1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/Metadata.h" 116 #include "llvm/IR/Module.h" 117 #include "llvm/IR/Operator.h" 118 #include "llvm/IR/PatternMatch.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <functional> 147 #include <iterator> 148 #include <limits> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 200 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks with a " 202 "vectorize(enable) pragma.")); 203 204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 205 // that predication is preferred, and this lists all options. I.e., the 206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 207 // and predicate the instructions accordingly. If tail-folding fails, there are 208 // different fallback strategies depending on these values: 209 namespace PreferPredicateTy { 210 enum Option { 211 ScalarEpilogue = 0, 212 PredicateElseScalarEpilogue, 213 PredicateOrDontVectorize 214 }; 215 } // namespace PreferPredicateTy 216 217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 218 "prefer-predicate-over-epilogue", 219 cl::init(PreferPredicateTy::ScalarEpilogue), 220 cl::Hidden, 221 cl::desc("Tail-folding and predication preferences over creating a scalar " 222 "epilogue loop."), 223 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 224 "scalar-epilogue", 225 "Don't tail-predicate loops, create scalar epilogue"), 226 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 227 "predicate-else-scalar-epilogue", 228 "prefer tail-folding, create scalar epilogue if tail " 229 "folding fails."), 230 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 231 "predicate-dont-vectorize", 232 "prefers tail-folding, don't attempt vectorization if " 233 "tail-folding fails."))); 234 235 static cl::opt<bool> MaximizeBandwidth( 236 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 237 cl::desc("Maximize bandwidth when selecting vectorization factor which " 238 "will be determined by the smallest type in loop.")); 239 240 static cl::opt<bool> EnableInterleavedMemAccesses( 241 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 242 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 243 244 /// An interleave-group may need masking if it resides in a block that needs 245 /// predication, or in order to mask away gaps. 246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 247 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 248 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 249 250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 251 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 252 cl::desc("We don't interleave loops with a estimated constant trip count " 253 "below this number")); 254 255 static cl::opt<unsigned> ForceTargetNumScalarRegs( 256 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of scalar registers.")); 258 259 static cl::opt<unsigned> ForceTargetNumVectorRegs( 260 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's number of vector registers.")); 262 263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 264 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 265 cl::desc("A flag that overrides the target's max interleave factor for " 266 "scalar loops.")); 267 268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 269 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 270 cl::desc("A flag that overrides the target's max interleave factor for " 271 "vectorized loops.")); 272 273 static cl::opt<unsigned> ForceTargetInstructionCost( 274 "force-target-instruction-cost", cl::init(0), cl::Hidden, 275 cl::desc("A flag that overrides the target's expected cost for " 276 "an instruction to a single constant value. Mostly " 277 "useful for getting consistent testing.")); 278 279 static cl::opt<bool> ForceTargetSupportsScalableVectors( 280 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 281 cl::desc( 282 "Pretend that scalable vectors are supported, even if the target does " 283 "not support them. This flag should only be used for testing.")); 284 285 static cl::opt<unsigned> SmallLoopCost( 286 "small-loop-cost", cl::init(20), cl::Hidden, 287 cl::desc( 288 "The cost of a loop that is considered 'small' by the interleaver.")); 289 290 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 291 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 292 cl::desc("Enable the use of the block frequency analysis to access PGO " 293 "heuristics minimizing code growth in cold regions and being more " 294 "aggressive in hot regions.")); 295 296 // Runtime interleave loops for load/store throughput. 297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 298 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 299 cl::desc( 300 "Enable runtime interleaving until load/store ports are saturated")); 301 302 /// Interleave small loops with scalar reductions. 303 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 304 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 305 cl::desc("Enable interleaving for loops with small iteration counts that " 306 "contain scalar reductions to expose ILP.")); 307 308 /// The number of stores in a loop that are allowed to need predication. 309 static cl::opt<unsigned> NumberOfStoresToPredicate( 310 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 311 cl::desc("Max number of stores to be predicated behind an if.")); 312 313 static cl::opt<bool> EnableIndVarRegisterHeur( 314 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 315 cl::desc("Count the induction variable only once when interleaving")); 316 317 static cl::opt<bool> EnableCondStoresVectorization( 318 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 319 cl::desc("Enable if predication of stores during vectorization.")); 320 321 static cl::opt<unsigned> MaxNestedScalarReductionIC( 322 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 323 cl::desc("The maximum interleave count to use when interleaving a scalar " 324 "reduction in a nested loop.")); 325 326 static cl::opt<bool> 327 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 328 cl::Hidden, 329 cl::desc("Prefer in-loop vector reductions, " 330 "overriding the targets preference.")); 331 332 static cl::opt<bool> ForceOrderedReductions( 333 "force-ordered-reductions", cl::init(false), cl::Hidden, 334 cl::desc("Enable the vectorisation of loops with in-order (strict) " 335 "FP reductions")); 336 337 static cl::opt<bool> PreferPredicatedReductionSelect( 338 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 339 cl::desc( 340 "Prefer predicating a reduction operation over an after loop select.")); 341 342 cl::opt<bool> EnableVPlanNativePath( 343 "enable-vplan-native-path", cl::init(false), cl::Hidden, 344 cl::desc("Enable VPlan-native vectorization path with " 345 "support for outer loop vectorization.")); 346 347 // FIXME: Remove this switch once we have divergence analysis. Currently we 348 // assume divergent non-backedge branches when this switch is true. 349 cl::opt<bool> EnableVPlanPredication( 350 "enable-vplan-predication", cl::init(false), cl::Hidden, 351 cl::desc("Enable VPlan-native vectorization path predicator with " 352 "support for outer loop vectorization.")); 353 354 // This flag enables the stress testing of the VPlan H-CFG construction in the 355 // VPlan-native vectorization path. It must be used in conjuction with 356 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 357 // verification of the H-CFGs built. 358 static cl::opt<bool> VPlanBuildStressTest( 359 "vplan-build-stress-test", cl::init(false), cl::Hidden, 360 cl::desc( 361 "Build VPlan for every supported loop nest in the function and bail " 362 "out right after the build (stress test the VPlan H-CFG construction " 363 "in the VPlan-native vectorization path).")); 364 365 cl::opt<bool> llvm::EnableLoopInterleaving( 366 "interleave-loops", cl::init(true), cl::Hidden, 367 cl::desc("Enable loop interleaving in Loop vectorization passes")); 368 cl::opt<bool> llvm::EnableLoopVectorization( 369 "vectorize-loops", cl::init(true), cl::Hidden, 370 cl::desc("Run the Loop vectorization passes")); 371 372 cl::opt<bool> PrintVPlansInDotFormat( 373 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 374 cl::desc("Use dot format instead of plain text when dumping VPlans")); 375 376 /// A helper function that returns true if the given type is irregular. The 377 /// type is irregular if its allocated size doesn't equal the store size of an 378 /// element of the corresponding vector type. 379 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 380 // Determine if an array of N elements of type Ty is "bitcast compatible" 381 // with a <N x Ty> vector. 382 // This is only true if there is no padding between the array elements. 383 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 384 } 385 386 /// A helper function that returns the reciprocal of the block probability of 387 /// predicated blocks. If we return X, we are assuming the predicated block 388 /// will execute once for every X iterations of the loop header. 389 /// 390 /// TODO: We should use actual block probability here, if available. Currently, 391 /// we always assume predicated blocks have a 50% chance of executing. 392 static unsigned getReciprocalPredBlockProb() { return 2; } 393 394 /// A helper function that returns an integer or floating-point constant with 395 /// value C. 396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 397 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 398 : ConstantFP::get(Ty, C); 399 } 400 401 /// Returns "best known" trip count for the specified loop \p L as defined by 402 /// the following procedure: 403 /// 1) Returns exact trip count if it is known. 404 /// 2) Returns expected trip count according to profile data if any. 405 /// 3) Returns upper bound estimate if it is known. 406 /// 4) Returns None if all of the above failed. 407 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 408 // Check if exact trip count is known. 409 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 410 return ExpectedTC; 411 412 // Check if there is an expected trip count available from profile data. 413 if (LoopVectorizeWithBlockFrequency) 414 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 415 return EstimatedTC; 416 417 // Check if upper bound estimate is known. 418 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 419 return ExpectedTC; 420 421 return None; 422 } 423 424 // Forward declare GeneratedRTChecks. 425 class GeneratedRTChecks; 426 427 namespace llvm { 428 429 AnalysisKey ShouldRunExtraVectorPasses::Key; 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop and the start value for the canonical induction, if it is != 0. The 472 /// latter is the case when vectorizing the epilogue loop. In the case of 473 /// epilogue vectorization, this function is overriden to handle the more 474 /// complex control flow around the loops. 475 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 476 477 /// Widen a single call instruction within the innermost loop. 478 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 479 VPTransformState &State); 480 481 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 482 void fixVectorizedLoop(VPTransformState &State); 483 484 // Return true if any runtime check is added. 485 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 486 487 /// A type for vectorized values in the new loop. Each value from the 488 /// original loop, when vectorized, is represented by UF vector values in the 489 /// new unrolled loop, where UF is the unroll factor. 490 using VectorParts = SmallVector<Value *, 2>; 491 492 /// Vectorize a single first-order recurrence or pointer induction PHINode in 493 /// a block. This method handles the induction variable canonicalization. It 494 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 495 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 496 VPTransformState &State); 497 498 /// A helper function to scalarize a single Instruction in the innermost loop. 499 /// Generates a sequence of scalar instances for each lane between \p MinLane 500 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 501 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 502 /// Instr's operands. 503 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 504 const VPIteration &Instance, bool IfPredicateInstr, 505 VPTransformState &State); 506 507 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 508 /// is provided, the integer induction variable will first be truncated to 509 /// the corresponding type. \p CanonicalIV is the scalar value generated for 510 /// the canonical induction variable. 511 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, 512 VPTransformState &State, Value *CanonicalIV); 513 514 /// Construct the vector value of a scalarized value \p V one lane at a time. 515 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 516 VPTransformState &State); 517 518 /// Try to vectorize interleaved access group \p Group with the base address 519 /// given in \p Addr, optionally masking the vector operations if \p 520 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 521 /// values in the vectorized loop. 522 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 523 ArrayRef<VPValue *> VPDefs, 524 VPTransformState &State, VPValue *Addr, 525 ArrayRef<VPValue *> StoredValues, 526 VPValue *BlockInMask = nullptr); 527 528 /// Set the debug location in the builder \p Ptr using the debug location in 529 /// \p V. If \p Ptr is None then it uses the class member's Builder. 530 void setDebugLocFromInst(const Value *V, 531 Optional<IRBuilderBase *> CustomBuilder = None); 532 533 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 534 void fixNonInductionPHIs(VPTransformState &State); 535 536 /// Returns true if the reordering of FP operations is not allowed, but we are 537 /// able to vectorize with strict in-order reductions for the given RdxDesc. 538 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 539 540 /// Create a broadcast instruction. This method generates a broadcast 541 /// instruction (shuffle) for loop invariant values and for the induction 542 /// value. If this is the induction variable then we extend it to N, N+1, ... 543 /// this is needed because each iteration in the loop corresponds to a SIMD 544 /// element. 545 virtual Value *getBroadcastInstrs(Value *V); 546 547 /// Add metadata from one instruction to another. 548 /// 549 /// This includes both the original MDs from \p From and additional ones (\see 550 /// addNewMetadata). Use this for *newly created* instructions in the vector 551 /// loop. 552 void addMetadata(Instruction *To, Instruction *From); 553 554 /// Similar to the previous function but it adds the metadata to a 555 /// vector of instructions. 556 void addMetadata(ArrayRef<Value *> To, Instruction *From); 557 558 // Returns the resume value (bc.merge.rdx) for a reduction as 559 // generated by fixReduction. 560 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 561 562 protected: 563 friend class LoopVectorizationPlanner; 564 565 /// A small list of PHINodes. 566 using PhiVector = SmallVector<PHINode *, 4>; 567 568 /// A type for scalarized values in the new loop. Each value from the 569 /// original loop, when scalarized, is represented by UF x VF scalar values 570 /// in the new unrolled loop, where UF is the unroll factor and VF is the 571 /// vectorization factor. 572 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 573 574 /// Set up the values of the IVs correctly when exiting the vector loop. 575 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 576 Value *CountRoundDown, Value *EndValue, 577 BasicBlock *MiddleBlock); 578 579 /// Introduce a conditional branch (on true, condition to be set later) at the 580 /// end of the header=latch connecting it to itself (across the backedge) and 581 /// to the exit block of \p L. 582 void createHeaderBranch(Loop *L); 583 584 /// Handle all cross-iteration phis in the header. 585 void fixCrossIterationPHIs(VPTransformState &State); 586 587 /// Create the exit value of first order recurrences in the middle block and 588 /// update their users. 589 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 590 VPTransformState &State); 591 592 /// Create code for the loop exit value of the reduction. 593 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 594 595 /// Clear NSW/NUW flags from reduction instructions if necessary. 596 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 597 VPTransformState &State); 598 599 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 600 /// means we need to add the appropriate incoming value from the middle 601 /// block as exiting edges from the scalar epilogue loop (if present) are 602 /// already in place, and we exit the vector loop exclusively to the middle 603 /// block. 604 void fixLCSSAPHIs(VPTransformState &State); 605 606 /// Iteratively sink the scalarized operands of a predicated instruction into 607 /// the block that was created for it. 608 void sinkScalarOperands(Instruction *PredInst); 609 610 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 611 /// represented as. 612 void truncateToMinimalBitwidths(VPTransformState &State); 613 614 /// Create a vector induction phi node based on an existing scalar one. \p 615 /// EntryVal is the value from the original loop that maps to the vector phi 616 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 617 /// truncate instruction, instead of widening the original IV, we widen a 618 /// version of the IV truncated to \p EntryVal's type. 619 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 620 Value *Step, Value *Start, 621 Instruction *EntryVal, VPValue *Def, 622 VPTransformState &State); 623 624 /// Returns (and creates if needed) the original loop trip count. 625 Value *getOrCreateTripCount(Loop *NewLoop); 626 627 /// Returns (and creates if needed) the trip count of the widened loop. 628 Value *getOrCreateVectorTripCount(Loop *NewLoop); 629 630 /// Returns a bitcasted value to the requested vector type. 631 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 632 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 633 const DataLayout &DL); 634 635 /// Emit a bypass check to see if the vector trip count is zero, including if 636 /// it overflows. 637 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 638 639 /// Emit a bypass check to see if all of the SCEV assumptions we've 640 /// had to make are correct. Returns the block containing the checks or 641 /// nullptr if no checks have been added. 642 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 643 644 /// Emit bypass checks to check any memory assumptions we may have made. 645 /// Returns the block containing the checks or nullptr if no checks have been 646 /// added. 647 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 648 649 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 650 /// vector loop preheader, middle block and scalar preheader. Also 651 /// allocate a loop object for the new vector loop and return it. 652 Loop *createVectorLoopSkeleton(StringRef Prefix); 653 654 /// Create new phi nodes for the induction variables to resume iteration count 655 /// in the scalar epilogue, from where the vectorized loop left off. 656 /// In cases where the loop skeleton is more complicated (eg. epilogue 657 /// vectorization) and the resume values can come from an additional bypass 658 /// block, the \p AdditionalBypass pair provides information about the bypass 659 /// block and the end value on the edge from bypass to this loop. 660 void createInductionResumeValues( 661 Loop *L, 662 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 663 664 /// Complete the loop skeleton by adding debug MDs, creating appropriate 665 /// conditional branches in the middle block, preparing the builder and 666 /// running the verifier. Take in the vector loop \p L as argument, and return 667 /// the preheader of the completed vector loop. 668 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 669 670 /// Add additional metadata to \p To that was not present on \p Orig. 671 /// 672 /// Currently this is used to add the noalias annotations based on the 673 /// inserted memchecks. Use this for instructions that are *cloned* into the 674 /// vector loop. 675 void addNewMetadata(Instruction *To, const Instruction *Orig); 676 677 /// Collect poison-generating recipes that may generate a poison value that is 678 /// used after vectorization, even when their operands are not poison. Those 679 /// recipes meet the following conditions: 680 /// * Contribute to the address computation of a recipe generating a widen 681 /// memory load/store (VPWidenMemoryInstructionRecipe or 682 /// VPInterleaveRecipe). 683 /// * Such a widen memory load/store has at least one underlying Instruction 684 /// that is in a basic block that needs predication and after vectorization 685 /// the generated instruction won't be predicated. 686 void collectPoisonGeneratingRecipes(VPTransformState &State); 687 688 /// Allow subclasses to override and print debug traces before/after vplan 689 /// execution, when trace information is requested. 690 virtual void printDebugTracesAtStart(){}; 691 virtual void printDebugTracesAtEnd(){}; 692 693 /// The original loop. 694 Loop *OrigLoop; 695 696 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 697 /// dynamic knowledge to simplify SCEV expressions and converts them to a 698 /// more usable form. 699 PredicatedScalarEvolution &PSE; 700 701 /// Loop Info. 702 LoopInfo *LI; 703 704 /// Dominator Tree. 705 DominatorTree *DT; 706 707 /// Alias Analysis. 708 AAResults *AA; 709 710 /// Target Library Info. 711 const TargetLibraryInfo *TLI; 712 713 /// Target Transform Info. 714 const TargetTransformInfo *TTI; 715 716 /// Assumption Cache. 717 AssumptionCache *AC; 718 719 /// Interface to emit optimization remarks. 720 OptimizationRemarkEmitter *ORE; 721 722 /// LoopVersioning. It's only set up (non-null) if memchecks were 723 /// used. 724 /// 725 /// This is currently only used to add no-alias metadata based on the 726 /// memchecks. The actually versioning is performed manually. 727 std::unique_ptr<LoopVersioning> LVer; 728 729 /// The vectorization SIMD factor to use. Each vector will have this many 730 /// vector elements. 731 ElementCount VF; 732 733 /// The vectorization unroll factor to use. Each scalar is vectorized to this 734 /// many different vector instructions. 735 unsigned UF; 736 737 /// The builder that we use 738 IRBuilder<> Builder; 739 740 // --- Vectorization state --- 741 742 /// The vector-loop preheader. 743 BasicBlock *LoopVectorPreHeader; 744 745 /// The scalar-loop preheader. 746 BasicBlock *LoopScalarPreHeader; 747 748 /// Middle Block between the vector and the scalar. 749 BasicBlock *LoopMiddleBlock; 750 751 /// The unique ExitBlock of the scalar loop if one exists. Note that 752 /// there can be multiple exiting edges reaching this block. 753 BasicBlock *LoopExitBlock; 754 755 /// The vector loop body. 756 BasicBlock *LoopVectorBody; 757 758 /// The scalar loop body. 759 BasicBlock *LoopScalarBody; 760 761 /// A list of all bypass blocks. The first block is the entry of the loop. 762 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 763 764 /// Store instructions that were predicated. 765 SmallVector<Instruction *, 4> PredicatedInstructions; 766 767 /// Trip count of the original loop. 768 Value *TripCount = nullptr; 769 770 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 771 Value *VectorTripCount = nullptr; 772 773 /// The legality analysis. 774 LoopVectorizationLegality *Legal; 775 776 /// The profitablity analysis. 777 LoopVectorizationCostModel *Cost; 778 779 // Record whether runtime checks are added. 780 bool AddedSafetyChecks = false; 781 782 // Holds the end values for each induction variable. We save the end values 783 // so we can later fix-up the external users of the induction variables. 784 DenseMap<PHINode *, Value *> IVEndValues; 785 786 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 787 // fixed up at the end of vector code generation. 788 SmallVector<PHINode *, 8> OrigPHIsToFix; 789 790 /// BFI and PSI are used to check for profile guided size optimizations. 791 BlockFrequencyInfo *BFI; 792 ProfileSummaryInfo *PSI; 793 794 // Whether this loop should be optimized for size based on profile guided size 795 // optimizatios. 796 bool OptForSizeBasedOnProfile; 797 798 /// Structure to hold information about generated runtime checks, responsible 799 /// for cleaning the checks, if vectorization turns out unprofitable. 800 GeneratedRTChecks &RTChecks; 801 802 // Holds the resume values for reductions in the loops, used to set the 803 // correct start value of reduction PHIs when vectorizing the epilogue. 804 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 805 ReductionResumeValues; 806 }; 807 808 class InnerLoopUnroller : public InnerLoopVectorizer { 809 public: 810 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 811 LoopInfo *LI, DominatorTree *DT, 812 const TargetLibraryInfo *TLI, 813 const TargetTransformInfo *TTI, AssumptionCache *AC, 814 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 815 LoopVectorizationLegality *LVL, 816 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 817 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 818 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 819 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 820 BFI, PSI, Check) {} 821 822 private: 823 Value *getBroadcastInstrs(Value *V) override; 824 }; 825 826 /// Encapsulate information regarding vectorization of a loop and its epilogue. 827 /// This information is meant to be updated and used across two stages of 828 /// epilogue vectorization. 829 struct EpilogueLoopVectorizationInfo { 830 ElementCount MainLoopVF = ElementCount::getFixed(0); 831 unsigned MainLoopUF = 0; 832 ElementCount EpilogueVF = ElementCount::getFixed(0); 833 unsigned EpilogueUF = 0; 834 BasicBlock *MainLoopIterationCountCheck = nullptr; 835 BasicBlock *EpilogueIterationCountCheck = nullptr; 836 BasicBlock *SCEVSafetyCheck = nullptr; 837 BasicBlock *MemSafetyCheck = nullptr; 838 Value *TripCount = nullptr; 839 Value *VectorTripCount = nullptr; 840 841 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 842 ElementCount EVF, unsigned EUF) 843 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 844 assert(EUF == 1 && 845 "A high UF for the epilogue loop is likely not beneficial."); 846 } 847 }; 848 849 /// An extension of the inner loop vectorizer that creates a skeleton for a 850 /// vectorized loop that has its epilogue (residual) also vectorized. 851 /// The idea is to run the vplan on a given loop twice, firstly to setup the 852 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 853 /// from the first step and vectorize the epilogue. This is achieved by 854 /// deriving two concrete strategy classes from this base class and invoking 855 /// them in succession from the loop vectorizer planner. 856 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 857 public: 858 InnerLoopAndEpilogueVectorizer( 859 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 860 DominatorTree *DT, const TargetLibraryInfo *TLI, 861 const TargetTransformInfo *TTI, AssumptionCache *AC, 862 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 863 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 864 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 865 GeneratedRTChecks &Checks) 866 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 867 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 868 Checks), 869 EPI(EPI) {} 870 871 // Override this function to handle the more complex control flow around the 872 // three loops. 873 std::pair<BasicBlock *, Value *> 874 createVectorizedLoopSkeleton() final override { 875 return createEpilogueVectorizedLoopSkeleton(); 876 } 877 878 /// The interface for creating a vectorized skeleton using one of two 879 /// different strategies, each corresponding to one execution of the vplan 880 /// as described above. 881 virtual std::pair<BasicBlock *, Value *> 882 createEpilogueVectorizedLoopSkeleton() = 0; 883 884 /// Holds and updates state information required to vectorize the main loop 885 /// and its epilogue in two separate passes. This setup helps us avoid 886 /// regenerating and recomputing runtime safety checks. It also helps us to 887 /// shorten the iteration-count-check path length for the cases where the 888 /// iteration count of the loop is so small that the main vector loop is 889 /// completely skipped. 890 EpilogueLoopVectorizationInfo &EPI; 891 }; 892 893 /// A specialized derived class of inner loop vectorizer that performs 894 /// vectorization of *main* loops in the process of vectorizing loops and their 895 /// epilogues. 896 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 897 public: 898 EpilogueVectorizerMainLoop( 899 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 900 DominatorTree *DT, const TargetLibraryInfo *TLI, 901 const TargetTransformInfo *TTI, AssumptionCache *AC, 902 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 903 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 904 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 905 GeneratedRTChecks &Check) 906 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 907 EPI, LVL, CM, BFI, PSI, Check) {} 908 /// Implements the interface for creating a vectorized skeleton using the 909 /// *main loop* strategy (ie the first pass of vplan execution). 910 std::pair<BasicBlock *, Value *> 911 createEpilogueVectorizedLoopSkeleton() final override; 912 913 protected: 914 /// Emits an iteration count bypass check once for the main loop (when \p 915 /// ForEpilogue is false) and once for the epilogue loop (when \p 916 /// ForEpilogue is true). 917 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 918 bool ForEpilogue); 919 void printDebugTracesAtStart() override; 920 void printDebugTracesAtEnd() override; 921 }; 922 923 // A specialized derived class of inner loop vectorizer that performs 924 // vectorization of *epilogue* loops in the process of vectorizing loops and 925 // their epilogues. 926 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 927 public: 928 EpilogueVectorizerEpilogueLoop( 929 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 930 DominatorTree *DT, const TargetLibraryInfo *TLI, 931 const TargetTransformInfo *TTI, AssumptionCache *AC, 932 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 933 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 934 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 935 GeneratedRTChecks &Checks) 936 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 937 EPI, LVL, CM, BFI, PSI, Checks) {} 938 /// Implements the interface for creating a vectorized skeleton using the 939 /// *epilogue loop* strategy (ie the second pass of vplan execution). 940 std::pair<BasicBlock *, Value *> 941 createEpilogueVectorizedLoopSkeleton() final override; 942 943 protected: 944 /// Emits an iteration count bypass check after the main vector loop has 945 /// finished to see if there are any iterations left to execute by either 946 /// the vector epilogue or the scalar epilogue. 947 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 948 BasicBlock *Bypass, 949 BasicBlock *Insert); 950 void printDebugTracesAtStart() override; 951 void printDebugTracesAtEnd() override; 952 }; 953 } // end namespace llvm 954 955 /// Look for a meaningful debug location on the instruction or it's 956 /// operands. 957 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 958 if (!I) 959 return I; 960 961 DebugLoc Empty; 962 if (I->getDebugLoc() != Empty) 963 return I; 964 965 for (Use &Op : I->operands()) { 966 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 967 if (OpInst->getDebugLoc() != Empty) 968 return OpInst; 969 } 970 971 return I; 972 } 973 974 void InnerLoopVectorizer::setDebugLocFromInst( 975 const Value *V, Optional<IRBuilderBase *> CustomBuilder) { 976 IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 977 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 978 const DILocation *DIL = Inst->getDebugLoc(); 979 980 // When a FSDiscriminator is enabled, we don't need to add the multiply 981 // factors to the discriminators. 982 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 983 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 984 // FIXME: For scalable vectors, assume vscale=1. 985 auto NewDIL = 986 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 987 if (NewDIL) 988 B->SetCurrentDebugLocation(NewDIL.getValue()); 989 else 990 LLVM_DEBUG(dbgs() 991 << "Failed to create new discriminator: " 992 << DIL->getFilename() << " Line: " << DIL->getLine()); 993 } else 994 B->SetCurrentDebugLocation(DIL); 995 } else 996 B->SetCurrentDebugLocation(DebugLoc()); 997 } 998 999 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1000 /// is passed, the message relates to that particular instruction. 1001 #ifndef NDEBUG 1002 static void debugVectorizationMessage(const StringRef Prefix, 1003 const StringRef DebugMsg, 1004 Instruction *I) { 1005 dbgs() << "LV: " << Prefix << DebugMsg; 1006 if (I != nullptr) 1007 dbgs() << " " << *I; 1008 else 1009 dbgs() << '.'; 1010 dbgs() << '\n'; 1011 } 1012 #endif 1013 1014 /// Create an analysis remark that explains why vectorization failed 1015 /// 1016 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1017 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1018 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1019 /// the location of the remark. \return the remark object that can be 1020 /// streamed to. 1021 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1022 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1023 Value *CodeRegion = TheLoop->getHeader(); 1024 DebugLoc DL = TheLoop->getStartLoc(); 1025 1026 if (I) { 1027 CodeRegion = I->getParent(); 1028 // If there is no debug location attached to the instruction, revert back to 1029 // using the loop's. 1030 if (I->getDebugLoc()) 1031 DL = I->getDebugLoc(); 1032 } 1033 1034 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1035 } 1036 1037 namespace llvm { 1038 1039 /// Return a value for Step multiplied by VF. 1040 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1041 int64_t Step) { 1042 assert(Ty->isIntegerTy() && "Expected an integer step"); 1043 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1044 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1045 } 1046 1047 /// Return the runtime value for VF. 1048 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1049 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1050 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1051 } 1052 1053 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1054 ElementCount VF) { 1055 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1056 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1057 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1058 return B.CreateUIToFP(RuntimeVF, FTy); 1059 } 1060 1061 void reportVectorizationFailure(const StringRef DebugMsg, 1062 const StringRef OREMsg, const StringRef ORETag, 1063 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1064 Instruction *I) { 1065 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1066 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1067 ORE->emit( 1068 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1069 << "loop not vectorized: " << OREMsg); 1070 } 1071 1072 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1073 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1074 Instruction *I) { 1075 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1076 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1077 ORE->emit( 1078 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1079 << Msg); 1080 } 1081 1082 } // end namespace llvm 1083 1084 #ifndef NDEBUG 1085 /// \return string containing a file name and a line # for the given loop. 1086 static std::string getDebugLocString(const Loop *L) { 1087 std::string Result; 1088 if (L) { 1089 raw_string_ostream OS(Result); 1090 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1091 LoopDbgLoc.print(OS); 1092 else 1093 // Just print the module name. 1094 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1095 OS.flush(); 1096 } 1097 return Result; 1098 } 1099 #endif 1100 1101 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1102 const Instruction *Orig) { 1103 // If the loop was versioned with memchecks, add the corresponding no-alias 1104 // metadata. 1105 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1106 LVer->annotateInstWithNoAlias(To, Orig); 1107 } 1108 1109 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1110 VPTransformState &State) { 1111 1112 // Collect recipes in the backward slice of `Root` that may generate a poison 1113 // value that is used after vectorization. 1114 SmallPtrSet<VPRecipeBase *, 16> Visited; 1115 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1116 SmallVector<VPRecipeBase *, 16> Worklist; 1117 Worklist.push_back(Root); 1118 1119 // Traverse the backward slice of Root through its use-def chain. 1120 while (!Worklist.empty()) { 1121 VPRecipeBase *CurRec = Worklist.back(); 1122 Worklist.pop_back(); 1123 1124 if (!Visited.insert(CurRec).second) 1125 continue; 1126 1127 // Prune search if we find another recipe generating a widen memory 1128 // instruction. Widen memory instructions involved in address computation 1129 // will lead to gather/scatter instructions, which don't need to be 1130 // handled. 1131 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1132 isa<VPInterleaveRecipe>(CurRec) || 1133 isa<VPScalarIVStepsRecipe>(CurRec) || 1134 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1135 continue; 1136 1137 // This recipe contributes to the address computation of a widen 1138 // load/store. Collect recipe if its underlying instruction has 1139 // poison-generating flags. 1140 Instruction *Instr = CurRec->getUnderlyingInstr(); 1141 if (Instr && Instr->hasPoisonGeneratingFlags()) 1142 State.MayGeneratePoisonRecipes.insert(CurRec); 1143 1144 // Add new definitions to the worklist. 1145 for (VPValue *operand : CurRec->operands()) 1146 if (VPDef *OpDef = operand->getDef()) 1147 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1148 } 1149 }); 1150 1151 // Traverse all the recipes in the VPlan and collect the poison-generating 1152 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1153 // VPInterleaveRecipe. 1154 auto Iter = depth_first( 1155 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1156 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1157 for (VPRecipeBase &Recipe : *VPBB) { 1158 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1159 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1160 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1161 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1162 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1163 collectPoisonGeneratingInstrsInBackwardSlice( 1164 cast<VPRecipeBase>(AddrDef)); 1165 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1166 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1167 if (AddrDef) { 1168 // Check if any member of the interleave group needs predication. 1169 const InterleaveGroup<Instruction> *InterGroup = 1170 InterleaveRec->getInterleaveGroup(); 1171 bool NeedPredication = false; 1172 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1173 I < NumMembers; ++I) { 1174 Instruction *Member = InterGroup->getMember(I); 1175 if (Member) 1176 NeedPredication |= 1177 Legal->blockNeedsPredication(Member->getParent()); 1178 } 1179 1180 if (NeedPredication) 1181 collectPoisonGeneratingInstrsInBackwardSlice( 1182 cast<VPRecipeBase>(AddrDef)); 1183 } 1184 } 1185 } 1186 } 1187 } 1188 1189 void InnerLoopVectorizer::addMetadata(Instruction *To, 1190 Instruction *From) { 1191 propagateMetadata(To, From); 1192 addNewMetadata(To, From); 1193 } 1194 1195 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1196 Instruction *From) { 1197 for (Value *V : To) { 1198 if (Instruction *I = dyn_cast<Instruction>(V)) 1199 addMetadata(I, From); 1200 } 1201 } 1202 1203 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1204 const RecurrenceDescriptor &RdxDesc) { 1205 auto It = ReductionResumeValues.find(&RdxDesc); 1206 assert(It != ReductionResumeValues.end() && 1207 "Expected to find a resume value for the reduction."); 1208 return It->second; 1209 } 1210 1211 namespace llvm { 1212 1213 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1214 // lowered. 1215 enum ScalarEpilogueLowering { 1216 1217 // The default: allowing scalar epilogues. 1218 CM_ScalarEpilogueAllowed, 1219 1220 // Vectorization with OptForSize: don't allow epilogues. 1221 CM_ScalarEpilogueNotAllowedOptSize, 1222 1223 // A special case of vectorisation with OptForSize: loops with a very small 1224 // trip count are considered for vectorization under OptForSize, thereby 1225 // making sure the cost of their loop body is dominant, free of runtime 1226 // guards and scalar iteration overheads. 1227 CM_ScalarEpilogueNotAllowedLowTripLoop, 1228 1229 // Loop hint predicate indicating an epilogue is undesired. 1230 CM_ScalarEpilogueNotNeededUsePredicate, 1231 1232 // Directive indicating we must either tail fold or not vectorize 1233 CM_ScalarEpilogueNotAllowedUsePredicate 1234 }; 1235 1236 /// ElementCountComparator creates a total ordering for ElementCount 1237 /// for the purposes of using it in a set structure. 1238 struct ElementCountComparator { 1239 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1240 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1241 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1242 } 1243 }; 1244 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1245 1246 /// LoopVectorizationCostModel - estimates the expected speedups due to 1247 /// vectorization. 1248 /// In many cases vectorization is not profitable. This can happen because of 1249 /// a number of reasons. In this class we mainly attempt to predict the 1250 /// expected speedup/slowdowns due to the supported instruction set. We use the 1251 /// TargetTransformInfo to query the different backends for the cost of 1252 /// different operations. 1253 class LoopVectorizationCostModel { 1254 public: 1255 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1256 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1257 LoopVectorizationLegality *Legal, 1258 const TargetTransformInfo &TTI, 1259 const TargetLibraryInfo *TLI, DemandedBits *DB, 1260 AssumptionCache *AC, 1261 OptimizationRemarkEmitter *ORE, const Function *F, 1262 const LoopVectorizeHints *Hints, 1263 InterleavedAccessInfo &IAI) 1264 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1265 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1266 Hints(Hints), InterleaveInfo(IAI) {} 1267 1268 /// \return An upper bound for the vectorization factors (both fixed and 1269 /// scalable). If the factors are 0, vectorization and interleaving should be 1270 /// avoided up front. 1271 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1272 1273 /// \return True if runtime checks are required for vectorization, and false 1274 /// otherwise. 1275 bool runtimeChecksRequired(); 1276 1277 /// \return The most profitable vectorization factor and the cost of that VF. 1278 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1279 /// then this vectorization factor will be selected if vectorization is 1280 /// possible. 1281 VectorizationFactor 1282 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1283 1284 VectorizationFactor 1285 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1286 const LoopVectorizationPlanner &LVP); 1287 1288 /// Setup cost-based decisions for user vectorization factor. 1289 /// \return true if the UserVF is a feasible VF to be chosen. 1290 bool selectUserVectorizationFactor(ElementCount UserVF) { 1291 collectUniformsAndScalars(UserVF); 1292 collectInstsToScalarize(UserVF); 1293 return expectedCost(UserVF).first.isValid(); 1294 } 1295 1296 /// \return The size (in bits) of the smallest and widest types in the code 1297 /// that needs to be vectorized. We ignore values that remain scalar such as 1298 /// 64 bit loop indices. 1299 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1300 1301 /// \return The desired interleave count. 1302 /// If interleave count has been specified by metadata it will be returned. 1303 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1304 /// are the selected vectorization factor and the cost of the selected VF. 1305 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1306 1307 /// Memory access instruction may be vectorized in more than one way. 1308 /// Form of instruction after vectorization depends on cost. 1309 /// This function takes cost-based decisions for Load/Store instructions 1310 /// and collects them in a map. This decisions map is used for building 1311 /// the lists of loop-uniform and loop-scalar instructions. 1312 /// The calculated cost is saved with widening decision in order to 1313 /// avoid redundant calculations. 1314 void setCostBasedWideningDecision(ElementCount VF); 1315 1316 /// A struct that represents some properties of the register usage 1317 /// of a loop. 1318 struct RegisterUsage { 1319 /// Holds the number of loop invariant values that are used in the loop. 1320 /// The key is ClassID of target-provided register class. 1321 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1322 /// Holds the maximum number of concurrent live intervals in the loop. 1323 /// The key is ClassID of target-provided register class. 1324 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1325 }; 1326 1327 /// \return Returns information about the register usages of the loop for the 1328 /// given vectorization factors. 1329 SmallVector<RegisterUsage, 8> 1330 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1331 1332 /// Collect values we want to ignore in the cost model. 1333 void collectValuesToIgnore(); 1334 1335 /// Collect all element types in the loop for which widening is needed. 1336 void collectElementTypesForWidening(); 1337 1338 /// Split reductions into those that happen in the loop, and those that happen 1339 /// outside. In loop reductions are collected into InLoopReductionChains. 1340 void collectInLoopReductions(); 1341 1342 /// Returns true if we should use strict in-order reductions for the given 1343 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1344 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1345 /// of FP operations. 1346 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1347 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1348 } 1349 1350 /// \returns The smallest bitwidth each instruction can be represented with. 1351 /// The vector equivalents of these instructions should be truncated to this 1352 /// type. 1353 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1354 return MinBWs; 1355 } 1356 1357 /// \returns True if it is more profitable to scalarize instruction \p I for 1358 /// vectorization factor \p VF. 1359 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1360 assert(VF.isVector() && 1361 "Profitable to scalarize relevant only for VF > 1."); 1362 1363 // Cost model is not run in the VPlan-native path - return conservative 1364 // result until this changes. 1365 if (EnableVPlanNativePath) 1366 return false; 1367 1368 auto Scalars = InstsToScalarize.find(VF); 1369 assert(Scalars != InstsToScalarize.end() && 1370 "VF not yet analyzed for scalarization profitability"); 1371 return Scalars->second.find(I) != Scalars->second.end(); 1372 } 1373 1374 /// Returns true if \p I is known to be uniform after vectorization. 1375 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1376 if (VF.isScalar()) 1377 return true; 1378 1379 // Cost model is not run in the VPlan-native path - return conservative 1380 // result until this changes. 1381 if (EnableVPlanNativePath) 1382 return false; 1383 1384 auto UniformsPerVF = Uniforms.find(VF); 1385 assert(UniformsPerVF != Uniforms.end() && 1386 "VF not yet analyzed for uniformity"); 1387 return UniformsPerVF->second.count(I); 1388 } 1389 1390 /// Returns true if \p I is known to be scalar after vectorization. 1391 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1392 if (VF.isScalar()) 1393 return true; 1394 1395 // Cost model is not run in the VPlan-native path - return conservative 1396 // result until this changes. 1397 if (EnableVPlanNativePath) 1398 return false; 1399 1400 auto ScalarsPerVF = Scalars.find(VF); 1401 assert(ScalarsPerVF != Scalars.end() && 1402 "Scalar values are not calculated for VF"); 1403 return ScalarsPerVF->second.count(I); 1404 } 1405 1406 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1407 /// for vectorization factor \p VF. 1408 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1409 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1410 !isProfitableToScalarize(I, VF) && 1411 !isScalarAfterVectorization(I, VF); 1412 } 1413 1414 /// Decision that was taken during cost calculation for memory instruction. 1415 enum InstWidening { 1416 CM_Unknown, 1417 CM_Widen, // For consecutive accesses with stride +1. 1418 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1419 CM_Interleave, 1420 CM_GatherScatter, 1421 CM_Scalarize 1422 }; 1423 1424 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1425 /// instruction \p I and vector width \p VF. 1426 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1427 InstructionCost Cost) { 1428 assert(VF.isVector() && "Expected VF >=2"); 1429 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1430 } 1431 1432 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1433 /// interleaving group \p Grp and vector width \p VF. 1434 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1435 ElementCount VF, InstWidening W, 1436 InstructionCost Cost) { 1437 assert(VF.isVector() && "Expected VF >=2"); 1438 /// Broadcast this decicion to all instructions inside the group. 1439 /// But the cost will be assigned to one instruction only. 1440 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1441 if (auto *I = Grp->getMember(i)) { 1442 if (Grp->getInsertPos() == I) 1443 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1444 else 1445 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1446 } 1447 } 1448 } 1449 1450 /// Return the cost model decision for the given instruction \p I and vector 1451 /// width \p VF. Return CM_Unknown if this instruction did not pass 1452 /// through the cost modeling. 1453 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1454 assert(VF.isVector() && "Expected VF to be a vector VF"); 1455 // Cost model is not run in the VPlan-native path - return conservative 1456 // result until this changes. 1457 if (EnableVPlanNativePath) 1458 return CM_GatherScatter; 1459 1460 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1461 auto Itr = WideningDecisions.find(InstOnVF); 1462 if (Itr == WideningDecisions.end()) 1463 return CM_Unknown; 1464 return Itr->second.first; 1465 } 1466 1467 /// Return the vectorization cost for the given instruction \p I and vector 1468 /// width \p VF. 1469 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1470 assert(VF.isVector() && "Expected VF >=2"); 1471 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1472 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1473 "The cost is not calculated"); 1474 return WideningDecisions[InstOnVF].second; 1475 } 1476 1477 /// Return True if instruction \p I is an optimizable truncate whose operand 1478 /// is an induction variable. Such a truncate will be removed by adding a new 1479 /// induction variable with the destination type. 1480 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1481 // If the instruction is not a truncate, return false. 1482 auto *Trunc = dyn_cast<TruncInst>(I); 1483 if (!Trunc) 1484 return false; 1485 1486 // Get the source and destination types of the truncate. 1487 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1488 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1489 1490 // If the truncate is free for the given types, return false. Replacing a 1491 // free truncate with an induction variable would add an induction variable 1492 // update instruction to each iteration of the loop. We exclude from this 1493 // check the primary induction variable since it will need an update 1494 // instruction regardless. 1495 Value *Op = Trunc->getOperand(0); 1496 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1497 return false; 1498 1499 // If the truncated value is not an induction variable, return false. 1500 return Legal->isInductionPhi(Op); 1501 } 1502 1503 /// Collects the instructions to scalarize for each predicated instruction in 1504 /// the loop. 1505 void collectInstsToScalarize(ElementCount VF); 1506 1507 /// Collect Uniform and Scalar values for the given \p VF. 1508 /// The sets depend on CM decision for Load/Store instructions 1509 /// that may be vectorized as interleave, gather-scatter or scalarized. 1510 void collectUniformsAndScalars(ElementCount VF) { 1511 // Do the analysis once. 1512 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1513 return; 1514 setCostBasedWideningDecision(VF); 1515 collectLoopUniforms(VF); 1516 collectLoopScalars(VF); 1517 } 1518 1519 /// Returns true if the target machine supports masked store operation 1520 /// for the given \p DataType and kind of access to \p Ptr. 1521 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1522 return Legal->isConsecutivePtr(DataType, Ptr) && 1523 TTI.isLegalMaskedStore(DataType, Alignment); 1524 } 1525 1526 /// Returns true if the target machine supports masked load operation 1527 /// for the given \p DataType and kind of access to \p Ptr. 1528 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1529 return Legal->isConsecutivePtr(DataType, Ptr) && 1530 TTI.isLegalMaskedLoad(DataType, Alignment); 1531 } 1532 1533 /// Returns true if the target machine can represent \p V as a masked gather 1534 /// or scatter operation. 1535 bool isLegalGatherOrScatter(Value *V, 1536 ElementCount VF = ElementCount::getFixed(1)) { 1537 bool LI = isa<LoadInst>(V); 1538 bool SI = isa<StoreInst>(V); 1539 if (!LI && !SI) 1540 return false; 1541 auto *Ty = getLoadStoreType(V); 1542 Align Align = getLoadStoreAlignment(V); 1543 if (VF.isVector()) 1544 Ty = VectorType::get(Ty, VF); 1545 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1546 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1547 } 1548 1549 /// Returns true if the target machine supports all of the reduction 1550 /// variables found for the given VF. 1551 bool canVectorizeReductions(ElementCount VF) const { 1552 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1553 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1554 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1555 })); 1556 } 1557 1558 /// Returns true if \p I is an instruction that will be scalarized with 1559 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1560 /// instructions include conditional stores and instructions that may divide 1561 /// by zero. 1562 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1563 1564 // Returns true if \p I is an instruction that will be predicated either 1565 // through scalar predication or masked load/store or masked gather/scatter. 1566 // \p VF is the vectorization factor that will be used to vectorize \p I. 1567 // Superset of instructions that return true for isScalarWithPredication. 1568 bool isPredicatedInst(Instruction *I, ElementCount VF, 1569 bool IsKnownUniform = false) { 1570 // When we know the load is uniform and the original scalar loop was not 1571 // predicated we don't need to mark it as a predicated instruction. Any 1572 // vectorised blocks created when tail-folding are something artificial we 1573 // have introduced and we know there is always at least one active lane. 1574 // That's why we call Legal->blockNeedsPredication here because it doesn't 1575 // query tail-folding. 1576 if (IsKnownUniform && isa<LoadInst>(I) && 1577 !Legal->blockNeedsPredication(I->getParent())) 1578 return false; 1579 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1580 return false; 1581 // Loads and stores that need some form of masked operation are predicated 1582 // instructions. 1583 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1584 return Legal->isMaskRequired(I); 1585 return isScalarWithPredication(I, VF); 1586 } 1587 1588 /// Returns true if \p I is a memory instruction with consecutive memory 1589 /// access that can be widened. 1590 bool 1591 memoryInstructionCanBeWidened(Instruction *I, 1592 ElementCount VF = ElementCount::getFixed(1)); 1593 1594 /// Returns true if \p I is a memory instruction in an interleaved-group 1595 /// of memory accesses that can be vectorized with wide vector loads/stores 1596 /// and shuffles. 1597 bool 1598 interleavedAccessCanBeWidened(Instruction *I, 1599 ElementCount VF = ElementCount::getFixed(1)); 1600 1601 /// Check if \p Instr belongs to any interleaved access group. 1602 bool isAccessInterleaved(Instruction *Instr) { 1603 return InterleaveInfo.isInterleaved(Instr); 1604 } 1605 1606 /// Get the interleaved access group that \p Instr belongs to. 1607 const InterleaveGroup<Instruction> * 1608 getInterleavedAccessGroup(Instruction *Instr) { 1609 return InterleaveInfo.getInterleaveGroup(Instr); 1610 } 1611 1612 /// Returns true if we're required to use a scalar epilogue for at least 1613 /// the final iteration of the original loop. 1614 bool requiresScalarEpilogue(ElementCount VF) const { 1615 if (!isScalarEpilogueAllowed()) 1616 return false; 1617 // If we might exit from anywhere but the latch, must run the exiting 1618 // iteration in scalar form. 1619 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1620 return true; 1621 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1622 } 1623 1624 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1625 /// loop hint annotation. 1626 bool isScalarEpilogueAllowed() const { 1627 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1628 } 1629 1630 /// Returns true if all loop blocks should be masked to fold tail loop. 1631 bool foldTailByMasking() const { return FoldTailByMasking; } 1632 1633 /// Returns true if the instructions in this block requires predication 1634 /// for any reason, e.g. because tail folding now requires a predicate 1635 /// or because the block in the original loop was predicated. 1636 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1637 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1638 } 1639 1640 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1641 /// nodes to the chain of instructions representing the reductions. Uses a 1642 /// MapVector to ensure deterministic iteration order. 1643 using ReductionChainMap = 1644 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1645 1646 /// Return the chain of instructions representing an inloop reduction. 1647 const ReductionChainMap &getInLoopReductionChains() const { 1648 return InLoopReductionChains; 1649 } 1650 1651 /// Returns true if the Phi is part of an inloop reduction. 1652 bool isInLoopReduction(PHINode *Phi) const { 1653 return InLoopReductionChains.count(Phi); 1654 } 1655 1656 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1657 /// with factor VF. Return the cost of the instruction, including 1658 /// scalarization overhead if it's needed. 1659 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1660 1661 /// Estimate cost of a call instruction CI if it were vectorized with factor 1662 /// VF. Return the cost of the instruction, including scalarization overhead 1663 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1664 /// scalarized - 1665 /// i.e. either vector version isn't available, or is too expensive. 1666 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1667 bool &NeedToScalarize) const; 1668 1669 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1670 /// that of B. 1671 bool isMoreProfitable(const VectorizationFactor &A, 1672 const VectorizationFactor &B) const; 1673 1674 /// Invalidates decisions already taken by the cost model. 1675 void invalidateCostModelingDecisions() { 1676 WideningDecisions.clear(); 1677 Uniforms.clear(); 1678 Scalars.clear(); 1679 } 1680 1681 private: 1682 unsigned NumPredStores = 0; 1683 1684 /// Convenience function that returns the value of vscale_range iff 1685 /// vscale_range.min == vscale_range.max or otherwise returns the value 1686 /// returned by the corresponding TLI method. 1687 Optional<unsigned> getVScaleForTuning() const; 1688 1689 /// \return An upper bound for the vectorization factors for both 1690 /// fixed and scalable vectorization, where the minimum-known number of 1691 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1692 /// disabled or unsupported, then the scalable part will be equal to 1693 /// ElementCount::getScalable(0). 1694 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1695 ElementCount UserVF, 1696 bool FoldTailByMasking); 1697 1698 /// \return the maximized element count based on the targets vector 1699 /// registers and the loop trip-count, but limited to a maximum safe VF. 1700 /// This is a helper function of computeFeasibleMaxVF. 1701 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1702 /// issue that occurred on one of the buildbots which cannot be reproduced 1703 /// without having access to the properietary compiler (see comments on 1704 /// D98509). The issue is currently under investigation and this workaround 1705 /// will be removed as soon as possible. 1706 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1707 unsigned SmallestType, 1708 unsigned WidestType, 1709 const ElementCount &MaxSafeVF, 1710 bool FoldTailByMasking); 1711 1712 /// \return the maximum legal scalable VF, based on the safe max number 1713 /// of elements. 1714 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1715 1716 /// The vectorization cost is a combination of the cost itself and a boolean 1717 /// indicating whether any of the contributing operations will actually 1718 /// operate on vector values after type legalization in the backend. If this 1719 /// latter value is false, then all operations will be scalarized (i.e. no 1720 /// vectorization has actually taken place). 1721 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1722 1723 /// Returns the expected execution cost. The unit of the cost does 1724 /// not matter because we use the 'cost' units to compare different 1725 /// vector widths. The cost that is returned is *not* normalized by 1726 /// the factor width. If \p Invalid is not nullptr, this function 1727 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1728 /// each instruction that has an Invalid cost for the given VF. 1729 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1730 VectorizationCostTy 1731 expectedCost(ElementCount VF, 1732 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1733 1734 /// Returns the execution time cost of an instruction for a given vector 1735 /// width. Vector width of one means scalar. 1736 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1737 1738 /// The cost-computation logic from getInstructionCost which provides 1739 /// the vector type as an output parameter. 1740 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1741 Type *&VectorTy); 1742 1743 /// Return the cost of instructions in an inloop reduction pattern, if I is 1744 /// part of that pattern. 1745 Optional<InstructionCost> 1746 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1747 TTI::TargetCostKind CostKind); 1748 1749 /// Calculate vectorization cost of memory instruction \p I. 1750 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1751 1752 /// The cost computation for scalarized memory instruction. 1753 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1754 1755 /// The cost computation for interleaving group of memory instructions. 1756 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1757 1758 /// The cost computation for Gather/Scatter instruction. 1759 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1760 1761 /// The cost computation for widening instruction \p I with consecutive 1762 /// memory access. 1763 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1764 1765 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1766 /// Load: scalar load + broadcast. 1767 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1768 /// element) 1769 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1770 1771 /// Estimate the overhead of scalarizing an instruction. This is a 1772 /// convenience wrapper for the type-based getScalarizationOverhead API. 1773 InstructionCost getScalarizationOverhead(Instruction *I, 1774 ElementCount VF) const; 1775 1776 /// Returns whether the instruction is a load or store and will be a emitted 1777 /// as a vector operation. 1778 bool isConsecutiveLoadOrStore(Instruction *I); 1779 1780 /// Returns true if an artificially high cost for emulated masked memrefs 1781 /// should be used. 1782 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1783 1784 /// Map of scalar integer values to the smallest bitwidth they can be legally 1785 /// represented as. The vector equivalents of these values should be truncated 1786 /// to this type. 1787 MapVector<Instruction *, uint64_t> MinBWs; 1788 1789 /// A type representing the costs for instructions if they were to be 1790 /// scalarized rather than vectorized. The entries are Instruction-Cost 1791 /// pairs. 1792 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1793 1794 /// A set containing all BasicBlocks that are known to present after 1795 /// vectorization as a predicated block. 1796 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1797 1798 /// Records whether it is allowed to have the original scalar loop execute at 1799 /// least once. This may be needed as a fallback loop in case runtime 1800 /// aliasing/dependence checks fail, or to handle the tail/remainder 1801 /// iterations when the trip count is unknown or doesn't divide by the VF, 1802 /// or as a peel-loop to handle gaps in interleave-groups. 1803 /// Under optsize and when the trip count is very small we don't allow any 1804 /// iterations to execute in the scalar loop. 1805 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1806 1807 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1808 bool FoldTailByMasking = false; 1809 1810 /// A map holding scalar costs for different vectorization factors. The 1811 /// presence of a cost for an instruction in the mapping indicates that the 1812 /// instruction will be scalarized when vectorizing with the associated 1813 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1814 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1815 1816 /// Holds the instructions known to be uniform after vectorization. 1817 /// The data is collected per VF. 1818 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1819 1820 /// Holds the instructions known to be scalar after vectorization. 1821 /// The data is collected per VF. 1822 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1823 1824 /// Holds the instructions (address computations) that are forced to be 1825 /// scalarized. 1826 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1827 1828 /// PHINodes of the reductions that should be expanded in-loop along with 1829 /// their associated chains of reduction operations, in program order from top 1830 /// (PHI) to bottom 1831 ReductionChainMap InLoopReductionChains; 1832 1833 /// A Map of inloop reduction operations and their immediate chain operand. 1834 /// FIXME: This can be removed once reductions can be costed correctly in 1835 /// vplan. This was added to allow quick lookup to the inloop operations, 1836 /// without having to loop through InLoopReductionChains. 1837 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1838 1839 /// Returns the expected difference in cost from scalarizing the expression 1840 /// feeding a predicated instruction \p PredInst. The instructions to 1841 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1842 /// non-negative return value implies the expression will be scalarized. 1843 /// Currently, only single-use chains are considered for scalarization. 1844 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1845 ElementCount VF); 1846 1847 /// Collect the instructions that are uniform after vectorization. An 1848 /// instruction is uniform if we represent it with a single scalar value in 1849 /// the vectorized loop corresponding to each vector iteration. Examples of 1850 /// uniform instructions include pointer operands of consecutive or 1851 /// interleaved memory accesses. Note that although uniformity implies an 1852 /// instruction will be scalar, the reverse is not true. In general, a 1853 /// scalarized instruction will be represented by VF scalar values in the 1854 /// vectorized loop, each corresponding to an iteration of the original 1855 /// scalar loop. 1856 void collectLoopUniforms(ElementCount VF); 1857 1858 /// Collect the instructions that are scalar after vectorization. An 1859 /// instruction is scalar if it is known to be uniform or will be scalarized 1860 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1861 /// to the list if they are used by a load/store instruction that is marked as 1862 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1863 /// VF values in the vectorized loop, each corresponding to an iteration of 1864 /// the original scalar loop. 1865 void collectLoopScalars(ElementCount VF); 1866 1867 /// Keeps cost model vectorization decision and cost for instructions. 1868 /// Right now it is used for memory instructions only. 1869 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1870 std::pair<InstWidening, InstructionCost>>; 1871 1872 DecisionList WideningDecisions; 1873 1874 /// Returns true if \p V is expected to be vectorized and it needs to be 1875 /// extracted. 1876 bool needsExtract(Value *V, ElementCount VF) const { 1877 Instruction *I = dyn_cast<Instruction>(V); 1878 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1879 TheLoop->isLoopInvariant(I)) 1880 return false; 1881 1882 // Assume we can vectorize V (and hence we need extraction) if the 1883 // scalars are not computed yet. This can happen, because it is called 1884 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1885 // the scalars are collected. That should be a safe assumption in most 1886 // cases, because we check if the operands have vectorizable types 1887 // beforehand in LoopVectorizationLegality. 1888 return Scalars.find(VF) == Scalars.end() || 1889 !isScalarAfterVectorization(I, VF); 1890 }; 1891 1892 /// Returns a range containing only operands needing to be extracted. 1893 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1894 ElementCount VF) const { 1895 return SmallVector<Value *, 4>(make_filter_range( 1896 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1897 } 1898 1899 /// Determines if we have the infrastructure to vectorize loop \p L and its 1900 /// epilogue, assuming the main loop is vectorized by \p VF. 1901 bool isCandidateForEpilogueVectorization(const Loop &L, 1902 const ElementCount VF) const; 1903 1904 /// Returns true if epilogue vectorization is considered profitable, and 1905 /// false otherwise. 1906 /// \p VF is the vectorization factor chosen for the original loop. 1907 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1908 1909 public: 1910 /// The loop that we evaluate. 1911 Loop *TheLoop; 1912 1913 /// Predicated scalar evolution analysis. 1914 PredicatedScalarEvolution &PSE; 1915 1916 /// Loop Info analysis. 1917 LoopInfo *LI; 1918 1919 /// Vectorization legality. 1920 LoopVectorizationLegality *Legal; 1921 1922 /// Vector target information. 1923 const TargetTransformInfo &TTI; 1924 1925 /// Target Library Info. 1926 const TargetLibraryInfo *TLI; 1927 1928 /// Demanded bits analysis. 1929 DemandedBits *DB; 1930 1931 /// Assumption cache. 1932 AssumptionCache *AC; 1933 1934 /// Interface to emit optimization remarks. 1935 OptimizationRemarkEmitter *ORE; 1936 1937 const Function *TheFunction; 1938 1939 /// Loop Vectorize Hint. 1940 const LoopVectorizeHints *Hints; 1941 1942 /// The interleave access information contains groups of interleaved accesses 1943 /// with the same stride and close to each other. 1944 InterleavedAccessInfo &InterleaveInfo; 1945 1946 /// Values to ignore in the cost model. 1947 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1948 1949 /// Values to ignore in the cost model when VF > 1. 1950 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1951 1952 /// All element types found in the loop. 1953 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1954 1955 /// Profitable vector factors. 1956 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1957 }; 1958 } // end namespace llvm 1959 1960 /// Helper struct to manage generating runtime checks for vectorization. 1961 /// 1962 /// The runtime checks are created up-front in temporary blocks to allow better 1963 /// estimating the cost and un-linked from the existing IR. After deciding to 1964 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1965 /// temporary blocks are completely removed. 1966 class GeneratedRTChecks { 1967 /// Basic block which contains the generated SCEV checks, if any. 1968 BasicBlock *SCEVCheckBlock = nullptr; 1969 1970 /// The value representing the result of the generated SCEV checks. If it is 1971 /// nullptr, either no SCEV checks have been generated or they have been used. 1972 Value *SCEVCheckCond = nullptr; 1973 1974 /// Basic block which contains the generated memory runtime checks, if any. 1975 BasicBlock *MemCheckBlock = nullptr; 1976 1977 /// The value representing the result of the generated memory runtime checks. 1978 /// If it is nullptr, either no memory runtime checks have been generated or 1979 /// they have been used. 1980 Value *MemRuntimeCheckCond = nullptr; 1981 1982 DominatorTree *DT; 1983 LoopInfo *LI; 1984 1985 SCEVExpander SCEVExp; 1986 SCEVExpander MemCheckExp; 1987 1988 public: 1989 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1990 const DataLayout &DL) 1991 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1992 MemCheckExp(SE, DL, "scev.check") {} 1993 1994 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1995 /// accurately estimate the cost of the runtime checks. The blocks are 1996 /// un-linked from the IR and is added back during vector code generation. If 1997 /// there is no vector code generation, the check blocks are removed 1998 /// completely. 1999 void Create(Loop *L, const LoopAccessInfo &LAI, 2000 const SCEVPredicate &Pred) { 2001 2002 BasicBlock *LoopHeader = L->getHeader(); 2003 BasicBlock *Preheader = L->getLoopPreheader(); 2004 2005 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2006 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2007 // may be used by SCEVExpander. The blocks will be un-linked from their 2008 // predecessors and removed from LI & DT at the end of the function. 2009 if (!Pred.isAlwaysTrue()) { 2010 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2011 nullptr, "vector.scevcheck"); 2012 2013 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2014 &Pred, SCEVCheckBlock->getTerminator()); 2015 } 2016 2017 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2018 if (RtPtrChecking.Need) { 2019 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2020 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2021 "vector.memcheck"); 2022 2023 MemRuntimeCheckCond = 2024 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2025 RtPtrChecking.getChecks(), MemCheckExp); 2026 assert(MemRuntimeCheckCond && 2027 "no RT checks generated although RtPtrChecking " 2028 "claimed checks are required"); 2029 } 2030 2031 if (!MemCheckBlock && !SCEVCheckBlock) 2032 return; 2033 2034 // Unhook the temporary block with the checks, update various places 2035 // accordingly. 2036 if (SCEVCheckBlock) 2037 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2038 if (MemCheckBlock) 2039 MemCheckBlock->replaceAllUsesWith(Preheader); 2040 2041 if (SCEVCheckBlock) { 2042 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2043 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2044 Preheader->getTerminator()->eraseFromParent(); 2045 } 2046 if (MemCheckBlock) { 2047 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2048 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2049 Preheader->getTerminator()->eraseFromParent(); 2050 } 2051 2052 DT->changeImmediateDominator(LoopHeader, Preheader); 2053 if (MemCheckBlock) { 2054 DT->eraseNode(MemCheckBlock); 2055 LI->removeBlock(MemCheckBlock); 2056 } 2057 if (SCEVCheckBlock) { 2058 DT->eraseNode(SCEVCheckBlock); 2059 LI->removeBlock(SCEVCheckBlock); 2060 } 2061 } 2062 2063 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2064 /// unused. 2065 ~GeneratedRTChecks() { 2066 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2067 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2068 if (!SCEVCheckCond) 2069 SCEVCleaner.markResultUsed(); 2070 2071 if (!MemRuntimeCheckCond) 2072 MemCheckCleaner.markResultUsed(); 2073 2074 if (MemRuntimeCheckCond) { 2075 auto &SE = *MemCheckExp.getSE(); 2076 // Memory runtime check generation creates compares that use expanded 2077 // values. Remove them before running the SCEVExpanderCleaners. 2078 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2079 if (MemCheckExp.isInsertedInstruction(&I)) 2080 continue; 2081 SE.forgetValue(&I); 2082 I.eraseFromParent(); 2083 } 2084 } 2085 MemCheckCleaner.cleanup(); 2086 SCEVCleaner.cleanup(); 2087 2088 if (SCEVCheckCond) 2089 SCEVCheckBlock->eraseFromParent(); 2090 if (MemRuntimeCheckCond) 2091 MemCheckBlock->eraseFromParent(); 2092 } 2093 2094 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2095 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2096 /// depending on the generated condition. 2097 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2098 BasicBlock *LoopVectorPreHeader, 2099 BasicBlock *LoopExitBlock) { 2100 if (!SCEVCheckCond) 2101 return nullptr; 2102 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2103 if (C->isZero()) 2104 return nullptr; 2105 2106 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2107 2108 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2109 // Create new preheader for vector loop. 2110 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2111 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2112 2113 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2114 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2115 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2116 SCEVCheckBlock); 2117 2118 DT->addNewBlock(SCEVCheckBlock, Pred); 2119 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2120 2121 ReplaceInstWithInst( 2122 SCEVCheckBlock->getTerminator(), 2123 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2124 // Mark the check as used, to prevent it from being removed during cleanup. 2125 SCEVCheckCond = nullptr; 2126 return SCEVCheckBlock; 2127 } 2128 2129 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2130 /// the branches to branch to the vector preheader or \p Bypass, depending on 2131 /// the generated condition. 2132 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2133 BasicBlock *LoopVectorPreHeader) { 2134 // Check if we generated code that checks in runtime if arrays overlap. 2135 if (!MemRuntimeCheckCond) 2136 return nullptr; 2137 2138 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2139 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2140 MemCheckBlock); 2141 2142 DT->addNewBlock(MemCheckBlock, Pred); 2143 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2144 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2145 2146 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2147 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2148 2149 ReplaceInstWithInst( 2150 MemCheckBlock->getTerminator(), 2151 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2152 MemCheckBlock->getTerminator()->setDebugLoc( 2153 Pred->getTerminator()->getDebugLoc()); 2154 2155 // Mark the check as used, to prevent it from being removed during cleanup. 2156 MemRuntimeCheckCond = nullptr; 2157 return MemCheckBlock; 2158 } 2159 }; 2160 2161 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2162 // vectorization. The loop needs to be annotated with #pragma omp simd 2163 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2164 // vector length information is not provided, vectorization is not considered 2165 // explicit. Interleave hints are not allowed either. These limitations will be 2166 // relaxed in the future. 2167 // Please, note that we are currently forced to abuse the pragma 'clang 2168 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2169 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2170 // provides *explicit vectorization hints* (LV can bypass legal checks and 2171 // assume that vectorization is legal). However, both hints are implemented 2172 // using the same metadata (llvm.loop.vectorize, processed by 2173 // LoopVectorizeHints). This will be fixed in the future when the native IR 2174 // representation for pragma 'omp simd' is introduced. 2175 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2176 OptimizationRemarkEmitter *ORE) { 2177 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2178 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2179 2180 // Only outer loops with an explicit vectorization hint are supported. 2181 // Unannotated outer loops are ignored. 2182 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2183 return false; 2184 2185 Function *Fn = OuterLp->getHeader()->getParent(); 2186 if (!Hints.allowVectorization(Fn, OuterLp, 2187 true /*VectorizeOnlyWhenForced*/)) { 2188 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2189 return false; 2190 } 2191 2192 if (Hints.getInterleave() > 1) { 2193 // TODO: Interleave support is future work. 2194 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2195 "outer loops.\n"); 2196 Hints.emitRemarkWithHints(); 2197 return false; 2198 } 2199 2200 return true; 2201 } 2202 2203 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2204 OptimizationRemarkEmitter *ORE, 2205 SmallVectorImpl<Loop *> &V) { 2206 // Collect inner loops and outer loops without irreducible control flow. For 2207 // now, only collect outer loops that have explicit vectorization hints. If we 2208 // are stress testing the VPlan H-CFG construction, we collect the outermost 2209 // loop of every loop nest. 2210 if (L.isInnermost() || VPlanBuildStressTest || 2211 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2212 LoopBlocksRPO RPOT(&L); 2213 RPOT.perform(LI); 2214 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2215 V.push_back(&L); 2216 // TODO: Collect inner loops inside marked outer loops in case 2217 // vectorization fails for the outer loop. Do not invoke 2218 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2219 // already known to be reducible. We can use an inherited attribute for 2220 // that. 2221 return; 2222 } 2223 } 2224 for (Loop *InnerL : L) 2225 collectSupportedLoops(*InnerL, LI, ORE, V); 2226 } 2227 2228 namespace { 2229 2230 /// The LoopVectorize Pass. 2231 struct LoopVectorize : public FunctionPass { 2232 /// Pass identification, replacement for typeid 2233 static char ID; 2234 2235 LoopVectorizePass Impl; 2236 2237 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2238 bool VectorizeOnlyWhenForced = false) 2239 : FunctionPass(ID), 2240 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2241 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2242 } 2243 2244 bool runOnFunction(Function &F) override { 2245 if (skipFunction(F)) 2246 return false; 2247 2248 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2249 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2250 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2251 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2252 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2253 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2254 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2255 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2256 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2257 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2258 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2259 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2260 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2261 2262 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2263 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2264 2265 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2266 GetLAA, *ORE, PSI).MadeAnyChange; 2267 } 2268 2269 void getAnalysisUsage(AnalysisUsage &AU) const override { 2270 AU.addRequired<AssumptionCacheTracker>(); 2271 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2272 AU.addRequired<DominatorTreeWrapperPass>(); 2273 AU.addRequired<LoopInfoWrapperPass>(); 2274 AU.addRequired<ScalarEvolutionWrapperPass>(); 2275 AU.addRequired<TargetTransformInfoWrapperPass>(); 2276 AU.addRequired<AAResultsWrapperPass>(); 2277 AU.addRequired<LoopAccessLegacyAnalysis>(); 2278 AU.addRequired<DemandedBitsWrapperPass>(); 2279 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2280 AU.addRequired<InjectTLIMappingsLegacy>(); 2281 2282 // We currently do not preserve loopinfo/dominator analyses with outer loop 2283 // vectorization. Until this is addressed, mark these analyses as preserved 2284 // only for non-VPlan-native path. 2285 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2286 if (!EnableVPlanNativePath) { 2287 AU.addPreserved<LoopInfoWrapperPass>(); 2288 AU.addPreserved<DominatorTreeWrapperPass>(); 2289 } 2290 2291 AU.addPreserved<BasicAAWrapperPass>(); 2292 AU.addPreserved<GlobalsAAWrapperPass>(); 2293 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2294 } 2295 }; 2296 2297 } // end anonymous namespace 2298 2299 //===----------------------------------------------------------------------===// 2300 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2301 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2302 //===----------------------------------------------------------------------===// 2303 2304 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2305 // We need to place the broadcast of invariant variables outside the loop, 2306 // but only if it's proven safe to do so. Else, broadcast will be inside 2307 // vector loop body. 2308 Instruction *Instr = dyn_cast<Instruction>(V); 2309 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2310 (!Instr || 2311 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2312 // Place the code for broadcasting invariant variables in the new preheader. 2313 IRBuilder<>::InsertPointGuard Guard(Builder); 2314 if (SafeToHoist) 2315 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2316 2317 // Broadcast the scalar into all locations in the vector. 2318 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2319 2320 return Shuf; 2321 } 2322 2323 /// This function adds 2324 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2325 /// to each vector element of Val. The sequence starts at StartIndex. 2326 /// \p Opcode is relevant for FP induction variable. 2327 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2328 Instruction::BinaryOps BinOp, ElementCount VF, 2329 IRBuilderBase &Builder) { 2330 assert(VF.isVector() && "only vector VFs are supported"); 2331 2332 // Create and check the types. 2333 auto *ValVTy = cast<VectorType>(Val->getType()); 2334 ElementCount VLen = ValVTy->getElementCount(); 2335 2336 Type *STy = Val->getType()->getScalarType(); 2337 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2338 "Induction Step must be an integer or FP"); 2339 assert(Step->getType() == STy && "Step has wrong type"); 2340 2341 SmallVector<Constant *, 8> Indices; 2342 2343 // Create a vector of consecutive numbers from zero to VF. 2344 VectorType *InitVecValVTy = ValVTy; 2345 if (STy->isFloatingPointTy()) { 2346 Type *InitVecValSTy = 2347 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2348 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2349 } 2350 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2351 2352 // Splat the StartIdx 2353 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2354 2355 if (STy->isIntegerTy()) { 2356 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2357 Step = Builder.CreateVectorSplat(VLen, Step); 2358 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2359 // FIXME: The newly created binary instructions should contain nsw/nuw 2360 // flags, which can be found from the original scalar operations. 2361 Step = Builder.CreateMul(InitVec, Step); 2362 return Builder.CreateAdd(Val, Step, "induction"); 2363 } 2364 2365 // Floating point induction. 2366 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2367 "Binary Opcode should be specified for FP induction"); 2368 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2369 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2370 2371 Step = Builder.CreateVectorSplat(VLen, Step); 2372 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2373 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2374 } 2375 2376 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2377 const InductionDescriptor &II, Value *Step, Value *Start, 2378 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2379 IRBuilderBase &Builder = State.Builder; 2380 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2381 "Expected either an induction phi-node or a truncate of it!"); 2382 2383 // Construct the initial value of the vector IV in the vector loop preheader 2384 auto CurrIP = Builder.saveIP(); 2385 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2386 if (isa<TruncInst>(EntryVal)) { 2387 assert(Start->getType()->isIntegerTy() && 2388 "Truncation requires an integer type"); 2389 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2390 Step = Builder.CreateTrunc(Step, TruncType); 2391 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2392 } 2393 2394 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2395 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2396 Value *SteppedStart = getStepVector( 2397 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2398 2399 // We create vector phi nodes for both integer and floating-point induction 2400 // variables. Here, we determine the kind of arithmetic we will perform. 2401 Instruction::BinaryOps AddOp; 2402 Instruction::BinaryOps MulOp; 2403 if (Step->getType()->isIntegerTy()) { 2404 AddOp = Instruction::Add; 2405 MulOp = Instruction::Mul; 2406 } else { 2407 AddOp = II.getInductionOpcode(); 2408 MulOp = Instruction::FMul; 2409 } 2410 2411 // Multiply the vectorization factor by the step using integer or 2412 // floating-point arithmetic as appropriate. 2413 Type *StepType = Step->getType(); 2414 Value *RuntimeVF; 2415 if (Step->getType()->isFloatingPointTy()) 2416 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2417 else 2418 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2419 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2420 2421 // Create a vector splat to use in the induction update. 2422 // 2423 // FIXME: If the step is non-constant, we create the vector splat with 2424 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2425 // handle a constant vector splat. 2426 Value *SplatVF = isa<Constant>(Mul) 2427 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2428 : Builder.CreateVectorSplat(State.VF, Mul); 2429 Builder.restoreIP(CurrIP); 2430 2431 // We may need to add the step a number of times, depending on the unroll 2432 // factor. The last of those goes into the PHI. 2433 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2434 &*LoopVectorBody->getFirstInsertionPt()); 2435 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2436 Instruction *LastInduction = VecInd; 2437 for (unsigned Part = 0; Part < UF; ++Part) { 2438 State.set(Def, LastInduction, Part); 2439 2440 if (isa<TruncInst>(EntryVal)) 2441 addMetadata(LastInduction, EntryVal); 2442 2443 LastInduction = cast<Instruction>( 2444 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2445 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2446 } 2447 2448 // Move the last step to the end of the latch block. This ensures consistent 2449 // placement of all induction updates. 2450 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2451 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2452 LastInduction->moveBefore(Br); 2453 LastInduction->setName("vec.ind.next"); 2454 2455 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2456 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2457 } 2458 2459 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2460 /// variable on which to base the steps, \p Step is the size of the step. 2461 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2462 const InductionDescriptor &ID, VPValue *Def, 2463 VPTransformState &State) { 2464 IRBuilderBase &Builder = State.Builder; 2465 // We shouldn't have to build scalar steps if we aren't vectorizing. 2466 assert(State.VF.isVector() && "VF should be greater than one"); 2467 // Get the value type and ensure it and the step have the same integer type. 2468 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2469 assert(ScalarIVTy == Step->getType() && 2470 "Val and Step should have the same type"); 2471 2472 // We build scalar steps for both integer and floating-point induction 2473 // variables. Here, we determine the kind of arithmetic we will perform. 2474 Instruction::BinaryOps AddOp; 2475 Instruction::BinaryOps MulOp; 2476 if (ScalarIVTy->isIntegerTy()) { 2477 AddOp = Instruction::Add; 2478 MulOp = Instruction::Mul; 2479 } else { 2480 AddOp = ID.getInductionOpcode(); 2481 MulOp = Instruction::FMul; 2482 } 2483 2484 // Determine the number of scalars we need to generate for each unroll 2485 // iteration. 2486 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2487 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2488 // Compute the scalar steps and save the results in State. 2489 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2490 ScalarIVTy->getScalarSizeInBits()); 2491 Type *VecIVTy = nullptr; 2492 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2493 if (!FirstLaneOnly && State.VF.isScalable()) { 2494 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2495 UnitStepVec = 2496 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2497 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2498 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2499 } 2500 2501 for (unsigned Part = 0; Part < State.UF; ++Part) { 2502 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2503 2504 if (!FirstLaneOnly && State.VF.isScalable()) { 2505 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2506 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2507 if (ScalarIVTy->isFloatingPointTy()) 2508 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2509 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2510 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2511 State.set(Def, Add, Part); 2512 // It's useful to record the lane values too for the known minimum number 2513 // of elements so we do those below. This improves the code quality when 2514 // trying to extract the first element, for example. 2515 } 2516 2517 if (ScalarIVTy->isFloatingPointTy()) 2518 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2519 2520 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2521 Value *StartIdx = Builder.CreateBinOp( 2522 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2523 // The step returned by `createStepForVF` is a runtime-evaluated value 2524 // when VF is scalable. Otherwise, it should be folded into a Constant. 2525 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2526 "Expected StartIdx to be folded to a constant when VF is not " 2527 "scalable"); 2528 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2529 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2530 State.set(Def, Add, VPIteration(Part, Lane)); 2531 } 2532 } 2533 } 2534 2535 // Generate code for the induction step. Note that induction steps are 2536 // required to be loop-invariant 2537 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2538 Instruction *InsertBefore, 2539 Loop *OrigLoop = nullptr) { 2540 const DataLayout &DL = SE.getDataLayout(); 2541 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2542 "Induction step should be loop invariant"); 2543 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2544 return E->getValue(); 2545 2546 SCEVExpander Exp(SE, DL, "induction"); 2547 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2548 } 2549 2550 /// Compute the transformed value of Index at offset StartValue using step 2551 /// StepValue. 2552 /// For integer induction, returns StartValue + Index * StepValue. 2553 /// For pointer induction, returns StartValue[Index * StepValue]. 2554 /// FIXME: The newly created binary instructions should contain nsw/nuw 2555 /// flags, which can be found from the original scalar operations. 2556 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2557 Value *StartValue, Value *Step, 2558 const InductionDescriptor &ID) { 2559 assert(Index->getType()->getScalarType() == Step->getType() && 2560 "Index scalar type does not match StepValue type"); 2561 2562 // Note: the IR at this point is broken. We cannot use SE to create any new 2563 // SCEV and then expand it, hoping that SCEV's simplification will give us 2564 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2565 // lead to various SCEV crashes. So all we can do is to use builder and rely 2566 // on InstCombine for future simplifications. Here we handle some trivial 2567 // cases only. 2568 auto CreateAdd = [&B](Value *X, Value *Y) { 2569 assert(X->getType() == Y->getType() && "Types don't match!"); 2570 if (auto *CX = dyn_cast<ConstantInt>(X)) 2571 if (CX->isZero()) 2572 return Y; 2573 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2574 if (CY->isZero()) 2575 return X; 2576 return B.CreateAdd(X, Y); 2577 }; 2578 2579 // We allow X to be a vector type, in which case Y will potentially be 2580 // splatted into a vector with the same element count. 2581 auto CreateMul = [&B](Value *X, Value *Y) { 2582 assert(X->getType()->getScalarType() == Y->getType() && 2583 "Types don't match!"); 2584 if (auto *CX = dyn_cast<ConstantInt>(X)) 2585 if (CX->isOne()) 2586 return Y; 2587 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2588 if (CY->isOne()) 2589 return X; 2590 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2591 if (XVTy && !isa<VectorType>(Y->getType())) 2592 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2593 return B.CreateMul(X, Y); 2594 }; 2595 2596 switch (ID.getKind()) { 2597 case InductionDescriptor::IK_IntInduction: { 2598 assert(!isa<VectorType>(Index->getType()) && 2599 "Vector indices not supported for integer inductions yet"); 2600 assert(Index->getType() == StartValue->getType() && 2601 "Index type does not match StartValue type"); 2602 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2603 return B.CreateSub(StartValue, Index); 2604 auto *Offset = CreateMul(Index, Step); 2605 return CreateAdd(StartValue, Offset); 2606 } 2607 case InductionDescriptor::IK_PtrInduction: { 2608 assert(isa<Constant>(Step) && 2609 "Expected constant step for pointer induction"); 2610 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2611 } 2612 case InductionDescriptor::IK_FpInduction: { 2613 assert(!isa<VectorType>(Index->getType()) && 2614 "Vector indices not supported for FP inductions yet"); 2615 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2616 auto InductionBinOp = ID.getInductionBinOp(); 2617 assert(InductionBinOp && 2618 (InductionBinOp->getOpcode() == Instruction::FAdd || 2619 InductionBinOp->getOpcode() == Instruction::FSub) && 2620 "Original bin op should be defined for FP induction"); 2621 2622 Value *MulExp = B.CreateFMul(Step, Index); 2623 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2624 "induction"); 2625 } 2626 case InductionDescriptor::IK_NoInduction: 2627 return nullptr; 2628 } 2629 llvm_unreachable("invalid enum"); 2630 } 2631 2632 void InnerLoopVectorizer::widenIntOrFpInduction( 2633 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, 2634 Value *CanonicalIV) { 2635 Value *Start = Def->getStartValue()->getLiveInIRValue(); 2636 const InductionDescriptor &ID = Def->getInductionDescriptor(); 2637 TruncInst *Trunc = Def->getTruncInst(); 2638 IRBuilderBase &Builder = State.Builder; 2639 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2640 assert(State.VF.isVector() && "must have vector VF"); 2641 2642 // The value from the original loop to which we are mapping the new induction 2643 // variable. 2644 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2645 2646 auto &DL = EntryVal->getModule()->getDataLayout(); 2647 2648 // Generate code for the induction step. Note that induction steps are 2649 // required to be loop-invariant 2650 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2651 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2652 "Induction step should be loop invariant"); 2653 if (PSE.getSE()->isSCEVable(IV->getType())) { 2654 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2655 return Exp.expandCodeFor(Step, Step->getType(), 2656 State.CFG.VectorPreHeader->getTerminator()); 2657 } 2658 return cast<SCEVUnknown>(Step)->getValue(); 2659 }; 2660 2661 // The scalar value to broadcast. This is derived from the canonical 2662 // induction variable. If a truncation type is given, truncate the canonical 2663 // induction variable and step. Otherwise, derive these values from the 2664 // induction descriptor. 2665 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2666 Value *ScalarIV = CanonicalIV; 2667 Type *NeededType = IV->getType(); 2668 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { 2669 ScalarIV = 2670 NeededType->isIntegerTy() 2671 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) 2672 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); 2673 ScalarIV = emitTransformedIndex(Builder, ScalarIV, Start, Step, ID); 2674 ScalarIV->setName("offset.idx"); 2675 } 2676 if (Trunc) { 2677 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2678 assert(Step->getType()->isIntegerTy() && 2679 "Truncation requires an integer step"); 2680 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2681 Step = Builder.CreateTrunc(Step, TruncType); 2682 } 2683 return ScalarIV; 2684 }; 2685 2686 // Fast-math-flags propagate from the original induction instruction. 2687 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2688 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2689 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2690 2691 // Now do the actual transformations, and start with creating the step value. 2692 Value *Step = CreateStepValue(ID.getStep()); 2693 2694 // Create a new independent vector induction variable. Later VPlan2VPlan 2695 // optimizations will remove it, if it won't be needed, e.g. because all users 2696 // of it access scalar values. 2697 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2698 2699 if (Def->needsScalarIV()) { 2700 // Create scalar steps that can be used by instructions we will later 2701 // scalarize. Note that the addition of the scalar steps will not increase 2702 // the number of instructions in the loop in the common case prior to 2703 // InstCombine. We will be trading one vector extract for each scalar step. 2704 Value *ScalarIV = CreateScalarIV(Step); 2705 buildScalarSteps(ScalarIV, Step, ID, Def, State); 2706 } 2707 } 2708 2709 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2710 const VPIteration &Instance, 2711 VPTransformState &State) { 2712 Value *ScalarInst = State.get(Def, Instance); 2713 Value *VectorValue = State.get(Def, Instance.Part); 2714 VectorValue = Builder.CreateInsertElement( 2715 VectorValue, ScalarInst, 2716 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2717 State.set(Def, VectorValue, Instance.Part); 2718 } 2719 2720 // Return whether we allow using masked interleave-groups (for dealing with 2721 // strided loads/stores that reside in predicated blocks, or for dealing 2722 // with gaps). 2723 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2724 // If an override option has been passed in for interleaved accesses, use it. 2725 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2726 return EnableMaskedInterleavedMemAccesses; 2727 2728 return TTI.enableMaskedInterleavedAccessVectorization(); 2729 } 2730 2731 // Try to vectorize the interleave group that \p Instr belongs to. 2732 // 2733 // E.g. Translate following interleaved load group (factor = 3): 2734 // for (i = 0; i < N; i+=3) { 2735 // R = Pic[i]; // Member of index 0 2736 // G = Pic[i+1]; // Member of index 1 2737 // B = Pic[i+2]; // Member of index 2 2738 // ... // do something to R, G, B 2739 // } 2740 // To: 2741 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2742 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2743 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2744 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2745 // 2746 // Or translate following interleaved store group (factor = 3): 2747 // for (i = 0; i < N; i+=3) { 2748 // ... do something to R, G, B 2749 // Pic[i] = R; // Member of index 0 2750 // Pic[i+1] = G; // Member of index 1 2751 // Pic[i+2] = B; // Member of index 2 2752 // } 2753 // To: 2754 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2755 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2756 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2757 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2758 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2759 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2760 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2761 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2762 VPValue *BlockInMask) { 2763 Instruction *Instr = Group->getInsertPos(); 2764 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2765 2766 // Prepare for the vector type of the interleaved load/store. 2767 Type *ScalarTy = getLoadStoreType(Instr); 2768 unsigned InterleaveFactor = Group->getFactor(); 2769 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2770 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2771 2772 // Prepare for the new pointers. 2773 SmallVector<Value *, 2> AddrParts; 2774 unsigned Index = Group->getIndex(Instr); 2775 2776 // TODO: extend the masked interleaved-group support to reversed access. 2777 assert((!BlockInMask || !Group->isReverse()) && 2778 "Reversed masked interleave-group not supported."); 2779 2780 // If the group is reverse, adjust the index to refer to the last vector lane 2781 // instead of the first. We adjust the index from the first vector lane, 2782 // rather than directly getting the pointer for lane VF - 1, because the 2783 // pointer operand of the interleaved access is supposed to be uniform. For 2784 // uniform instructions, we're only required to generate a value for the 2785 // first vector lane in each unroll iteration. 2786 if (Group->isReverse()) 2787 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2788 2789 for (unsigned Part = 0; Part < UF; Part++) { 2790 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2791 setDebugLocFromInst(AddrPart); 2792 2793 // Notice current instruction could be any index. Need to adjust the address 2794 // to the member of index 0. 2795 // 2796 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2797 // b = A[i]; // Member of index 0 2798 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2799 // 2800 // E.g. A[i+1] = a; // Member of index 1 2801 // A[i] = b; // Member of index 0 2802 // A[i+2] = c; // Member of index 2 (Current instruction) 2803 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2804 2805 bool InBounds = false; 2806 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2807 InBounds = gep->isInBounds(); 2808 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2809 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2810 2811 // Cast to the vector pointer type. 2812 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2813 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2814 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2815 } 2816 2817 setDebugLocFromInst(Instr); 2818 Value *PoisonVec = PoisonValue::get(VecTy); 2819 2820 Value *MaskForGaps = nullptr; 2821 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2822 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2823 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2824 } 2825 2826 // Vectorize the interleaved load group. 2827 if (isa<LoadInst>(Instr)) { 2828 // For each unroll part, create a wide load for the group. 2829 SmallVector<Value *, 2> NewLoads; 2830 for (unsigned Part = 0; Part < UF; Part++) { 2831 Instruction *NewLoad; 2832 if (BlockInMask || MaskForGaps) { 2833 assert(useMaskedInterleavedAccesses(*TTI) && 2834 "masked interleaved groups are not allowed."); 2835 Value *GroupMask = MaskForGaps; 2836 if (BlockInMask) { 2837 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2838 Value *ShuffledMask = Builder.CreateShuffleVector( 2839 BlockInMaskPart, 2840 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2841 "interleaved.mask"); 2842 GroupMask = MaskForGaps 2843 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2844 MaskForGaps) 2845 : ShuffledMask; 2846 } 2847 NewLoad = 2848 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2849 GroupMask, PoisonVec, "wide.masked.vec"); 2850 } 2851 else 2852 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2853 Group->getAlign(), "wide.vec"); 2854 Group->addMetadata(NewLoad); 2855 NewLoads.push_back(NewLoad); 2856 } 2857 2858 // For each member in the group, shuffle out the appropriate data from the 2859 // wide loads. 2860 unsigned J = 0; 2861 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2862 Instruction *Member = Group->getMember(I); 2863 2864 // Skip the gaps in the group. 2865 if (!Member) 2866 continue; 2867 2868 auto StrideMask = 2869 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2870 for (unsigned Part = 0; Part < UF; Part++) { 2871 Value *StridedVec = Builder.CreateShuffleVector( 2872 NewLoads[Part], StrideMask, "strided.vec"); 2873 2874 // If this member has different type, cast the result type. 2875 if (Member->getType() != ScalarTy) { 2876 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2877 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2878 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2879 } 2880 2881 if (Group->isReverse()) 2882 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2883 2884 State.set(VPDefs[J], StridedVec, Part); 2885 } 2886 ++J; 2887 } 2888 return; 2889 } 2890 2891 // The sub vector type for current instruction. 2892 auto *SubVT = VectorType::get(ScalarTy, VF); 2893 2894 // Vectorize the interleaved store group. 2895 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2896 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2897 "masked interleaved groups are not allowed."); 2898 assert((!MaskForGaps || !VF.isScalable()) && 2899 "masking gaps for scalable vectors is not yet supported."); 2900 for (unsigned Part = 0; Part < UF; Part++) { 2901 // Collect the stored vector from each member. 2902 SmallVector<Value *, 4> StoredVecs; 2903 for (unsigned i = 0; i < InterleaveFactor; i++) { 2904 assert((Group->getMember(i) || MaskForGaps) && 2905 "Fail to get a member from an interleaved store group"); 2906 Instruction *Member = Group->getMember(i); 2907 2908 // Skip the gaps in the group. 2909 if (!Member) { 2910 Value *Undef = PoisonValue::get(SubVT); 2911 StoredVecs.push_back(Undef); 2912 continue; 2913 } 2914 2915 Value *StoredVec = State.get(StoredValues[i], Part); 2916 2917 if (Group->isReverse()) 2918 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2919 2920 // If this member has different type, cast it to a unified type. 2921 2922 if (StoredVec->getType() != SubVT) 2923 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2924 2925 StoredVecs.push_back(StoredVec); 2926 } 2927 2928 // Concatenate all vectors into a wide vector. 2929 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2930 2931 // Interleave the elements in the wide vector. 2932 Value *IVec = Builder.CreateShuffleVector( 2933 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2934 "interleaved.vec"); 2935 2936 Instruction *NewStoreInstr; 2937 if (BlockInMask || MaskForGaps) { 2938 Value *GroupMask = MaskForGaps; 2939 if (BlockInMask) { 2940 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2941 Value *ShuffledMask = Builder.CreateShuffleVector( 2942 BlockInMaskPart, 2943 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2944 "interleaved.mask"); 2945 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2946 ShuffledMask, MaskForGaps) 2947 : ShuffledMask; 2948 } 2949 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2950 Group->getAlign(), GroupMask); 2951 } else 2952 NewStoreInstr = 2953 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2954 2955 Group->addMetadata(NewStoreInstr); 2956 } 2957 } 2958 2959 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2960 VPReplicateRecipe *RepRecipe, 2961 const VPIteration &Instance, 2962 bool IfPredicateInstr, 2963 VPTransformState &State) { 2964 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2965 2966 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2967 // the first lane and part. 2968 if (isa<NoAliasScopeDeclInst>(Instr)) 2969 if (!Instance.isFirstIteration()) 2970 return; 2971 2972 setDebugLocFromInst(Instr); 2973 2974 // Does this instruction return a value ? 2975 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2976 2977 Instruction *Cloned = Instr->clone(); 2978 if (!IsVoidRetTy) 2979 Cloned->setName(Instr->getName() + ".cloned"); 2980 2981 // If the scalarized instruction contributes to the address computation of a 2982 // widen masked load/store which was in a basic block that needed predication 2983 // and is not predicated after vectorization, we can't propagate 2984 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2985 // instruction could feed a poison value to the base address of the widen 2986 // load/store. 2987 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2988 Cloned->dropPoisonGeneratingFlags(); 2989 2990 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2991 Builder.GetInsertPoint()); 2992 // Replace the operands of the cloned instructions with their scalar 2993 // equivalents in the new loop. 2994 for (auto &I : enumerate(RepRecipe->operands())) { 2995 auto InputInstance = Instance; 2996 VPValue *Operand = I.value(); 2997 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2998 if (OperandR && OperandR->isUniform()) 2999 InputInstance.Lane = VPLane::getFirstLane(); 3000 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 3001 } 3002 addNewMetadata(Cloned, Instr); 3003 3004 // Place the cloned scalar in the new loop. 3005 Builder.Insert(Cloned); 3006 3007 State.set(RepRecipe, Cloned, Instance); 3008 3009 // If we just cloned a new assumption, add it the assumption cache. 3010 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3011 AC->registerAssumption(II); 3012 3013 // End if-block. 3014 if (IfPredicateInstr) 3015 PredicatedInstructions.push_back(Cloned); 3016 } 3017 3018 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 3019 BasicBlock *Header = L->getHeader(); 3020 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 3021 3022 IRBuilder<> B(Header->getTerminator()); 3023 Instruction *OldInst = 3024 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 3025 setDebugLocFromInst(OldInst, &B); 3026 3027 // Connect the header to the exit and header blocks and replace the old 3028 // terminator. 3029 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 3030 3031 // Now we have two terminators. Remove the old one from the block. 3032 Header->getTerminator()->eraseFromParent(); 3033 } 3034 3035 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3036 if (TripCount) 3037 return TripCount; 3038 3039 assert(L && "Create Trip Count for null loop."); 3040 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3041 // Find the loop boundaries. 3042 ScalarEvolution *SE = PSE.getSE(); 3043 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3044 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3045 "Invalid loop count"); 3046 3047 Type *IdxTy = Legal->getWidestInductionType(); 3048 assert(IdxTy && "No type for induction"); 3049 3050 // The exit count might have the type of i64 while the phi is i32. This can 3051 // happen if we have an induction variable that is sign extended before the 3052 // compare. The only way that we get a backedge taken count is that the 3053 // induction variable was signed and as such will not overflow. In such a case 3054 // truncation is legal. 3055 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3056 IdxTy->getPrimitiveSizeInBits()) 3057 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3058 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3059 3060 // Get the total trip count from the count by adding 1. 3061 const SCEV *ExitCount = SE->getAddExpr( 3062 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3063 3064 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3065 3066 // Expand the trip count and place the new instructions in the preheader. 3067 // Notice that the pre-header does not change, only the loop body. 3068 SCEVExpander Exp(*SE, DL, "induction"); 3069 3070 // Count holds the overall loop count (N). 3071 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3072 L->getLoopPreheader()->getTerminator()); 3073 3074 if (TripCount->getType()->isPointerTy()) 3075 TripCount = 3076 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3077 L->getLoopPreheader()->getTerminator()); 3078 3079 return TripCount; 3080 } 3081 3082 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3083 if (VectorTripCount) 3084 return VectorTripCount; 3085 3086 Value *TC = getOrCreateTripCount(L); 3087 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3088 3089 Type *Ty = TC->getType(); 3090 // This is where we can make the step a runtime constant. 3091 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3092 3093 // If the tail is to be folded by masking, round the number of iterations N 3094 // up to a multiple of Step instead of rounding down. This is done by first 3095 // adding Step-1 and then rounding down. Note that it's ok if this addition 3096 // overflows: the vector induction variable will eventually wrap to zero given 3097 // that it starts at zero and its Step is a power of two; the loop will then 3098 // exit, with the last early-exit vector comparison also producing all-true. 3099 if (Cost->foldTailByMasking()) { 3100 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3101 "VF*UF must be a power of 2 when folding tail by masking"); 3102 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 3103 TC = Builder.CreateAdd( 3104 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 3105 } 3106 3107 // Now we need to generate the expression for the part of the loop that the 3108 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3109 // iterations are not required for correctness, or N - Step, otherwise. Step 3110 // is equal to the vectorization factor (number of SIMD elements) times the 3111 // unroll factor (number of SIMD instructions). 3112 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3113 3114 // There are cases where we *must* run at least one iteration in the remainder 3115 // loop. See the cost model for when this can happen. If the step evenly 3116 // divides the trip count, we set the remainder to be equal to the step. If 3117 // the step does not evenly divide the trip count, no adjustment is necessary 3118 // since there will already be scalar iterations. Note that the minimum 3119 // iterations check ensures that N >= Step. 3120 if (Cost->requiresScalarEpilogue(VF)) { 3121 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3122 R = Builder.CreateSelect(IsZero, Step, R); 3123 } 3124 3125 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3126 3127 return VectorTripCount; 3128 } 3129 3130 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3131 const DataLayout &DL) { 3132 // Verify that V is a vector type with same number of elements as DstVTy. 3133 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3134 unsigned VF = DstFVTy->getNumElements(); 3135 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3136 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3137 Type *SrcElemTy = SrcVecTy->getElementType(); 3138 Type *DstElemTy = DstFVTy->getElementType(); 3139 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3140 "Vector elements must have same size"); 3141 3142 // Do a direct cast if element types are castable. 3143 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3144 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3145 } 3146 // V cannot be directly casted to desired vector type. 3147 // May happen when V is a floating point vector but DstVTy is a vector of 3148 // pointers or vice-versa. Handle this using a two-step bitcast using an 3149 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3150 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3151 "Only one type should be a pointer type"); 3152 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3153 "Only one type should be a floating point type"); 3154 Type *IntTy = 3155 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3156 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3157 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3158 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3159 } 3160 3161 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3162 BasicBlock *Bypass) { 3163 Value *Count = getOrCreateTripCount(L); 3164 // Reuse existing vector loop preheader for TC checks. 3165 // Note that new preheader block is generated for vector loop. 3166 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3167 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3168 3169 // Generate code to check if the loop's trip count is less than VF * UF, or 3170 // equal to it in case a scalar epilogue is required; this implies that the 3171 // vector trip count is zero. This check also covers the case where adding one 3172 // to the backedge-taken count overflowed leading to an incorrect trip count 3173 // of zero. In this case we will also jump to the scalar loop. 3174 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3175 : ICmpInst::ICMP_ULT; 3176 3177 // If tail is to be folded, vector loop takes care of all iterations. 3178 Value *CheckMinIters = Builder.getFalse(); 3179 if (!Cost->foldTailByMasking()) { 3180 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3181 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3182 } 3183 // Create new preheader for vector loop. 3184 LoopVectorPreHeader = 3185 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3186 "vector.ph"); 3187 3188 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3189 DT->getNode(Bypass)->getIDom()) && 3190 "TC check is expected to dominate Bypass"); 3191 3192 // Update dominator for Bypass & LoopExit (if needed). 3193 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3194 if (!Cost->requiresScalarEpilogue(VF)) 3195 // If there is an epilogue which must run, there's no edge from the 3196 // middle block to exit blocks and thus no need to update the immediate 3197 // dominator of the exit blocks. 3198 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3199 3200 ReplaceInstWithInst( 3201 TCCheckBlock->getTerminator(), 3202 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3203 LoopBypassBlocks.push_back(TCCheckBlock); 3204 } 3205 3206 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3207 3208 BasicBlock *const SCEVCheckBlock = 3209 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3210 if (!SCEVCheckBlock) 3211 return nullptr; 3212 3213 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3214 (OptForSizeBasedOnProfile && 3215 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3216 "Cannot SCEV check stride or overflow when optimizing for size"); 3217 3218 3219 // Update dominator only if this is first RT check. 3220 if (LoopBypassBlocks.empty()) { 3221 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3222 if (!Cost->requiresScalarEpilogue(VF)) 3223 // If there is an epilogue which must run, there's no edge from the 3224 // middle block to exit blocks and thus no need to update the immediate 3225 // dominator of the exit blocks. 3226 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3227 } 3228 3229 LoopBypassBlocks.push_back(SCEVCheckBlock); 3230 AddedSafetyChecks = true; 3231 return SCEVCheckBlock; 3232 } 3233 3234 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3235 BasicBlock *Bypass) { 3236 // VPlan-native path does not do any analysis for runtime checks currently. 3237 if (EnableVPlanNativePath) 3238 return nullptr; 3239 3240 BasicBlock *const MemCheckBlock = 3241 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3242 3243 // Check if we generated code that checks in runtime if arrays overlap. We put 3244 // the checks into a separate block to make the more common case of few 3245 // elements faster. 3246 if (!MemCheckBlock) 3247 return nullptr; 3248 3249 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3250 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3251 "Cannot emit memory checks when optimizing for size, unless forced " 3252 "to vectorize."); 3253 ORE->emit([&]() { 3254 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3255 L->getStartLoc(), L->getHeader()) 3256 << "Code-size may be reduced by not forcing " 3257 "vectorization, or by source-code modifications " 3258 "eliminating the need for runtime checks " 3259 "(e.g., adding 'restrict')."; 3260 }); 3261 } 3262 3263 LoopBypassBlocks.push_back(MemCheckBlock); 3264 3265 AddedSafetyChecks = true; 3266 3267 // We currently don't use LoopVersioning for the actual loop cloning but we 3268 // still use it to add the noalias metadata. 3269 LVer = std::make_unique<LoopVersioning>( 3270 *Legal->getLAI(), 3271 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3272 DT, PSE.getSE()); 3273 LVer->prepareNoAliasMetadata(); 3274 return MemCheckBlock; 3275 } 3276 3277 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3278 LoopScalarBody = OrigLoop->getHeader(); 3279 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3280 assert(LoopVectorPreHeader && "Invalid loop structure"); 3281 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3282 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3283 "multiple exit loop without required epilogue?"); 3284 3285 LoopMiddleBlock = 3286 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3287 LI, nullptr, Twine(Prefix) + "middle.block"); 3288 LoopScalarPreHeader = 3289 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3290 nullptr, Twine(Prefix) + "scalar.ph"); 3291 3292 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3293 3294 // Set up the middle block terminator. Two cases: 3295 // 1) If we know that we must execute the scalar epilogue, emit an 3296 // unconditional branch. 3297 // 2) Otherwise, we must have a single unique exit block (due to how we 3298 // implement the multiple exit case). In this case, set up a conditonal 3299 // branch from the middle block to the loop scalar preheader, and the 3300 // exit block. completeLoopSkeleton will update the condition to use an 3301 // iteration check, if required to decide whether to execute the remainder. 3302 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3303 BranchInst::Create(LoopScalarPreHeader) : 3304 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3305 Builder.getTrue()); 3306 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3307 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3308 3309 // We intentionally don't let SplitBlock to update LoopInfo since 3310 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3311 // LoopVectorBody is explicitly added to the correct place few lines later. 3312 LoopVectorBody = 3313 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3314 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3315 3316 // Update dominator for loop exit. 3317 if (!Cost->requiresScalarEpilogue(VF)) 3318 // If there is an epilogue which must run, there's no edge from the 3319 // middle block to exit blocks and thus no need to update the immediate 3320 // dominator of the exit blocks. 3321 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3322 3323 // Create and register the new vector loop. 3324 Loop *Lp = LI->AllocateLoop(); 3325 Loop *ParentLoop = OrigLoop->getParentLoop(); 3326 3327 // Insert the new loop into the loop nest and register the new basic blocks 3328 // before calling any utilities such as SCEV that require valid LoopInfo. 3329 if (ParentLoop) { 3330 ParentLoop->addChildLoop(Lp); 3331 } else { 3332 LI->addTopLevelLoop(Lp); 3333 } 3334 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3335 return Lp; 3336 } 3337 3338 void InnerLoopVectorizer::createInductionResumeValues( 3339 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3340 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3341 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3342 "Inconsistent information about additional bypass."); 3343 3344 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3345 assert(VectorTripCount && L && "Expected valid arguments"); 3346 // We are going to resume the execution of the scalar loop. 3347 // Go over all of the induction variables that we found and fix the 3348 // PHIs that are left in the scalar version of the loop. 3349 // The starting values of PHI nodes depend on the counter of the last 3350 // iteration in the vectorized loop. 3351 // If we come from a bypass edge then we need to start from the original 3352 // start value. 3353 Instruction *OldInduction = Legal->getPrimaryInduction(); 3354 for (auto &InductionEntry : Legal->getInductionVars()) { 3355 PHINode *OrigPhi = InductionEntry.first; 3356 InductionDescriptor II = InductionEntry.second; 3357 3358 // Create phi nodes to merge from the backedge-taken check block. 3359 PHINode *BCResumeVal = 3360 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3361 LoopScalarPreHeader->getTerminator()); 3362 // Copy original phi DL over to the new one. 3363 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3364 Value *&EndValue = IVEndValues[OrigPhi]; 3365 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3366 if (OrigPhi == OldInduction) { 3367 // We know what the end value is. 3368 EndValue = VectorTripCount; 3369 } else { 3370 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3371 3372 // Fast-math-flags propagate from the original induction instruction. 3373 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3374 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3375 3376 Type *StepType = II.getStep()->getType(); 3377 Instruction::CastOps CastOp = 3378 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3379 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3380 Value *Step = 3381 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3382 EndValue = emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3383 EndValue->setName("ind.end"); 3384 3385 // Compute the end value for the additional bypass (if applicable). 3386 if (AdditionalBypass.first) { 3387 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3388 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3389 StepType, true); 3390 Value *Step = 3391 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3392 CRD = 3393 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3394 EndValueFromAdditionalBypass = 3395 emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3396 EndValueFromAdditionalBypass->setName("ind.end"); 3397 } 3398 } 3399 // The new PHI merges the original incoming value, in case of a bypass, 3400 // or the value at the end of the vectorized loop. 3401 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3402 3403 // Fix the scalar body counter (PHI node). 3404 // The old induction's phi node in the scalar body needs the truncated 3405 // value. 3406 for (BasicBlock *BB : LoopBypassBlocks) 3407 BCResumeVal->addIncoming(II.getStartValue(), BB); 3408 3409 if (AdditionalBypass.first) 3410 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3411 EndValueFromAdditionalBypass); 3412 3413 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3414 } 3415 } 3416 3417 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3418 MDNode *OrigLoopID) { 3419 assert(L && "Expected valid loop."); 3420 3421 // The trip counts should be cached by now. 3422 Value *Count = getOrCreateTripCount(L); 3423 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3424 3425 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3426 3427 // Add a check in the middle block to see if we have completed 3428 // all of the iterations in the first vector loop. Three cases: 3429 // 1) If we require a scalar epilogue, there is no conditional branch as 3430 // we unconditionally branch to the scalar preheader. Do nothing. 3431 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3432 // Thus if tail is to be folded, we know we don't need to run the 3433 // remainder and we can use the previous value for the condition (true). 3434 // 3) Otherwise, construct a runtime check. 3435 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3436 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3437 Count, VectorTripCount, "cmp.n", 3438 LoopMiddleBlock->getTerminator()); 3439 3440 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3441 // of the corresponding compare because they may have ended up with 3442 // different line numbers and we want to avoid awkward line stepping while 3443 // debugging. Eg. if the compare has got a line number inside the loop. 3444 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3445 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3446 } 3447 3448 // Get ready to start creating new instructions into the vectorized body. 3449 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3450 "Inconsistent vector loop preheader"); 3451 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3452 3453 #ifdef EXPENSIVE_CHECKS 3454 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3455 LI->verify(*DT); 3456 #endif 3457 3458 return LoopVectorPreHeader; 3459 } 3460 3461 std::pair<BasicBlock *, Value *> 3462 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3463 /* 3464 In this function we generate a new loop. The new loop will contain 3465 the vectorized instructions while the old loop will continue to run the 3466 scalar remainder. 3467 3468 [ ] <-- loop iteration number check. 3469 / | 3470 / v 3471 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3472 | / | 3473 | / v 3474 || [ ] <-- vector pre header. 3475 |/ | 3476 | v 3477 | [ ] \ 3478 | [ ]_| <-- vector loop. 3479 | | 3480 | v 3481 \ -[ ] <--- middle-block. 3482 \/ | 3483 /\ v 3484 | ->[ ] <--- new preheader. 3485 | | 3486 (opt) v <-- edge from middle to exit iff epilogue is not required. 3487 | [ ] \ 3488 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3489 \ | 3490 \ v 3491 >[ ] <-- exit block(s). 3492 ... 3493 */ 3494 3495 // Get the metadata of the original loop before it gets modified. 3496 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3497 3498 // Workaround! Compute the trip count of the original loop and cache it 3499 // before we start modifying the CFG. This code has a systemic problem 3500 // wherein it tries to run analysis over partially constructed IR; this is 3501 // wrong, and not simply for SCEV. The trip count of the original loop 3502 // simply happens to be prone to hitting this in practice. In theory, we 3503 // can hit the same issue for any SCEV, or ValueTracking query done during 3504 // mutation. See PR49900. 3505 getOrCreateTripCount(OrigLoop); 3506 3507 // Create an empty vector loop, and prepare basic blocks for the runtime 3508 // checks. 3509 Loop *Lp = createVectorLoopSkeleton(""); 3510 3511 // Now, compare the new count to zero. If it is zero skip the vector loop and 3512 // jump to the scalar loop. This check also covers the case where the 3513 // backedge-taken count is uint##_max: adding one to it will overflow leading 3514 // to an incorrect trip count of zero. In this (rare) case we will also jump 3515 // to the scalar loop. 3516 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3517 3518 // Generate the code to check any assumptions that we've made for SCEV 3519 // expressions. 3520 emitSCEVChecks(Lp, LoopScalarPreHeader); 3521 3522 // Generate the code that checks in runtime if arrays overlap. We put the 3523 // checks into a separate block to make the more common case of few elements 3524 // faster. 3525 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3526 3527 createHeaderBranch(Lp); 3528 3529 // Emit phis for the new starting index of the scalar loop. 3530 createInductionResumeValues(Lp); 3531 3532 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3533 } 3534 3535 // Fix up external users of the induction variable. At this point, we are 3536 // in LCSSA form, with all external PHIs that use the IV having one input value, 3537 // coming from the remainder loop. We need those PHIs to also have a correct 3538 // value for the IV when arriving directly from the middle block. 3539 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3540 const InductionDescriptor &II, 3541 Value *CountRoundDown, Value *EndValue, 3542 BasicBlock *MiddleBlock) { 3543 // There are two kinds of external IV usages - those that use the value 3544 // computed in the last iteration (the PHI) and those that use the penultimate 3545 // value (the value that feeds into the phi from the loop latch). 3546 // We allow both, but they, obviously, have different values. 3547 3548 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3549 3550 DenseMap<Value *, Value *> MissingVals; 3551 3552 // An external user of the last iteration's value should see the value that 3553 // the remainder loop uses to initialize its own IV. 3554 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3555 for (User *U : PostInc->users()) { 3556 Instruction *UI = cast<Instruction>(U); 3557 if (!OrigLoop->contains(UI)) { 3558 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3559 MissingVals[UI] = EndValue; 3560 } 3561 } 3562 3563 // An external user of the penultimate value need to see EndValue - Step. 3564 // The simplest way to get this is to recompute it from the constituent SCEVs, 3565 // that is Start + (Step * (CRD - 1)). 3566 for (User *U : OrigPhi->users()) { 3567 auto *UI = cast<Instruction>(U); 3568 if (!OrigLoop->contains(UI)) { 3569 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3570 3571 IRBuilder<> B(MiddleBlock->getTerminator()); 3572 3573 // Fast-math-flags propagate from the original induction instruction. 3574 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3575 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3576 3577 Value *CountMinusOne = B.CreateSub( 3578 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3579 Value *CMO = 3580 !II.getStep()->getType()->isIntegerTy() 3581 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3582 II.getStep()->getType()) 3583 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3584 CMO->setName("cast.cmo"); 3585 3586 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3587 LoopVectorBody->getTerminator()); 3588 Value *Escape = 3589 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3590 Escape->setName("ind.escape"); 3591 MissingVals[UI] = Escape; 3592 } 3593 } 3594 3595 for (auto &I : MissingVals) { 3596 PHINode *PHI = cast<PHINode>(I.first); 3597 // One corner case we have to handle is two IVs "chasing" each-other, 3598 // that is %IV2 = phi [...], [ %IV1, %latch ] 3599 // In this case, if IV1 has an external use, we need to avoid adding both 3600 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3601 // don't already have an incoming value for the middle block. 3602 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3603 PHI->addIncoming(I.second, MiddleBlock); 3604 } 3605 } 3606 3607 namespace { 3608 3609 struct CSEDenseMapInfo { 3610 static bool canHandle(const Instruction *I) { 3611 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3612 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3613 } 3614 3615 static inline Instruction *getEmptyKey() { 3616 return DenseMapInfo<Instruction *>::getEmptyKey(); 3617 } 3618 3619 static inline Instruction *getTombstoneKey() { 3620 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3621 } 3622 3623 static unsigned getHashValue(const Instruction *I) { 3624 assert(canHandle(I) && "Unknown instruction!"); 3625 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3626 I->value_op_end())); 3627 } 3628 3629 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3630 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3631 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3632 return LHS == RHS; 3633 return LHS->isIdenticalTo(RHS); 3634 } 3635 }; 3636 3637 } // end anonymous namespace 3638 3639 ///Perform cse of induction variable instructions. 3640 static void cse(BasicBlock *BB) { 3641 // Perform simple cse. 3642 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3643 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3644 if (!CSEDenseMapInfo::canHandle(&In)) 3645 continue; 3646 3647 // Check if we can replace this instruction with any of the 3648 // visited instructions. 3649 if (Instruction *V = CSEMap.lookup(&In)) { 3650 In.replaceAllUsesWith(V); 3651 In.eraseFromParent(); 3652 continue; 3653 } 3654 3655 CSEMap[&In] = &In; 3656 } 3657 } 3658 3659 InstructionCost 3660 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3661 bool &NeedToScalarize) const { 3662 Function *F = CI->getCalledFunction(); 3663 Type *ScalarRetTy = CI->getType(); 3664 SmallVector<Type *, 4> Tys, ScalarTys; 3665 for (auto &ArgOp : CI->args()) 3666 ScalarTys.push_back(ArgOp->getType()); 3667 3668 // Estimate cost of scalarized vector call. The source operands are assumed 3669 // to be vectors, so we need to extract individual elements from there, 3670 // execute VF scalar calls, and then gather the result into the vector return 3671 // value. 3672 InstructionCost ScalarCallCost = 3673 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3674 if (VF.isScalar()) 3675 return ScalarCallCost; 3676 3677 // Compute corresponding vector type for return value and arguments. 3678 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3679 for (Type *ScalarTy : ScalarTys) 3680 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3681 3682 // Compute costs of unpacking argument values for the scalar calls and 3683 // packing the return values to a vector. 3684 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3685 3686 InstructionCost Cost = 3687 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3688 3689 // If we can't emit a vector call for this function, then the currently found 3690 // cost is the cost we need to return. 3691 NeedToScalarize = true; 3692 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3693 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3694 3695 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3696 return Cost; 3697 3698 // If the corresponding vector cost is cheaper, return its cost. 3699 InstructionCost VectorCallCost = 3700 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3701 if (VectorCallCost < Cost) { 3702 NeedToScalarize = false; 3703 Cost = VectorCallCost; 3704 } 3705 return Cost; 3706 } 3707 3708 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3709 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3710 return Elt; 3711 return VectorType::get(Elt, VF); 3712 } 3713 3714 InstructionCost 3715 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3716 ElementCount VF) const { 3717 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3718 assert(ID && "Expected intrinsic call!"); 3719 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3720 FastMathFlags FMF; 3721 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3722 FMF = FPMO->getFastMathFlags(); 3723 3724 SmallVector<const Value *> Arguments(CI->args()); 3725 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3726 SmallVector<Type *> ParamTys; 3727 std::transform(FTy->param_begin(), FTy->param_end(), 3728 std::back_inserter(ParamTys), 3729 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3730 3731 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3732 dyn_cast<IntrinsicInst>(CI)); 3733 return TTI.getIntrinsicInstrCost(CostAttrs, 3734 TargetTransformInfo::TCK_RecipThroughput); 3735 } 3736 3737 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3738 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3739 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3740 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3741 } 3742 3743 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3744 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3745 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3746 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3747 } 3748 3749 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3750 // For every instruction `I` in MinBWs, truncate the operands, create a 3751 // truncated version of `I` and reextend its result. InstCombine runs 3752 // later and will remove any ext/trunc pairs. 3753 SmallPtrSet<Value *, 4> Erased; 3754 for (const auto &KV : Cost->getMinimalBitwidths()) { 3755 // If the value wasn't vectorized, we must maintain the original scalar 3756 // type. The absence of the value from State indicates that it 3757 // wasn't vectorized. 3758 // FIXME: Should not rely on getVPValue at this point. 3759 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3760 if (!State.hasAnyVectorValue(Def)) 3761 continue; 3762 for (unsigned Part = 0; Part < UF; ++Part) { 3763 Value *I = State.get(Def, Part); 3764 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3765 continue; 3766 Type *OriginalTy = I->getType(); 3767 Type *ScalarTruncatedTy = 3768 IntegerType::get(OriginalTy->getContext(), KV.second); 3769 auto *TruncatedTy = VectorType::get( 3770 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3771 if (TruncatedTy == OriginalTy) 3772 continue; 3773 3774 IRBuilder<> B(cast<Instruction>(I)); 3775 auto ShrinkOperand = [&](Value *V) -> Value * { 3776 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3777 if (ZI->getSrcTy() == TruncatedTy) 3778 return ZI->getOperand(0); 3779 return B.CreateZExtOrTrunc(V, TruncatedTy); 3780 }; 3781 3782 // The actual instruction modification depends on the instruction type, 3783 // unfortunately. 3784 Value *NewI = nullptr; 3785 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3786 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3787 ShrinkOperand(BO->getOperand(1))); 3788 3789 // Any wrapping introduced by shrinking this operation shouldn't be 3790 // considered undefined behavior. So, we can't unconditionally copy 3791 // arithmetic wrapping flags to NewI. 3792 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3793 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3794 NewI = 3795 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3796 ShrinkOperand(CI->getOperand(1))); 3797 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3798 NewI = B.CreateSelect(SI->getCondition(), 3799 ShrinkOperand(SI->getTrueValue()), 3800 ShrinkOperand(SI->getFalseValue())); 3801 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3802 switch (CI->getOpcode()) { 3803 default: 3804 llvm_unreachable("Unhandled cast!"); 3805 case Instruction::Trunc: 3806 NewI = ShrinkOperand(CI->getOperand(0)); 3807 break; 3808 case Instruction::SExt: 3809 NewI = B.CreateSExtOrTrunc( 3810 CI->getOperand(0), 3811 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3812 break; 3813 case Instruction::ZExt: 3814 NewI = B.CreateZExtOrTrunc( 3815 CI->getOperand(0), 3816 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3817 break; 3818 } 3819 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3820 auto Elements0 = 3821 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3822 auto *O0 = B.CreateZExtOrTrunc( 3823 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3824 auto Elements1 = 3825 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3826 auto *O1 = B.CreateZExtOrTrunc( 3827 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3828 3829 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3830 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3831 // Don't do anything with the operands, just extend the result. 3832 continue; 3833 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3834 auto Elements = 3835 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3836 auto *O0 = B.CreateZExtOrTrunc( 3837 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3838 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3839 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3840 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3841 auto Elements = 3842 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3843 auto *O0 = B.CreateZExtOrTrunc( 3844 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3845 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3846 } else { 3847 // If we don't know what to do, be conservative and don't do anything. 3848 continue; 3849 } 3850 3851 // Lastly, extend the result. 3852 NewI->takeName(cast<Instruction>(I)); 3853 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3854 I->replaceAllUsesWith(Res); 3855 cast<Instruction>(I)->eraseFromParent(); 3856 Erased.insert(I); 3857 State.reset(Def, Res, Part); 3858 } 3859 } 3860 3861 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3862 for (const auto &KV : Cost->getMinimalBitwidths()) { 3863 // If the value wasn't vectorized, we must maintain the original scalar 3864 // type. The absence of the value from State indicates that it 3865 // wasn't vectorized. 3866 // FIXME: Should not rely on getVPValue at this point. 3867 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3868 if (!State.hasAnyVectorValue(Def)) 3869 continue; 3870 for (unsigned Part = 0; Part < UF; ++Part) { 3871 Value *I = State.get(Def, Part); 3872 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3873 if (Inst && Inst->use_empty()) { 3874 Value *NewI = Inst->getOperand(0); 3875 Inst->eraseFromParent(); 3876 State.reset(Def, NewI, Part); 3877 } 3878 } 3879 } 3880 } 3881 3882 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3883 // Insert truncates and extends for any truncated instructions as hints to 3884 // InstCombine. 3885 if (VF.isVector()) 3886 truncateToMinimalBitwidths(State); 3887 3888 // Fix widened non-induction PHIs by setting up the PHI operands. 3889 if (OrigPHIsToFix.size()) { 3890 assert(EnableVPlanNativePath && 3891 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3892 fixNonInductionPHIs(State); 3893 } 3894 3895 // At this point every instruction in the original loop is widened to a 3896 // vector form. Now we need to fix the recurrences in the loop. These PHI 3897 // nodes are currently empty because we did not want to introduce cycles. 3898 // This is the second stage of vectorizing recurrences. 3899 fixCrossIterationPHIs(State); 3900 3901 // Forget the original basic block. 3902 PSE.getSE()->forgetLoop(OrigLoop); 3903 3904 // If we inserted an edge from the middle block to the unique exit block, 3905 // update uses outside the loop (phis) to account for the newly inserted 3906 // edge. 3907 if (!Cost->requiresScalarEpilogue(VF)) { 3908 // Fix-up external users of the induction variables. 3909 for (auto &Entry : Legal->getInductionVars()) 3910 fixupIVUsers(Entry.first, Entry.second, 3911 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3912 IVEndValues[Entry.first], LoopMiddleBlock); 3913 3914 fixLCSSAPHIs(State); 3915 } 3916 3917 for (Instruction *PI : PredicatedInstructions) 3918 sinkScalarOperands(&*PI); 3919 3920 // Remove redundant induction instructions. 3921 cse(LoopVectorBody); 3922 3923 // Set/update profile weights for the vector and remainder loops as original 3924 // loop iterations are now distributed among them. Note that original loop 3925 // represented by LoopScalarBody becomes remainder loop after vectorization. 3926 // 3927 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3928 // end up getting slightly roughened result but that should be OK since 3929 // profile is not inherently precise anyway. Note also possible bypass of 3930 // vector code caused by legality checks is ignored, assigning all the weight 3931 // to the vector loop, optimistically. 3932 // 3933 // For scalable vectorization we can't know at compile time how many iterations 3934 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3935 // vscale of '1'. 3936 setProfileInfoAfterUnrolling( 3937 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3938 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3939 } 3940 3941 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3942 // In order to support recurrences we need to be able to vectorize Phi nodes. 3943 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3944 // stage #2: We now need to fix the recurrences by adding incoming edges to 3945 // the currently empty PHI nodes. At this point every instruction in the 3946 // original loop is widened to a vector form so we can use them to construct 3947 // the incoming edges. 3948 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 3949 for (VPRecipeBase &R : Header->phis()) { 3950 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3951 fixReduction(ReductionPhi, State); 3952 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3953 fixFirstOrderRecurrence(FOR, State); 3954 } 3955 } 3956 3957 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3958 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3959 // This is the second phase of vectorizing first-order recurrences. An 3960 // overview of the transformation is described below. Suppose we have the 3961 // following loop. 3962 // 3963 // for (int i = 0; i < n; ++i) 3964 // b[i] = a[i] - a[i - 1]; 3965 // 3966 // There is a first-order recurrence on "a". For this loop, the shorthand 3967 // scalar IR looks like: 3968 // 3969 // scalar.ph: 3970 // s_init = a[-1] 3971 // br scalar.body 3972 // 3973 // scalar.body: 3974 // i = phi [0, scalar.ph], [i+1, scalar.body] 3975 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3976 // s2 = a[i] 3977 // b[i] = s2 - s1 3978 // br cond, scalar.body, ... 3979 // 3980 // In this example, s1 is a recurrence because it's value depends on the 3981 // previous iteration. In the first phase of vectorization, we created a 3982 // vector phi v1 for s1. We now complete the vectorization and produce the 3983 // shorthand vector IR shown below (for VF = 4, UF = 1). 3984 // 3985 // vector.ph: 3986 // v_init = vector(..., ..., ..., a[-1]) 3987 // br vector.body 3988 // 3989 // vector.body 3990 // i = phi [0, vector.ph], [i+4, vector.body] 3991 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3992 // v2 = a[i, i+1, i+2, i+3]; 3993 // v3 = vector(v1(3), v2(0, 1, 2)) 3994 // b[i, i+1, i+2, i+3] = v2 - v3 3995 // br cond, vector.body, middle.block 3996 // 3997 // middle.block: 3998 // x = v2(3) 3999 // br scalar.ph 4000 // 4001 // scalar.ph: 4002 // s_init = phi [x, middle.block], [a[-1], otherwise] 4003 // br scalar.body 4004 // 4005 // After execution completes the vector loop, we extract the next value of 4006 // the recurrence (x) to use as the initial value in the scalar loop. 4007 4008 // Extract the last vector element in the middle block. This will be the 4009 // initial value for the recurrence when jumping to the scalar loop. 4010 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4011 Value *Incoming = State.get(PreviousDef, UF - 1); 4012 auto *ExtractForScalar = Incoming; 4013 auto *IdxTy = Builder.getInt32Ty(); 4014 if (VF.isVector()) { 4015 auto *One = ConstantInt::get(IdxTy, 1); 4016 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4017 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4018 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4019 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4020 "vector.recur.extract"); 4021 } 4022 // Extract the second last element in the middle block if the 4023 // Phi is used outside the loop. We need to extract the phi itself 4024 // and not the last element (the phi update in the current iteration). This 4025 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4026 // when the scalar loop is not run at all. 4027 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4028 if (VF.isVector()) { 4029 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4030 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4031 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4032 Incoming, Idx, "vector.recur.extract.for.phi"); 4033 } else if (UF > 1) 4034 // When loop is unrolled without vectorizing, initialize 4035 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4036 // of `Incoming`. This is analogous to the vectorized case above: extracting 4037 // the second last element when VF > 1. 4038 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4039 4040 // Fix the initial value of the original recurrence in the scalar loop. 4041 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4042 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4043 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4044 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4045 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4046 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4047 Start->addIncoming(Incoming, BB); 4048 } 4049 4050 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4051 Phi->setName("scalar.recur"); 4052 4053 // Finally, fix users of the recurrence outside the loop. The users will need 4054 // either the last value of the scalar recurrence or the last value of the 4055 // vector recurrence we extracted in the middle block. Since the loop is in 4056 // LCSSA form, we just need to find all the phi nodes for the original scalar 4057 // recurrence in the exit block, and then add an edge for the middle block. 4058 // Note that LCSSA does not imply single entry when the original scalar loop 4059 // had multiple exiting edges (as we always run the last iteration in the 4060 // scalar epilogue); in that case, there is no edge from middle to exit and 4061 // and thus no phis which needed updated. 4062 if (!Cost->requiresScalarEpilogue(VF)) 4063 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4064 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4065 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4066 } 4067 4068 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4069 VPTransformState &State) { 4070 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4071 // Get it's reduction variable descriptor. 4072 assert(Legal->isReductionVariable(OrigPhi) && 4073 "Unable to find the reduction variable"); 4074 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4075 4076 RecurKind RK = RdxDesc.getRecurrenceKind(); 4077 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4078 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4079 setDebugLocFromInst(ReductionStartValue); 4080 4081 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4082 // This is the vector-clone of the value that leaves the loop. 4083 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4084 4085 // Wrap flags are in general invalid after vectorization, clear them. 4086 clearReductionWrapFlags(RdxDesc, State); 4087 4088 // Before each round, move the insertion point right between 4089 // the PHIs and the values we are going to write. 4090 // This allows us to write both PHINodes and the extractelement 4091 // instructions. 4092 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4093 4094 setDebugLocFromInst(LoopExitInst); 4095 4096 Type *PhiTy = OrigPhi->getType(); 4097 // If tail is folded by masking, the vector value to leave the loop should be 4098 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4099 // instead of the former. For an inloop reduction the reduction will already 4100 // be predicated, and does not need to be handled here. 4101 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4102 for (unsigned Part = 0; Part < UF; ++Part) { 4103 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4104 Value *Sel = nullptr; 4105 for (User *U : VecLoopExitInst->users()) { 4106 if (isa<SelectInst>(U)) { 4107 assert(!Sel && "Reduction exit feeding two selects"); 4108 Sel = U; 4109 } else 4110 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4111 } 4112 assert(Sel && "Reduction exit feeds no select"); 4113 State.reset(LoopExitInstDef, Sel, Part); 4114 4115 // If the target can create a predicated operator for the reduction at no 4116 // extra cost in the loop (for example a predicated vadd), it can be 4117 // cheaper for the select to remain in the loop than be sunk out of it, 4118 // and so use the select value for the phi instead of the old 4119 // LoopExitValue. 4120 if (PreferPredicatedReductionSelect || 4121 TTI->preferPredicatedReductionSelect( 4122 RdxDesc.getOpcode(), PhiTy, 4123 TargetTransformInfo::ReductionFlags())) { 4124 auto *VecRdxPhi = 4125 cast<PHINode>(State.get(PhiR, Part)); 4126 VecRdxPhi->setIncomingValueForBlock( 4127 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4128 } 4129 } 4130 } 4131 4132 // If the vector reduction can be performed in a smaller type, we truncate 4133 // then extend the loop exit value to enable InstCombine to evaluate the 4134 // entire expression in the smaller type. 4135 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4136 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4137 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4138 Builder.SetInsertPoint( 4139 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4140 VectorParts RdxParts(UF); 4141 for (unsigned Part = 0; Part < UF; ++Part) { 4142 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4143 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4144 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4145 : Builder.CreateZExt(Trunc, VecTy); 4146 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4147 if (U != Trunc) { 4148 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4149 RdxParts[Part] = Extnd; 4150 } 4151 } 4152 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4153 for (unsigned Part = 0; Part < UF; ++Part) { 4154 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4155 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4156 } 4157 } 4158 4159 // Reduce all of the unrolled parts into a single vector. 4160 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4161 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4162 4163 // The middle block terminator has already been assigned a DebugLoc here (the 4164 // OrigLoop's single latch terminator). We want the whole middle block to 4165 // appear to execute on this line because: (a) it is all compiler generated, 4166 // (b) these instructions are always executed after evaluating the latch 4167 // conditional branch, and (c) other passes may add new predecessors which 4168 // terminate on this line. This is the easiest way to ensure we don't 4169 // accidentally cause an extra step back into the loop while debugging. 4170 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4171 if (PhiR->isOrdered()) 4172 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4173 else { 4174 // Floating-point operations should have some FMF to enable the reduction. 4175 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4176 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4177 for (unsigned Part = 1; Part < UF; ++Part) { 4178 Value *RdxPart = State.get(LoopExitInstDef, Part); 4179 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4180 ReducedPartRdx = Builder.CreateBinOp( 4181 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4182 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4183 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4184 ReducedPartRdx, RdxPart); 4185 else 4186 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4187 } 4188 } 4189 4190 // Create the reduction after the loop. Note that inloop reductions create the 4191 // target reduction in the loop using a Reduction recipe. 4192 if (VF.isVector() && !PhiR->isInLoop()) { 4193 ReducedPartRdx = 4194 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4195 // If the reduction can be performed in a smaller type, we need to extend 4196 // the reduction to the wider type before we branch to the original loop. 4197 if (PhiTy != RdxDesc.getRecurrenceType()) 4198 ReducedPartRdx = RdxDesc.isSigned() 4199 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4200 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4201 } 4202 4203 PHINode *ResumePhi = 4204 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4205 4206 // Create a phi node that merges control-flow from the backedge-taken check 4207 // block and the middle block. 4208 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4209 LoopScalarPreHeader->getTerminator()); 4210 4211 // If we are fixing reductions in the epilogue loop then we should already 4212 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4213 // we carry over the incoming values correctly. 4214 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4215 if (Incoming == LoopMiddleBlock) 4216 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4217 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4218 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4219 Incoming); 4220 else 4221 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4222 } 4223 4224 // Set the resume value for this reduction 4225 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4226 4227 // Now, we need to fix the users of the reduction variable 4228 // inside and outside of the scalar remainder loop. 4229 4230 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4231 // in the exit blocks. See comment on analogous loop in 4232 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4233 if (!Cost->requiresScalarEpilogue(VF)) 4234 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4235 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4236 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4237 4238 // Fix the scalar loop reduction variable with the incoming reduction sum 4239 // from the vector body and from the backedge value. 4240 int IncomingEdgeBlockIdx = 4241 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4242 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4243 // Pick the other block. 4244 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4245 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4246 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4247 } 4248 4249 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4250 VPTransformState &State) { 4251 RecurKind RK = RdxDesc.getRecurrenceKind(); 4252 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4253 return; 4254 4255 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4256 assert(LoopExitInstr && "null loop exit instruction"); 4257 SmallVector<Instruction *, 8> Worklist; 4258 SmallPtrSet<Instruction *, 8> Visited; 4259 Worklist.push_back(LoopExitInstr); 4260 Visited.insert(LoopExitInstr); 4261 4262 while (!Worklist.empty()) { 4263 Instruction *Cur = Worklist.pop_back_val(); 4264 if (isa<OverflowingBinaryOperator>(Cur)) 4265 for (unsigned Part = 0; Part < UF; ++Part) { 4266 // FIXME: Should not rely on getVPValue at this point. 4267 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4268 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4269 } 4270 4271 for (User *U : Cur->users()) { 4272 Instruction *UI = cast<Instruction>(U); 4273 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4274 Visited.insert(UI).second) 4275 Worklist.push_back(UI); 4276 } 4277 } 4278 } 4279 4280 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4281 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4282 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4283 // Some phis were already hand updated by the reduction and recurrence 4284 // code above, leave them alone. 4285 continue; 4286 4287 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4288 // Non-instruction incoming values will have only one value. 4289 4290 VPLane Lane = VPLane::getFirstLane(); 4291 if (isa<Instruction>(IncomingValue) && 4292 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4293 VF)) 4294 Lane = VPLane::getLastLaneForVF(VF); 4295 4296 // Can be a loop invariant incoming value or the last scalar value to be 4297 // extracted from the vectorized loop. 4298 // FIXME: Should not rely on getVPValue at this point. 4299 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4300 Value *lastIncomingValue = 4301 OrigLoop->isLoopInvariant(IncomingValue) 4302 ? IncomingValue 4303 : State.get(State.Plan->getVPValue(IncomingValue, true), 4304 VPIteration(UF - 1, Lane)); 4305 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4306 } 4307 } 4308 4309 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4310 // The basic block and loop containing the predicated instruction. 4311 auto *PredBB = PredInst->getParent(); 4312 auto *VectorLoop = LI->getLoopFor(PredBB); 4313 4314 // Initialize a worklist with the operands of the predicated instruction. 4315 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4316 4317 // Holds instructions that we need to analyze again. An instruction may be 4318 // reanalyzed if we don't yet know if we can sink it or not. 4319 SmallVector<Instruction *, 8> InstsToReanalyze; 4320 4321 // Returns true if a given use occurs in the predicated block. Phi nodes use 4322 // their operands in their corresponding predecessor blocks. 4323 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4324 auto *I = cast<Instruction>(U.getUser()); 4325 BasicBlock *BB = I->getParent(); 4326 if (auto *Phi = dyn_cast<PHINode>(I)) 4327 BB = Phi->getIncomingBlock( 4328 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4329 return BB == PredBB; 4330 }; 4331 4332 // Iteratively sink the scalarized operands of the predicated instruction 4333 // into the block we created for it. When an instruction is sunk, it's 4334 // operands are then added to the worklist. The algorithm ends after one pass 4335 // through the worklist doesn't sink a single instruction. 4336 bool Changed; 4337 do { 4338 // Add the instructions that need to be reanalyzed to the worklist, and 4339 // reset the changed indicator. 4340 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4341 InstsToReanalyze.clear(); 4342 Changed = false; 4343 4344 while (!Worklist.empty()) { 4345 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4346 4347 // We can't sink an instruction if it is a phi node, is not in the loop, 4348 // or may have side effects. 4349 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4350 I->mayHaveSideEffects()) 4351 continue; 4352 4353 // If the instruction is already in PredBB, check if we can sink its 4354 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4355 // sinking the scalar instruction I, hence it appears in PredBB; but it 4356 // may have failed to sink I's operands (recursively), which we try 4357 // (again) here. 4358 if (I->getParent() == PredBB) { 4359 Worklist.insert(I->op_begin(), I->op_end()); 4360 continue; 4361 } 4362 4363 // It's legal to sink the instruction if all its uses occur in the 4364 // predicated block. Otherwise, there's nothing to do yet, and we may 4365 // need to reanalyze the instruction. 4366 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4367 InstsToReanalyze.push_back(I); 4368 continue; 4369 } 4370 4371 // Move the instruction to the beginning of the predicated block, and add 4372 // it's operands to the worklist. 4373 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4374 Worklist.insert(I->op_begin(), I->op_end()); 4375 4376 // The sinking may have enabled other instructions to be sunk, so we will 4377 // need to iterate. 4378 Changed = true; 4379 } 4380 } while (Changed); 4381 } 4382 4383 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4384 for (PHINode *OrigPhi : OrigPHIsToFix) { 4385 VPWidenPHIRecipe *VPPhi = 4386 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4387 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4388 // Make sure the builder has a valid insert point. 4389 Builder.SetInsertPoint(NewPhi); 4390 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4391 VPValue *Inc = VPPhi->getIncomingValue(i); 4392 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4393 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4394 } 4395 } 4396 } 4397 4398 bool InnerLoopVectorizer::useOrderedReductions( 4399 const RecurrenceDescriptor &RdxDesc) { 4400 return Cost->useOrderedReductions(RdxDesc); 4401 } 4402 4403 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4404 VPWidenPHIRecipe *PhiR, 4405 VPTransformState &State) { 4406 PHINode *P = cast<PHINode>(PN); 4407 if (EnableVPlanNativePath) { 4408 // Currently we enter here in the VPlan-native path for non-induction 4409 // PHIs where all control flow is uniform. We simply widen these PHIs. 4410 // Create a vector phi with no operands - the vector phi operands will be 4411 // set at the end of vector code generation. 4412 Type *VecTy = (State.VF.isScalar()) 4413 ? PN->getType() 4414 : VectorType::get(PN->getType(), State.VF); 4415 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4416 State.set(PhiR, VecPhi, 0); 4417 OrigPHIsToFix.push_back(P); 4418 4419 return; 4420 } 4421 4422 assert(PN->getParent() == OrigLoop->getHeader() && 4423 "Non-header phis should have been handled elsewhere"); 4424 4425 // In order to support recurrences we need to be able to vectorize Phi nodes. 4426 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4427 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4428 // this value when we vectorize all of the instructions that use the PHI. 4429 4430 assert(!Legal->isReductionVariable(P) && 4431 "reductions should be handled elsewhere"); 4432 4433 setDebugLocFromInst(P); 4434 4435 // This PHINode must be an induction variable. 4436 // Make sure that we know about it. 4437 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4438 4439 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4440 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4441 4442 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4443 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4444 4445 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4446 // which can be found from the original scalar operations. 4447 switch (II.getKind()) { 4448 case InductionDescriptor::IK_NoInduction: 4449 llvm_unreachable("Unknown induction"); 4450 case InductionDescriptor::IK_IntInduction: 4451 case InductionDescriptor::IK_FpInduction: 4452 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4453 case InductionDescriptor::IK_PtrInduction: { 4454 // Handle the pointer induction variable case. 4455 assert(P->getType()->isPointerTy() && "Unexpected type."); 4456 4457 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4458 // This is the normalized GEP that starts counting at zero. 4459 Value *PtrInd = 4460 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4461 // Determine the number of scalars we need to generate for each unroll 4462 // iteration. If the instruction is uniform, we only need to generate the 4463 // first lane. Otherwise, we generate all VF values. 4464 bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); 4465 assert((IsUniform || !State.VF.isScalable()) && 4466 "Cannot scalarize a scalable VF"); 4467 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4468 4469 for (unsigned Part = 0; Part < UF; ++Part) { 4470 Value *PartStart = 4471 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4472 4473 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4474 Value *Idx = Builder.CreateAdd( 4475 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4476 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4477 4478 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 4479 State.CFG.PrevBB->getTerminator()); 4480 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, 4481 II.getStartValue(), Step, II); 4482 SclrGep->setName("next.gep"); 4483 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4484 } 4485 } 4486 return; 4487 } 4488 assert(isa<SCEVConstant>(II.getStep()) && 4489 "Induction step not a SCEV constant!"); 4490 Type *PhiType = II.getStep()->getType(); 4491 4492 // Build a pointer phi 4493 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4494 Type *ScStValueType = ScalarStartValue->getType(); 4495 PHINode *NewPointerPhi = 4496 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4497 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4498 4499 // A pointer induction, performed by using a gep 4500 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4501 Instruction *InductionLoc = LoopLatch->getTerminator(); 4502 const SCEV *ScalarStep = II.getStep(); 4503 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4504 Value *ScalarStepValue = 4505 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4506 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4507 Value *NumUnrolledElems = 4508 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4509 Value *InductionGEP = GetElementPtrInst::Create( 4510 II.getElementType(), NewPointerPhi, 4511 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4512 InductionLoc); 4513 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4514 4515 // Create UF many actual address geps that use the pointer 4516 // phi as base and a vectorized version of the step value 4517 // (<step*0, ..., step*N>) as offset. 4518 for (unsigned Part = 0; Part < State.UF; ++Part) { 4519 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4520 Value *StartOffsetScalar = 4521 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4522 Value *StartOffset = 4523 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4524 // Create a vector of consecutive numbers from zero to VF. 4525 StartOffset = 4526 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4527 4528 Value *GEP = Builder.CreateGEP( 4529 II.getElementType(), NewPointerPhi, 4530 Builder.CreateMul( 4531 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4532 "vector.gep")); 4533 State.set(PhiR, GEP, Part); 4534 } 4535 } 4536 } 4537 } 4538 4539 /// A helper function for checking whether an integer division-related 4540 /// instruction may divide by zero (in which case it must be predicated if 4541 /// executed conditionally in the scalar code). 4542 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4543 /// Non-zero divisors that are non compile-time constants will not be 4544 /// converted into multiplication, so we will still end up scalarizing 4545 /// the division, but can do so w/o predication. 4546 static bool mayDivideByZero(Instruction &I) { 4547 assert((I.getOpcode() == Instruction::UDiv || 4548 I.getOpcode() == Instruction::SDiv || 4549 I.getOpcode() == Instruction::URem || 4550 I.getOpcode() == Instruction::SRem) && 4551 "Unexpected instruction"); 4552 Value *Divisor = I.getOperand(1); 4553 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4554 return !CInt || CInt->isZero(); 4555 } 4556 4557 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4558 VPUser &ArgOperands, 4559 VPTransformState &State) { 4560 assert(!isa<DbgInfoIntrinsic>(I) && 4561 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4562 setDebugLocFromInst(&I); 4563 4564 Module *M = I.getParent()->getParent()->getParent(); 4565 auto *CI = cast<CallInst>(&I); 4566 4567 SmallVector<Type *, 4> Tys; 4568 for (Value *ArgOperand : CI->args()) 4569 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4570 4571 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4572 4573 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4574 // version of the instruction. 4575 // Is it beneficial to perform intrinsic call compared to lib call? 4576 bool NeedToScalarize = false; 4577 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4578 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4579 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4580 assert((UseVectorIntrinsic || !NeedToScalarize) && 4581 "Instruction should be scalarized elsewhere."); 4582 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4583 "Either the intrinsic cost or vector call cost must be valid"); 4584 4585 for (unsigned Part = 0; Part < UF; ++Part) { 4586 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4587 SmallVector<Value *, 4> Args; 4588 for (auto &I : enumerate(ArgOperands.operands())) { 4589 // Some intrinsics have a scalar argument - don't replace it with a 4590 // vector. 4591 Value *Arg; 4592 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4593 Arg = State.get(I.value(), Part); 4594 else { 4595 Arg = State.get(I.value(), VPIteration(0, 0)); 4596 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4597 TysForDecl.push_back(Arg->getType()); 4598 } 4599 Args.push_back(Arg); 4600 } 4601 4602 Function *VectorF; 4603 if (UseVectorIntrinsic) { 4604 // Use vector version of the intrinsic. 4605 if (VF.isVector()) 4606 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4607 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4608 assert(VectorF && "Can't retrieve vector intrinsic."); 4609 } else { 4610 // Use vector version of the function call. 4611 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4612 #ifndef NDEBUG 4613 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4614 "Can't create vector function."); 4615 #endif 4616 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4617 } 4618 SmallVector<OperandBundleDef, 1> OpBundles; 4619 CI->getOperandBundlesAsDefs(OpBundles); 4620 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4621 4622 if (isa<FPMathOperator>(V)) 4623 V->copyFastMathFlags(CI); 4624 4625 State.set(Def, V, Part); 4626 addMetadata(V, &I); 4627 } 4628 } 4629 4630 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4631 // We should not collect Scalars more than once per VF. Right now, this 4632 // function is called from collectUniformsAndScalars(), which already does 4633 // this check. Collecting Scalars for VF=1 does not make any sense. 4634 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4635 "This function should not be visited twice for the same VF"); 4636 4637 SmallSetVector<Instruction *, 8> Worklist; 4638 4639 // These sets are used to seed the analysis with pointers used by memory 4640 // accesses that will remain scalar. 4641 SmallSetVector<Instruction *, 8> ScalarPtrs; 4642 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4643 auto *Latch = TheLoop->getLoopLatch(); 4644 4645 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4646 // The pointer operands of loads and stores will be scalar as long as the 4647 // memory access is not a gather or scatter operation. The value operand of a 4648 // store will remain scalar if the store is scalarized. 4649 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4650 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4651 assert(WideningDecision != CM_Unknown && 4652 "Widening decision should be ready at this moment"); 4653 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4654 if (Ptr == Store->getValueOperand()) 4655 return WideningDecision == CM_Scalarize; 4656 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4657 "Ptr is neither a value or pointer operand"); 4658 return WideningDecision != CM_GatherScatter; 4659 }; 4660 4661 // A helper that returns true if the given value is a bitcast or 4662 // getelementptr instruction contained in the loop. 4663 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4664 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4665 isa<GetElementPtrInst>(V)) && 4666 !TheLoop->isLoopInvariant(V); 4667 }; 4668 4669 // A helper that evaluates a memory access's use of a pointer. If the use will 4670 // be a scalar use and the pointer is only used by memory accesses, we place 4671 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4672 // PossibleNonScalarPtrs. 4673 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4674 // We only care about bitcast and getelementptr instructions contained in 4675 // the loop. 4676 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4677 return; 4678 4679 // If the pointer has already been identified as scalar (e.g., if it was 4680 // also identified as uniform), there's nothing to do. 4681 auto *I = cast<Instruction>(Ptr); 4682 if (Worklist.count(I)) 4683 return; 4684 4685 // If the use of the pointer will be a scalar use, and all users of the 4686 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4687 // place the pointer in PossibleNonScalarPtrs. 4688 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4689 return isa<LoadInst>(U) || isa<StoreInst>(U); 4690 })) 4691 ScalarPtrs.insert(I); 4692 else 4693 PossibleNonScalarPtrs.insert(I); 4694 }; 4695 4696 // We seed the scalars analysis with three classes of instructions: (1) 4697 // instructions marked uniform-after-vectorization and (2) bitcast, 4698 // getelementptr and (pointer) phi instructions used by memory accesses 4699 // requiring a scalar use. 4700 // 4701 // (1) Add to the worklist all instructions that have been identified as 4702 // uniform-after-vectorization. 4703 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4704 4705 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4706 // memory accesses requiring a scalar use. The pointer operands of loads and 4707 // stores will be scalar as long as the memory accesses is not a gather or 4708 // scatter operation. The value operand of a store will remain scalar if the 4709 // store is scalarized. 4710 for (auto *BB : TheLoop->blocks()) 4711 for (auto &I : *BB) { 4712 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4713 evaluatePtrUse(Load, Load->getPointerOperand()); 4714 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4715 evaluatePtrUse(Store, Store->getPointerOperand()); 4716 evaluatePtrUse(Store, Store->getValueOperand()); 4717 } 4718 } 4719 for (auto *I : ScalarPtrs) 4720 if (!PossibleNonScalarPtrs.count(I)) { 4721 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4722 Worklist.insert(I); 4723 } 4724 4725 // Insert the forced scalars. 4726 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4727 // induction variable when the PHI user is scalarized. 4728 auto ForcedScalar = ForcedScalars.find(VF); 4729 if (ForcedScalar != ForcedScalars.end()) 4730 for (auto *I : ForcedScalar->second) 4731 Worklist.insert(I); 4732 4733 // Expand the worklist by looking through any bitcasts and getelementptr 4734 // instructions we've already identified as scalar. This is similar to the 4735 // expansion step in collectLoopUniforms(); however, here we're only 4736 // expanding to include additional bitcasts and getelementptr instructions. 4737 unsigned Idx = 0; 4738 while (Idx != Worklist.size()) { 4739 Instruction *Dst = Worklist[Idx++]; 4740 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4741 continue; 4742 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4743 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4744 auto *J = cast<Instruction>(U); 4745 return !TheLoop->contains(J) || Worklist.count(J) || 4746 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4747 isScalarUse(J, Src)); 4748 })) { 4749 Worklist.insert(Src); 4750 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4751 } 4752 } 4753 4754 // An induction variable will remain scalar if all users of the induction 4755 // variable and induction variable update remain scalar. 4756 for (auto &Induction : Legal->getInductionVars()) { 4757 auto *Ind = Induction.first; 4758 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4759 4760 // If tail-folding is applied, the primary induction variable will be used 4761 // to feed a vector compare. 4762 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4763 continue; 4764 4765 // Returns true if \p Indvar is a pointer induction that is used directly by 4766 // load/store instruction \p I. 4767 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4768 Instruction *I) { 4769 return Induction.second.getKind() == 4770 InductionDescriptor::IK_PtrInduction && 4771 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4772 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4773 }; 4774 4775 // Determine if all users of the induction variable are scalar after 4776 // vectorization. 4777 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4778 auto *I = cast<Instruction>(U); 4779 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4780 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4781 }); 4782 if (!ScalarInd) 4783 continue; 4784 4785 // Determine if all users of the induction variable update instruction are 4786 // scalar after vectorization. 4787 auto ScalarIndUpdate = 4788 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4789 auto *I = cast<Instruction>(U); 4790 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4791 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4792 }); 4793 if (!ScalarIndUpdate) 4794 continue; 4795 4796 // The induction variable and its update instruction will remain scalar. 4797 Worklist.insert(Ind); 4798 Worklist.insert(IndUpdate); 4799 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4800 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4801 << "\n"); 4802 } 4803 4804 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4805 } 4806 4807 bool LoopVectorizationCostModel::isScalarWithPredication( 4808 Instruction *I, ElementCount VF) const { 4809 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4810 return false; 4811 switch(I->getOpcode()) { 4812 default: 4813 break; 4814 case Instruction::Load: 4815 case Instruction::Store: { 4816 if (!Legal->isMaskRequired(I)) 4817 return false; 4818 auto *Ptr = getLoadStorePointerOperand(I); 4819 auto *Ty = getLoadStoreType(I); 4820 Type *VTy = Ty; 4821 if (VF.isVector()) 4822 VTy = VectorType::get(Ty, VF); 4823 const Align Alignment = getLoadStoreAlignment(I); 4824 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4825 TTI.isLegalMaskedGather(VTy, Alignment)) 4826 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4827 TTI.isLegalMaskedScatter(VTy, Alignment)); 4828 } 4829 case Instruction::UDiv: 4830 case Instruction::SDiv: 4831 case Instruction::SRem: 4832 case Instruction::URem: 4833 return mayDivideByZero(*I); 4834 } 4835 return false; 4836 } 4837 4838 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4839 Instruction *I, ElementCount VF) { 4840 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4841 assert(getWideningDecision(I, VF) == CM_Unknown && 4842 "Decision should not be set yet."); 4843 auto *Group = getInterleavedAccessGroup(I); 4844 assert(Group && "Must have a group."); 4845 4846 // If the instruction's allocated size doesn't equal it's type size, it 4847 // requires padding and will be scalarized. 4848 auto &DL = I->getModule()->getDataLayout(); 4849 auto *ScalarTy = getLoadStoreType(I); 4850 if (hasIrregularType(ScalarTy, DL)) 4851 return false; 4852 4853 // Check if masking is required. 4854 // A Group may need masking for one of two reasons: it resides in a block that 4855 // needs predication, or it was decided to use masking to deal with gaps 4856 // (either a gap at the end of a load-access that may result in a speculative 4857 // load, or any gaps in a store-access). 4858 bool PredicatedAccessRequiresMasking = 4859 blockNeedsPredicationForAnyReason(I->getParent()) && 4860 Legal->isMaskRequired(I); 4861 bool LoadAccessWithGapsRequiresEpilogMasking = 4862 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4863 !isScalarEpilogueAllowed(); 4864 bool StoreAccessWithGapsRequiresMasking = 4865 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4866 if (!PredicatedAccessRequiresMasking && 4867 !LoadAccessWithGapsRequiresEpilogMasking && 4868 !StoreAccessWithGapsRequiresMasking) 4869 return true; 4870 4871 // If masked interleaving is required, we expect that the user/target had 4872 // enabled it, because otherwise it either wouldn't have been created or 4873 // it should have been invalidated by the CostModel. 4874 assert(useMaskedInterleavedAccesses(TTI) && 4875 "Masked interleave-groups for predicated accesses are not enabled."); 4876 4877 if (Group->isReverse()) 4878 return false; 4879 4880 auto *Ty = getLoadStoreType(I); 4881 const Align Alignment = getLoadStoreAlignment(I); 4882 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4883 : TTI.isLegalMaskedStore(Ty, Alignment); 4884 } 4885 4886 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4887 Instruction *I, ElementCount VF) { 4888 // Get and ensure we have a valid memory instruction. 4889 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4890 4891 auto *Ptr = getLoadStorePointerOperand(I); 4892 auto *ScalarTy = getLoadStoreType(I); 4893 4894 // In order to be widened, the pointer should be consecutive, first of all. 4895 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4896 return false; 4897 4898 // If the instruction is a store located in a predicated block, it will be 4899 // scalarized. 4900 if (isScalarWithPredication(I, VF)) 4901 return false; 4902 4903 // If the instruction's allocated size doesn't equal it's type size, it 4904 // requires padding and will be scalarized. 4905 auto &DL = I->getModule()->getDataLayout(); 4906 if (hasIrregularType(ScalarTy, DL)) 4907 return false; 4908 4909 return true; 4910 } 4911 4912 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4913 // We should not collect Uniforms more than once per VF. Right now, 4914 // this function is called from collectUniformsAndScalars(), which 4915 // already does this check. Collecting Uniforms for VF=1 does not make any 4916 // sense. 4917 4918 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4919 "This function should not be visited twice for the same VF"); 4920 4921 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4922 // not analyze again. Uniforms.count(VF) will return 1. 4923 Uniforms[VF].clear(); 4924 4925 // We now know that the loop is vectorizable! 4926 // Collect instructions inside the loop that will remain uniform after 4927 // vectorization. 4928 4929 // Global values, params and instructions outside of current loop are out of 4930 // scope. 4931 auto isOutOfScope = [&](Value *V) -> bool { 4932 Instruction *I = dyn_cast<Instruction>(V); 4933 return (!I || !TheLoop->contains(I)); 4934 }; 4935 4936 // Worklist containing uniform instructions demanding lane 0. 4937 SetVector<Instruction *> Worklist; 4938 BasicBlock *Latch = TheLoop->getLoopLatch(); 4939 4940 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4941 // that are scalar with predication must not be considered uniform after 4942 // vectorization, because that would create an erroneous replicating region 4943 // where only a single instance out of VF should be formed. 4944 // TODO: optimize such seldom cases if found important, see PR40816. 4945 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4946 if (isOutOfScope(I)) { 4947 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4948 << *I << "\n"); 4949 return; 4950 } 4951 if (isScalarWithPredication(I, VF)) { 4952 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4953 << *I << "\n"); 4954 return; 4955 } 4956 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4957 Worklist.insert(I); 4958 }; 4959 4960 // Start with the conditional branch. If the branch condition is an 4961 // instruction contained in the loop that is only used by the branch, it is 4962 // uniform. 4963 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4964 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4965 addToWorklistIfAllowed(Cmp); 4966 4967 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4968 InstWidening WideningDecision = getWideningDecision(I, VF); 4969 assert(WideningDecision != CM_Unknown && 4970 "Widening decision should be ready at this moment"); 4971 4972 // A uniform memory op is itself uniform. We exclude uniform stores 4973 // here as they demand the last lane, not the first one. 4974 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4975 assert(WideningDecision == CM_Scalarize); 4976 return true; 4977 } 4978 4979 return (WideningDecision == CM_Widen || 4980 WideningDecision == CM_Widen_Reverse || 4981 WideningDecision == CM_Interleave); 4982 }; 4983 4984 4985 // Returns true if Ptr is the pointer operand of a memory access instruction 4986 // I, and I is known to not require scalarization. 4987 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4988 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4989 }; 4990 4991 // Holds a list of values which are known to have at least one uniform use. 4992 // Note that there may be other uses which aren't uniform. A "uniform use" 4993 // here is something which only demands lane 0 of the unrolled iterations; 4994 // it does not imply that all lanes produce the same value (e.g. this is not 4995 // the usual meaning of uniform) 4996 SetVector<Value *> HasUniformUse; 4997 4998 // Scan the loop for instructions which are either a) known to have only 4999 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5000 for (auto *BB : TheLoop->blocks()) 5001 for (auto &I : *BB) { 5002 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5003 switch (II->getIntrinsicID()) { 5004 case Intrinsic::sideeffect: 5005 case Intrinsic::experimental_noalias_scope_decl: 5006 case Intrinsic::assume: 5007 case Intrinsic::lifetime_start: 5008 case Intrinsic::lifetime_end: 5009 if (TheLoop->hasLoopInvariantOperands(&I)) 5010 addToWorklistIfAllowed(&I); 5011 break; 5012 default: 5013 break; 5014 } 5015 } 5016 5017 // ExtractValue instructions must be uniform, because the operands are 5018 // known to be loop-invariant. 5019 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5020 assert(isOutOfScope(EVI->getAggregateOperand()) && 5021 "Expected aggregate value to be loop invariant"); 5022 addToWorklistIfAllowed(EVI); 5023 continue; 5024 } 5025 5026 // If there's no pointer operand, there's nothing to do. 5027 auto *Ptr = getLoadStorePointerOperand(&I); 5028 if (!Ptr) 5029 continue; 5030 5031 // A uniform memory op is itself uniform. We exclude uniform stores 5032 // here as they demand the last lane, not the first one. 5033 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5034 addToWorklistIfAllowed(&I); 5035 5036 if (isUniformDecision(&I, VF)) { 5037 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5038 HasUniformUse.insert(Ptr); 5039 } 5040 } 5041 5042 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5043 // demanding) users. Since loops are assumed to be in LCSSA form, this 5044 // disallows uses outside the loop as well. 5045 for (auto *V : HasUniformUse) { 5046 if (isOutOfScope(V)) 5047 continue; 5048 auto *I = cast<Instruction>(V); 5049 auto UsersAreMemAccesses = 5050 llvm::all_of(I->users(), [&](User *U) -> bool { 5051 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5052 }); 5053 if (UsersAreMemAccesses) 5054 addToWorklistIfAllowed(I); 5055 } 5056 5057 // Expand Worklist in topological order: whenever a new instruction 5058 // is added , its users should be already inside Worklist. It ensures 5059 // a uniform instruction will only be used by uniform instructions. 5060 unsigned idx = 0; 5061 while (idx != Worklist.size()) { 5062 Instruction *I = Worklist[idx++]; 5063 5064 for (auto OV : I->operand_values()) { 5065 // isOutOfScope operands cannot be uniform instructions. 5066 if (isOutOfScope(OV)) 5067 continue; 5068 // First order recurrence Phi's should typically be considered 5069 // non-uniform. 5070 auto *OP = dyn_cast<PHINode>(OV); 5071 if (OP && Legal->isFirstOrderRecurrence(OP)) 5072 continue; 5073 // If all the users of the operand are uniform, then add the 5074 // operand into the uniform worklist. 5075 auto *OI = cast<Instruction>(OV); 5076 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5077 auto *J = cast<Instruction>(U); 5078 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5079 })) 5080 addToWorklistIfAllowed(OI); 5081 } 5082 } 5083 5084 // For an instruction to be added into Worklist above, all its users inside 5085 // the loop should also be in Worklist. However, this condition cannot be 5086 // true for phi nodes that form a cyclic dependence. We must process phi 5087 // nodes separately. An induction variable will remain uniform if all users 5088 // of the induction variable and induction variable update remain uniform. 5089 // The code below handles both pointer and non-pointer induction variables. 5090 for (auto &Induction : Legal->getInductionVars()) { 5091 auto *Ind = Induction.first; 5092 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5093 5094 // Determine if all users of the induction variable are uniform after 5095 // vectorization. 5096 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5097 auto *I = cast<Instruction>(U); 5098 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5099 isVectorizedMemAccessUse(I, Ind); 5100 }); 5101 if (!UniformInd) 5102 continue; 5103 5104 // Determine if all users of the induction variable update instruction are 5105 // uniform after vectorization. 5106 auto UniformIndUpdate = 5107 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5108 auto *I = cast<Instruction>(U); 5109 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5110 isVectorizedMemAccessUse(I, IndUpdate); 5111 }); 5112 if (!UniformIndUpdate) 5113 continue; 5114 5115 // The induction variable and its update instruction will remain uniform. 5116 addToWorklistIfAllowed(Ind); 5117 addToWorklistIfAllowed(IndUpdate); 5118 } 5119 5120 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5121 } 5122 5123 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5124 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5125 5126 if (Legal->getRuntimePointerChecking()->Need) { 5127 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5128 "runtime pointer checks needed. Enable vectorization of this " 5129 "loop with '#pragma clang loop vectorize(enable)' when " 5130 "compiling with -Os/-Oz", 5131 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5132 return true; 5133 } 5134 5135 if (!PSE.getPredicate().isAlwaysTrue()) { 5136 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5137 "runtime SCEV checks needed. Enable vectorization of this " 5138 "loop with '#pragma clang loop vectorize(enable)' when " 5139 "compiling with -Os/-Oz", 5140 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5141 return true; 5142 } 5143 5144 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5145 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5146 reportVectorizationFailure("Runtime stride check for small trip count", 5147 "runtime stride == 1 checks needed. Enable vectorization of " 5148 "this loop without such check by compiling with -Os/-Oz", 5149 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5150 return true; 5151 } 5152 5153 return false; 5154 } 5155 5156 ElementCount 5157 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5158 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5159 return ElementCount::getScalable(0); 5160 5161 if (Hints->isScalableVectorizationDisabled()) { 5162 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5163 "ScalableVectorizationDisabled", ORE, TheLoop); 5164 return ElementCount::getScalable(0); 5165 } 5166 5167 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5168 5169 auto MaxScalableVF = ElementCount::getScalable( 5170 std::numeric_limits<ElementCount::ScalarTy>::max()); 5171 5172 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5173 // FIXME: While for scalable vectors this is currently sufficient, this should 5174 // be replaced by a more detailed mechanism that filters out specific VFs, 5175 // instead of invalidating vectorization for a whole set of VFs based on the 5176 // MaxVF. 5177 5178 // Disable scalable vectorization if the loop contains unsupported reductions. 5179 if (!canVectorizeReductions(MaxScalableVF)) { 5180 reportVectorizationInfo( 5181 "Scalable vectorization not supported for the reduction " 5182 "operations found in this loop.", 5183 "ScalableVFUnfeasible", ORE, TheLoop); 5184 return ElementCount::getScalable(0); 5185 } 5186 5187 // Disable scalable vectorization if the loop contains any instructions 5188 // with element types not supported for scalable vectors. 5189 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5190 return !Ty->isVoidTy() && 5191 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5192 })) { 5193 reportVectorizationInfo("Scalable vectorization is not supported " 5194 "for all element types found in this loop.", 5195 "ScalableVFUnfeasible", ORE, TheLoop); 5196 return ElementCount::getScalable(0); 5197 } 5198 5199 if (Legal->isSafeForAnyVectorWidth()) 5200 return MaxScalableVF; 5201 5202 // Limit MaxScalableVF by the maximum safe dependence distance. 5203 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5204 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5205 MaxVScale = 5206 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5207 MaxScalableVF = ElementCount::getScalable( 5208 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5209 if (!MaxScalableVF) 5210 reportVectorizationInfo( 5211 "Max legal vector width too small, scalable vectorization " 5212 "unfeasible.", 5213 "ScalableVFUnfeasible", ORE, TheLoop); 5214 5215 return MaxScalableVF; 5216 } 5217 5218 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5219 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5220 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5221 unsigned SmallestType, WidestType; 5222 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5223 5224 // Get the maximum safe dependence distance in bits computed by LAA. 5225 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5226 // the memory accesses that is most restrictive (involved in the smallest 5227 // dependence distance). 5228 unsigned MaxSafeElements = 5229 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5230 5231 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5232 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5233 5234 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5235 << ".\n"); 5236 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5237 << ".\n"); 5238 5239 // First analyze the UserVF, fall back if the UserVF should be ignored. 5240 if (UserVF) { 5241 auto MaxSafeUserVF = 5242 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5243 5244 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5245 // If `VF=vscale x N` is safe, then so is `VF=N` 5246 if (UserVF.isScalable()) 5247 return FixedScalableVFPair( 5248 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5249 else 5250 return UserVF; 5251 } 5252 5253 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5254 5255 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5256 // is better to ignore the hint and let the compiler choose a suitable VF. 5257 if (!UserVF.isScalable()) { 5258 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5259 << " is unsafe, clamping to max safe VF=" 5260 << MaxSafeFixedVF << ".\n"); 5261 ORE->emit([&]() { 5262 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5263 TheLoop->getStartLoc(), 5264 TheLoop->getHeader()) 5265 << "User-specified vectorization factor " 5266 << ore::NV("UserVectorizationFactor", UserVF) 5267 << " is unsafe, clamping to maximum safe vectorization factor " 5268 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5269 }); 5270 return MaxSafeFixedVF; 5271 } 5272 5273 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5274 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5275 << " is ignored because scalable vectors are not " 5276 "available.\n"); 5277 ORE->emit([&]() { 5278 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5279 TheLoop->getStartLoc(), 5280 TheLoop->getHeader()) 5281 << "User-specified vectorization factor " 5282 << ore::NV("UserVectorizationFactor", UserVF) 5283 << " is ignored because the target does not support scalable " 5284 "vectors. The compiler will pick a more suitable value."; 5285 }); 5286 } else { 5287 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5288 << " is unsafe. Ignoring scalable UserVF.\n"); 5289 ORE->emit([&]() { 5290 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5291 TheLoop->getStartLoc(), 5292 TheLoop->getHeader()) 5293 << "User-specified vectorization factor " 5294 << ore::NV("UserVectorizationFactor", UserVF) 5295 << " is unsafe. Ignoring the hint to let the compiler pick a " 5296 "more suitable value."; 5297 }); 5298 } 5299 } 5300 5301 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5302 << " / " << WidestType << " bits.\n"); 5303 5304 FixedScalableVFPair Result(ElementCount::getFixed(1), 5305 ElementCount::getScalable(0)); 5306 if (auto MaxVF = 5307 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5308 MaxSafeFixedVF, FoldTailByMasking)) 5309 Result.FixedVF = MaxVF; 5310 5311 if (auto MaxVF = 5312 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5313 MaxSafeScalableVF, FoldTailByMasking)) 5314 if (MaxVF.isScalable()) { 5315 Result.ScalableVF = MaxVF; 5316 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5317 << "\n"); 5318 } 5319 5320 return Result; 5321 } 5322 5323 FixedScalableVFPair 5324 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5325 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5326 // TODO: It may by useful to do since it's still likely to be dynamically 5327 // uniform if the target can skip. 5328 reportVectorizationFailure( 5329 "Not inserting runtime ptr check for divergent target", 5330 "runtime pointer checks needed. Not enabled for divergent target", 5331 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5332 return FixedScalableVFPair::getNone(); 5333 } 5334 5335 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5336 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5337 if (TC == 1) { 5338 reportVectorizationFailure("Single iteration (non) loop", 5339 "loop trip count is one, irrelevant for vectorization", 5340 "SingleIterationLoop", ORE, TheLoop); 5341 return FixedScalableVFPair::getNone(); 5342 } 5343 5344 switch (ScalarEpilogueStatus) { 5345 case CM_ScalarEpilogueAllowed: 5346 return computeFeasibleMaxVF(TC, UserVF, false); 5347 case CM_ScalarEpilogueNotAllowedUsePredicate: 5348 LLVM_FALLTHROUGH; 5349 case CM_ScalarEpilogueNotNeededUsePredicate: 5350 LLVM_DEBUG( 5351 dbgs() << "LV: vector predicate hint/switch found.\n" 5352 << "LV: Not allowing scalar epilogue, creating predicated " 5353 << "vector loop.\n"); 5354 break; 5355 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5356 // fallthrough as a special case of OptForSize 5357 case CM_ScalarEpilogueNotAllowedOptSize: 5358 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5359 LLVM_DEBUG( 5360 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5361 else 5362 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5363 << "count.\n"); 5364 5365 // Bail if runtime checks are required, which are not good when optimising 5366 // for size. 5367 if (runtimeChecksRequired()) 5368 return FixedScalableVFPair::getNone(); 5369 5370 break; 5371 } 5372 5373 // The only loops we can vectorize without a scalar epilogue, are loops with 5374 // a bottom-test and a single exiting block. We'd have to handle the fact 5375 // that not every instruction executes on the last iteration. This will 5376 // require a lane mask which varies through the vector loop body. (TODO) 5377 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5378 // If there was a tail-folding hint/switch, but we can't fold the tail by 5379 // masking, fallback to a vectorization with a scalar epilogue. 5380 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5381 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5382 "scalar epilogue instead.\n"); 5383 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5384 return computeFeasibleMaxVF(TC, UserVF, false); 5385 } 5386 return FixedScalableVFPair::getNone(); 5387 } 5388 5389 // Now try the tail folding 5390 5391 // Invalidate interleave groups that require an epilogue if we can't mask 5392 // the interleave-group. 5393 if (!useMaskedInterleavedAccesses(TTI)) { 5394 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5395 "No decisions should have been taken at this point"); 5396 // Note: There is no need to invalidate any cost modeling decisions here, as 5397 // non where taken so far. 5398 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5399 } 5400 5401 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5402 // Avoid tail folding if the trip count is known to be a multiple of any VF 5403 // we chose. 5404 // FIXME: The condition below pessimises the case for fixed-width vectors, 5405 // when scalable VFs are also candidates for vectorization. 5406 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5407 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5408 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5409 "MaxFixedVF must be a power of 2"); 5410 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5411 : MaxFixedVF.getFixedValue(); 5412 ScalarEvolution *SE = PSE.getSE(); 5413 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5414 const SCEV *ExitCount = SE->getAddExpr( 5415 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5416 const SCEV *Rem = SE->getURemExpr( 5417 SE->applyLoopGuards(ExitCount, TheLoop), 5418 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5419 if (Rem->isZero()) { 5420 // Accept MaxFixedVF if we do not have a tail. 5421 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5422 return MaxFactors; 5423 } 5424 } 5425 5426 // For scalable vectors don't use tail folding for low trip counts or 5427 // optimizing for code size. We only permit this if the user has explicitly 5428 // requested it. 5429 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5430 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5431 MaxFactors.ScalableVF.isVector()) 5432 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5433 5434 // If we don't know the precise trip count, or if the trip count that we 5435 // found modulo the vectorization factor is not zero, try to fold the tail 5436 // by masking. 5437 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5438 if (Legal->prepareToFoldTailByMasking()) { 5439 FoldTailByMasking = true; 5440 return MaxFactors; 5441 } 5442 5443 // If there was a tail-folding hint/switch, but we can't fold the tail by 5444 // masking, fallback to a vectorization with a scalar epilogue. 5445 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5446 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5447 "scalar epilogue instead.\n"); 5448 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5449 return MaxFactors; 5450 } 5451 5452 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5453 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5454 return FixedScalableVFPair::getNone(); 5455 } 5456 5457 if (TC == 0) { 5458 reportVectorizationFailure( 5459 "Unable to calculate the loop count due to complex control flow", 5460 "unable to calculate the loop count due to complex control flow", 5461 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5462 return FixedScalableVFPair::getNone(); 5463 } 5464 5465 reportVectorizationFailure( 5466 "Cannot optimize for size and vectorize at the same time.", 5467 "cannot optimize for size and vectorize at the same time. " 5468 "Enable vectorization of this loop with '#pragma clang loop " 5469 "vectorize(enable)' when compiling with -Os/-Oz", 5470 "NoTailLoopWithOptForSize", ORE, TheLoop); 5471 return FixedScalableVFPair::getNone(); 5472 } 5473 5474 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5475 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5476 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5477 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5478 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5479 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5480 : TargetTransformInfo::RGK_FixedWidthVector); 5481 5482 // Convenience function to return the minimum of two ElementCounts. 5483 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5484 assert((LHS.isScalable() == RHS.isScalable()) && 5485 "Scalable flags must match"); 5486 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5487 }; 5488 5489 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5490 // Note that both WidestRegister and WidestType may not be a powers of 2. 5491 auto MaxVectorElementCount = ElementCount::get( 5492 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5493 ComputeScalableMaxVF); 5494 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5495 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5496 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5497 5498 if (!MaxVectorElementCount) { 5499 LLVM_DEBUG(dbgs() << "LV: The target has no " 5500 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5501 << " vector registers.\n"); 5502 return ElementCount::getFixed(1); 5503 } 5504 5505 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5506 if (ConstTripCount && 5507 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5508 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5509 // If loop trip count (TC) is known at compile time there is no point in 5510 // choosing VF greater than TC (as done in the loop below). Select maximum 5511 // power of two which doesn't exceed TC. 5512 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5513 // when the TC is less than or equal to the known number of lanes. 5514 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5515 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5516 "exceeding the constant trip count: " 5517 << ClampedConstTripCount << "\n"); 5518 return ElementCount::getFixed(ClampedConstTripCount); 5519 } 5520 5521 ElementCount MaxVF = MaxVectorElementCount; 5522 if (TTI.shouldMaximizeVectorBandwidth() || 5523 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5524 auto MaxVectorElementCountMaxBW = ElementCount::get( 5525 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5526 ComputeScalableMaxVF); 5527 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5528 5529 // Collect all viable vectorization factors larger than the default MaxVF 5530 // (i.e. MaxVectorElementCount). 5531 SmallVector<ElementCount, 8> VFs; 5532 for (ElementCount VS = MaxVectorElementCount * 2; 5533 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5534 VFs.push_back(VS); 5535 5536 // For each VF calculate its register usage. 5537 auto RUs = calculateRegisterUsage(VFs); 5538 5539 // Select the largest VF which doesn't require more registers than existing 5540 // ones. 5541 for (int i = RUs.size() - 1; i >= 0; --i) { 5542 bool Selected = true; 5543 for (auto &pair : RUs[i].MaxLocalUsers) { 5544 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5545 if (pair.second > TargetNumRegisters) 5546 Selected = false; 5547 } 5548 if (Selected) { 5549 MaxVF = VFs[i]; 5550 break; 5551 } 5552 } 5553 if (ElementCount MinVF = 5554 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5555 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5556 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5557 << ") with target's minimum: " << MinVF << '\n'); 5558 MaxVF = MinVF; 5559 } 5560 } 5561 } 5562 return MaxVF; 5563 } 5564 5565 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5566 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5567 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5568 auto Min = Attr.getVScaleRangeMin(); 5569 auto Max = Attr.getVScaleRangeMax(); 5570 if (Max && Min == Max) 5571 return Max; 5572 } 5573 5574 return TTI.getVScaleForTuning(); 5575 } 5576 5577 bool LoopVectorizationCostModel::isMoreProfitable( 5578 const VectorizationFactor &A, const VectorizationFactor &B) const { 5579 InstructionCost CostA = A.Cost; 5580 InstructionCost CostB = B.Cost; 5581 5582 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5583 5584 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5585 MaxTripCount) { 5586 // If we are folding the tail and the trip count is a known (possibly small) 5587 // constant, the trip count will be rounded up to an integer number of 5588 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5589 // which we compare directly. When not folding the tail, the total cost will 5590 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5591 // approximated with the per-lane cost below instead of using the tripcount 5592 // as here. 5593 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5594 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5595 return RTCostA < RTCostB; 5596 } 5597 5598 // Improve estimate for the vector width if it is scalable. 5599 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5600 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5601 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5602 if (A.Width.isScalable()) 5603 EstimatedWidthA *= VScale.getValue(); 5604 if (B.Width.isScalable()) 5605 EstimatedWidthB *= VScale.getValue(); 5606 } 5607 5608 // Assume vscale may be larger than 1 (or the value being tuned for), 5609 // so that scalable vectorization is slightly favorable over fixed-width 5610 // vectorization. 5611 if (A.Width.isScalable() && !B.Width.isScalable()) 5612 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5613 5614 // To avoid the need for FP division: 5615 // (CostA / A.Width) < (CostB / B.Width) 5616 // <=> (CostA * B.Width) < (CostB * A.Width) 5617 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5618 } 5619 5620 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5621 const ElementCountSet &VFCandidates) { 5622 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5623 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5624 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5625 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5626 "Expected Scalar VF to be a candidate"); 5627 5628 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5629 VectorizationFactor ChosenFactor = ScalarCost; 5630 5631 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5632 if (ForceVectorization && VFCandidates.size() > 1) { 5633 // Ignore scalar width, because the user explicitly wants vectorization. 5634 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5635 // evaluation. 5636 ChosenFactor.Cost = InstructionCost::getMax(); 5637 } 5638 5639 SmallVector<InstructionVFPair> InvalidCosts; 5640 for (const auto &i : VFCandidates) { 5641 // The cost for scalar VF=1 is already calculated, so ignore it. 5642 if (i.isScalar()) 5643 continue; 5644 5645 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5646 VectorizationFactor Candidate(i, C.first); 5647 5648 #ifndef NDEBUG 5649 unsigned AssumedMinimumVscale = 1; 5650 if (Optional<unsigned> VScale = getVScaleForTuning()) 5651 AssumedMinimumVscale = VScale.getValue(); 5652 unsigned Width = 5653 Candidate.Width.isScalable() 5654 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5655 : Candidate.Width.getFixedValue(); 5656 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5657 << " costs: " << (Candidate.Cost / Width)); 5658 if (i.isScalable()) 5659 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5660 << AssumedMinimumVscale << ")"); 5661 LLVM_DEBUG(dbgs() << ".\n"); 5662 #endif 5663 5664 if (!C.second && !ForceVectorization) { 5665 LLVM_DEBUG( 5666 dbgs() << "LV: Not considering vector loop of width " << i 5667 << " because it will not generate any vector instructions.\n"); 5668 continue; 5669 } 5670 5671 // If profitable add it to ProfitableVF list. 5672 if (isMoreProfitable(Candidate, ScalarCost)) 5673 ProfitableVFs.push_back(Candidate); 5674 5675 if (isMoreProfitable(Candidate, ChosenFactor)) 5676 ChosenFactor = Candidate; 5677 } 5678 5679 // Emit a report of VFs with invalid costs in the loop. 5680 if (!InvalidCosts.empty()) { 5681 // Group the remarks per instruction, keeping the instruction order from 5682 // InvalidCosts. 5683 std::map<Instruction *, unsigned> Numbering; 5684 unsigned I = 0; 5685 for (auto &Pair : InvalidCosts) 5686 if (!Numbering.count(Pair.first)) 5687 Numbering[Pair.first] = I++; 5688 5689 // Sort the list, first on instruction(number) then on VF. 5690 llvm::sort(InvalidCosts, 5691 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5692 if (Numbering[A.first] != Numbering[B.first]) 5693 return Numbering[A.first] < Numbering[B.first]; 5694 ElementCountComparator ECC; 5695 return ECC(A.second, B.second); 5696 }); 5697 5698 // For a list of ordered instruction-vf pairs: 5699 // [(load, vf1), (load, vf2), (store, vf1)] 5700 // Group the instructions together to emit separate remarks for: 5701 // load (vf1, vf2) 5702 // store (vf1) 5703 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5704 auto Subset = ArrayRef<InstructionVFPair>(); 5705 do { 5706 if (Subset.empty()) 5707 Subset = Tail.take_front(1); 5708 5709 Instruction *I = Subset.front().first; 5710 5711 // If the next instruction is different, or if there are no other pairs, 5712 // emit a remark for the collated subset. e.g. 5713 // [(load, vf1), (load, vf2))] 5714 // to emit: 5715 // remark: invalid costs for 'load' at VF=(vf, vf2) 5716 if (Subset == Tail || Tail[Subset.size()].first != I) { 5717 std::string OutString; 5718 raw_string_ostream OS(OutString); 5719 assert(!Subset.empty() && "Unexpected empty range"); 5720 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5721 for (auto &Pair : Subset) 5722 OS << (Pair.second == Subset.front().second ? "" : ", ") 5723 << Pair.second; 5724 OS << "):"; 5725 if (auto *CI = dyn_cast<CallInst>(I)) 5726 OS << " call to " << CI->getCalledFunction()->getName(); 5727 else 5728 OS << " " << I->getOpcodeName(); 5729 OS.flush(); 5730 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5731 Tail = Tail.drop_front(Subset.size()); 5732 Subset = {}; 5733 } else 5734 // Grow the subset by one element 5735 Subset = Tail.take_front(Subset.size() + 1); 5736 } while (!Tail.empty()); 5737 } 5738 5739 if (!EnableCondStoresVectorization && NumPredStores) { 5740 reportVectorizationFailure("There are conditional stores.", 5741 "store that is conditionally executed prevents vectorization", 5742 "ConditionalStore", ORE, TheLoop); 5743 ChosenFactor = ScalarCost; 5744 } 5745 5746 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5747 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5748 << "LV: Vectorization seems to be not beneficial, " 5749 << "but was forced by a user.\n"); 5750 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5751 return ChosenFactor; 5752 } 5753 5754 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5755 const Loop &L, ElementCount VF) const { 5756 // Cross iteration phis such as reductions need special handling and are 5757 // currently unsupported. 5758 if (any_of(L.getHeader()->phis(), 5759 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5760 return false; 5761 5762 // Phis with uses outside of the loop require special handling and are 5763 // currently unsupported. 5764 for (auto &Entry : Legal->getInductionVars()) { 5765 // Look for uses of the value of the induction at the last iteration. 5766 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5767 for (User *U : PostInc->users()) 5768 if (!L.contains(cast<Instruction>(U))) 5769 return false; 5770 // Look for uses of penultimate value of the induction. 5771 for (User *U : Entry.first->users()) 5772 if (!L.contains(cast<Instruction>(U))) 5773 return false; 5774 } 5775 5776 // Induction variables that are widened require special handling that is 5777 // currently not supported. 5778 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5779 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5780 this->isProfitableToScalarize(Entry.first, VF)); 5781 })) 5782 return false; 5783 5784 // Epilogue vectorization code has not been auditted to ensure it handles 5785 // non-latch exits properly. It may be fine, but it needs auditted and 5786 // tested. 5787 if (L.getExitingBlock() != L.getLoopLatch()) 5788 return false; 5789 5790 return true; 5791 } 5792 5793 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5794 const ElementCount VF) const { 5795 // FIXME: We need a much better cost-model to take different parameters such 5796 // as register pressure, code size increase and cost of extra branches into 5797 // account. For now we apply a very crude heuristic and only consider loops 5798 // with vectorization factors larger than a certain value. 5799 // We also consider epilogue vectorization unprofitable for targets that don't 5800 // consider interleaving beneficial (eg. MVE). 5801 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5802 return false; 5803 // FIXME: We should consider changing the threshold for scalable 5804 // vectors to take VScaleForTuning into account. 5805 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5806 return true; 5807 return false; 5808 } 5809 5810 VectorizationFactor 5811 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5812 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5813 VectorizationFactor Result = VectorizationFactor::Disabled(); 5814 if (!EnableEpilogueVectorization) { 5815 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5816 return Result; 5817 } 5818 5819 if (!isScalarEpilogueAllowed()) { 5820 LLVM_DEBUG( 5821 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5822 "allowed.\n";); 5823 return Result; 5824 } 5825 5826 // Not really a cost consideration, but check for unsupported cases here to 5827 // simplify the logic. 5828 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5829 LLVM_DEBUG( 5830 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5831 "not a supported candidate.\n";); 5832 return Result; 5833 } 5834 5835 if (EpilogueVectorizationForceVF > 1) { 5836 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5837 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5838 if (LVP.hasPlanWithVF(ForcedEC)) 5839 return {ForcedEC, 0}; 5840 else { 5841 LLVM_DEBUG( 5842 dbgs() 5843 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5844 return Result; 5845 } 5846 } 5847 5848 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5849 TheLoop->getHeader()->getParent()->hasMinSize()) { 5850 LLVM_DEBUG( 5851 dbgs() 5852 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5853 return Result; 5854 } 5855 5856 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5857 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5858 "this loop\n"); 5859 return Result; 5860 } 5861 5862 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5863 // the main loop handles 8 lanes per iteration. We could still benefit from 5864 // vectorizing the epilogue loop with VF=4. 5865 ElementCount EstimatedRuntimeVF = MainLoopVF; 5866 if (MainLoopVF.isScalable()) { 5867 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5868 if (Optional<unsigned> VScale = getVScaleForTuning()) 5869 EstimatedRuntimeVF *= VScale.getValue(); 5870 } 5871 5872 for (auto &NextVF : ProfitableVFs) 5873 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5874 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5875 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5876 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5877 LVP.hasPlanWithVF(NextVF.Width)) 5878 Result = NextVF; 5879 5880 if (Result != VectorizationFactor::Disabled()) 5881 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5882 << Result.Width << "\n";); 5883 return Result; 5884 } 5885 5886 std::pair<unsigned, unsigned> 5887 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5888 unsigned MinWidth = -1U; 5889 unsigned MaxWidth = 8; 5890 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5891 // For in-loop reductions, no element types are added to ElementTypesInLoop 5892 // if there are no loads/stores in the loop. In this case, check through the 5893 // reduction variables to determine the maximum width. 5894 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5895 // Reset MaxWidth so that we can find the smallest type used by recurrences 5896 // in the loop. 5897 MaxWidth = -1U; 5898 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5899 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5900 // When finding the min width used by the recurrence we need to account 5901 // for casts on the input operands of the recurrence. 5902 MaxWidth = std::min<unsigned>( 5903 MaxWidth, std::min<unsigned>( 5904 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5905 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5906 } 5907 } else { 5908 for (Type *T : ElementTypesInLoop) { 5909 MinWidth = std::min<unsigned>( 5910 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5911 MaxWidth = std::max<unsigned>( 5912 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5913 } 5914 } 5915 return {MinWidth, MaxWidth}; 5916 } 5917 5918 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5919 ElementTypesInLoop.clear(); 5920 // For each block. 5921 for (BasicBlock *BB : TheLoop->blocks()) { 5922 // For each instruction in the loop. 5923 for (Instruction &I : BB->instructionsWithoutDebug()) { 5924 Type *T = I.getType(); 5925 5926 // Skip ignored values. 5927 if (ValuesToIgnore.count(&I)) 5928 continue; 5929 5930 // Only examine Loads, Stores and PHINodes. 5931 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5932 continue; 5933 5934 // Examine PHI nodes that are reduction variables. Update the type to 5935 // account for the recurrence type. 5936 if (auto *PN = dyn_cast<PHINode>(&I)) { 5937 if (!Legal->isReductionVariable(PN)) 5938 continue; 5939 const RecurrenceDescriptor &RdxDesc = 5940 Legal->getReductionVars().find(PN)->second; 5941 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5942 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5943 RdxDesc.getRecurrenceType(), 5944 TargetTransformInfo::ReductionFlags())) 5945 continue; 5946 T = RdxDesc.getRecurrenceType(); 5947 } 5948 5949 // Examine the stored values. 5950 if (auto *ST = dyn_cast<StoreInst>(&I)) 5951 T = ST->getValueOperand()->getType(); 5952 5953 assert(T->isSized() && 5954 "Expected the load/store/recurrence type to be sized"); 5955 5956 ElementTypesInLoop.insert(T); 5957 } 5958 } 5959 } 5960 5961 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5962 unsigned LoopCost) { 5963 // -- The interleave heuristics -- 5964 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5965 // There are many micro-architectural considerations that we can't predict 5966 // at this level. For example, frontend pressure (on decode or fetch) due to 5967 // code size, or the number and capabilities of the execution ports. 5968 // 5969 // We use the following heuristics to select the interleave count: 5970 // 1. If the code has reductions, then we interleave to break the cross 5971 // iteration dependency. 5972 // 2. If the loop is really small, then we interleave to reduce the loop 5973 // overhead. 5974 // 3. We don't interleave if we think that we will spill registers to memory 5975 // due to the increased register pressure. 5976 5977 if (!isScalarEpilogueAllowed()) 5978 return 1; 5979 5980 // We used the distance for the interleave count. 5981 if (Legal->getMaxSafeDepDistBytes() != -1U) 5982 return 1; 5983 5984 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5985 const bool HasReductions = !Legal->getReductionVars().empty(); 5986 // Do not interleave loops with a relatively small known or estimated trip 5987 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5988 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5989 // because with the above conditions interleaving can expose ILP and break 5990 // cross iteration dependences for reductions. 5991 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5992 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5993 return 1; 5994 5995 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5996 // We divide by these constants so assume that we have at least one 5997 // instruction that uses at least one register. 5998 for (auto& pair : R.MaxLocalUsers) { 5999 pair.second = std::max(pair.second, 1U); 6000 } 6001 6002 // We calculate the interleave count using the following formula. 6003 // Subtract the number of loop invariants from the number of available 6004 // registers. These registers are used by all of the interleaved instances. 6005 // Next, divide the remaining registers by the number of registers that is 6006 // required by the loop, in order to estimate how many parallel instances 6007 // fit without causing spills. All of this is rounded down if necessary to be 6008 // a power of two. We want power of two interleave count to simplify any 6009 // addressing operations or alignment considerations. 6010 // We also want power of two interleave counts to ensure that the induction 6011 // variable of the vector loop wraps to zero, when tail is folded by masking; 6012 // this currently happens when OptForSize, in which case IC is set to 1 above. 6013 unsigned IC = UINT_MAX; 6014 6015 for (auto& pair : R.MaxLocalUsers) { 6016 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6017 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6018 << " registers of " 6019 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6020 if (VF.isScalar()) { 6021 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6022 TargetNumRegisters = ForceTargetNumScalarRegs; 6023 } else { 6024 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6025 TargetNumRegisters = ForceTargetNumVectorRegs; 6026 } 6027 unsigned MaxLocalUsers = pair.second; 6028 unsigned LoopInvariantRegs = 0; 6029 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6030 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6031 6032 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6033 // Don't count the induction variable as interleaved. 6034 if (EnableIndVarRegisterHeur) { 6035 TmpIC = 6036 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6037 std::max(1U, (MaxLocalUsers - 1))); 6038 } 6039 6040 IC = std::min(IC, TmpIC); 6041 } 6042 6043 // Clamp the interleave ranges to reasonable counts. 6044 unsigned MaxInterleaveCount = 6045 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6046 6047 // Check if the user has overridden the max. 6048 if (VF.isScalar()) { 6049 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6050 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6051 } else { 6052 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6053 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6054 } 6055 6056 // If trip count is known or estimated compile time constant, limit the 6057 // interleave count to be less than the trip count divided by VF, provided it 6058 // is at least 1. 6059 // 6060 // For scalable vectors we can't know if interleaving is beneficial. It may 6061 // not be beneficial for small loops if none of the lanes in the second vector 6062 // iterations is enabled. However, for larger loops, there is likely to be a 6063 // similar benefit as for fixed-width vectors. For now, we choose to leave 6064 // the InterleaveCount as if vscale is '1', although if some information about 6065 // the vector is known (e.g. min vector size), we can make a better decision. 6066 if (BestKnownTC) { 6067 MaxInterleaveCount = 6068 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6069 // Make sure MaxInterleaveCount is greater than 0. 6070 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6071 } 6072 6073 assert(MaxInterleaveCount > 0 && 6074 "Maximum interleave count must be greater than 0"); 6075 6076 // Clamp the calculated IC to be between the 1 and the max interleave count 6077 // that the target and trip count allows. 6078 if (IC > MaxInterleaveCount) 6079 IC = MaxInterleaveCount; 6080 else 6081 // Make sure IC is greater than 0. 6082 IC = std::max(1u, IC); 6083 6084 assert(IC > 0 && "Interleave count must be greater than 0."); 6085 6086 // If we did not calculate the cost for VF (because the user selected the VF) 6087 // then we calculate the cost of VF here. 6088 if (LoopCost == 0) { 6089 InstructionCost C = expectedCost(VF).first; 6090 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6091 LoopCost = *C.getValue(); 6092 } 6093 6094 assert(LoopCost && "Non-zero loop cost expected"); 6095 6096 // Interleave if we vectorized this loop and there is a reduction that could 6097 // benefit from interleaving. 6098 if (VF.isVector() && HasReductions) { 6099 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6100 return IC; 6101 } 6102 6103 // For any scalar loop that either requires runtime checks or predication we 6104 // are better off leaving this to the unroller. Note that if we've already 6105 // vectorized the loop we will have done the runtime check and so interleaving 6106 // won't require further checks. 6107 bool ScalarInterleavingRequiresPredication = 6108 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 6109 return Legal->blockNeedsPredication(BB); 6110 })); 6111 bool ScalarInterleavingRequiresRuntimePointerCheck = 6112 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6113 6114 // We want to interleave small loops in order to reduce the loop overhead and 6115 // potentially expose ILP opportunities. 6116 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6117 << "LV: IC is " << IC << '\n' 6118 << "LV: VF is " << VF << '\n'); 6119 const bool AggressivelyInterleaveReductions = 6120 TTI.enableAggressiveInterleaving(HasReductions); 6121 if (!ScalarInterleavingRequiresRuntimePointerCheck && 6122 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 6123 // We assume that the cost overhead is 1 and we use the cost model 6124 // to estimate the cost of the loop and interleave until the cost of the 6125 // loop overhead is about 5% of the cost of the loop. 6126 unsigned SmallIC = 6127 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6128 6129 // Interleave until store/load ports (estimated by max interleave count) are 6130 // saturated. 6131 unsigned NumStores = Legal->getNumStores(); 6132 unsigned NumLoads = Legal->getNumLoads(); 6133 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6134 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6135 6136 // There is little point in interleaving for reductions containing selects 6137 // and compares when VF=1 since it may just create more overhead than it's 6138 // worth for loops with small trip counts. This is because we still have to 6139 // do the final reduction after the loop. 6140 bool HasSelectCmpReductions = 6141 HasReductions && 6142 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6143 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6144 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6145 RdxDesc.getRecurrenceKind()); 6146 }); 6147 if (HasSelectCmpReductions) { 6148 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6149 return 1; 6150 } 6151 6152 // If we have a scalar reduction (vector reductions are already dealt with 6153 // by this point), we can increase the critical path length if the loop 6154 // we're interleaving is inside another loop. For tree-wise reductions 6155 // set the limit to 2, and for ordered reductions it's best to disable 6156 // interleaving entirely. 6157 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6158 bool HasOrderedReductions = 6159 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6160 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6161 return RdxDesc.isOrdered(); 6162 }); 6163 if (HasOrderedReductions) { 6164 LLVM_DEBUG( 6165 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6166 return 1; 6167 } 6168 6169 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6170 SmallIC = std::min(SmallIC, F); 6171 StoresIC = std::min(StoresIC, F); 6172 LoadsIC = std::min(LoadsIC, F); 6173 } 6174 6175 if (EnableLoadStoreRuntimeInterleave && 6176 std::max(StoresIC, LoadsIC) > SmallIC) { 6177 LLVM_DEBUG( 6178 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6179 return std::max(StoresIC, LoadsIC); 6180 } 6181 6182 // If there are scalar reductions and TTI has enabled aggressive 6183 // interleaving for reductions, we will interleave to expose ILP. 6184 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6185 AggressivelyInterleaveReductions) { 6186 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6187 // Interleave no less than SmallIC but not as aggressive as the normal IC 6188 // to satisfy the rare situation when resources are too limited. 6189 return std::max(IC / 2, SmallIC); 6190 } else { 6191 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6192 return SmallIC; 6193 } 6194 } 6195 6196 // Interleave if this is a large loop (small loops are already dealt with by 6197 // this point) that could benefit from interleaving. 6198 if (AggressivelyInterleaveReductions) { 6199 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6200 return IC; 6201 } 6202 6203 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6204 return 1; 6205 } 6206 6207 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6208 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6209 // This function calculates the register usage by measuring the highest number 6210 // of values that are alive at a single location. Obviously, this is a very 6211 // rough estimation. We scan the loop in a topological order in order and 6212 // assign a number to each instruction. We use RPO to ensure that defs are 6213 // met before their users. We assume that each instruction that has in-loop 6214 // users starts an interval. We record every time that an in-loop value is 6215 // used, so we have a list of the first and last occurrences of each 6216 // instruction. Next, we transpose this data structure into a multi map that 6217 // holds the list of intervals that *end* at a specific location. This multi 6218 // map allows us to perform a linear search. We scan the instructions linearly 6219 // and record each time that a new interval starts, by placing it in a set. 6220 // If we find this value in the multi-map then we remove it from the set. 6221 // The max register usage is the maximum size of the set. 6222 // We also search for instructions that are defined outside the loop, but are 6223 // used inside the loop. We need this number separately from the max-interval 6224 // usage number because when we unroll, loop-invariant values do not take 6225 // more register. 6226 LoopBlocksDFS DFS(TheLoop); 6227 DFS.perform(LI); 6228 6229 RegisterUsage RU; 6230 6231 // Each 'key' in the map opens a new interval. The values 6232 // of the map are the index of the 'last seen' usage of the 6233 // instruction that is the key. 6234 using IntervalMap = DenseMap<Instruction *, unsigned>; 6235 6236 // Maps instruction to its index. 6237 SmallVector<Instruction *, 64> IdxToInstr; 6238 // Marks the end of each interval. 6239 IntervalMap EndPoint; 6240 // Saves the list of instruction indices that are used in the loop. 6241 SmallPtrSet<Instruction *, 8> Ends; 6242 // Saves the list of values that are used in the loop but are 6243 // defined outside the loop, such as arguments and constants. 6244 SmallPtrSet<Value *, 8> LoopInvariants; 6245 6246 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6247 for (Instruction &I : BB->instructionsWithoutDebug()) { 6248 IdxToInstr.push_back(&I); 6249 6250 // Save the end location of each USE. 6251 for (Value *U : I.operands()) { 6252 auto *Instr = dyn_cast<Instruction>(U); 6253 6254 // Ignore non-instruction values such as arguments, constants, etc. 6255 if (!Instr) 6256 continue; 6257 6258 // If this instruction is outside the loop then record it and continue. 6259 if (!TheLoop->contains(Instr)) { 6260 LoopInvariants.insert(Instr); 6261 continue; 6262 } 6263 6264 // Overwrite previous end points. 6265 EndPoint[Instr] = IdxToInstr.size(); 6266 Ends.insert(Instr); 6267 } 6268 } 6269 } 6270 6271 // Saves the list of intervals that end with the index in 'key'. 6272 using InstrList = SmallVector<Instruction *, 2>; 6273 DenseMap<unsigned, InstrList> TransposeEnds; 6274 6275 // Transpose the EndPoints to a list of values that end at each index. 6276 for (auto &Interval : EndPoint) 6277 TransposeEnds[Interval.second].push_back(Interval.first); 6278 6279 SmallPtrSet<Instruction *, 8> OpenIntervals; 6280 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6281 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6282 6283 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6284 6285 // A lambda that gets the register usage for the given type and VF. 6286 const auto &TTICapture = TTI; 6287 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6288 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6289 return 0; 6290 InstructionCost::CostType RegUsage = 6291 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6292 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6293 "Nonsensical values for register usage."); 6294 return RegUsage; 6295 }; 6296 6297 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6298 Instruction *I = IdxToInstr[i]; 6299 6300 // Remove all of the instructions that end at this location. 6301 InstrList &List = TransposeEnds[i]; 6302 for (Instruction *ToRemove : List) 6303 OpenIntervals.erase(ToRemove); 6304 6305 // Ignore instructions that are never used within the loop. 6306 if (!Ends.count(I)) 6307 continue; 6308 6309 // Skip ignored values. 6310 if (ValuesToIgnore.count(I)) 6311 continue; 6312 6313 // For each VF find the maximum usage of registers. 6314 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6315 // Count the number of live intervals. 6316 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6317 6318 if (VFs[j].isScalar()) { 6319 for (auto Inst : OpenIntervals) { 6320 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6321 if (RegUsage.find(ClassID) == RegUsage.end()) 6322 RegUsage[ClassID] = 1; 6323 else 6324 RegUsage[ClassID] += 1; 6325 } 6326 } else { 6327 collectUniformsAndScalars(VFs[j]); 6328 for (auto Inst : OpenIntervals) { 6329 // Skip ignored values for VF > 1. 6330 if (VecValuesToIgnore.count(Inst)) 6331 continue; 6332 if (isScalarAfterVectorization(Inst, VFs[j])) { 6333 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6334 if (RegUsage.find(ClassID) == RegUsage.end()) 6335 RegUsage[ClassID] = 1; 6336 else 6337 RegUsage[ClassID] += 1; 6338 } else { 6339 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6340 if (RegUsage.find(ClassID) == RegUsage.end()) 6341 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6342 else 6343 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6344 } 6345 } 6346 } 6347 6348 for (auto& pair : RegUsage) { 6349 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6350 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6351 else 6352 MaxUsages[j][pair.first] = pair.second; 6353 } 6354 } 6355 6356 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6357 << OpenIntervals.size() << '\n'); 6358 6359 // Add the current instruction to the list of open intervals. 6360 OpenIntervals.insert(I); 6361 } 6362 6363 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6364 SmallMapVector<unsigned, unsigned, 4> Invariant; 6365 6366 for (auto Inst : LoopInvariants) { 6367 unsigned Usage = 6368 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6369 unsigned ClassID = 6370 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6371 if (Invariant.find(ClassID) == Invariant.end()) 6372 Invariant[ClassID] = Usage; 6373 else 6374 Invariant[ClassID] += Usage; 6375 } 6376 6377 LLVM_DEBUG({ 6378 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6379 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6380 << " item\n"; 6381 for (const auto &pair : MaxUsages[i]) { 6382 dbgs() << "LV(REG): RegisterClass: " 6383 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6384 << " registers\n"; 6385 } 6386 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6387 << " item\n"; 6388 for (const auto &pair : Invariant) { 6389 dbgs() << "LV(REG): RegisterClass: " 6390 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6391 << " registers\n"; 6392 } 6393 }); 6394 6395 RU.LoopInvariantRegs = Invariant; 6396 RU.MaxLocalUsers = MaxUsages[i]; 6397 RUs[i] = RU; 6398 } 6399 6400 return RUs; 6401 } 6402 6403 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6404 ElementCount VF) { 6405 // TODO: Cost model for emulated masked load/store is completely 6406 // broken. This hack guides the cost model to use an artificially 6407 // high enough value to practically disable vectorization with such 6408 // operations, except where previously deployed legality hack allowed 6409 // using very low cost values. This is to avoid regressions coming simply 6410 // from moving "masked load/store" check from legality to cost model. 6411 // Masked Load/Gather emulation was previously never allowed. 6412 // Limited number of Masked Store/Scatter emulation was allowed. 6413 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6414 return isa<LoadInst>(I) || 6415 (isa<StoreInst>(I) && 6416 NumPredStores > NumberOfStoresToPredicate); 6417 } 6418 6419 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6420 // If we aren't vectorizing the loop, or if we've already collected the 6421 // instructions to scalarize, there's nothing to do. Collection may already 6422 // have occurred if we have a user-selected VF and are now computing the 6423 // expected cost for interleaving. 6424 if (VF.isScalar() || VF.isZero() || 6425 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6426 return; 6427 6428 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6429 // not profitable to scalarize any instructions, the presence of VF in the 6430 // map will indicate that we've analyzed it already. 6431 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6432 6433 // Find all the instructions that are scalar with predication in the loop and 6434 // determine if it would be better to not if-convert the blocks they are in. 6435 // If so, we also record the instructions to scalarize. 6436 for (BasicBlock *BB : TheLoop->blocks()) { 6437 if (!blockNeedsPredicationForAnyReason(BB)) 6438 continue; 6439 for (Instruction &I : *BB) 6440 if (isScalarWithPredication(&I, VF)) { 6441 ScalarCostsTy ScalarCosts; 6442 // Do not apply discount if scalable, because that would lead to 6443 // invalid scalarization costs. 6444 // Do not apply discount logic if hacked cost is needed 6445 // for emulated masked memrefs. 6446 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6447 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6448 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6449 // Remember that BB will remain after vectorization. 6450 PredicatedBBsAfterVectorization.insert(BB); 6451 } 6452 } 6453 } 6454 6455 int LoopVectorizationCostModel::computePredInstDiscount( 6456 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6457 assert(!isUniformAfterVectorization(PredInst, VF) && 6458 "Instruction marked uniform-after-vectorization will be predicated"); 6459 6460 // Initialize the discount to zero, meaning that the scalar version and the 6461 // vector version cost the same. 6462 InstructionCost Discount = 0; 6463 6464 // Holds instructions to analyze. The instructions we visit are mapped in 6465 // ScalarCosts. Those instructions are the ones that would be scalarized if 6466 // we find that the scalar version costs less. 6467 SmallVector<Instruction *, 8> Worklist; 6468 6469 // Returns true if the given instruction can be scalarized. 6470 auto canBeScalarized = [&](Instruction *I) -> bool { 6471 // We only attempt to scalarize instructions forming a single-use chain 6472 // from the original predicated block that would otherwise be vectorized. 6473 // Although not strictly necessary, we give up on instructions we know will 6474 // already be scalar to avoid traversing chains that are unlikely to be 6475 // beneficial. 6476 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6477 isScalarAfterVectorization(I, VF)) 6478 return false; 6479 6480 // If the instruction is scalar with predication, it will be analyzed 6481 // separately. We ignore it within the context of PredInst. 6482 if (isScalarWithPredication(I, VF)) 6483 return false; 6484 6485 // If any of the instruction's operands are uniform after vectorization, 6486 // the instruction cannot be scalarized. This prevents, for example, a 6487 // masked load from being scalarized. 6488 // 6489 // We assume we will only emit a value for lane zero of an instruction 6490 // marked uniform after vectorization, rather than VF identical values. 6491 // Thus, if we scalarize an instruction that uses a uniform, we would 6492 // create uses of values corresponding to the lanes we aren't emitting code 6493 // for. This behavior can be changed by allowing getScalarValue to clone 6494 // the lane zero values for uniforms rather than asserting. 6495 for (Use &U : I->operands()) 6496 if (auto *J = dyn_cast<Instruction>(U.get())) 6497 if (isUniformAfterVectorization(J, VF)) 6498 return false; 6499 6500 // Otherwise, we can scalarize the instruction. 6501 return true; 6502 }; 6503 6504 // Compute the expected cost discount from scalarizing the entire expression 6505 // feeding the predicated instruction. We currently only consider expressions 6506 // that are single-use instruction chains. 6507 Worklist.push_back(PredInst); 6508 while (!Worklist.empty()) { 6509 Instruction *I = Worklist.pop_back_val(); 6510 6511 // If we've already analyzed the instruction, there's nothing to do. 6512 if (ScalarCosts.find(I) != ScalarCosts.end()) 6513 continue; 6514 6515 // Compute the cost of the vector instruction. Note that this cost already 6516 // includes the scalarization overhead of the predicated instruction. 6517 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6518 6519 // Compute the cost of the scalarized instruction. This cost is the cost of 6520 // the instruction as if it wasn't if-converted and instead remained in the 6521 // predicated block. We will scale this cost by block probability after 6522 // computing the scalarization overhead. 6523 InstructionCost ScalarCost = 6524 VF.getFixedValue() * 6525 getInstructionCost(I, ElementCount::getFixed(1)).first; 6526 6527 // Compute the scalarization overhead of needed insertelement instructions 6528 // and phi nodes. 6529 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6530 ScalarCost += TTI.getScalarizationOverhead( 6531 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6532 APInt::getAllOnes(VF.getFixedValue()), true, false); 6533 ScalarCost += 6534 VF.getFixedValue() * 6535 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6536 } 6537 6538 // Compute the scalarization overhead of needed extractelement 6539 // instructions. For each of the instruction's operands, if the operand can 6540 // be scalarized, add it to the worklist; otherwise, account for the 6541 // overhead. 6542 for (Use &U : I->operands()) 6543 if (auto *J = dyn_cast<Instruction>(U.get())) { 6544 assert(VectorType::isValidElementType(J->getType()) && 6545 "Instruction has non-scalar type"); 6546 if (canBeScalarized(J)) 6547 Worklist.push_back(J); 6548 else if (needsExtract(J, VF)) { 6549 ScalarCost += TTI.getScalarizationOverhead( 6550 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6551 APInt::getAllOnes(VF.getFixedValue()), false, true); 6552 } 6553 } 6554 6555 // Scale the total scalar cost by block probability. 6556 ScalarCost /= getReciprocalPredBlockProb(); 6557 6558 // Compute the discount. A non-negative discount means the vector version 6559 // of the instruction costs more, and scalarizing would be beneficial. 6560 Discount += VectorCost - ScalarCost; 6561 ScalarCosts[I] = ScalarCost; 6562 } 6563 6564 return *Discount.getValue(); 6565 } 6566 6567 LoopVectorizationCostModel::VectorizationCostTy 6568 LoopVectorizationCostModel::expectedCost( 6569 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6570 VectorizationCostTy Cost; 6571 6572 // For each block. 6573 for (BasicBlock *BB : TheLoop->blocks()) { 6574 VectorizationCostTy BlockCost; 6575 6576 // For each instruction in the old loop. 6577 for (Instruction &I : BB->instructionsWithoutDebug()) { 6578 // Skip ignored values. 6579 if (ValuesToIgnore.count(&I) || 6580 (VF.isVector() && VecValuesToIgnore.count(&I))) 6581 continue; 6582 6583 VectorizationCostTy C = getInstructionCost(&I, VF); 6584 6585 // Check if we should override the cost. 6586 if (C.first.isValid() && 6587 ForceTargetInstructionCost.getNumOccurrences() > 0) 6588 C.first = InstructionCost(ForceTargetInstructionCost); 6589 6590 // Keep a list of instructions with invalid costs. 6591 if (Invalid && !C.first.isValid()) 6592 Invalid->emplace_back(&I, VF); 6593 6594 BlockCost.first += C.first; 6595 BlockCost.second |= C.second; 6596 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6597 << " for VF " << VF << " For instruction: " << I 6598 << '\n'); 6599 } 6600 6601 // If we are vectorizing a predicated block, it will have been 6602 // if-converted. This means that the block's instructions (aside from 6603 // stores and instructions that may divide by zero) will now be 6604 // unconditionally executed. For the scalar case, we may not always execute 6605 // the predicated block, if it is an if-else block. Thus, scale the block's 6606 // cost by the probability of executing it. blockNeedsPredication from 6607 // Legal is used so as to not include all blocks in tail folded loops. 6608 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6609 BlockCost.first /= getReciprocalPredBlockProb(); 6610 6611 Cost.first += BlockCost.first; 6612 Cost.second |= BlockCost.second; 6613 } 6614 6615 return Cost; 6616 } 6617 6618 /// Gets Address Access SCEV after verifying that the access pattern 6619 /// is loop invariant except the induction variable dependence. 6620 /// 6621 /// This SCEV can be sent to the Target in order to estimate the address 6622 /// calculation cost. 6623 static const SCEV *getAddressAccessSCEV( 6624 Value *Ptr, 6625 LoopVectorizationLegality *Legal, 6626 PredicatedScalarEvolution &PSE, 6627 const Loop *TheLoop) { 6628 6629 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6630 if (!Gep) 6631 return nullptr; 6632 6633 // We are looking for a gep with all loop invariant indices except for one 6634 // which should be an induction variable. 6635 auto SE = PSE.getSE(); 6636 unsigned NumOperands = Gep->getNumOperands(); 6637 for (unsigned i = 1; i < NumOperands; ++i) { 6638 Value *Opd = Gep->getOperand(i); 6639 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6640 !Legal->isInductionVariable(Opd)) 6641 return nullptr; 6642 } 6643 6644 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6645 return PSE.getSCEV(Ptr); 6646 } 6647 6648 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6649 return Legal->hasStride(I->getOperand(0)) || 6650 Legal->hasStride(I->getOperand(1)); 6651 } 6652 6653 InstructionCost 6654 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6655 ElementCount VF) { 6656 assert(VF.isVector() && 6657 "Scalarization cost of instruction implies vectorization."); 6658 if (VF.isScalable()) 6659 return InstructionCost::getInvalid(); 6660 6661 Type *ValTy = getLoadStoreType(I); 6662 auto SE = PSE.getSE(); 6663 6664 unsigned AS = getLoadStoreAddressSpace(I); 6665 Value *Ptr = getLoadStorePointerOperand(I); 6666 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6667 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6668 // that it is being called from this specific place. 6669 6670 // Figure out whether the access is strided and get the stride value 6671 // if it's known in compile time 6672 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6673 6674 // Get the cost of the scalar memory instruction and address computation. 6675 InstructionCost Cost = 6676 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6677 6678 // Don't pass *I here, since it is scalar but will actually be part of a 6679 // vectorized loop where the user of it is a vectorized instruction. 6680 const Align Alignment = getLoadStoreAlignment(I); 6681 Cost += VF.getKnownMinValue() * 6682 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6683 AS, TTI::TCK_RecipThroughput); 6684 6685 // Get the overhead of the extractelement and insertelement instructions 6686 // we might create due to scalarization. 6687 Cost += getScalarizationOverhead(I, VF); 6688 6689 // If we have a predicated load/store, it will need extra i1 extracts and 6690 // conditional branches, but may not be executed for each vector lane. Scale 6691 // the cost by the probability of executing the predicated block. 6692 if (isPredicatedInst(I, VF)) { 6693 Cost /= getReciprocalPredBlockProb(); 6694 6695 // Add the cost of an i1 extract and a branch 6696 auto *Vec_i1Ty = 6697 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6698 Cost += TTI.getScalarizationOverhead( 6699 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6700 /*Insert=*/false, /*Extract=*/true); 6701 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6702 6703 if (useEmulatedMaskMemRefHack(I, VF)) 6704 // Artificially setting to a high enough value to practically disable 6705 // vectorization with such operations. 6706 Cost = 3000000; 6707 } 6708 6709 return Cost; 6710 } 6711 6712 InstructionCost 6713 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6714 ElementCount VF) { 6715 Type *ValTy = getLoadStoreType(I); 6716 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6717 Value *Ptr = getLoadStorePointerOperand(I); 6718 unsigned AS = getLoadStoreAddressSpace(I); 6719 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6720 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6721 6722 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6723 "Stride should be 1 or -1 for consecutive memory access"); 6724 const Align Alignment = getLoadStoreAlignment(I); 6725 InstructionCost Cost = 0; 6726 if (Legal->isMaskRequired(I)) 6727 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6728 CostKind); 6729 else 6730 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6731 CostKind, I); 6732 6733 bool Reverse = ConsecutiveStride < 0; 6734 if (Reverse) 6735 Cost += 6736 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6737 return Cost; 6738 } 6739 6740 InstructionCost 6741 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6742 ElementCount VF) { 6743 assert(Legal->isUniformMemOp(*I)); 6744 6745 Type *ValTy = getLoadStoreType(I); 6746 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6747 const Align Alignment = getLoadStoreAlignment(I); 6748 unsigned AS = getLoadStoreAddressSpace(I); 6749 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6750 if (isa<LoadInst>(I)) { 6751 return TTI.getAddressComputationCost(ValTy) + 6752 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6753 CostKind) + 6754 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6755 } 6756 StoreInst *SI = cast<StoreInst>(I); 6757 6758 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6759 return TTI.getAddressComputationCost(ValTy) + 6760 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6761 CostKind) + 6762 (isLoopInvariantStoreValue 6763 ? 0 6764 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6765 VF.getKnownMinValue() - 1)); 6766 } 6767 6768 InstructionCost 6769 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6770 ElementCount VF) { 6771 Type *ValTy = getLoadStoreType(I); 6772 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6773 const Align Alignment = getLoadStoreAlignment(I); 6774 const Value *Ptr = getLoadStorePointerOperand(I); 6775 6776 return TTI.getAddressComputationCost(VectorTy) + 6777 TTI.getGatherScatterOpCost( 6778 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6779 TargetTransformInfo::TCK_RecipThroughput, I); 6780 } 6781 6782 InstructionCost 6783 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6784 ElementCount VF) { 6785 // TODO: Once we have support for interleaving with scalable vectors 6786 // we can calculate the cost properly here. 6787 if (VF.isScalable()) 6788 return InstructionCost::getInvalid(); 6789 6790 Type *ValTy = getLoadStoreType(I); 6791 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6792 unsigned AS = getLoadStoreAddressSpace(I); 6793 6794 auto Group = getInterleavedAccessGroup(I); 6795 assert(Group && "Fail to get an interleaved access group."); 6796 6797 unsigned InterleaveFactor = Group->getFactor(); 6798 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6799 6800 // Holds the indices of existing members in the interleaved group. 6801 SmallVector<unsigned, 4> Indices; 6802 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6803 if (Group->getMember(IF)) 6804 Indices.push_back(IF); 6805 6806 // Calculate the cost of the whole interleaved group. 6807 bool UseMaskForGaps = 6808 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6809 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6810 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6811 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6812 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6813 6814 if (Group->isReverse()) { 6815 // TODO: Add support for reversed masked interleaved access. 6816 assert(!Legal->isMaskRequired(I) && 6817 "Reverse masked interleaved access not supported."); 6818 Cost += 6819 Group->getNumMembers() * 6820 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6821 } 6822 return Cost; 6823 } 6824 6825 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6826 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6827 using namespace llvm::PatternMatch; 6828 // Early exit for no inloop reductions 6829 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6830 return None; 6831 auto *VectorTy = cast<VectorType>(Ty); 6832 6833 // We are looking for a pattern of, and finding the minimal acceptable cost: 6834 // reduce(mul(ext(A), ext(B))) or 6835 // reduce(mul(A, B)) or 6836 // reduce(ext(A)) or 6837 // reduce(A). 6838 // The basic idea is that we walk down the tree to do that, finding the root 6839 // reduction instruction in InLoopReductionImmediateChains. From there we find 6840 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6841 // of the components. If the reduction cost is lower then we return it for the 6842 // reduction instruction and 0 for the other instructions in the pattern. If 6843 // it is not we return an invalid cost specifying the orignal cost method 6844 // should be used. 6845 Instruction *RetI = I; 6846 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6847 if (!RetI->hasOneUser()) 6848 return None; 6849 RetI = RetI->user_back(); 6850 } 6851 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6852 RetI->user_back()->getOpcode() == Instruction::Add) { 6853 if (!RetI->hasOneUser()) 6854 return None; 6855 RetI = RetI->user_back(); 6856 } 6857 6858 // Test if the found instruction is a reduction, and if not return an invalid 6859 // cost specifying the parent to use the original cost modelling. 6860 if (!InLoopReductionImmediateChains.count(RetI)) 6861 return None; 6862 6863 // Find the reduction this chain is a part of and calculate the basic cost of 6864 // the reduction on its own. 6865 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6866 Instruction *ReductionPhi = LastChain; 6867 while (!isa<PHINode>(ReductionPhi)) 6868 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6869 6870 const RecurrenceDescriptor &RdxDesc = 6871 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6872 6873 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6874 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6875 6876 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6877 // normal fmul instruction to the cost of the fadd reduction. 6878 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6879 BaseCost += 6880 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6881 6882 // If we're using ordered reductions then we can just return the base cost 6883 // here, since getArithmeticReductionCost calculates the full ordered 6884 // reduction cost when FP reassociation is not allowed. 6885 if (useOrderedReductions(RdxDesc)) 6886 return BaseCost; 6887 6888 // Get the operand that was not the reduction chain and match it to one of the 6889 // patterns, returning the better cost if it is found. 6890 Instruction *RedOp = RetI->getOperand(1) == LastChain 6891 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6892 : dyn_cast<Instruction>(RetI->getOperand(1)); 6893 6894 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6895 6896 Instruction *Op0, *Op1; 6897 if (RedOp && 6898 match(RedOp, 6899 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6900 match(Op0, m_ZExtOrSExt(m_Value())) && 6901 Op0->getOpcode() == Op1->getOpcode() && 6902 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6903 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6904 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6905 6906 // Matched reduce(ext(mul(ext(A), ext(B))) 6907 // Note that the extend opcodes need to all match, or if A==B they will have 6908 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6909 // which is equally fine. 6910 bool IsUnsigned = isa<ZExtInst>(Op0); 6911 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6912 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6913 6914 InstructionCost ExtCost = 6915 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6916 TTI::CastContextHint::None, CostKind, Op0); 6917 InstructionCost MulCost = 6918 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6919 InstructionCost Ext2Cost = 6920 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6921 TTI::CastContextHint::None, CostKind, RedOp); 6922 6923 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6924 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6925 CostKind); 6926 6927 if (RedCost.isValid() && 6928 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6929 return I == RetI ? RedCost : 0; 6930 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6931 !TheLoop->isLoopInvariant(RedOp)) { 6932 // Matched reduce(ext(A)) 6933 bool IsUnsigned = isa<ZExtInst>(RedOp); 6934 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6935 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6936 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6937 CostKind); 6938 6939 InstructionCost ExtCost = 6940 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6941 TTI::CastContextHint::None, CostKind, RedOp); 6942 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6943 return I == RetI ? RedCost : 0; 6944 } else if (RedOp && 6945 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6946 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6947 Op0->getOpcode() == Op1->getOpcode() && 6948 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6949 bool IsUnsigned = isa<ZExtInst>(Op0); 6950 Type *Op0Ty = Op0->getOperand(0)->getType(); 6951 Type *Op1Ty = Op1->getOperand(0)->getType(); 6952 Type *LargestOpTy = 6953 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6954 : Op0Ty; 6955 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6956 6957 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6958 // different sizes. We take the largest type as the ext to reduce, and add 6959 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6960 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6961 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6962 TTI::CastContextHint::None, CostKind, Op0); 6963 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6964 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6965 TTI::CastContextHint::None, CostKind, Op1); 6966 InstructionCost MulCost = 6967 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6968 6969 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6970 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6971 CostKind); 6972 InstructionCost ExtraExtCost = 0; 6973 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6974 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6975 ExtraExtCost = TTI.getCastInstrCost( 6976 ExtraExtOp->getOpcode(), ExtType, 6977 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6978 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6979 } 6980 6981 if (RedCost.isValid() && 6982 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6983 return I == RetI ? RedCost : 0; 6984 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6985 // Matched reduce(mul()) 6986 InstructionCost MulCost = 6987 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6988 6989 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6990 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6991 CostKind); 6992 6993 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6994 return I == RetI ? RedCost : 0; 6995 } 6996 } 6997 6998 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6999 } 7000 7001 InstructionCost 7002 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7003 ElementCount VF) { 7004 // Calculate scalar cost only. Vectorization cost should be ready at this 7005 // moment. 7006 if (VF.isScalar()) { 7007 Type *ValTy = getLoadStoreType(I); 7008 const Align Alignment = getLoadStoreAlignment(I); 7009 unsigned AS = getLoadStoreAddressSpace(I); 7010 7011 return TTI.getAddressComputationCost(ValTy) + 7012 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7013 TTI::TCK_RecipThroughput, I); 7014 } 7015 return getWideningCost(I, VF); 7016 } 7017 7018 LoopVectorizationCostModel::VectorizationCostTy 7019 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7020 ElementCount VF) { 7021 // If we know that this instruction will remain uniform, check the cost of 7022 // the scalar version. 7023 if (isUniformAfterVectorization(I, VF)) 7024 VF = ElementCount::getFixed(1); 7025 7026 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7027 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7028 7029 // Forced scalars do not have any scalarization overhead. 7030 auto ForcedScalar = ForcedScalars.find(VF); 7031 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7032 auto InstSet = ForcedScalar->second; 7033 if (InstSet.count(I)) 7034 return VectorizationCostTy( 7035 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7036 VF.getKnownMinValue()), 7037 false); 7038 } 7039 7040 Type *VectorTy; 7041 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7042 7043 bool TypeNotScalarized = false; 7044 if (VF.isVector() && VectorTy->isVectorTy()) { 7045 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7046 if (NumParts) 7047 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7048 else 7049 C = InstructionCost::getInvalid(); 7050 } 7051 return VectorizationCostTy(C, TypeNotScalarized); 7052 } 7053 7054 InstructionCost 7055 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7056 ElementCount VF) const { 7057 7058 // There is no mechanism yet to create a scalable scalarization loop, 7059 // so this is currently Invalid. 7060 if (VF.isScalable()) 7061 return InstructionCost::getInvalid(); 7062 7063 if (VF.isScalar()) 7064 return 0; 7065 7066 InstructionCost Cost = 0; 7067 Type *RetTy = ToVectorTy(I->getType(), VF); 7068 if (!RetTy->isVoidTy() && 7069 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7070 Cost += TTI.getScalarizationOverhead( 7071 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7072 false); 7073 7074 // Some targets keep addresses scalar. 7075 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7076 return Cost; 7077 7078 // Some targets support efficient element stores. 7079 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7080 return Cost; 7081 7082 // Collect operands to consider. 7083 CallInst *CI = dyn_cast<CallInst>(I); 7084 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7085 7086 // Skip operands that do not require extraction/scalarization and do not incur 7087 // any overhead. 7088 SmallVector<Type *> Tys; 7089 for (auto *V : filterExtractingOperands(Ops, VF)) 7090 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7091 return Cost + TTI.getOperandsScalarizationOverhead( 7092 filterExtractingOperands(Ops, VF), Tys); 7093 } 7094 7095 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7096 if (VF.isScalar()) 7097 return; 7098 NumPredStores = 0; 7099 for (BasicBlock *BB : TheLoop->blocks()) { 7100 // For each instruction in the old loop. 7101 for (Instruction &I : *BB) { 7102 Value *Ptr = getLoadStorePointerOperand(&I); 7103 if (!Ptr) 7104 continue; 7105 7106 // TODO: We should generate better code and update the cost model for 7107 // predicated uniform stores. Today they are treated as any other 7108 // predicated store (see added test cases in 7109 // invariant-store-vectorization.ll). 7110 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 7111 NumPredStores++; 7112 7113 if (Legal->isUniformMemOp(I)) { 7114 // TODO: Avoid replicating loads and stores instead of 7115 // relying on instcombine to remove them. 7116 // Load: Scalar load + broadcast 7117 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7118 InstructionCost Cost; 7119 if (isa<StoreInst>(&I) && VF.isScalable() && 7120 isLegalGatherOrScatter(&I, VF)) { 7121 Cost = getGatherScatterCost(&I, VF); 7122 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7123 } else { 7124 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7125 "Cannot yet scalarize uniform stores"); 7126 Cost = getUniformMemOpCost(&I, VF); 7127 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7128 } 7129 continue; 7130 } 7131 7132 // We assume that widening is the best solution when possible. 7133 if (memoryInstructionCanBeWidened(&I, VF)) { 7134 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7135 int ConsecutiveStride = Legal->isConsecutivePtr( 7136 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7137 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7138 "Expected consecutive stride."); 7139 InstWidening Decision = 7140 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7141 setWideningDecision(&I, VF, Decision, Cost); 7142 continue; 7143 } 7144 7145 // Choose between Interleaving, Gather/Scatter or Scalarization. 7146 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7147 unsigned NumAccesses = 1; 7148 if (isAccessInterleaved(&I)) { 7149 auto Group = getInterleavedAccessGroup(&I); 7150 assert(Group && "Fail to get an interleaved access group."); 7151 7152 // Make one decision for the whole group. 7153 if (getWideningDecision(&I, VF) != CM_Unknown) 7154 continue; 7155 7156 NumAccesses = Group->getNumMembers(); 7157 if (interleavedAccessCanBeWidened(&I, VF)) 7158 InterleaveCost = getInterleaveGroupCost(&I, VF); 7159 } 7160 7161 InstructionCost GatherScatterCost = 7162 isLegalGatherOrScatter(&I, VF) 7163 ? getGatherScatterCost(&I, VF) * NumAccesses 7164 : InstructionCost::getInvalid(); 7165 7166 InstructionCost ScalarizationCost = 7167 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7168 7169 // Choose better solution for the current VF, 7170 // write down this decision and use it during vectorization. 7171 InstructionCost Cost; 7172 InstWidening Decision; 7173 if (InterleaveCost <= GatherScatterCost && 7174 InterleaveCost < ScalarizationCost) { 7175 Decision = CM_Interleave; 7176 Cost = InterleaveCost; 7177 } else if (GatherScatterCost < ScalarizationCost) { 7178 Decision = CM_GatherScatter; 7179 Cost = GatherScatterCost; 7180 } else { 7181 Decision = CM_Scalarize; 7182 Cost = ScalarizationCost; 7183 } 7184 // If the instructions belongs to an interleave group, the whole group 7185 // receives the same decision. The whole group receives the cost, but 7186 // the cost will actually be assigned to one instruction. 7187 if (auto Group = getInterleavedAccessGroup(&I)) 7188 setWideningDecision(Group, VF, Decision, Cost); 7189 else 7190 setWideningDecision(&I, VF, Decision, Cost); 7191 } 7192 } 7193 7194 // Make sure that any load of address and any other address computation 7195 // remains scalar unless there is gather/scatter support. This avoids 7196 // inevitable extracts into address registers, and also has the benefit of 7197 // activating LSR more, since that pass can't optimize vectorized 7198 // addresses. 7199 if (TTI.prefersVectorizedAddressing()) 7200 return; 7201 7202 // Start with all scalar pointer uses. 7203 SmallPtrSet<Instruction *, 8> AddrDefs; 7204 for (BasicBlock *BB : TheLoop->blocks()) 7205 for (Instruction &I : *BB) { 7206 Instruction *PtrDef = 7207 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7208 if (PtrDef && TheLoop->contains(PtrDef) && 7209 getWideningDecision(&I, VF) != CM_GatherScatter) 7210 AddrDefs.insert(PtrDef); 7211 } 7212 7213 // Add all instructions used to generate the addresses. 7214 SmallVector<Instruction *, 4> Worklist; 7215 append_range(Worklist, AddrDefs); 7216 while (!Worklist.empty()) { 7217 Instruction *I = Worklist.pop_back_val(); 7218 for (auto &Op : I->operands()) 7219 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7220 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7221 AddrDefs.insert(InstOp).second) 7222 Worklist.push_back(InstOp); 7223 } 7224 7225 for (auto *I : AddrDefs) { 7226 if (isa<LoadInst>(I)) { 7227 // Setting the desired widening decision should ideally be handled in 7228 // by cost functions, but since this involves the task of finding out 7229 // if the loaded register is involved in an address computation, it is 7230 // instead changed here when we know this is the case. 7231 InstWidening Decision = getWideningDecision(I, VF); 7232 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7233 // Scalarize a widened load of address. 7234 setWideningDecision( 7235 I, VF, CM_Scalarize, 7236 (VF.getKnownMinValue() * 7237 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7238 else if (auto Group = getInterleavedAccessGroup(I)) { 7239 // Scalarize an interleave group of address loads. 7240 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7241 if (Instruction *Member = Group->getMember(I)) 7242 setWideningDecision( 7243 Member, VF, CM_Scalarize, 7244 (VF.getKnownMinValue() * 7245 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7246 } 7247 } 7248 } else 7249 // Make sure I gets scalarized and a cost estimate without 7250 // scalarization overhead. 7251 ForcedScalars[VF].insert(I); 7252 } 7253 } 7254 7255 InstructionCost 7256 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7257 Type *&VectorTy) { 7258 Type *RetTy = I->getType(); 7259 if (canTruncateToMinimalBitwidth(I, VF)) 7260 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7261 auto SE = PSE.getSE(); 7262 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7263 7264 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7265 ElementCount VF) -> bool { 7266 if (VF.isScalar()) 7267 return true; 7268 7269 auto Scalarized = InstsToScalarize.find(VF); 7270 assert(Scalarized != InstsToScalarize.end() && 7271 "VF not yet analyzed for scalarization profitability"); 7272 return !Scalarized->second.count(I) && 7273 llvm::all_of(I->users(), [&](User *U) { 7274 auto *UI = cast<Instruction>(U); 7275 return !Scalarized->second.count(UI); 7276 }); 7277 }; 7278 (void) hasSingleCopyAfterVectorization; 7279 7280 if (isScalarAfterVectorization(I, VF)) { 7281 // With the exception of GEPs and PHIs, after scalarization there should 7282 // only be one copy of the instruction generated in the loop. This is 7283 // because the VF is either 1, or any instructions that need scalarizing 7284 // have already been dealt with by the the time we get here. As a result, 7285 // it means we don't have to multiply the instruction cost by VF. 7286 assert(I->getOpcode() == Instruction::GetElementPtr || 7287 I->getOpcode() == Instruction::PHI || 7288 (I->getOpcode() == Instruction::BitCast && 7289 I->getType()->isPointerTy()) || 7290 hasSingleCopyAfterVectorization(I, VF)); 7291 VectorTy = RetTy; 7292 } else 7293 VectorTy = ToVectorTy(RetTy, VF); 7294 7295 // TODO: We need to estimate the cost of intrinsic calls. 7296 switch (I->getOpcode()) { 7297 case Instruction::GetElementPtr: 7298 // We mark this instruction as zero-cost because the cost of GEPs in 7299 // vectorized code depends on whether the corresponding memory instruction 7300 // is scalarized or not. Therefore, we handle GEPs with the memory 7301 // instruction cost. 7302 return 0; 7303 case Instruction::Br: { 7304 // In cases of scalarized and predicated instructions, there will be VF 7305 // predicated blocks in the vectorized loop. Each branch around these 7306 // blocks requires also an extract of its vector compare i1 element. 7307 bool ScalarPredicatedBB = false; 7308 BranchInst *BI = cast<BranchInst>(I); 7309 if (VF.isVector() && BI->isConditional() && 7310 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7311 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7312 ScalarPredicatedBB = true; 7313 7314 if (ScalarPredicatedBB) { 7315 // Not possible to scalarize scalable vector with predicated instructions. 7316 if (VF.isScalable()) 7317 return InstructionCost::getInvalid(); 7318 // Return cost for branches around scalarized and predicated blocks. 7319 auto *Vec_i1Ty = 7320 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7321 return ( 7322 TTI.getScalarizationOverhead( 7323 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7324 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7325 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7326 // The back-edge branch will remain, as will all scalar branches. 7327 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7328 else 7329 // This branch will be eliminated by if-conversion. 7330 return 0; 7331 // Note: We currently assume zero cost for an unconditional branch inside 7332 // a predicated block since it will become a fall-through, although we 7333 // may decide in the future to call TTI for all branches. 7334 } 7335 case Instruction::PHI: { 7336 auto *Phi = cast<PHINode>(I); 7337 7338 // First-order recurrences are replaced by vector shuffles inside the loop. 7339 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7340 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7341 return TTI.getShuffleCost( 7342 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7343 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7344 7345 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7346 // converted into select instructions. We require N - 1 selects per phi 7347 // node, where N is the number of incoming values. 7348 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7349 return (Phi->getNumIncomingValues() - 1) * 7350 TTI.getCmpSelInstrCost( 7351 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7352 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7353 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7354 7355 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7356 } 7357 case Instruction::UDiv: 7358 case Instruction::SDiv: 7359 case Instruction::URem: 7360 case Instruction::SRem: 7361 // If we have a predicated instruction, it may not be executed for each 7362 // vector lane. Get the scalarization cost and scale this amount by the 7363 // probability of executing the predicated block. If the instruction is not 7364 // predicated, we fall through to the next case. 7365 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7366 InstructionCost Cost = 0; 7367 7368 // These instructions have a non-void type, so account for the phi nodes 7369 // that we will create. This cost is likely to be zero. The phi node 7370 // cost, if any, should be scaled by the block probability because it 7371 // models a copy at the end of each predicated block. 7372 Cost += VF.getKnownMinValue() * 7373 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7374 7375 // The cost of the non-predicated instruction. 7376 Cost += VF.getKnownMinValue() * 7377 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7378 7379 // The cost of insertelement and extractelement instructions needed for 7380 // scalarization. 7381 Cost += getScalarizationOverhead(I, VF); 7382 7383 // Scale the cost by the probability of executing the predicated blocks. 7384 // This assumes the predicated block for each vector lane is equally 7385 // likely. 7386 return Cost / getReciprocalPredBlockProb(); 7387 } 7388 LLVM_FALLTHROUGH; 7389 case Instruction::Add: 7390 case Instruction::FAdd: 7391 case Instruction::Sub: 7392 case Instruction::FSub: 7393 case Instruction::Mul: 7394 case Instruction::FMul: 7395 case Instruction::FDiv: 7396 case Instruction::FRem: 7397 case Instruction::Shl: 7398 case Instruction::LShr: 7399 case Instruction::AShr: 7400 case Instruction::And: 7401 case Instruction::Or: 7402 case Instruction::Xor: { 7403 // Since we will replace the stride by 1 the multiplication should go away. 7404 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7405 return 0; 7406 7407 // Detect reduction patterns 7408 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7409 return *RedCost; 7410 7411 // Certain instructions can be cheaper to vectorize if they have a constant 7412 // second vector operand. One example of this are shifts on x86. 7413 Value *Op2 = I->getOperand(1); 7414 TargetTransformInfo::OperandValueProperties Op2VP; 7415 TargetTransformInfo::OperandValueKind Op2VK = 7416 TTI.getOperandInfo(Op2, Op2VP); 7417 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7418 Op2VK = TargetTransformInfo::OK_UniformValue; 7419 7420 SmallVector<const Value *, 4> Operands(I->operand_values()); 7421 return TTI.getArithmeticInstrCost( 7422 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7423 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7424 } 7425 case Instruction::FNeg: { 7426 return TTI.getArithmeticInstrCost( 7427 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7428 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7429 TargetTransformInfo::OP_None, I->getOperand(0), I); 7430 } 7431 case Instruction::Select: { 7432 SelectInst *SI = cast<SelectInst>(I); 7433 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7434 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7435 7436 const Value *Op0, *Op1; 7437 using namespace llvm::PatternMatch; 7438 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7439 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7440 // select x, y, false --> x & y 7441 // select x, true, y --> x | y 7442 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7443 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7444 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7445 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7446 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7447 Op1->getType()->getScalarSizeInBits() == 1); 7448 7449 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7450 return TTI.getArithmeticInstrCost( 7451 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7452 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7453 } 7454 7455 Type *CondTy = SI->getCondition()->getType(); 7456 if (!ScalarCond) 7457 CondTy = VectorType::get(CondTy, VF); 7458 7459 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7460 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7461 Pred = Cmp->getPredicate(); 7462 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7463 CostKind, I); 7464 } 7465 case Instruction::ICmp: 7466 case Instruction::FCmp: { 7467 Type *ValTy = I->getOperand(0)->getType(); 7468 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7469 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7470 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7471 VectorTy = ToVectorTy(ValTy, VF); 7472 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7473 cast<CmpInst>(I)->getPredicate(), CostKind, 7474 I); 7475 } 7476 case Instruction::Store: 7477 case Instruction::Load: { 7478 ElementCount Width = VF; 7479 if (Width.isVector()) { 7480 InstWidening Decision = getWideningDecision(I, Width); 7481 assert(Decision != CM_Unknown && 7482 "CM decision should be taken at this point"); 7483 if (Decision == CM_Scalarize) 7484 Width = ElementCount::getFixed(1); 7485 } 7486 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7487 return getMemoryInstructionCost(I, VF); 7488 } 7489 case Instruction::BitCast: 7490 if (I->getType()->isPointerTy()) 7491 return 0; 7492 LLVM_FALLTHROUGH; 7493 case Instruction::ZExt: 7494 case Instruction::SExt: 7495 case Instruction::FPToUI: 7496 case Instruction::FPToSI: 7497 case Instruction::FPExt: 7498 case Instruction::PtrToInt: 7499 case Instruction::IntToPtr: 7500 case Instruction::SIToFP: 7501 case Instruction::UIToFP: 7502 case Instruction::Trunc: 7503 case Instruction::FPTrunc: { 7504 // Computes the CastContextHint from a Load/Store instruction. 7505 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7506 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7507 "Expected a load or a store!"); 7508 7509 if (VF.isScalar() || !TheLoop->contains(I)) 7510 return TTI::CastContextHint::Normal; 7511 7512 switch (getWideningDecision(I, VF)) { 7513 case LoopVectorizationCostModel::CM_GatherScatter: 7514 return TTI::CastContextHint::GatherScatter; 7515 case LoopVectorizationCostModel::CM_Interleave: 7516 return TTI::CastContextHint::Interleave; 7517 case LoopVectorizationCostModel::CM_Scalarize: 7518 case LoopVectorizationCostModel::CM_Widen: 7519 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7520 : TTI::CastContextHint::Normal; 7521 case LoopVectorizationCostModel::CM_Widen_Reverse: 7522 return TTI::CastContextHint::Reversed; 7523 case LoopVectorizationCostModel::CM_Unknown: 7524 llvm_unreachable("Instr did not go through cost modelling?"); 7525 } 7526 7527 llvm_unreachable("Unhandled case!"); 7528 }; 7529 7530 unsigned Opcode = I->getOpcode(); 7531 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7532 // For Trunc, the context is the only user, which must be a StoreInst. 7533 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7534 if (I->hasOneUse()) 7535 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7536 CCH = ComputeCCH(Store); 7537 } 7538 // For Z/Sext, the context is the operand, which must be a LoadInst. 7539 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7540 Opcode == Instruction::FPExt) { 7541 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7542 CCH = ComputeCCH(Load); 7543 } 7544 7545 // We optimize the truncation of induction variables having constant 7546 // integer steps. The cost of these truncations is the same as the scalar 7547 // operation. 7548 if (isOptimizableIVTruncate(I, VF)) { 7549 auto *Trunc = cast<TruncInst>(I); 7550 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7551 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7552 } 7553 7554 // Detect reduction patterns 7555 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7556 return *RedCost; 7557 7558 Type *SrcScalarTy = I->getOperand(0)->getType(); 7559 Type *SrcVecTy = 7560 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7561 if (canTruncateToMinimalBitwidth(I, VF)) { 7562 // This cast is going to be shrunk. This may remove the cast or it might 7563 // turn it into slightly different cast. For example, if MinBW == 16, 7564 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7565 // 7566 // Calculate the modified src and dest types. 7567 Type *MinVecTy = VectorTy; 7568 if (Opcode == Instruction::Trunc) { 7569 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7570 VectorTy = 7571 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7572 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7573 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7574 VectorTy = 7575 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7576 } 7577 } 7578 7579 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7580 } 7581 case Instruction::Call: { 7582 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7583 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7584 return *RedCost; 7585 bool NeedToScalarize; 7586 CallInst *CI = cast<CallInst>(I); 7587 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7588 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7589 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7590 return std::min(CallCost, IntrinsicCost); 7591 } 7592 return CallCost; 7593 } 7594 case Instruction::ExtractValue: 7595 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7596 case Instruction::Alloca: 7597 // We cannot easily widen alloca to a scalable alloca, as 7598 // the result would need to be a vector of pointers. 7599 if (VF.isScalable()) 7600 return InstructionCost::getInvalid(); 7601 LLVM_FALLTHROUGH; 7602 default: 7603 // This opcode is unknown. Assume that it is the same as 'mul'. 7604 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7605 } // end of switch. 7606 } 7607 7608 char LoopVectorize::ID = 0; 7609 7610 static const char lv_name[] = "Loop Vectorization"; 7611 7612 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7613 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7614 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7615 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7616 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7617 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7618 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7619 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7620 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7621 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7622 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7623 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7624 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7625 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7626 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7627 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7628 7629 namespace llvm { 7630 7631 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7632 7633 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7634 bool VectorizeOnlyWhenForced) { 7635 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7636 } 7637 7638 } // end namespace llvm 7639 7640 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7641 // Check if the pointer operand of a load or store instruction is 7642 // consecutive. 7643 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7644 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7645 return false; 7646 } 7647 7648 void LoopVectorizationCostModel::collectValuesToIgnore() { 7649 // Ignore ephemeral values. 7650 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7651 7652 // Ignore type-promoting instructions we identified during reduction 7653 // detection. 7654 for (auto &Reduction : Legal->getReductionVars()) { 7655 const RecurrenceDescriptor &RedDes = Reduction.second; 7656 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7657 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7658 } 7659 // Ignore type-casting instructions we identified during induction 7660 // detection. 7661 for (auto &Induction : Legal->getInductionVars()) { 7662 const InductionDescriptor &IndDes = Induction.second; 7663 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7664 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7665 } 7666 } 7667 7668 void LoopVectorizationCostModel::collectInLoopReductions() { 7669 for (auto &Reduction : Legal->getReductionVars()) { 7670 PHINode *Phi = Reduction.first; 7671 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7672 7673 // We don't collect reductions that are type promoted (yet). 7674 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7675 continue; 7676 7677 // If the target would prefer this reduction to happen "in-loop", then we 7678 // want to record it as such. 7679 unsigned Opcode = RdxDesc.getOpcode(); 7680 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7681 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7682 TargetTransformInfo::ReductionFlags())) 7683 continue; 7684 7685 // Check that we can correctly put the reductions into the loop, by 7686 // finding the chain of operations that leads from the phi to the loop 7687 // exit value. 7688 SmallVector<Instruction *, 4> ReductionOperations = 7689 RdxDesc.getReductionOpChain(Phi, TheLoop); 7690 bool InLoop = !ReductionOperations.empty(); 7691 if (InLoop) { 7692 InLoopReductionChains[Phi] = ReductionOperations; 7693 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7694 Instruction *LastChain = Phi; 7695 for (auto *I : ReductionOperations) { 7696 InLoopReductionImmediateChains[I] = LastChain; 7697 LastChain = I; 7698 } 7699 } 7700 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7701 << " reduction for phi: " << *Phi << "\n"); 7702 } 7703 } 7704 7705 // TODO: we could return a pair of values that specify the max VF and 7706 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7707 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7708 // doesn't have a cost model that can choose which plan to execute if 7709 // more than one is generated. 7710 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7711 LoopVectorizationCostModel &CM) { 7712 unsigned WidestType; 7713 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7714 return WidestVectorRegBits / WidestType; 7715 } 7716 7717 VectorizationFactor 7718 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7719 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7720 ElementCount VF = UserVF; 7721 // Outer loop handling: They may require CFG and instruction level 7722 // transformations before even evaluating whether vectorization is profitable. 7723 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7724 // the vectorization pipeline. 7725 if (!OrigLoop->isInnermost()) { 7726 // If the user doesn't provide a vectorization factor, determine a 7727 // reasonable one. 7728 if (UserVF.isZero()) { 7729 VF = ElementCount::getFixed(determineVPlanVF( 7730 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7731 .getFixedSize(), 7732 CM)); 7733 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7734 7735 // Make sure we have a VF > 1 for stress testing. 7736 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7737 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7738 << "overriding computed VF.\n"); 7739 VF = ElementCount::getFixed(4); 7740 } 7741 } 7742 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7743 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7744 "VF needs to be a power of two"); 7745 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7746 << "VF " << VF << " to build VPlans.\n"); 7747 buildVPlans(VF, VF); 7748 7749 // For VPlan build stress testing, we bail out after VPlan construction. 7750 if (VPlanBuildStressTest) 7751 return VectorizationFactor::Disabled(); 7752 7753 return {VF, 0 /*Cost*/}; 7754 } 7755 7756 LLVM_DEBUG( 7757 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7758 "VPlan-native path.\n"); 7759 return VectorizationFactor::Disabled(); 7760 } 7761 7762 Optional<VectorizationFactor> 7763 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7764 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7765 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7766 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7767 return None; 7768 7769 // Invalidate interleave groups if all blocks of loop will be predicated. 7770 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7771 !useMaskedInterleavedAccesses(*TTI)) { 7772 LLVM_DEBUG( 7773 dbgs() 7774 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7775 "which requires masked-interleaved support.\n"); 7776 if (CM.InterleaveInfo.invalidateGroups()) 7777 // Invalidating interleave groups also requires invalidating all decisions 7778 // based on them, which includes widening decisions and uniform and scalar 7779 // values. 7780 CM.invalidateCostModelingDecisions(); 7781 } 7782 7783 ElementCount MaxUserVF = 7784 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7785 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7786 if (!UserVF.isZero() && UserVFIsLegal) { 7787 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7788 "VF needs to be a power of two"); 7789 // Collect the instructions (and their associated costs) that will be more 7790 // profitable to scalarize. 7791 if (CM.selectUserVectorizationFactor(UserVF)) { 7792 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7793 CM.collectInLoopReductions(); 7794 buildVPlansWithVPRecipes(UserVF, UserVF); 7795 LLVM_DEBUG(printPlans(dbgs())); 7796 return {{UserVF, 0}}; 7797 } else 7798 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7799 "InvalidCost", ORE, OrigLoop); 7800 } 7801 7802 // Populate the set of Vectorization Factor Candidates. 7803 ElementCountSet VFCandidates; 7804 for (auto VF = ElementCount::getFixed(1); 7805 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7806 VFCandidates.insert(VF); 7807 for (auto VF = ElementCount::getScalable(1); 7808 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7809 VFCandidates.insert(VF); 7810 7811 for (const auto &VF : VFCandidates) { 7812 // Collect Uniform and Scalar instructions after vectorization with VF. 7813 CM.collectUniformsAndScalars(VF); 7814 7815 // Collect the instructions (and their associated costs) that will be more 7816 // profitable to scalarize. 7817 if (VF.isVector()) 7818 CM.collectInstsToScalarize(VF); 7819 } 7820 7821 CM.collectInLoopReductions(); 7822 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7823 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7824 7825 LLVM_DEBUG(printPlans(dbgs())); 7826 if (!MaxFactors.hasVector()) 7827 return VectorizationFactor::Disabled(); 7828 7829 // Select the optimal vectorization factor. 7830 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7831 7832 // Check if it is profitable to vectorize with runtime checks. 7833 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7834 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7835 bool PragmaThresholdReached = 7836 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7837 bool ThresholdReached = 7838 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7839 if ((ThresholdReached && !Hints.allowReordering()) || 7840 PragmaThresholdReached) { 7841 ORE->emit([&]() { 7842 return OptimizationRemarkAnalysisAliasing( 7843 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7844 OrigLoop->getHeader()) 7845 << "loop not vectorized: cannot prove it is safe to reorder " 7846 "memory operations"; 7847 }); 7848 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7849 Hints.emitRemarkWithHints(); 7850 return VectorizationFactor::Disabled(); 7851 } 7852 } 7853 return SelectedVF; 7854 } 7855 7856 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7857 assert(count_if(VPlans, 7858 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7859 1 && 7860 "Best VF has not a single VPlan."); 7861 7862 for (const VPlanPtr &Plan : VPlans) { 7863 if (Plan->hasVF(VF)) 7864 return *Plan.get(); 7865 } 7866 llvm_unreachable("No plan found!"); 7867 } 7868 7869 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7870 SmallVector<Metadata *, 4> MDs; 7871 // Reserve first location for self reference to the LoopID metadata node. 7872 MDs.push_back(nullptr); 7873 bool IsUnrollMetadata = false; 7874 MDNode *LoopID = L->getLoopID(); 7875 if (LoopID) { 7876 // First find existing loop unrolling disable metadata. 7877 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7878 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7879 if (MD) { 7880 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7881 IsUnrollMetadata = 7882 S && S->getString().startswith("llvm.loop.unroll.disable"); 7883 } 7884 MDs.push_back(LoopID->getOperand(i)); 7885 } 7886 } 7887 7888 if (!IsUnrollMetadata) { 7889 // Add runtime unroll disable metadata. 7890 LLVMContext &Context = L->getHeader()->getContext(); 7891 SmallVector<Metadata *, 1> DisableOperands; 7892 DisableOperands.push_back( 7893 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7894 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7895 MDs.push_back(DisableNode); 7896 MDNode *NewLoopID = MDNode::get(Context, MDs); 7897 // Set operand 0 to refer to the loop id itself. 7898 NewLoopID->replaceOperandWith(0, NewLoopID); 7899 L->setLoopID(NewLoopID); 7900 } 7901 } 7902 7903 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7904 VPlan &BestVPlan, 7905 InnerLoopVectorizer &ILV, 7906 DominatorTree *DT) { 7907 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7908 << '\n'); 7909 7910 // Perform the actual loop transformation. 7911 7912 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7913 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7914 Value *CanonicalIVStartValue; 7915 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7916 ILV.createVectorizedLoopSkeleton(); 7917 ILV.collectPoisonGeneratingRecipes(State); 7918 7919 ILV.printDebugTracesAtStart(); 7920 7921 //===------------------------------------------------===// 7922 // 7923 // Notice: any optimization or new instruction that go 7924 // into the code below should also be implemented in 7925 // the cost-model. 7926 // 7927 //===------------------------------------------------===// 7928 7929 // 2. Copy and widen instructions from the old loop into the new loop. 7930 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7931 ILV.getOrCreateVectorTripCount(nullptr), 7932 CanonicalIVStartValue, State); 7933 BestVPlan.execute(&State); 7934 7935 // Keep all loop hints from the original loop on the vector loop (we'll 7936 // replace the vectorizer-specific hints below). 7937 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7938 7939 Optional<MDNode *> VectorizedLoopID = 7940 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7941 LLVMLoopVectorizeFollowupVectorized}); 7942 7943 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 7944 if (VectorizedLoopID.hasValue()) 7945 L->setLoopID(VectorizedLoopID.getValue()); 7946 else { 7947 // Keep all loop hints from the original loop on the vector loop (we'll 7948 // replace the vectorizer-specific hints below). 7949 if (MDNode *LID = OrigLoop->getLoopID()) 7950 L->setLoopID(LID); 7951 7952 LoopVectorizeHints Hints(L, true, *ORE); 7953 Hints.setAlreadyVectorized(); 7954 } 7955 // Disable runtime unrolling when vectorizing the epilogue loop. 7956 if (CanonicalIVStartValue) 7957 AddRuntimeUnrollDisableMetaData(L); 7958 7959 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7960 // predication, updating analyses. 7961 ILV.fixVectorizedLoop(State); 7962 7963 ILV.printDebugTracesAtEnd(); 7964 } 7965 7966 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7967 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7968 for (const auto &Plan : VPlans) 7969 if (PrintVPlansInDotFormat) 7970 Plan->printDOT(O); 7971 else 7972 Plan->print(O); 7973 } 7974 #endif 7975 7976 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7977 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7978 7979 // We create new control-flow for the vectorized loop, so the original exit 7980 // conditions will be dead after vectorization if it's only used by the 7981 // terminator 7982 SmallVector<BasicBlock*> ExitingBlocks; 7983 OrigLoop->getExitingBlocks(ExitingBlocks); 7984 for (auto *BB : ExitingBlocks) { 7985 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7986 if (!Cmp || !Cmp->hasOneUse()) 7987 continue; 7988 7989 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7990 if (!DeadInstructions.insert(Cmp).second) 7991 continue; 7992 7993 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7994 // TODO: can recurse through operands in general 7995 for (Value *Op : Cmp->operands()) { 7996 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7997 DeadInstructions.insert(cast<Instruction>(Op)); 7998 } 7999 } 8000 8001 // We create new "steps" for induction variable updates to which the original 8002 // induction variables map. An original update instruction will be dead if 8003 // all its users except the induction variable are dead. 8004 auto *Latch = OrigLoop->getLoopLatch(); 8005 for (auto &Induction : Legal->getInductionVars()) { 8006 PHINode *Ind = Induction.first; 8007 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8008 8009 // If the tail is to be folded by masking, the primary induction variable, 8010 // if exists, isn't dead: it will be used for masking. Don't kill it. 8011 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8012 continue; 8013 8014 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8015 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8016 })) 8017 DeadInstructions.insert(IndUpdate); 8018 } 8019 } 8020 8021 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8022 8023 //===--------------------------------------------------------------------===// 8024 // EpilogueVectorizerMainLoop 8025 //===--------------------------------------------------------------------===// 8026 8027 /// This function is partially responsible for generating the control flow 8028 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8029 std::pair<BasicBlock *, Value *> 8030 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8031 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8032 Loop *Lp = createVectorLoopSkeleton(""); 8033 8034 // Generate the code to check the minimum iteration count of the vector 8035 // epilogue (see below). 8036 EPI.EpilogueIterationCountCheck = 8037 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8038 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8039 8040 // Generate the code to check any assumptions that we've made for SCEV 8041 // expressions. 8042 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8043 8044 // Generate the code that checks at runtime if arrays overlap. We put the 8045 // checks into a separate block to make the more common case of few elements 8046 // faster. 8047 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8048 8049 // Generate the iteration count check for the main loop, *after* the check 8050 // for the epilogue loop, so that the path-length is shorter for the case 8051 // that goes directly through the vector epilogue. The longer-path length for 8052 // the main loop is compensated for, by the gain from vectorizing the larger 8053 // trip count. Note: the branch will get updated later on when we vectorize 8054 // the epilogue. 8055 EPI.MainLoopIterationCountCheck = 8056 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8057 8058 // Generate the induction variable. 8059 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8060 EPI.VectorTripCount = CountRoundDown; 8061 createHeaderBranch(Lp); 8062 8063 // Skip induction resume value creation here because they will be created in 8064 // the second pass. If we created them here, they wouldn't be used anyway, 8065 // because the vplan in the second pass still contains the inductions from the 8066 // original loop. 8067 8068 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 8069 } 8070 8071 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8072 LLVM_DEBUG({ 8073 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8074 << "Main Loop VF:" << EPI.MainLoopVF 8075 << ", Main Loop UF:" << EPI.MainLoopUF 8076 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8077 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8078 }); 8079 } 8080 8081 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8082 DEBUG_WITH_TYPE(VerboseDebug, { 8083 dbgs() << "intermediate fn:\n" 8084 << *OrigLoop->getHeader()->getParent() << "\n"; 8085 }); 8086 } 8087 8088 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8089 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8090 assert(L && "Expected valid Loop."); 8091 assert(Bypass && "Expected valid bypass basic block."); 8092 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8093 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8094 Value *Count = getOrCreateTripCount(L); 8095 // Reuse existing vector loop preheader for TC checks. 8096 // Note that new preheader block is generated for vector loop. 8097 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8098 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8099 8100 // Generate code to check if the loop's trip count is less than VF * UF of the 8101 // main vector loop. 8102 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8103 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8104 8105 Value *CheckMinIters = Builder.CreateICmp( 8106 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8107 "min.iters.check"); 8108 8109 if (!ForEpilogue) 8110 TCCheckBlock->setName("vector.main.loop.iter.check"); 8111 8112 // Create new preheader for vector loop. 8113 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8114 DT, LI, nullptr, "vector.ph"); 8115 8116 if (ForEpilogue) { 8117 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8118 DT->getNode(Bypass)->getIDom()) && 8119 "TC check is expected to dominate Bypass"); 8120 8121 // Update dominator for Bypass & LoopExit. 8122 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8123 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8124 // For loops with multiple exits, there's no edge from the middle block 8125 // to exit blocks (as the epilogue must run) and thus no need to update 8126 // the immediate dominator of the exit blocks. 8127 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8128 8129 LoopBypassBlocks.push_back(TCCheckBlock); 8130 8131 // Save the trip count so we don't have to regenerate it in the 8132 // vec.epilog.iter.check. This is safe to do because the trip count 8133 // generated here dominates the vector epilog iter check. 8134 EPI.TripCount = Count; 8135 } 8136 8137 ReplaceInstWithInst( 8138 TCCheckBlock->getTerminator(), 8139 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8140 8141 return TCCheckBlock; 8142 } 8143 8144 //===--------------------------------------------------------------------===// 8145 // EpilogueVectorizerEpilogueLoop 8146 //===--------------------------------------------------------------------===// 8147 8148 /// This function is partially responsible for generating the control flow 8149 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8150 std::pair<BasicBlock *, Value *> 8151 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8152 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8153 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8154 8155 // Now, compare the remaining count and if there aren't enough iterations to 8156 // execute the vectorized epilogue skip to the scalar part. 8157 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8158 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8159 LoopVectorPreHeader = 8160 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8161 LI, nullptr, "vec.epilog.ph"); 8162 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8163 VecEpilogueIterationCountCheck); 8164 8165 // Adjust the control flow taking the state info from the main loop 8166 // vectorization into account. 8167 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8168 "expected this to be saved from the previous pass."); 8169 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8170 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8171 8172 DT->changeImmediateDominator(LoopVectorPreHeader, 8173 EPI.MainLoopIterationCountCheck); 8174 8175 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8176 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8177 8178 if (EPI.SCEVSafetyCheck) 8179 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8180 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8181 if (EPI.MemSafetyCheck) 8182 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8183 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8184 8185 DT->changeImmediateDominator( 8186 VecEpilogueIterationCountCheck, 8187 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8188 8189 DT->changeImmediateDominator(LoopScalarPreHeader, 8190 EPI.EpilogueIterationCountCheck); 8191 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8192 // If there is an epilogue which must run, there's no edge from the 8193 // middle block to exit blocks and thus no need to update the immediate 8194 // dominator of the exit blocks. 8195 DT->changeImmediateDominator(LoopExitBlock, 8196 EPI.EpilogueIterationCountCheck); 8197 8198 // Keep track of bypass blocks, as they feed start values to the induction 8199 // phis in the scalar loop preheader. 8200 if (EPI.SCEVSafetyCheck) 8201 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8202 if (EPI.MemSafetyCheck) 8203 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8204 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8205 8206 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 8207 // merge control-flow from the latch block and the middle block. Update the 8208 // incoming values here and move the Phi into the preheader. 8209 SmallVector<PHINode *, 4> PhisInBlock; 8210 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8211 PhisInBlock.push_back(&Phi); 8212 8213 for (PHINode *Phi : PhisInBlock) { 8214 Phi->replaceIncomingBlockWith( 8215 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8216 VecEpilogueIterationCountCheck); 8217 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8218 if (EPI.SCEVSafetyCheck) 8219 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8220 if (EPI.MemSafetyCheck) 8221 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8222 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8223 } 8224 8225 // Generate a resume induction for the vector epilogue and put it in the 8226 // vector epilogue preheader 8227 Type *IdxTy = Legal->getWidestInductionType(); 8228 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8229 LoopVectorPreHeader->getFirstNonPHI()); 8230 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8231 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8232 EPI.MainLoopIterationCountCheck); 8233 8234 // Generate the induction variable. 8235 createHeaderBranch(Lp); 8236 8237 // Generate induction resume values. These variables save the new starting 8238 // indexes for the scalar loop. They are used to test if there are any tail 8239 // iterations left once the vector loop has completed. 8240 // Note that when the vectorized epilogue is skipped due to iteration count 8241 // check, then the resume value for the induction variable comes from 8242 // the trip count of the main vector loop, hence passing the AdditionalBypass 8243 // argument. 8244 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8245 EPI.VectorTripCount} /* AdditionalBypass */); 8246 8247 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8248 } 8249 8250 BasicBlock * 8251 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8252 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8253 8254 assert(EPI.TripCount && 8255 "Expected trip count to have been safed in the first pass."); 8256 assert( 8257 (!isa<Instruction>(EPI.TripCount) || 8258 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8259 "saved trip count does not dominate insertion point."); 8260 Value *TC = EPI.TripCount; 8261 IRBuilder<> Builder(Insert->getTerminator()); 8262 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8263 8264 // Generate code to check if the loop's trip count is less than VF * UF of the 8265 // vector epilogue loop. 8266 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8267 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8268 8269 Value *CheckMinIters = 8270 Builder.CreateICmp(P, Count, 8271 createStepForVF(Builder, Count->getType(), 8272 EPI.EpilogueVF, EPI.EpilogueUF), 8273 "min.epilog.iters.check"); 8274 8275 ReplaceInstWithInst( 8276 Insert->getTerminator(), 8277 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8278 8279 LoopBypassBlocks.push_back(Insert); 8280 return Insert; 8281 } 8282 8283 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8284 LLVM_DEBUG({ 8285 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8286 << "Epilogue Loop VF:" << EPI.EpilogueVF 8287 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8288 }); 8289 } 8290 8291 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8292 DEBUG_WITH_TYPE(VerboseDebug, { 8293 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8294 }); 8295 } 8296 8297 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8298 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8299 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8300 bool PredicateAtRangeStart = Predicate(Range.Start); 8301 8302 for (ElementCount TmpVF = Range.Start * 2; 8303 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8304 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8305 Range.End = TmpVF; 8306 break; 8307 } 8308 8309 return PredicateAtRangeStart; 8310 } 8311 8312 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8313 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8314 /// of VF's starting at a given VF and extending it as much as possible. Each 8315 /// vectorization decision can potentially shorten this sub-range during 8316 /// buildVPlan(). 8317 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8318 ElementCount MaxVF) { 8319 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8320 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8321 VFRange SubRange = {VF, MaxVFPlusOne}; 8322 VPlans.push_back(buildVPlan(SubRange)); 8323 VF = SubRange.End; 8324 } 8325 } 8326 8327 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8328 VPlanPtr &Plan) { 8329 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8330 8331 // Look for cached value. 8332 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8333 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8334 if (ECEntryIt != EdgeMaskCache.end()) 8335 return ECEntryIt->second; 8336 8337 VPValue *SrcMask = createBlockInMask(Src, Plan); 8338 8339 // The terminator has to be a branch inst! 8340 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8341 assert(BI && "Unexpected terminator found"); 8342 8343 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8344 return EdgeMaskCache[Edge] = SrcMask; 8345 8346 // If source is an exiting block, we know the exit edge is dynamically dead 8347 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8348 // adding uses of an otherwise potentially dead instruction. 8349 if (OrigLoop->isLoopExiting(Src)) 8350 return EdgeMaskCache[Edge] = SrcMask; 8351 8352 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8353 assert(EdgeMask && "No Edge Mask found for condition"); 8354 8355 if (BI->getSuccessor(0) != Dst) 8356 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8357 8358 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8359 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8360 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8361 // The select version does not introduce new UB if SrcMask is false and 8362 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8363 VPValue *False = Plan->getOrAddVPValue( 8364 ConstantInt::getFalse(BI->getCondition()->getType())); 8365 EdgeMask = 8366 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8367 } 8368 8369 return EdgeMaskCache[Edge] = EdgeMask; 8370 } 8371 8372 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8373 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8374 8375 // Look for cached value. 8376 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8377 if (BCEntryIt != BlockMaskCache.end()) 8378 return BCEntryIt->second; 8379 8380 // All-one mask is modelled as no-mask following the convention for masked 8381 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8382 VPValue *BlockMask = nullptr; 8383 8384 if (OrigLoop->getHeader() == BB) { 8385 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8386 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8387 8388 // Introduce the early-exit compare IV <= BTC to form header block mask. 8389 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8390 // constructing the desired canonical IV in the header block as its first 8391 // non-phi instructions. 8392 assert(CM.foldTailByMasking() && "must fold the tail"); 8393 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8394 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8395 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8396 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8397 8398 VPBuilder::InsertPointGuard Guard(Builder); 8399 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8400 if (CM.TTI.emitGetActiveLaneMask()) { 8401 VPValue *TC = Plan->getOrCreateTripCount(); 8402 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8403 } else { 8404 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8405 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8406 } 8407 return BlockMaskCache[BB] = BlockMask; 8408 } 8409 8410 // This is the block mask. We OR all incoming edges. 8411 for (auto *Predecessor : predecessors(BB)) { 8412 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8413 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8414 return BlockMaskCache[BB] = EdgeMask; 8415 8416 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8417 BlockMask = EdgeMask; 8418 continue; 8419 } 8420 8421 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8422 } 8423 8424 return BlockMaskCache[BB] = BlockMask; 8425 } 8426 8427 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8428 ArrayRef<VPValue *> Operands, 8429 VFRange &Range, 8430 VPlanPtr &Plan) { 8431 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8432 "Must be called with either a load or store"); 8433 8434 auto willWiden = [&](ElementCount VF) -> bool { 8435 if (VF.isScalar()) 8436 return false; 8437 LoopVectorizationCostModel::InstWidening Decision = 8438 CM.getWideningDecision(I, VF); 8439 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8440 "CM decision should be taken at this point."); 8441 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8442 return true; 8443 if (CM.isScalarAfterVectorization(I, VF) || 8444 CM.isProfitableToScalarize(I, VF)) 8445 return false; 8446 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8447 }; 8448 8449 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8450 return nullptr; 8451 8452 VPValue *Mask = nullptr; 8453 if (Legal->isMaskRequired(I)) 8454 Mask = createBlockInMask(I->getParent(), Plan); 8455 8456 // Determine if the pointer operand of the access is either consecutive or 8457 // reverse consecutive. 8458 LoopVectorizationCostModel::InstWidening Decision = 8459 CM.getWideningDecision(I, Range.Start); 8460 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8461 bool Consecutive = 8462 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8463 8464 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8465 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8466 Consecutive, Reverse); 8467 8468 StoreInst *Store = cast<StoreInst>(I); 8469 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8470 Mask, Consecutive, Reverse); 8471 } 8472 8473 static VPWidenIntOrFpInductionRecipe * 8474 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, 8475 VPValue *Start, const InductionDescriptor &IndDesc, 8476 LoopVectorizationCostModel &CM, Loop &OrigLoop, 8477 VFRange &Range) { 8478 // Returns true if an instruction \p I should be scalarized instead of 8479 // vectorized for the chosen vectorization factor. 8480 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8481 return CM.isScalarAfterVectorization(I, VF) || 8482 CM.isProfitableToScalarize(I, VF); 8483 }; 8484 8485 bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( 8486 [&](ElementCount VF) { 8487 // Returns true if we should generate a scalar version of \p IV. 8488 if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) 8489 return true; 8490 auto isScalarInst = [&](User *U) -> bool { 8491 auto *I = cast<Instruction>(U); 8492 return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); 8493 }; 8494 return any_of(PhiOrTrunc->users(), isScalarInst); 8495 }, 8496 Range); 8497 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8498 [&](ElementCount VF) { 8499 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8500 }, 8501 Range); 8502 assert(IndDesc.getStartValue() == 8503 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8504 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8505 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, 8506 NeedsScalarIV, !NeedsScalarIVOnly); 8507 } 8508 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8509 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, 8510 !NeedsScalarIVOnly); 8511 } 8512 8513 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8514 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { 8515 8516 // Check if this is an integer or fp induction. If so, build the recipe that 8517 // produces its scalar and vector values. 8518 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8519 return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, 8520 Range); 8521 8522 return nullptr; 8523 } 8524 8525 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8526 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8527 VPlan &Plan) const { 8528 // Optimize the special case where the source is a constant integer 8529 // induction variable. Notice that we can only optimize the 'trunc' case 8530 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8531 // (c) other casts depend on pointer size. 8532 8533 // Determine whether \p K is a truncation based on an induction variable that 8534 // can be optimized. 8535 auto isOptimizableIVTruncate = 8536 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8537 return [=](ElementCount VF) -> bool { 8538 return CM.isOptimizableIVTruncate(K, VF); 8539 }; 8540 }; 8541 8542 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8543 isOptimizableIVTruncate(I), Range)) { 8544 8545 auto *Phi = cast<PHINode>(I->getOperand(0)); 8546 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8547 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8548 return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); 8549 } 8550 return nullptr; 8551 } 8552 8553 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8554 ArrayRef<VPValue *> Operands, 8555 VPlanPtr &Plan) { 8556 // If all incoming values are equal, the incoming VPValue can be used directly 8557 // instead of creating a new VPBlendRecipe. 8558 VPValue *FirstIncoming = Operands[0]; 8559 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8560 return FirstIncoming == Inc; 8561 })) { 8562 return Operands[0]; 8563 } 8564 8565 unsigned NumIncoming = Phi->getNumIncomingValues(); 8566 // For in-loop reductions, we do not need to create an additional select. 8567 VPValue *InLoopVal = nullptr; 8568 for (unsigned In = 0; In < NumIncoming; In++) { 8569 PHINode *PhiOp = 8570 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8571 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8572 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8573 InLoopVal = Operands[In]; 8574 } 8575 } 8576 8577 assert((!InLoopVal || NumIncoming == 2) && 8578 "Found an in-loop reduction for PHI with unexpected number of " 8579 "incoming values"); 8580 if (InLoopVal) 8581 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8582 8583 // We know that all PHIs in non-header blocks are converted into selects, so 8584 // we don't have to worry about the insertion order and we can just use the 8585 // builder. At this point we generate the predication tree. There may be 8586 // duplications since this is a simple recursive scan, but future 8587 // optimizations will clean it up. 8588 SmallVector<VPValue *, 2> OperandsWithMask; 8589 8590 for (unsigned In = 0; In < NumIncoming; In++) { 8591 VPValue *EdgeMask = 8592 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8593 assert((EdgeMask || NumIncoming == 1) && 8594 "Multiple predecessors with one having a full mask"); 8595 OperandsWithMask.push_back(Operands[In]); 8596 if (EdgeMask) 8597 OperandsWithMask.push_back(EdgeMask); 8598 } 8599 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8600 } 8601 8602 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8603 ArrayRef<VPValue *> Operands, 8604 VFRange &Range) const { 8605 8606 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8607 [this, CI](ElementCount VF) { 8608 return CM.isScalarWithPredication(CI, VF); 8609 }, 8610 Range); 8611 8612 if (IsPredicated) 8613 return nullptr; 8614 8615 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8616 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8617 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8618 ID == Intrinsic::pseudoprobe || 8619 ID == Intrinsic::experimental_noalias_scope_decl)) 8620 return nullptr; 8621 8622 auto willWiden = [&](ElementCount VF) -> bool { 8623 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8624 // The following case may be scalarized depending on the VF. 8625 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8626 // version of the instruction. 8627 // Is it beneficial to perform intrinsic call compared to lib call? 8628 bool NeedToScalarize = false; 8629 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8630 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8631 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8632 return UseVectorIntrinsic || !NeedToScalarize; 8633 }; 8634 8635 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8636 return nullptr; 8637 8638 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8639 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8640 } 8641 8642 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8643 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8644 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8645 // Instruction should be widened, unless it is scalar after vectorization, 8646 // scalarization is profitable or it is predicated. 8647 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8648 return CM.isScalarAfterVectorization(I, VF) || 8649 CM.isProfitableToScalarize(I, VF) || 8650 CM.isScalarWithPredication(I, VF); 8651 }; 8652 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8653 Range); 8654 } 8655 8656 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8657 ArrayRef<VPValue *> Operands) const { 8658 auto IsVectorizableOpcode = [](unsigned Opcode) { 8659 switch (Opcode) { 8660 case Instruction::Add: 8661 case Instruction::And: 8662 case Instruction::AShr: 8663 case Instruction::BitCast: 8664 case Instruction::FAdd: 8665 case Instruction::FCmp: 8666 case Instruction::FDiv: 8667 case Instruction::FMul: 8668 case Instruction::FNeg: 8669 case Instruction::FPExt: 8670 case Instruction::FPToSI: 8671 case Instruction::FPToUI: 8672 case Instruction::FPTrunc: 8673 case Instruction::FRem: 8674 case Instruction::FSub: 8675 case Instruction::ICmp: 8676 case Instruction::IntToPtr: 8677 case Instruction::LShr: 8678 case Instruction::Mul: 8679 case Instruction::Or: 8680 case Instruction::PtrToInt: 8681 case Instruction::SDiv: 8682 case Instruction::Select: 8683 case Instruction::SExt: 8684 case Instruction::Shl: 8685 case Instruction::SIToFP: 8686 case Instruction::SRem: 8687 case Instruction::Sub: 8688 case Instruction::Trunc: 8689 case Instruction::UDiv: 8690 case Instruction::UIToFP: 8691 case Instruction::URem: 8692 case Instruction::Xor: 8693 case Instruction::ZExt: 8694 return true; 8695 } 8696 return false; 8697 }; 8698 8699 if (!IsVectorizableOpcode(I->getOpcode())) 8700 return nullptr; 8701 8702 // Success: widen this instruction. 8703 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8704 } 8705 8706 void VPRecipeBuilder::fixHeaderPhis() { 8707 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8708 for (VPHeaderPHIRecipe *R : PhisToFix) { 8709 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8710 VPRecipeBase *IncR = 8711 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8712 R->addOperand(IncR->getVPSingleValue()); 8713 } 8714 } 8715 8716 VPBasicBlock *VPRecipeBuilder::handleReplication( 8717 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8718 VPlanPtr &Plan) { 8719 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8720 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8721 Range); 8722 8723 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8724 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8725 Range); 8726 8727 // Even if the instruction is not marked as uniform, there are certain 8728 // intrinsic calls that can be effectively treated as such, so we check for 8729 // them here. Conservatively, we only do this for scalable vectors, since 8730 // for fixed-width VFs we can always fall back on full scalarization. 8731 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8732 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8733 case Intrinsic::assume: 8734 case Intrinsic::lifetime_start: 8735 case Intrinsic::lifetime_end: 8736 // For scalable vectors if one of the operands is variant then we still 8737 // want to mark as uniform, which will generate one instruction for just 8738 // the first lane of the vector. We can't scalarize the call in the same 8739 // way as for fixed-width vectors because we don't know how many lanes 8740 // there are. 8741 // 8742 // The reasons for doing it this way for scalable vectors are: 8743 // 1. For the assume intrinsic generating the instruction for the first 8744 // lane is still be better than not generating any at all. For 8745 // example, the input may be a splat across all lanes. 8746 // 2. For the lifetime start/end intrinsics the pointer operand only 8747 // does anything useful when the input comes from a stack object, 8748 // which suggests it should always be uniform. For non-stack objects 8749 // the effect is to poison the object, which still allows us to 8750 // remove the call. 8751 IsUniform = true; 8752 break; 8753 default: 8754 break; 8755 } 8756 } 8757 8758 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8759 IsUniform, IsPredicated); 8760 setRecipe(I, Recipe); 8761 Plan->addVPValue(I, Recipe); 8762 8763 // Find if I uses a predicated instruction. If so, it will use its scalar 8764 // value. Avoid hoisting the insert-element which packs the scalar value into 8765 // a vector value, as that happens iff all users use the vector value. 8766 for (VPValue *Op : Recipe->operands()) { 8767 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8768 if (!PredR) 8769 continue; 8770 auto *RepR = 8771 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8772 assert(RepR->isPredicated() && 8773 "expected Replicate recipe to be predicated"); 8774 RepR->setAlsoPack(false); 8775 } 8776 8777 // Finalize the recipe for Instr, first if it is not predicated. 8778 if (!IsPredicated) { 8779 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8780 VPBB->appendRecipe(Recipe); 8781 return VPBB; 8782 } 8783 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8784 8785 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8786 assert(SingleSucc && "VPBB must have a single successor when handling " 8787 "predicated replication."); 8788 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8789 // Record predicated instructions for above packing optimizations. 8790 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8791 VPBlockUtils::insertBlockAfter(Region, VPBB); 8792 auto *RegSucc = new VPBasicBlock(); 8793 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8794 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8795 return RegSucc; 8796 } 8797 8798 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8799 VPRecipeBase *PredRecipe, 8800 VPlanPtr &Plan) { 8801 // Instructions marked for predication are replicated and placed under an 8802 // if-then construct to prevent side-effects. 8803 8804 // Generate recipes to compute the block mask for this region. 8805 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8806 8807 // Build the triangular if-then region. 8808 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8809 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8810 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8811 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8812 auto *PHIRecipe = Instr->getType()->isVoidTy() 8813 ? nullptr 8814 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8815 if (PHIRecipe) { 8816 Plan->removeVPValueFor(Instr); 8817 Plan->addVPValue(Instr, PHIRecipe); 8818 } 8819 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8820 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8821 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8822 8823 // Note: first set Entry as region entry and then connect successors starting 8824 // from it in order, to propagate the "parent" of each VPBasicBlock. 8825 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8826 VPBlockUtils::connectBlocks(Pred, Exit); 8827 8828 return Region; 8829 } 8830 8831 VPRecipeOrVPValueTy 8832 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8833 ArrayRef<VPValue *> Operands, 8834 VFRange &Range, VPlanPtr &Plan) { 8835 // First, check for specific widening recipes that deal with calls, memory 8836 // operations, inductions and Phi nodes. 8837 if (auto *CI = dyn_cast<CallInst>(Instr)) 8838 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8839 8840 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8841 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8842 8843 VPRecipeBase *Recipe; 8844 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8845 if (Phi->getParent() != OrigLoop->getHeader()) 8846 return tryToBlend(Phi, Operands, Plan); 8847 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8848 return toVPRecipeResult(Recipe); 8849 8850 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8851 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8852 VPValue *StartV = Operands[0]; 8853 if (Legal->isReductionVariable(Phi)) { 8854 const RecurrenceDescriptor &RdxDesc = 8855 Legal->getReductionVars().find(Phi)->second; 8856 assert(RdxDesc.getRecurrenceStartValue() == 8857 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8858 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8859 CM.isInLoopReduction(Phi), 8860 CM.useOrderedReductions(RdxDesc)); 8861 } else { 8862 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8863 } 8864 8865 // Record the incoming value from the backedge, so we can add the incoming 8866 // value from the backedge after all recipes have been created. 8867 recordRecipeOf(cast<Instruction>( 8868 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8869 PhisToFix.push_back(PhiRecipe); 8870 } else { 8871 // TODO: record backedge value for remaining pointer induction phis. 8872 assert(Phi->getType()->isPointerTy() && 8873 "only pointer phis should be handled here"); 8874 assert(Legal->getInductionVars().count(Phi) && 8875 "Not an induction variable"); 8876 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8877 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8878 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8879 } 8880 8881 return toVPRecipeResult(PhiRecipe); 8882 } 8883 8884 if (isa<TruncInst>(Instr) && 8885 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8886 Range, *Plan))) 8887 return toVPRecipeResult(Recipe); 8888 8889 if (!shouldWiden(Instr, Range)) 8890 return nullptr; 8891 8892 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8893 return toVPRecipeResult(new VPWidenGEPRecipe( 8894 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8895 8896 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8897 bool InvariantCond = 8898 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8899 return toVPRecipeResult(new VPWidenSelectRecipe( 8900 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8901 } 8902 8903 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8904 } 8905 8906 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8907 ElementCount MaxVF) { 8908 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8909 8910 // Collect instructions from the original loop that will become trivially dead 8911 // in the vectorized loop. We don't need to vectorize these instructions. For 8912 // example, original induction update instructions can become dead because we 8913 // separately emit induction "steps" when generating code for the new loop. 8914 // Similarly, we create a new latch condition when setting up the structure 8915 // of the new loop, so the old one can become dead. 8916 SmallPtrSet<Instruction *, 4> DeadInstructions; 8917 collectTriviallyDeadInstructions(DeadInstructions); 8918 8919 // Add assume instructions we need to drop to DeadInstructions, to prevent 8920 // them from being added to the VPlan. 8921 // TODO: We only need to drop assumes in blocks that get flattend. If the 8922 // control flow is preserved, we should keep them. 8923 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8924 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8925 8926 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8927 // Dead instructions do not need sinking. Remove them from SinkAfter. 8928 for (Instruction *I : DeadInstructions) 8929 SinkAfter.erase(I); 8930 8931 // Cannot sink instructions after dead instructions (there won't be any 8932 // recipes for them). Instead, find the first non-dead previous instruction. 8933 for (auto &P : Legal->getSinkAfter()) { 8934 Instruction *SinkTarget = P.second; 8935 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8936 (void)FirstInst; 8937 while (DeadInstructions.contains(SinkTarget)) { 8938 assert( 8939 SinkTarget != FirstInst && 8940 "Must find a live instruction (at least the one feeding the " 8941 "first-order recurrence PHI) before reaching beginning of the block"); 8942 SinkTarget = SinkTarget->getPrevNode(); 8943 assert(SinkTarget != P.first && 8944 "sink source equals target, no sinking required"); 8945 } 8946 P.second = SinkTarget; 8947 } 8948 8949 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8950 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8951 VFRange SubRange = {VF, MaxVFPlusOne}; 8952 VPlans.push_back( 8953 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8954 VF = SubRange.End; 8955 } 8956 } 8957 8958 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8959 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8960 // BranchOnCount VPInstruction to the latch. 8961 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8962 bool HasNUW, bool IsVPlanNative) { 8963 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8964 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8965 8966 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8967 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8968 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8969 if (IsVPlanNative) 8970 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 8971 Header->insert(CanonicalIVPHI, Header->begin()); 8972 8973 auto *CanonicalIVIncrement = 8974 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8975 : VPInstruction::CanonicalIVIncrement, 8976 {CanonicalIVPHI}, DL); 8977 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8978 8979 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 8980 if (IsVPlanNative) { 8981 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 8982 EB->setCondBit(nullptr); 8983 } 8984 EB->appendRecipe(CanonicalIVIncrement); 8985 8986 auto *BranchOnCount = 8987 new VPInstruction(VPInstruction::BranchOnCount, 8988 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8989 EB->appendRecipe(BranchOnCount); 8990 } 8991 8992 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8993 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8994 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8995 8996 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8997 8998 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8999 9000 // --------------------------------------------------------------------------- 9001 // Pre-construction: record ingredients whose recipes we'll need to further 9002 // process after constructing the initial VPlan. 9003 // --------------------------------------------------------------------------- 9004 9005 // Mark instructions we'll need to sink later and their targets as 9006 // ingredients whose recipe we'll need to record. 9007 for (auto &Entry : SinkAfter) { 9008 RecipeBuilder.recordRecipeOf(Entry.first); 9009 RecipeBuilder.recordRecipeOf(Entry.second); 9010 } 9011 for (auto &Reduction : CM.getInLoopReductionChains()) { 9012 PHINode *Phi = Reduction.first; 9013 RecurKind Kind = 9014 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 9015 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9016 9017 RecipeBuilder.recordRecipeOf(Phi); 9018 for (auto &R : ReductionOperations) { 9019 RecipeBuilder.recordRecipeOf(R); 9020 // For min/max reducitons, where we have a pair of icmp/select, we also 9021 // need to record the ICmp recipe, so it can be removed later. 9022 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9023 "Only min/max recurrences allowed for inloop reductions"); 9024 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9025 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9026 } 9027 } 9028 9029 // For each interleave group which is relevant for this (possibly trimmed) 9030 // Range, add it to the set of groups to be later applied to the VPlan and add 9031 // placeholders for its members' Recipes which we'll be replacing with a 9032 // single VPInterleaveRecipe. 9033 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9034 auto applyIG = [IG, this](ElementCount VF) -> bool { 9035 return (VF.isVector() && // Query is illegal for VF == 1 9036 CM.getWideningDecision(IG->getInsertPos(), VF) == 9037 LoopVectorizationCostModel::CM_Interleave); 9038 }; 9039 if (!getDecisionAndClampRange(applyIG, Range)) 9040 continue; 9041 InterleaveGroups.insert(IG); 9042 for (unsigned i = 0; i < IG->getFactor(); i++) 9043 if (Instruction *Member = IG->getMember(i)) 9044 RecipeBuilder.recordRecipeOf(Member); 9045 }; 9046 9047 // --------------------------------------------------------------------------- 9048 // Build initial VPlan: Scan the body of the loop in a topological order to 9049 // visit each basic block after having visited its predecessor basic blocks. 9050 // --------------------------------------------------------------------------- 9051 9052 // Create initial VPlan skeleton, with separate header and latch blocks. 9053 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9054 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9055 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9056 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9057 auto Plan = std::make_unique<VPlan>(TopRegion); 9058 9059 Instruction *DLInst = 9060 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9061 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 9062 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 9063 !CM.foldTailByMasking(), false); 9064 9065 // Scan the body of the loop in a topological order to visit each basic block 9066 // after having visited its predecessor basic blocks. 9067 LoopBlocksDFS DFS(OrigLoop); 9068 DFS.perform(LI); 9069 9070 VPBasicBlock *VPBB = HeaderVPBB; 9071 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9072 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9073 // Relevant instructions from basic block BB will be grouped into VPRecipe 9074 // ingredients and fill a new VPBasicBlock. 9075 unsigned VPBBsForBB = 0; 9076 VPBB->setName(BB->getName()); 9077 Builder.setInsertPoint(VPBB); 9078 9079 // Introduce each ingredient into VPlan. 9080 // TODO: Model and preserve debug instrinsics in VPlan. 9081 for (Instruction &I : BB->instructionsWithoutDebug()) { 9082 Instruction *Instr = &I; 9083 9084 // First filter out irrelevant instructions, to ensure no recipes are 9085 // built for them. 9086 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9087 continue; 9088 9089 SmallVector<VPValue *, 4> Operands; 9090 auto *Phi = dyn_cast<PHINode>(Instr); 9091 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9092 Operands.push_back(Plan->getOrAddVPValue( 9093 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9094 } else { 9095 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9096 Operands = {OpRange.begin(), OpRange.end()}; 9097 } 9098 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9099 Instr, Operands, Range, Plan)) { 9100 // If Instr can be simplified to an existing VPValue, use it. 9101 if (RecipeOrValue.is<VPValue *>()) { 9102 auto *VPV = RecipeOrValue.get<VPValue *>(); 9103 Plan->addVPValue(Instr, VPV); 9104 // If the re-used value is a recipe, register the recipe for the 9105 // instruction, in case the recipe for Instr needs to be recorded. 9106 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9107 RecipeBuilder.setRecipe(Instr, R); 9108 continue; 9109 } 9110 // Otherwise, add the new recipe. 9111 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9112 for (auto *Def : Recipe->definedValues()) { 9113 auto *UV = Def->getUnderlyingValue(); 9114 Plan->addVPValue(UV, Def); 9115 } 9116 9117 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9118 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9119 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9120 // of the header block. That can happen for truncates of induction 9121 // variables. Those recipes are moved to the phi section of the header 9122 // block after applying SinkAfter, which relies on the original 9123 // position of the trunc. 9124 assert(isa<TruncInst>(Instr)); 9125 InductionsToMove.push_back( 9126 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9127 } 9128 RecipeBuilder.setRecipe(Instr, Recipe); 9129 VPBB->appendRecipe(Recipe); 9130 continue; 9131 } 9132 9133 // Otherwise, if all widening options failed, Instruction is to be 9134 // replicated. This may create a successor for VPBB. 9135 VPBasicBlock *NextVPBB = 9136 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9137 if (NextVPBB != VPBB) { 9138 VPBB = NextVPBB; 9139 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9140 : ""); 9141 } 9142 } 9143 9144 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9145 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9146 } 9147 9148 // Fold the last, empty block into its predecessor. 9149 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9150 assert(VPBB && "expected to fold last (empty) block"); 9151 // After here, VPBB should not be used. 9152 VPBB = nullptr; 9153 9154 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9155 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9156 "entry block must be set to a VPRegionBlock having a non-empty entry " 9157 "VPBasicBlock"); 9158 RecipeBuilder.fixHeaderPhis(); 9159 9160 // --------------------------------------------------------------------------- 9161 // Transform initial VPlan: Apply previously taken decisions, in order, to 9162 // bring the VPlan to its final state. 9163 // --------------------------------------------------------------------------- 9164 9165 // Apply Sink-After legal constraints. 9166 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9167 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9168 if (Region && Region->isReplicator()) { 9169 assert(Region->getNumSuccessors() == 1 && 9170 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9171 assert(R->getParent()->size() == 1 && 9172 "A recipe in an original replicator region must be the only " 9173 "recipe in its block"); 9174 return Region; 9175 } 9176 return nullptr; 9177 }; 9178 for (auto &Entry : SinkAfter) { 9179 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9180 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9181 9182 auto *TargetRegion = GetReplicateRegion(Target); 9183 auto *SinkRegion = GetReplicateRegion(Sink); 9184 if (!SinkRegion) { 9185 // If the sink source is not a replicate region, sink the recipe directly. 9186 if (TargetRegion) { 9187 // The target is in a replication region, make sure to move Sink to 9188 // the block after it, not into the replication region itself. 9189 VPBasicBlock *NextBlock = 9190 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9191 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9192 } else 9193 Sink->moveAfter(Target); 9194 continue; 9195 } 9196 9197 // The sink source is in a replicate region. Unhook the region from the CFG. 9198 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9199 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9200 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9201 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9202 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9203 9204 if (TargetRegion) { 9205 // The target recipe is also in a replicate region, move the sink region 9206 // after the target region. 9207 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9208 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9209 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9210 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9211 } else { 9212 // The sink source is in a replicate region, we need to move the whole 9213 // replicate region, which should only contain a single recipe in the 9214 // main block. 9215 auto *SplitBlock = 9216 Target->getParent()->splitAt(std::next(Target->getIterator())); 9217 9218 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9219 9220 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9221 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9222 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9223 } 9224 } 9225 9226 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9227 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9228 9229 // Now that sink-after is done, move induction recipes for optimized truncates 9230 // to the phi section of the header block. 9231 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9232 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9233 9234 // Adjust the recipes for any inloop reductions. 9235 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9236 RecipeBuilder, Range.Start); 9237 9238 // Introduce a recipe to combine the incoming and previous values of a 9239 // first-order recurrence. 9240 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9241 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9242 if (!RecurPhi) 9243 continue; 9244 9245 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9246 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9247 auto *Region = GetReplicateRegion(PrevRecipe); 9248 if (Region) 9249 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9250 if (Region || PrevRecipe->isPhi()) 9251 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9252 else 9253 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9254 9255 auto *RecurSplice = cast<VPInstruction>( 9256 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9257 {RecurPhi, RecurPhi->getBackedgeValue()})); 9258 9259 RecurPhi->replaceAllUsesWith(RecurSplice); 9260 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9261 // all users. 9262 RecurSplice->setOperand(0, RecurPhi); 9263 } 9264 9265 // Interleave memory: for each Interleave Group we marked earlier as relevant 9266 // for this VPlan, replace the Recipes widening its memory instructions with a 9267 // single VPInterleaveRecipe at its insertion point. 9268 for (auto IG : InterleaveGroups) { 9269 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9270 RecipeBuilder.getRecipe(IG->getInsertPos())); 9271 SmallVector<VPValue *, 4> StoredValues; 9272 for (unsigned i = 0; i < IG->getFactor(); ++i) 9273 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9274 auto *StoreR = 9275 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9276 StoredValues.push_back(StoreR->getStoredValue()); 9277 } 9278 9279 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9280 Recipe->getMask()); 9281 VPIG->insertBefore(Recipe); 9282 unsigned J = 0; 9283 for (unsigned i = 0; i < IG->getFactor(); ++i) 9284 if (Instruction *Member = IG->getMember(i)) { 9285 if (!Member->getType()->isVoidTy()) { 9286 VPValue *OriginalV = Plan->getVPValue(Member); 9287 Plan->removeVPValueFor(Member); 9288 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9289 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9290 J++; 9291 } 9292 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9293 } 9294 } 9295 9296 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9297 // in ways that accessing values using original IR values is incorrect. 9298 Plan->disableValue2VPValue(); 9299 9300 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9301 VPlanTransforms::sinkScalarOperands(*Plan); 9302 VPlanTransforms::mergeReplicateRegions(*Plan); 9303 VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop); 9304 9305 std::string PlanName; 9306 raw_string_ostream RSO(PlanName); 9307 ElementCount VF = Range.Start; 9308 Plan->addVF(VF); 9309 RSO << "Initial VPlan for VF={" << VF; 9310 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9311 Plan->addVF(VF); 9312 RSO << "," << VF; 9313 } 9314 RSO << "},UF>=1"; 9315 RSO.flush(); 9316 Plan->setName(PlanName); 9317 9318 // Fold Exit block into its predecessor if possible. 9319 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9320 // VPBasicBlock as exit. 9321 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9322 9323 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9324 return Plan; 9325 } 9326 9327 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9328 // Outer loop handling: They may require CFG and instruction level 9329 // transformations before even evaluating whether vectorization is profitable. 9330 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9331 // the vectorization pipeline. 9332 assert(!OrigLoop->isInnermost()); 9333 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9334 9335 // Create new empty VPlan 9336 auto Plan = std::make_unique<VPlan>(); 9337 9338 // Build hierarchical CFG 9339 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9340 HCFGBuilder.buildHierarchicalCFG(); 9341 9342 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9343 VF *= 2) 9344 Plan->addVF(VF); 9345 9346 if (EnableVPlanPredication) { 9347 VPlanPredicator VPP(*Plan); 9348 VPP.predicate(); 9349 9350 // Avoid running transformation to recipes until masked code generation in 9351 // VPlan-native path is in place. 9352 return Plan; 9353 } 9354 9355 SmallPtrSet<Instruction *, 1> DeadInstructions; 9356 VPlanTransforms::VPInstructionsToVPRecipes( 9357 OrigLoop, Plan, 9358 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9359 DeadInstructions, *PSE.getSE()); 9360 9361 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9362 true, true); 9363 return Plan; 9364 } 9365 9366 // Adjust the recipes for reductions. For in-loop reductions the chain of 9367 // instructions leading from the loop exit instr to the phi need to be converted 9368 // to reductions, with one operand being vector and the other being the scalar 9369 // reduction chain. For other reductions, a select is introduced between the phi 9370 // and live-out recipes when folding the tail. 9371 void LoopVectorizationPlanner::adjustRecipesForReductions( 9372 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9373 ElementCount MinVF) { 9374 for (auto &Reduction : CM.getInLoopReductionChains()) { 9375 PHINode *Phi = Reduction.first; 9376 const RecurrenceDescriptor &RdxDesc = 9377 Legal->getReductionVars().find(Phi)->second; 9378 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9379 9380 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9381 continue; 9382 9383 // ReductionOperations are orders top-down from the phi's use to the 9384 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9385 // which of the two operands will remain scalar and which will be reduced. 9386 // For minmax the chain will be the select instructions. 9387 Instruction *Chain = Phi; 9388 for (Instruction *R : ReductionOperations) { 9389 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9390 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9391 9392 VPValue *ChainOp = Plan->getVPValue(Chain); 9393 unsigned FirstOpId; 9394 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9395 "Only min/max recurrences allowed for inloop reductions"); 9396 // Recognize a call to the llvm.fmuladd intrinsic. 9397 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9398 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9399 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9400 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9401 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9402 "Expected to replace a VPWidenSelectSC"); 9403 FirstOpId = 1; 9404 } else { 9405 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9406 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9407 "Expected to replace a VPWidenSC"); 9408 FirstOpId = 0; 9409 } 9410 unsigned VecOpId = 9411 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9412 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9413 9414 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9415 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9416 : nullptr; 9417 9418 if (IsFMulAdd) { 9419 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9420 // need to create an fmul recipe to use as the vector operand for the 9421 // fadd reduction. 9422 VPInstruction *FMulRecipe = new VPInstruction( 9423 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9424 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9425 WidenRecipe->getParent()->insert(FMulRecipe, 9426 WidenRecipe->getIterator()); 9427 VecOp = FMulRecipe; 9428 } 9429 VPReductionRecipe *RedRecipe = 9430 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9431 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9432 Plan->removeVPValueFor(R); 9433 Plan->addVPValue(R, RedRecipe); 9434 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9435 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9436 WidenRecipe->eraseFromParent(); 9437 9438 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9439 VPRecipeBase *CompareRecipe = 9440 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9441 assert(isa<VPWidenRecipe>(CompareRecipe) && 9442 "Expected to replace a VPWidenSC"); 9443 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9444 "Expected no remaining users"); 9445 CompareRecipe->eraseFromParent(); 9446 } 9447 Chain = R; 9448 } 9449 } 9450 9451 // If tail is folded by masking, introduce selects between the phi 9452 // and the live-out instruction of each reduction, at the beginning of the 9453 // dedicated latch block. 9454 if (CM.foldTailByMasking()) { 9455 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9456 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9457 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9458 if (!PhiR || PhiR->isInLoop()) 9459 continue; 9460 VPValue *Cond = 9461 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9462 VPValue *Red = PhiR->getBackedgeValue(); 9463 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9464 "reduction recipe must be defined before latch"); 9465 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9466 } 9467 } 9468 } 9469 9470 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9471 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9472 VPSlotTracker &SlotTracker) const { 9473 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9474 IG->getInsertPos()->printAsOperand(O, false); 9475 O << ", "; 9476 getAddr()->printAsOperand(O, SlotTracker); 9477 VPValue *Mask = getMask(); 9478 if (Mask) { 9479 O << ", "; 9480 Mask->printAsOperand(O, SlotTracker); 9481 } 9482 9483 unsigned OpIdx = 0; 9484 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9485 if (!IG->getMember(i)) 9486 continue; 9487 if (getNumStoreOperands() > 0) { 9488 O << "\n" << Indent << " store "; 9489 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9490 O << " to index " << i; 9491 } else { 9492 O << "\n" << Indent << " "; 9493 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9494 O << " = load from index " << i; 9495 } 9496 ++OpIdx; 9497 } 9498 } 9499 #endif 9500 9501 void VPWidenCallRecipe::execute(VPTransformState &State) { 9502 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9503 *this, State); 9504 } 9505 9506 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9507 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9508 State.ILV->setDebugLocFromInst(&I); 9509 9510 // The condition can be loop invariant but still defined inside the 9511 // loop. This means that we can't just use the original 'cond' value. 9512 // We have to take the 'vectorized' value and pick the first lane. 9513 // Instcombine will make this a no-op. 9514 auto *InvarCond = 9515 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9516 9517 for (unsigned Part = 0; Part < State.UF; ++Part) { 9518 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9519 Value *Op0 = State.get(getOperand(1), Part); 9520 Value *Op1 = State.get(getOperand(2), Part); 9521 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9522 State.set(this, Sel, Part); 9523 State.ILV->addMetadata(Sel, &I); 9524 } 9525 } 9526 9527 void VPWidenRecipe::execute(VPTransformState &State) { 9528 auto &I = *cast<Instruction>(getUnderlyingValue()); 9529 auto &Builder = State.Builder; 9530 switch (I.getOpcode()) { 9531 case Instruction::Call: 9532 case Instruction::Br: 9533 case Instruction::PHI: 9534 case Instruction::GetElementPtr: 9535 case Instruction::Select: 9536 llvm_unreachable("This instruction is handled by a different recipe."); 9537 case Instruction::UDiv: 9538 case Instruction::SDiv: 9539 case Instruction::SRem: 9540 case Instruction::URem: 9541 case Instruction::Add: 9542 case Instruction::FAdd: 9543 case Instruction::Sub: 9544 case Instruction::FSub: 9545 case Instruction::FNeg: 9546 case Instruction::Mul: 9547 case Instruction::FMul: 9548 case Instruction::FDiv: 9549 case Instruction::FRem: 9550 case Instruction::Shl: 9551 case Instruction::LShr: 9552 case Instruction::AShr: 9553 case Instruction::And: 9554 case Instruction::Or: 9555 case Instruction::Xor: { 9556 // Just widen unops and binops. 9557 State.ILV->setDebugLocFromInst(&I); 9558 9559 for (unsigned Part = 0; Part < State.UF; ++Part) { 9560 SmallVector<Value *, 2> Ops; 9561 for (VPValue *VPOp : operands()) 9562 Ops.push_back(State.get(VPOp, Part)); 9563 9564 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9565 9566 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9567 VecOp->copyIRFlags(&I); 9568 9569 // If the instruction is vectorized and was in a basic block that needed 9570 // predication, we can't propagate poison-generating flags (nuw/nsw, 9571 // exact, etc.). The control flow has been linearized and the 9572 // instruction is no longer guarded by the predicate, which could make 9573 // the flag properties to no longer hold. 9574 if (State.MayGeneratePoisonRecipes.contains(this)) 9575 VecOp->dropPoisonGeneratingFlags(); 9576 } 9577 9578 // Use this vector value for all users of the original instruction. 9579 State.set(this, V, Part); 9580 State.ILV->addMetadata(V, &I); 9581 } 9582 9583 break; 9584 } 9585 case Instruction::ICmp: 9586 case Instruction::FCmp: { 9587 // Widen compares. Generate vector compares. 9588 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9589 auto *Cmp = cast<CmpInst>(&I); 9590 State.ILV->setDebugLocFromInst(Cmp); 9591 for (unsigned Part = 0; Part < State.UF; ++Part) { 9592 Value *A = State.get(getOperand(0), Part); 9593 Value *B = State.get(getOperand(1), Part); 9594 Value *C = nullptr; 9595 if (FCmp) { 9596 // Propagate fast math flags. 9597 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9598 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9599 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9600 } else { 9601 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9602 } 9603 State.set(this, C, Part); 9604 State.ILV->addMetadata(C, &I); 9605 } 9606 9607 break; 9608 } 9609 9610 case Instruction::ZExt: 9611 case Instruction::SExt: 9612 case Instruction::FPToUI: 9613 case Instruction::FPToSI: 9614 case Instruction::FPExt: 9615 case Instruction::PtrToInt: 9616 case Instruction::IntToPtr: 9617 case Instruction::SIToFP: 9618 case Instruction::UIToFP: 9619 case Instruction::Trunc: 9620 case Instruction::FPTrunc: 9621 case Instruction::BitCast: { 9622 auto *CI = cast<CastInst>(&I); 9623 State.ILV->setDebugLocFromInst(CI); 9624 9625 /// Vectorize casts. 9626 Type *DestTy = (State.VF.isScalar()) 9627 ? CI->getType() 9628 : VectorType::get(CI->getType(), State.VF); 9629 9630 for (unsigned Part = 0; Part < State.UF; ++Part) { 9631 Value *A = State.get(getOperand(0), Part); 9632 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9633 State.set(this, Cast, Part); 9634 State.ILV->addMetadata(Cast, &I); 9635 } 9636 break; 9637 } 9638 default: 9639 // This instruction is not vectorized by simple widening. 9640 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9641 llvm_unreachable("Unhandled instruction!"); 9642 } // end of switch. 9643 } 9644 9645 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9646 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9647 // Construct a vector GEP by widening the operands of the scalar GEP as 9648 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9649 // results in a vector of pointers when at least one operand of the GEP 9650 // is vector-typed. Thus, to keep the representation compact, we only use 9651 // vector-typed operands for loop-varying values. 9652 9653 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9654 // If we are vectorizing, but the GEP has only loop-invariant operands, 9655 // the GEP we build (by only using vector-typed operands for 9656 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9657 // produce a vector of pointers, we need to either arbitrarily pick an 9658 // operand to broadcast, or broadcast a clone of the original GEP. 9659 // Here, we broadcast a clone of the original. 9660 // 9661 // TODO: If at some point we decide to scalarize instructions having 9662 // loop-invariant operands, this special case will no longer be 9663 // required. We would add the scalarization decision to 9664 // collectLoopScalars() and teach getVectorValue() to broadcast 9665 // the lane-zero scalar value. 9666 auto *Clone = State.Builder.Insert(GEP->clone()); 9667 for (unsigned Part = 0; Part < State.UF; ++Part) { 9668 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9669 State.set(this, EntryPart, Part); 9670 State.ILV->addMetadata(EntryPart, GEP); 9671 } 9672 } else { 9673 // If the GEP has at least one loop-varying operand, we are sure to 9674 // produce a vector of pointers. But if we are only unrolling, we want 9675 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9676 // produce with the code below will be scalar (if VF == 1) or vector 9677 // (otherwise). Note that for the unroll-only case, we still maintain 9678 // values in the vector mapping with initVector, as we do for other 9679 // instructions. 9680 for (unsigned Part = 0; Part < State.UF; ++Part) { 9681 // The pointer operand of the new GEP. If it's loop-invariant, we 9682 // won't broadcast it. 9683 auto *Ptr = IsPtrLoopInvariant 9684 ? State.get(getOperand(0), VPIteration(0, 0)) 9685 : State.get(getOperand(0), Part); 9686 9687 // Collect all the indices for the new GEP. If any index is 9688 // loop-invariant, we won't broadcast it. 9689 SmallVector<Value *, 4> Indices; 9690 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9691 VPValue *Operand = getOperand(I); 9692 if (IsIndexLoopInvariant[I - 1]) 9693 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9694 else 9695 Indices.push_back(State.get(Operand, Part)); 9696 } 9697 9698 // If the GEP instruction is vectorized and was in a basic block that 9699 // needed predication, we can't propagate the poison-generating 'inbounds' 9700 // flag. The control flow has been linearized and the GEP is no longer 9701 // guarded by the predicate, which could make the 'inbounds' properties to 9702 // no longer hold. 9703 bool IsInBounds = 9704 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9705 9706 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9707 // but it should be a vector, otherwise. 9708 auto *NewGEP = IsInBounds 9709 ? State.Builder.CreateInBoundsGEP( 9710 GEP->getSourceElementType(), Ptr, Indices) 9711 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9712 Ptr, Indices); 9713 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9714 "NewGEP is not a pointer vector"); 9715 State.set(this, NewGEP, Part); 9716 State.ILV->addMetadata(NewGEP, GEP); 9717 } 9718 } 9719 } 9720 9721 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9722 assert(!State.Instance && "Int or FP induction being replicated."); 9723 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9724 State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); 9725 } 9726 9727 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9728 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9729 9730 // Fast-math-flags propagate from the original induction instruction. 9731 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9732 if (IndDesc.getInductionBinOp() && 9733 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9734 State.Builder.setFastMathFlags( 9735 IndDesc.getInductionBinOp()->getFastMathFlags()); 9736 9737 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9738 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9739 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9740 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9741 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9742 ScalarIV = 9743 Ty->isIntegerTy() 9744 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9745 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9746 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9747 getStartValue()->getLiveInIRValue(), Step, 9748 IndDesc); 9749 ScalarIV->setName("offset.idx"); 9750 } 9751 if (TruncToTy) { 9752 assert(Step->getType()->isIntegerTy() && 9753 "Truncation requires an integer step"); 9754 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9755 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9756 } 9757 return ScalarIV; 9758 }; 9759 9760 Value *ScalarIV = CreateScalarIV(Step); 9761 if (State.VF.isVector()) { 9762 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9763 return; 9764 } 9765 9766 for (unsigned Part = 0; Part < State.UF; ++Part) { 9767 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9768 Value *EntryPart; 9769 if (Step->getType()->isFloatingPointTy()) { 9770 Value *StartIdx = 9771 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9772 // Floating-point operations inherit FMF via the builder's flags. 9773 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9774 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9775 ScalarIV, MulOp); 9776 } else { 9777 Value *StartIdx = 9778 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9779 EntryPart = State.Builder.CreateAdd( 9780 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9781 } 9782 State.set(this, EntryPart, Part); 9783 } 9784 } 9785 9786 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9787 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9788 State); 9789 } 9790 9791 void VPBlendRecipe::execute(VPTransformState &State) { 9792 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9793 // We know that all PHIs in non-header blocks are converted into 9794 // selects, so we don't have to worry about the insertion order and we 9795 // can just use the builder. 9796 // At this point we generate the predication tree. There may be 9797 // duplications since this is a simple recursive scan, but future 9798 // optimizations will clean it up. 9799 9800 unsigned NumIncoming = getNumIncomingValues(); 9801 9802 // Generate a sequence of selects of the form: 9803 // SELECT(Mask3, In3, 9804 // SELECT(Mask2, In2, 9805 // SELECT(Mask1, In1, 9806 // In0))) 9807 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9808 // are essentially undef are taken from In0. 9809 InnerLoopVectorizer::VectorParts Entry(State.UF); 9810 for (unsigned In = 0; In < NumIncoming; ++In) { 9811 for (unsigned Part = 0; Part < State.UF; ++Part) { 9812 // We might have single edge PHIs (blocks) - use an identity 9813 // 'select' for the first PHI operand. 9814 Value *In0 = State.get(getIncomingValue(In), Part); 9815 if (In == 0) 9816 Entry[Part] = In0; // Initialize with the first incoming value. 9817 else { 9818 // Select between the current value and the previous incoming edge 9819 // based on the incoming mask. 9820 Value *Cond = State.get(getMask(In), Part); 9821 Entry[Part] = 9822 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9823 } 9824 } 9825 } 9826 for (unsigned Part = 0; Part < State.UF; ++Part) 9827 State.set(this, Entry[Part], Part); 9828 } 9829 9830 void VPInterleaveRecipe::execute(VPTransformState &State) { 9831 assert(!State.Instance && "Interleave group being replicated."); 9832 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9833 getStoredValues(), getMask()); 9834 } 9835 9836 void VPReductionRecipe::execute(VPTransformState &State) { 9837 assert(!State.Instance && "Reduction being replicated."); 9838 Value *PrevInChain = State.get(getChainOp(), 0); 9839 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9840 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9841 // Propagate the fast-math flags carried by the underlying instruction. 9842 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9843 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9844 for (unsigned Part = 0; Part < State.UF; ++Part) { 9845 Value *NewVecOp = State.get(getVecOp(), Part); 9846 if (VPValue *Cond = getCondOp()) { 9847 Value *NewCond = State.get(Cond, Part); 9848 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9849 Value *Iden = RdxDesc->getRecurrenceIdentity( 9850 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9851 Value *IdenVec = 9852 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9853 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9854 NewVecOp = Select; 9855 } 9856 Value *NewRed; 9857 Value *NextInChain; 9858 if (IsOrdered) { 9859 if (State.VF.isVector()) 9860 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9861 PrevInChain); 9862 else 9863 NewRed = State.Builder.CreateBinOp( 9864 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9865 NewVecOp); 9866 PrevInChain = NewRed; 9867 } else { 9868 PrevInChain = State.get(getChainOp(), Part); 9869 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9870 } 9871 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9872 NextInChain = 9873 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9874 NewRed, PrevInChain); 9875 } else if (IsOrdered) 9876 NextInChain = NewRed; 9877 else 9878 NextInChain = State.Builder.CreateBinOp( 9879 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9880 PrevInChain); 9881 State.set(this, NextInChain, Part); 9882 } 9883 } 9884 9885 void VPReplicateRecipe::execute(VPTransformState &State) { 9886 if (State.Instance) { // Generate a single instance. 9887 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9888 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9889 IsPredicated, State); 9890 // Insert scalar instance packing it into a vector. 9891 if (AlsoPack && State.VF.isVector()) { 9892 // If we're constructing lane 0, initialize to start from poison. 9893 if (State.Instance->Lane.isFirstLane()) { 9894 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9895 Value *Poison = PoisonValue::get( 9896 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9897 State.set(this, Poison, State.Instance->Part); 9898 } 9899 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9900 } 9901 return; 9902 } 9903 9904 // Generate scalar instances for all VF lanes of all UF parts, unless the 9905 // instruction is uniform inwhich case generate only the first lane for each 9906 // of the UF parts. 9907 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9908 assert((!State.VF.isScalable() || IsUniform) && 9909 "Can't scalarize a scalable vector"); 9910 for (unsigned Part = 0; Part < State.UF; ++Part) 9911 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9912 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9913 VPIteration(Part, Lane), IsPredicated, 9914 State); 9915 } 9916 9917 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9918 assert(State.Instance && "Branch on Mask works only on single instance."); 9919 9920 unsigned Part = State.Instance->Part; 9921 unsigned Lane = State.Instance->Lane.getKnownLane(); 9922 9923 Value *ConditionBit = nullptr; 9924 VPValue *BlockInMask = getMask(); 9925 if (BlockInMask) { 9926 ConditionBit = State.get(BlockInMask, Part); 9927 if (ConditionBit->getType()->isVectorTy()) 9928 ConditionBit = State.Builder.CreateExtractElement( 9929 ConditionBit, State.Builder.getInt32(Lane)); 9930 } else // Block in mask is all-one. 9931 ConditionBit = State.Builder.getTrue(); 9932 9933 // Replace the temporary unreachable terminator with a new conditional branch, 9934 // whose two destinations will be set later when they are created. 9935 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9936 assert(isa<UnreachableInst>(CurrentTerminator) && 9937 "Expected to replace unreachable terminator with conditional branch."); 9938 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9939 CondBr->setSuccessor(0, nullptr); 9940 ReplaceInstWithInst(CurrentTerminator, CondBr); 9941 } 9942 9943 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9944 assert(State.Instance && "Predicated instruction PHI works per instance."); 9945 Instruction *ScalarPredInst = 9946 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9947 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9948 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9949 assert(PredicatingBB && "Predicated block has no single predecessor."); 9950 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9951 "operand must be VPReplicateRecipe"); 9952 9953 // By current pack/unpack logic we need to generate only a single phi node: if 9954 // a vector value for the predicated instruction exists at this point it means 9955 // the instruction has vector users only, and a phi for the vector value is 9956 // needed. In this case the recipe of the predicated instruction is marked to 9957 // also do that packing, thereby "hoisting" the insert-element sequence. 9958 // Otherwise, a phi node for the scalar value is needed. 9959 unsigned Part = State.Instance->Part; 9960 if (State.hasVectorValue(getOperand(0), Part)) { 9961 Value *VectorValue = State.get(getOperand(0), Part); 9962 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9963 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9964 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9965 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9966 if (State.hasVectorValue(this, Part)) 9967 State.reset(this, VPhi, Part); 9968 else 9969 State.set(this, VPhi, Part); 9970 // NOTE: Currently we need to update the value of the operand, so the next 9971 // predicated iteration inserts its generated value in the correct vector. 9972 State.reset(getOperand(0), VPhi, Part); 9973 } else { 9974 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9975 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9976 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9977 PredicatingBB); 9978 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9979 if (State.hasScalarValue(this, *State.Instance)) 9980 State.reset(this, Phi, *State.Instance); 9981 else 9982 State.set(this, Phi, *State.Instance); 9983 // NOTE: Currently we need to update the value of the operand, so the next 9984 // predicated iteration inserts its generated value in the correct vector. 9985 State.reset(getOperand(0), Phi, *State.Instance); 9986 } 9987 } 9988 9989 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9990 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9991 9992 // Attempt to issue a wide load. 9993 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9994 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9995 9996 assert((LI || SI) && "Invalid Load/Store instruction"); 9997 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9998 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9999 10000 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 10001 10002 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 10003 const Align Alignment = getLoadStoreAlignment(&Ingredient); 10004 bool CreateGatherScatter = !Consecutive; 10005 10006 auto &Builder = State.Builder; 10007 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 10008 bool isMaskRequired = getMask(); 10009 if (isMaskRequired) 10010 for (unsigned Part = 0; Part < State.UF; ++Part) 10011 BlockInMaskParts[Part] = State.get(getMask(), Part); 10012 10013 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 10014 // Calculate the pointer for the specific unroll-part. 10015 GetElementPtrInst *PartPtr = nullptr; 10016 10017 bool InBounds = false; 10018 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 10019 InBounds = gep->isInBounds(); 10020 if (Reverse) { 10021 // If the address is consecutive but reversed, then the 10022 // wide store needs to start at the last vector element. 10023 // RunTimeVF = VScale * VF.getKnownMinValue() 10024 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 10025 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 10026 // NumElt = -Part * RunTimeVF 10027 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 10028 // LastLane = 1 - RunTimeVF 10029 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 10030 PartPtr = 10031 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 10032 PartPtr->setIsInBounds(InBounds); 10033 PartPtr = cast<GetElementPtrInst>( 10034 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 10035 PartPtr->setIsInBounds(InBounds); 10036 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 10037 BlockInMaskParts[Part] = 10038 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 10039 } else { 10040 Value *Increment = 10041 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 10042 PartPtr = cast<GetElementPtrInst>( 10043 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 10044 PartPtr->setIsInBounds(InBounds); 10045 } 10046 10047 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 10048 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 10049 }; 10050 10051 // Handle Stores: 10052 if (SI) { 10053 State.ILV->setDebugLocFromInst(SI); 10054 10055 for (unsigned Part = 0; Part < State.UF; ++Part) { 10056 Instruction *NewSI = nullptr; 10057 Value *StoredVal = State.get(StoredValue, Part); 10058 if (CreateGatherScatter) { 10059 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10060 Value *VectorGep = State.get(getAddr(), Part); 10061 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 10062 MaskPart); 10063 } else { 10064 if (Reverse) { 10065 // If we store to reverse consecutive memory locations, then we need 10066 // to reverse the order of elements in the stored value. 10067 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 10068 // We don't want to update the value in the map as it might be used in 10069 // another expression. So don't call resetVectorValue(StoredVal). 10070 } 10071 auto *VecPtr = 10072 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10073 if (isMaskRequired) 10074 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 10075 BlockInMaskParts[Part]); 10076 else 10077 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 10078 } 10079 State.ILV->addMetadata(NewSI, SI); 10080 } 10081 return; 10082 } 10083 10084 // Handle loads. 10085 assert(LI && "Must have a load instruction"); 10086 State.ILV->setDebugLocFromInst(LI); 10087 for (unsigned Part = 0; Part < State.UF; ++Part) { 10088 Value *NewLI; 10089 if (CreateGatherScatter) { 10090 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10091 Value *VectorGep = State.get(getAddr(), Part); 10092 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10093 nullptr, "wide.masked.gather"); 10094 State.ILV->addMetadata(NewLI, LI); 10095 } else { 10096 auto *VecPtr = 10097 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10098 if (isMaskRequired) 10099 NewLI = Builder.CreateMaskedLoad( 10100 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10101 PoisonValue::get(DataTy), "wide.masked.load"); 10102 else 10103 NewLI = 10104 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10105 10106 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10107 State.ILV->addMetadata(NewLI, LI); 10108 if (Reverse) 10109 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10110 } 10111 10112 State.set(this, NewLI, Part); 10113 } 10114 } 10115 10116 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10117 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10118 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10119 // for predication. 10120 static ScalarEpilogueLowering getScalarEpilogueLowering( 10121 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10122 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10123 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10124 LoopVectorizationLegality &LVL) { 10125 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10126 // don't look at hints or options, and don't request a scalar epilogue. 10127 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10128 // LoopAccessInfo (due to code dependency and not being able to reliably get 10129 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10130 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10131 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10132 // back to the old way and vectorize with versioning when forced. See D81345.) 10133 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10134 PGSOQueryType::IRPass) && 10135 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10136 return CM_ScalarEpilogueNotAllowedOptSize; 10137 10138 // 2) If set, obey the directives 10139 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10140 switch (PreferPredicateOverEpilogue) { 10141 case PreferPredicateTy::ScalarEpilogue: 10142 return CM_ScalarEpilogueAllowed; 10143 case PreferPredicateTy::PredicateElseScalarEpilogue: 10144 return CM_ScalarEpilogueNotNeededUsePredicate; 10145 case PreferPredicateTy::PredicateOrDontVectorize: 10146 return CM_ScalarEpilogueNotAllowedUsePredicate; 10147 }; 10148 } 10149 10150 // 3) If set, obey the hints 10151 switch (Hints.getPredicate()) { 10152 case LoopVectorizeHints::FK_Enabled: 10153 return CM_ScalarEpilogueNotNeededUsePredicate; 10154 case LoopVectorizeHints::FK_Disabled: 10155 return CM_ScalarEpilogueAllowed; 10156 }; 10157 10158 // 4) if the TTI hook indicates this is profitable, request predication. 10159 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10160 LVL.getLAI())) 10161 return CM_ScalarEpilogueNotNeededUsePredicate; 10162 10163 return CM_ScalarEpilogueAllowed; 10164 } 10165 10166 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10167 // If Values have been set for this Def return the one relevant for \p Part. 10168 if (hasVectorValue(Def, Part)) 10169 return Data.PerPartOutput[Def][Part]; 10170 10171 if (!hasScalarValue(Def, {Part, 0})) { 10172 Value *IRV = Def->getLiveInIRValue(); 10173 Value *B = ILV->getBroadcastInstrs(IRV); 10174 set(Def, B, Part); 10175 return B; 10176 } 10177 10178 Value *ScalarValue = get(Def, {Part, 0}); 10179 // If we aren't vectorizing, we can just copy the scalar map values over 10180 // to the vector map. 10181 if (VF.isScalar()) { 10182 set(Def, ScalarValue, Part); 10183 return ScalarValue; 10184 } 10185 10186 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10187 bool IsUniform = RepR && RepR->isUniform(); 10188 10189 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10190 // Check if there is a scalar value for the selected lane. 10191 if (!hasScalarValue(Def, {Part, LastLane})) { 10192 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10193 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10194 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10195 "unexpected recipe found to be invariant"); 10196 IsUniform = true; 10197 LastLane = 0; 10198 } 10199 10200 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10201 // Set the insert point after the last scalarized instruction or after the 10202 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10203 // will directly follow the scalar definitions. 10204 auto OldIP = Builder.saveIP(); 10205 auto NewIP = 10206 isa<PHINode>(LastInst) 10207 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10208 : std::next(BasicBlock::iterator(LastInst)); 10209 Builder.SetInsertPoint(&*NewIP); 10210 10211 // However, if we are vectorizing, we need to construct the vector values. 10212 // If the value is known to be uniform after vectorization, we can just 10213 // broadcast the scalar value corresponding to lane zero for each unroll 10214 // iteration. Otherwise, we construct the vector values using 10215 // insertelement instructions. Since the resulting vectors are stored in 10216 // State, we will only generate the insertelements once. 10217 Value *VectorValue = nullptr; 10218 if (IsUniform) { 10219 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10220 set(Def, VectorValue, Part); 10221 } else { 10222 // Initialize packing with insertelements to start from undef. 10223 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10224 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10225 set(Def, Undef, Part); 10226 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10227 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10228 VectorValue = get(Def, Part); 10229 } 10230 Builder.restoreIP(OldIP); 10231 return VectorValue; 10232 } 10233 10234 // Process the loop in the VPlan-native vectorization path. This path builds 10235 // VPlan upfront in the vectorization pipeline, which allows to apply 10236 // VPlan-to-VPlan transformations from the very beginning without modifying the 10237 // input LLVM IR. 10238 static bool processLoopInVPlanNativePath( 10239 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10240 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10241 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10242 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10243 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10244 LoopVectorizationRequirements &Requirements) { 10245 10246 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10247 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10248 return false; 10249 } 10250 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10251 Function *F = L->getHeader()->getParent(); 10252 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10253 10254 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10255 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10256 10257 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10258 &Hints, IAI); 10259 // Use the planner for outer loop vectorization. 10260 // TODO: CM is not used at this point inside the planner. Turn CM into an 10261 // optional argument if we don't need it in the future. 10262 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10263 Requirements, ORE); 10264 10265 // Get user vectorization factor. 10266 ElementCount UserVF = Hints.getWidth(); 10267 10268 CM.collectElementTypesForWidening(); 10269 10270 // Plan how to best vectorize, return the best VF and its cost. 10271 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10272 10273 // If we are stress testing VPlan builds, do not attempt to generate vector 10274 // code. Masked vector code generation support will follow soon. 10275 // Also, do not attempt to vectorize if no vector code will be produced. 10276 if (VPlanBuildStressTest || EnableVPlanPredication || 10277 VectorizationFactor::Disabled() == VF) 10278 return false; 10279 10280 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10281 10282 { 10283 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10284 F->getParent()->getDataLayout()); 10285 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10286 &CM, BFI, PSI, Checks); 10287 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10288 << L->getHeader()->getParent()->getName() << "\"\n"); 10289 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10290 } 10291 10292 // Mark the loop as already vectorized to avoid vectorizing again. 10293 Hints.setAlreadyVectorized(); 10294 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10295 return true; 10296 } 10297 10298 // Emit a remark if there are stores to floats that required a floating point 10299 // extension. If the vectorized loop was generated with floating point there 10300 // will be a performance penalty from the conversion overhead and the change in 10301 // the vector width. 10302 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10303 SmallVector<Instruction *, 4> Worklist; 10304 for (BasicBlock *BB : L->getBlocks()) { 10305 for (Instruction &Inst : *BB) { 10306 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10307 if (S->getValueOperand()->getType()->isFloatTy()) 10308 Worklist.push_back(S); 10309 } 10310 } 10311 } 10312 10313 // Traverse the floating point stores upwards searching, for floating point 10314 // conversions. 10315 SmallPtrSet<const Instruction *, 4> Visited; 10316 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10317 while (!Worklist.empty()) { 10318 auto *I = Worklist.pop_back_val(); 10319 if (!L->contains(I)) 10320 continue; 10321 if (!Visited.insert(I).second) 10322 continue; 10323 10324 // Emit a remark if the floating point store required a floating 10325 // point conversion. 10326 // TODO: More work could be done to identify the root cause such as a 10327 // constant or a function return type and point the user to it. 10328 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10329 ORE->emit([&]() { 10330 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10331 I->getDebugLoc(), L->getHeader()) 10332 << "floating point conversion changes vector width. " 10333 << "Mixed floating point precision requires an up/down " 10334 << "cast that will negatively impact performance."; 10335 }); 10336 10337 for (Use &Op : I->operands()) 10338 if (auto *OpI = dyn_cast<Instruction>(Op)) 10339 Worklist.push_back(OpI); 10340 } 10341 } 10342 10343 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10344 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10345 !EnableLoopInterleaving), 10346 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10347 !EnableLoopVectorization) {} 10348 10349 bool LoopVectorizePass::processLoop(Loop *L) { 10350 assert((EnableVPlanNativePath || L->isInnermost()) && 10351 "VPlan-native path is not enabled. Only process inner loops."); 10352 10353 #ifndef NDEBUG 10354 const std::string DebugLocStr = getDebugLocString(L); 10355 #endif /* NDEBUG */ 10356 10357 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10358 << L->getHeader()->getParent()->getName() << "' from " 10359 << DebugLocStr << "\n"); 10360 10361 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10362 10363 LLVM_DEBUG( 10364 dbgs() << "LV: Loop hints:" 10365 << " force=" 10366 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10367 ? "disabled" 10368 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10369 ? "enabled" 10370 : "?")) 10371 << " width=" << Hints.getWidth() 10372 << " interleave=" << Hints.getInterleave() << "\n"); 10373 10374 // Function containing loop 10375 Function *F = L->getHeader()->getParent(); 10376 10377 // Looking at the diagnostic output is the only way to determine if a loop 10378 // was vectorized (other than looking at the IR or machine code), so it 10379 // is important to generate an optimization remark for each loop. Most of 10380 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10381 // generated as OptimizationRemark and OptimizationRemarkMissed are 10382 // less verbose reporting vectorized loops and unvectorized loops that may 10383 // benefit from vectorization, respectively. 10384 10385 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10386 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10387 return false; 10388 } 10389 10390 PredicatedScalarEvolution PSE(*SE, *L); 10391 10392 // Check if it is legal to vectorize the loop. 10393 LoopVectorizationRequirements Requirements; 10394 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10395 &Requirements, &Hints, DB, AC, BFI, PSI); 10396 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10397 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10398 Hints.emitRemarkWithHints(); 10399 return false; 10400 } 10401 10402 // Check the function attributes and profiles to find out if this function 10403 // should be optimized for size. 10404 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10405 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10406 10407 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10408 // here. They may require CFG and instruction level transformations before 10409 // even evaluating whether vectorization is profitable. Since we cannot modify 10410 // the incoming IR, we need to build VPlan upfront in the vectorization 10411 // pipeline. 10412 if (!L->isInnermost()) 10413 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10414 ORE, BFI, PSI, Hints, Requirements); 10415 10416 assert(L->isInnermost() && "Inner loop expected."); 10417 10418 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10419 // count by optimizing for size, to minimize overheads. 10420 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10421 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10422 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10423 << "This loop is worth vectorizing only if no scalar " 10424 << "iteration overheads are incurred."); 10425 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10426 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10427 else { 10428 LLVM_DEBUG(dbgs() << "\n"); 10429 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10430 } 10431 } 10432 10433 // Check the function attributes to see if implicit floats are allowed. 10434 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10435 // an integer loop and the vector instructions selected are purely integer 10436 // vector instructions? 10437 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10438 reportVectorizationFailure( 10439 "Can't vectorize when the NoImplicitFloat attribute is used", 10440 "loop not vectorized due to NoImplicitFloat attribute", 10441 "NoImplicitFloat", ORE, L); 10442 Hints.emitRemarkWithHints(); 10443 return false; 10444 } 10445 10446 // Check if the target supports potentially unsafe FP vectorization. 10447 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10448 // for the target we're vectorizing for, to make sure none of the 10449 // additional fp-math flags can help. 10450 if (Hints.isPotentiallyUnsafe() && 10451 TTI->isFPVectorizationPotentiallyUnsafe()) { 10452 reportVectorizationFailure( 10453 "Potentially unsafe FP op prevents vectorization", 10454 "loop not vectorized due to unsafe FP support.", 10455 "UnsafeFP", ORE, L); 10456 Hints.emitRemarkWithHints(); 10457 return false; 10458 } 10459 10460 bool AllowOrderedReductions; 10461 // If the flag is set, use that instead and override the TTI behaviour. 10462 if (ForceOrderedReductions.getNumOccurrences() > 0) 10463 AllowOrderedReductions = ForceOrderedReductions; 10464 else 10465 AllowOrderedReductions = TTI->enableOrderedReductions(); 10466 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10467 ORE->emit([&]() { 10468 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10469 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10470 ExactFPMathInst->getDebugLoc(), 10471 ExactFPMathInst->getParent()) 10472 << "loop not vectorized: cannot prove it is safe to reorder " 10473 "floating-point operations"; 10474 }); 10475 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10476 "reorder floating-point operations\n"); 10477 Hints.emitRemarkWithHints(); 10478 return false; 10479 } 10480 10481 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10482 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10483 10484 // If an override option has been passed in for interleaved accesses, use it. 10485 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10486 UseInterleaved = EnableInterleavedMemAccesses; 10487 10488 // Analyze interleaved memory accesses. 10489 if (UseInterleaved) { 10490 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10491 } 10492 10493 // Use the cost model. 10494 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10495 F, &Hints, IAI); 10496 CM.collectValuesToIgnore(); 10497 CM.collectElementTypesForWidening(); 10498 10499 // Use the planner for vectorization. 10500 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10501 Requirements, ORE); 10502 10503 // Get user vectorization factor and interleave count. 10504 ElementCount UserVF = Hints.getWidth(); 10505 unsigned UserIC = Hints.getInterleave(); 10506 10507 // Plan how to best vectorize, return the best VF and its cost. 10508 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10509 10510 VectorizationFactor VF = VectorizationFactor::Disabled(); 10511 unsigned IC = 1; 10512 10513 if (MaybeVF) { 10514 VF = *MaybeVF; 10515 // Select the interleave count. 10516 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10517 } 10518 10519 // Identify the diagnostic messages that should be produced. 10520 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10521 bool VectorizeLoop = true, InterleaveLoop = true; 10522 if (VF.Width.isScalar()) { 10523 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10524 VecDiagMsg = std::make_pair( 10525 "VectorizationNotBeneficial", 10526 "the cost-model indicates that vectorization is not beneficial"); 10527 VectorizeLoop = false; 10528 } 10529 10530 if (!MaybeVF && UserIC > 1) { 10531 // Tell the user interleaving was avoided up-front, despite being explicitly 10532 // requested. 10533 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10534 "interleaving should be avoided up front\n"); 10535 IntDiagMsg = std::make_pair( 10536 "InterleavingAvoided", 10537 "Ignoring UserIC, because interleaving was avoided up front"); 10538 InterleaveLoop = false; 10539 } else if (IC == 1 && UserIC <= 1) { 10540 // Tell the user interleaving is not beneficial. 10541 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10542 IntDiagMsg = std::make_pair( 10543 "InterleavingNotBeneficial", 10544 "the cost-model indicates that interleaving is not beneficial"); 10545 InterleaveLoop = false; 10546 if (UserIC == 1) { 10547 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10548 IntDiagMsg.second += 10549 " and is explicitly disabled or interleave count is set to 1"; 10550 } 10551 } else if (IC > 1 && UserIC == 1) { 10552 // Tell the user interleaving is beneficial, but it explicitly disabled. 10553 LLVM_DEBUG( 10554 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10555 IntDiagMsg = std::make_pair( 10556 "InterleavingBeneficialButDisabled", 10557 "the cost-model indicates that interleaving is beneficial " 10558 "but is explicitly disabled or interleave count is set to 1"); 10559 InterleaveLoop = false; 10560 } 10561 10562 // Override IC if user provided an interleave count. 10563 IC = UserIC > 0 ? UserIC : IC; 10564 10565 // Emit diagnostic messages, if any. 10566 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10567 if (!VectorizeLoop && !InterleaveLoop) { 10568 // Do not vectorize or interleaving the loop. 10569 ORE->emit([&]() { 10570 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10571 L->getStartLoc(), L->getHeader()) 10572 << VecDiagMsg.second; 10573 }); 10574 ORE->emit([&]() { 10575 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10576 L->getStartLoc(), L->getHeader()) 10577 << IntDiagMsg.second; 10578 }); 10579 return false; 10580 } else if (!VectorizeLoop && InterleaveLoop) { 10581 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10582 ORE->emit([&]() { 10583 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10584 L->getStartLoc(), L->getHeader()) 10585 << VecDiagMsg.second; 10586 }); 10587 } else if (VectorizeLoop && !InterleaveLoop) { 10588 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10589 << ") in " << DebugLocStr << '\n'); 10590 ORE->emit([&]() { 10591 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10592 L->getStartLoc(), L->getHeader()) 10593 << IntDiagMsg.second; 10594 }); 10595 } else if (VectorizeLoop && InterleaveLoop) { 10596 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10597 << ") in " << DebugLocStr << '\n'); 10598 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10599 } 10600 10601 bool DisableRuntimeUnroll = false; 10602 MDNode *OrigLoopID = L->getLoopID(); 10603 { 10604 // Optimistically generate runtime checks. Drop them if they turn out to not 10605 // be profitable. Limit the scope of Checks, so the cleanup happens 10606 // immediately after vector codegeneration is done. 10607 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10608 F->getParent()->getDataLayout()); 10609 if (!VF.Width.isScalar() || IC > 1) 10610 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate()); 10611 10612 using namespace ore; 10613 if (!VectorizeLoop) { 10614 assert(IC > 1 && "interleave count should not be 1 or 0"); 10615 // If we decided that it is not legal to vectorize the loop, then 10616 // interleave it. 10617 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10618 &CM, BFI, PSI, Checks); 10619 10620 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10621 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10622 10623 ORE->emit([&]() { 10624 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10625 L->getHeader()) 10626 << "interleaved loop (interleaved count: " 10627 << NV("InterleaveCount", IC) << ")"; 10628 }); 10629 } else { 10630 // If we decided that it is *legal* to vectorize the loop, then do it. 10631 10632 // Consider vectorizing the epilogue too if it's profitable. 10633 VectorizationFactor EpilogueVF = 10634 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10635 if (EpilogueVF.Width.isVector()) { 10636 10637 // The first pass vectorizes the main loop and creates a scalar epilogue 10638 // to be vectorized by executing the plan (potentially with a different 10639 // factor) again shortly afterwards. 10640 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10641 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10642 EPI, &LVL, &CM, BFI, PSI, Checks); 10643 10644 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10645 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10646 DT); 10647 ++LoopsVectorized; 10648 10649 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10650 formLCSSARecursively(*L, *DT, LI, SE); 10651 10652 // Second pass vectorizes the epilogue and adjusts the control flow 10653 // edges from the first pass. 10654 EPI.MainLoopVF = EPI.EpilogueVF; 10655 EPI.MainLoopUF = EPI.EpilogueUF; 10656 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10657 ORE, EPI, &LVL, &CM, BFI, PSI, 10658 Checks); 10659 10660 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10661 10662 // Ensure that the start values for any VPReductionPHIRecipes are 10663 // updated before vectorising the epilogue loop. 10664 VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); 10665 for (VPRecipeBase &R : Header->phis()) { 10666 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10667 if (auto *Resume = MainILV.getReductionResumeValue( 10668 ReductionPhi->getRecurrenceDescriptor())) { 10669 VPValue *StartVal = new VPValue(Resume); 10670 BestEpiPlan.addExternalDef(StartVal); 10671 ReductionPhi->setOperand(0, StartVal); 10672 } 10673 } 10674 } 10675 10676 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10677 DT); 10678 ++LoopsEpilogueVectorized; 10679 10680 if (!MainILV.areSafetyChecksAdded()) 10681 DisableRuntimeUnroll = true; 10682 } else { 10683 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10684 &LVL, &CM, BFI, PSI, Checks); 10685 10686 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10687 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10688 ++LoopsVectorized; 10689 10690 // Add metadata to disable runtime unrolling a scalar loop when there 10691 // are no runtime checks about strides and memory. A scalar loop that is 10692 // rarely used is not worth unrolling. 10693 if (!LB.areSafetyChecksAdded()) 10694 DisableRuntimeUnroll = true; 10695 } 10696 // Report the vectorization decision. 10697 ORE->emit([&]() { 10698 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10699 L->getHeader()) 10700 << "vectorized loop (vectorization width: " 10701 << NV("VectorizationFactor", VF.Width) 10702 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10703 }); 10704 } 10705 10706 if (ORE->allowExtraAnalysis(LV_NAME)) 10707 checkMixedPrecision(L, ORE); 10708 } 10709 10710 Optional<MDNode *> RemainderLoopID = 10711 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10712 LLVMLoopVectorizeFollowupEpilogue}); 10713 if (RemainderLoopID.hasValue()) { 10714 L->setLoopID(RemainderLoopID.getValue()); 10715 } else { 10716 if (DisableRuntimeUnroll) 10717 AddRuntimeUnrollDisableMetaData(L); 10718 10719 // Mark the loop as already vectorized to avoid vectorizing again. 10720 Hints.setAlreadyVectorized(); 10721 } 10722 10723 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10724 return true; 10725 } 10726 10727 LoopVectorizeResult LoopVectorizePass::runImpl( 10728 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10729 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10730 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10731 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10732 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10733 SE = &SE_; 10734 LI = &LI_; 10735 TTI = &TTI_; 10736 DT = &DT_; 10737 BFI = &BFI_; 10738 TLI = TLI_; 10739 AA = &AA_; 10740 AC = &AC_; 10741 GetLAA = &GetLAA_; 10742 DB = &DB_; 10743 ORE = &ORE_; 10744 PSI = PSI_; 10745 10746 // Don't attempt if 10747 // 1. the target claims to have no vector registers, and 10748 // 2. interleaving won't help ILP. 10749 // 10750 // The second condition is necessary because, even if the target has no 10751 // vector registers, loop vectorization may still enable scalar 10752 // interleaving. 10753 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10754 TTI->getMaxInterleaveFactor(1) < 2) 10755 return LoopVectorizeResult(false, false); 10756 10757 bool Changed = false, CFGChanged = false; 10758 10759 // The vectorizer requires loops to be in simplified form. 10760 // Since simplification may add new inner loops, it has to run before the 10761 // legality and profitability checks. This means running the loop vectorizer 10762 // will simplify all loops, regardless of whether anything end up being 10763 // vectorized. 10764 for (auto &L : *LI) 10765 Changed |= CFGChanged |= 10766 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10767 10768 // Build up a worklist of inner-loops to vectorize. This is necessary as 10769 // the act of vectorizing or partially unrolling a loop creates new loops 10770 // and can invalidate iterators across the loops. 10771 SmallVector<Loop *, 8> Worklist; 10772 10773 for (Loop *L : *LI) 10774 collectSupportedLoops(*L, LI, ORE, Worklist); 10775 10776 LoopsAnalyzed += Worklist.size(); 10777 10778 // Now walk the identified inner loops. 10779 while (!Worklist.empty()) { 10780 Loop *L = Worklist.pop_back_val(); 10781 10782 // For the inner loops we actually process, form LCSSA to simplify the 10783 // transform. 10784 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10785 10786 Changed |= CFGChanged |= processLoop(L); 10787 } 10788 10789 // Process each loop nest in the function. 10790 return LoopVectorizeResult(Changed, CFGChanged); 10791 } 10792 10793 PreservedAnalyses LoopVectorizePass::run(Function &F, 10794 FunctionAnalysisManager &AM) { 10795 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10796 auto &LI = AM.getResult<LoopAnalysis>(F); 10797 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10798 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10799 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10800 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10801 auto &AA = AM.getResult<AAManager>(F); 10802 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10803 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10804 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10805 10806 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10807 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10808 [&](Loop &L) -> const LoopAccessInfo & { 10809 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10810 TLI, TTI, nullptr, nullptr, nullptr}; 10811 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10812 }; 10813 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10814 ProfileSummaryInfo *PSI = 10815 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10816 LoopVectorizeResult Result = 10817 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10818 if (!Result.MadeAnyChange) 10819 return PreservedAnalyses::all(); 10820 PreservedAnalyses PA; 10821 10822 // We currently do not preserve loopinfo/dominator analyses with outer loop 10823 // vectorization. Until this is addressed, mark these analyses as preserved 10824 // only for non-VPlan-native path. 10825 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10826 if (!EnableVPlanNativePath) { 10827 PA.preserve<LoopAnalysis>(); 10828 PA.preserve<DominatorTreeAnalysis>(); 10829 } 10830 10831 if (Result.MadeCFGChange) { 10832 // Making CFG changes likely means a loop got vectorized. Indicate that 10833 // extra simplification passes should be run. 10834 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10835 // be run if runtime checks have been added. 10836 AM.getResult<ShouldRunExtraVectorPasses>(F); 10837 PA.preserve<ShouldRunExtraVectorPasses>(); 10838 } else { 10839 PA.preserveSet<CFGAnalyses>(); 10840 } 10841 return PA; 10842 } 10843 10844 void LoopVectorizePass::printPipeline( 10845 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10846 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10847 OS, MapClassName2PassName); 10848 10849 OS << "<"; 10850 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10851 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10852 OS << ">"; 10853 } 10854