1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop and the start value for the canonical induction, if it is != 0. The 474 /// latter is the case when vectorizing the epilogue loop. In the case of 475 /// epilogue vectorization, this function is overriden to handle the more 476 /// complex control flow around the loops. 477 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 478 479 /// Widen a single call instruction within the innermost loop. 480 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 481 VPTransformState &State); 482 483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 484 void fixVectorizedLoop(VPTransformState &State); 485 486 // Return true if any runtime check is added. 487 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 488 489 /// A type for vectorized values in the new loop. Each value from the 490 /// original loop, when vectorized, is represented by UF vector values in the 491 /// new unrolled loop, where UF is the unroll factor. 492 using VectorParts = SmallVector<Value *, 2>; 493 494 /// Vectorize a single first-order recurrence or pointer induction PHINode in 495 /// a block. This method handles the induction variable canonicalization. It 496 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 497 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 498 VPTransformState &State); 499 500 /// A helper function to scalarize a single Instruction in the innermost loop. 501 /// Generates a sequence of scalar instances for each lane between \p MinLane 502 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 503 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 504 /// Instr's operands. 505 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 506 const VPIteration &Instance, bool IfPredicateInstr, 507 VPTransformState &State); 508 509 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 510 /// is provided, the integer induction variable will first be truncated to 511 /// the corresponding type. \p CanonicalIV is the scalar value generated for 512 /// the canonical induction variable. 513 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, 514 VPTransformState &State, Value *CanonicalIV); 515 516 /// Construct the vector value of a scalarized value \p V one lane at a time. 517 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 518 VPTransformState &State); 519 520 /// Try to vectorize interleaved access group \p Group with the base address 521 /// given in \p Addr, optionally masking the vector operations if \p 522 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 523 /// values in the vectorized loop. 524 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 525 ArrayRef<VPValue *> VPDefs, 526 VPTransformState &State, VPValue *Addr, 527 ArrayRef<VPValue *> StoredValues, 528 VPValue *BlockInMask = nullptr); 529 530 /// Set the debug location in the builder \p Ptr using the debug location in 531 /// \p V. If \p Ptr is None then it uses the class member's Builder. 532 void setDebugLocFromInst(const Value *V, 533 Optional<IRBuilderBase *> CustomBuilder = None); 534 535 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 536 void fixNonInductionPHIs(VPTransformState &State); 537 538 /// Returns true if the reordering of FP operations is not allowed, but we are 539 /// able to vectorize with strict in-order reductions for the given RdxDesc. 540 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 541 542 /// Create a broadcast instruction. This method generates a broadcast 543 /// instruction (shuffle) for loop invariant values and for the induction 544 /// value. If this is the induction variable then we extend it to N, N+1, ... 545 /// this is needed because each iteration in the loop corresponds to a SIMD 546 /// element. 547 virtual Value *getBroadcastInstrs(Value *V); 548 549 /// Add metadata from one instruction to another. 550 /// 551 /// This includes both the original MDs from \p From and additional ones (\see 552 /// addNewMetadata). Use this for *newly created* instructions in the vector 553 /// loop. 554 void addMetadata(Instruction *To, Instruction *From); 555 556 /// Similar to the previous function but it adds the metadata to a 557 /// vector of instructions. 558 void addMetadata(ArrayRef<Value *> To, Instruction *From); 559 560 // Returns the resume value (bc.merge.rdx) for a reduction as 561 // generated by fixReduction. 562 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 563 564 protected: 565 friend class LoopVectorizationPlanner; 566 567 /// A small list of PHINodes. 568 using PhiVector = SmallVector<PHINode *, 4>; 569 570 /// A type for scalarized values in the new loop. Each value from the 571 /// original loop, when scalarized, is represented by UF x VF scalar values 572 /// in the new unrolled loop, where UF is the unroll factor and VF is the 573 /// vectorization factor. 574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 575 576 /// Set up the values of the IVs correctly when exiting the vector loop. 577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 578 Value *CountRoundDown, Value *EndValue, 579 BasicBlock *MiddleBlock); 580 581 /// Introduce a conditional branch (on true, condition to be set later) at the 582 /// end of the header=latch connecting it to itself (across the backedge) and 583 /// to the exit block of \p L. 584 void createHeaderBranch(Loop *L); 585 586 /// Handle all cross-iteration phis in the header. 587 void fixCrossIterationPHIs(VPTransformState &State); 588 589 /// Create the exit value of first order recurrences in the middle block and 590 /// update their users. 591 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 592 VPTransformState &State); 593 594 /// Create code for the loop exit value of the reduction. 595 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 596 597 /// Clear NSW/NUW flags from reduction instructions if necessary. 598 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 599 VPTransformState &State); 600 601 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 602 /// means we need to add the appropriate incoming value from the middle 603 /// block as exiting edges from the scalar epilogue loop (if present) are 604 /// already in place, and we exit the vector loop exclusively to the middle 605 /// block. 606 void fixLCSSAPHIs(VPTransformState &State); 607 608 /// Iteratively sink the scalarized operands of a predicated instruction into 609 /// the block that was created for it. 610 void sinkScalarOperands(Instruction *PredInst); 611 612 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 613 /// represented as. 614 void truncateToMinimalBitwidths(VPTransformState &State); 615 616 /// Create a vector induction phi node based on an existing scalar one. \p 617 /// EntryVal is the value from the original loop that maps to the vector phi 618 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 619 /// truncate instruction, instead of widening the original IV, we widen a 620 /// version of the IV truncated to \p EntryVal's type. 621 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 622 Value *Step, Value *Start, 623 Instruction *EntryVal, VPValue *Def, 624 VPTransformState &State); 625 626 /// Returns (and creates if needed) the original loop trip count. 627 Value *getOrCreateTripCount(Loop *NewLoop); 628 629 /// Returns (and creates if needed) the trip count of the widened loop. 630 Value *getOrCreateVectorTripCount(Loop *NewLoop); 631 632 /// Returns a bitcasted value to the requested vector type. 633 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 634 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 635 const DataLayout &DL); 636 637 /// Emit a bypass check to see if the vector trip count is zero, including if 638 /// it overflows. 639 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 640 641 /// Emit a bypass check to see if all of the SCEV assumptions we've 642 /// had to make are correct. Returns the block containing the checks or 643 /// nullptr if no checks have been added. 644 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 645 646 /// Emit bypass checks to check any memory assumptions we may have made. 647 /// Returns the block containing the checks or nullptr if no checks have been 648 /// added. 649 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 650 651 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 652 /// vector loop preheader, middle block and scalar preheader. Also 653 /// allocate a loop object for the new vector loop and return it. 654 Loop *createVectorLoopSkeleton(StringRef Prefix); 655 656 /// Create new phi nodes for the induction variables to resume iteration count 657 /// in the scalar epilogue, from where the vectorized loop left off. 658 /// In cases where the loop skeleton is more complicated (eg. epilogue 659 /// vectorization) and the resume values can come from an additional bypass 660 /// block, the \p AdditionalBypass pair provides information about the bypass 661 /// block and the end value on the edge from bypass to this loop. 662 void createInductionResumeValues( 663 Loop *L, 664 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 665 666 /// Complete the loop skeleton by adding debug MDs, creating appropriate 667 /// conditional branches in the middle block, preparing the builder and 668 /// running the verifier. Take in the vector loop \p L as argument, and return 669 /// the preheader of the completed vector loop. 670 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 671 672 /// Add additional metadata to \p To that was not present on \p Orig. 673 /// 674 /// Currently this is used to add the noalias annotations based on the 675 /// inserted memchecks. Use this for instructions that are *cloned* into the 676 /// vector loop. 677 void addNewMetadata(Instruction *To, const Instruction *Orig); 678 679 /// Collect poison-generating recipes that may generate a poison value that is 680 /// used after vectorization, even when their operands are not poison. Those 681 /// recipes meet the following conditions: 682 /// * Contribute to the address computation of a recipe generating a widen 683 /// memory load/store (VPWidenMemoryInstructionRecipe or 684 /// VPInterleaveRecipe). 685 /// * Such a widen memory load/store has at least one underlying Instruction 686 /// that is in a basic block that needs predication and after vectorization 687 /// the generated instruction won't be predicated. 688 void collectPoisonGeneratingRecipes(VPTransformState &State); 689 690 /// Allow subclasses to override and print debug traces before/after vplan 691 /// execution, when trace information is requested. 692 virtual void printDebugTracesAtStart(){}; 693 virtual void printDebugTracesAtEnd(){}; 694 695 /// The original loop. 696 Loop *OrigLoop; 697 698 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 699 /// dynamic knowledge to simplify SCEV expressions and converts them to a 700 /// more usable form. 701 PredicatedScalarEvolution &PSE; 702 703 /// Loop Info. 704 LoopInfo *LI; 705 706 /// Dominator Tree. 707 DominatorTree *DT; 708 709 /// Alias Analysis. 710 AAResults *AA; 711 712 /// Target Library Info. 713 const TargetLibraryInfo *TLI; 714 715 /// Target Transform Info. 716 const TargetTransformInfo *TTI; 717 718 /// Assumption Cache. 719 AssumptionCache *AC; 720 721 /// Interface to emit optimization remarks. 722 OptimizationRemarkEmitter *ORE; 723 724 /// LoopVersioning. It's only set up (non-null) if memchecks were 725 /// used. 726 /// 727 /// This is currently only used to add no-alias metadata based on the 728 /// memchecks. The actually versioning is performed manually. 729 std::unique_ptr<LoopVersioning> LVer; 730 731 /// The vectorization SIMD factor to use. Each vector will have this many 732 /// vector elements. 733 ElementCount VF; 734 735 /// The vectorization unroll factor to use. Each scalar is vectorized to this 736 /// many different vector instructions. 737 unsigned UF; 738 739 /// The builder that we use 740 IRBuilder<> Builder; 741 742 // --- Vectorization state --- 743 744 /// The vector-loop preheader. 745 BasicBlock *LoopVectorPreHeader; 746 747 /// The scalar-loop preheader. 748 BasicBlock *LoopScalarPreHeader; 749 750 /// Middle Block between the vector and the scalar. 751 BasicBlock *LoopMiddleBlock; 752 753 /// The unique ExitBlock of the scalar loop if one exists. Note that 754 /// there can be multiple exiting edges reaching this block. 755 BasicBlock *LoopExitBlock; 756 757 /// The vector loop body. 758 BasicBlock *LoopVectorBody; 759 760 /// The scalar loop body. 761 BasicBlock *LoopScalarBody; 762 763 /// A list of all bypass blocks. The first block is the entry of the loop. 764 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 765 766 /// Store instructions that were predicated. 767 SmallVector<Instruction *, 4> PredicatedInstructions; 768 769 /// Trip count of the original loop. 770 Value *TripCount = nullptr; 771 772 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 773 Value *VectorTripCount = nullptr; 774 775 /// The legality analysis. 776 LoopVectorizationLegality *Legal; 777 778 /// The profitablity analysis. 779 LoopVectorizationCostModel *Cost; 780 781 // Record whether runtime checks are added. 782 bool AddedSafetyChecks = false; 783 784 // Holds the end values for each induction variable. We save the end values 785 // so we can later fix-up the external users of the induction variables. 786 DenseMap<PHINode *, Value *> IVEndValues; 787 788 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 789 // fixed up at the end of vector code generation. 790 SmallVector<PHINode *, 8> OrigPHIsToFix; 791 792 /// BFI and PSI are used to check for profile guided size optimizations. 793 BlockFrequencyInfo *BFI; 794 ProfileSummaryInfo *PSI; 795 796 // Whether this loop should be optimized for size based on profile guided size 797 // optimizatios. 798 bool OptForSizeBasedOnProfile; 799 800 /// Structure to hold information about generated runtime checks, responsible 801 /// for cleaning the checks, if vectorization turns out unprofitable. 802 GeneratedRTChecks &RTChecks; 803 804 // Holds the resume values for reductions in the loops, used to set the 805 // correct start value of reduction PHIs when vectorizing the epilogue. 806 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 807 ReductionResumeValues; 808 }; 809 810 class InnerLoopUnroller : public InnerLoopVectorizer { 811 public: 812 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 813 LoopInfo *LI, DominatorTree *DT, 814 const TargetLibraryInfo *TLI, 815 const TargetTransformInfo *TTI, AssumptionCache *AC, 816 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 817 LoopVectorizationLegality *LVL, 818 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 819 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 820 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 821 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 822 BFI, PSI, Check) {} 823 824 private: 825 Value *getBroadcastInstrs(Value *V) override; 826 }; 827 828 /// Encapsulate information regarding vectorization of a loop and its epilogue. 829 /// This information is meant to be updated and used across two stages of 830 /// epilogue vectorization. 831 struct EpilogueLoopVectorizationInfo { 832 ElementCount MainLoopVF = ElementCount::getFixed(0); 833 unsigned MainLoopUF = 0; 834 ElementCount EpilogueVF = ElementCount::getFixed(0); 835 unsigned EpilogueUF = 0; 836 BasicBlock *MainLoopIterationCountCheck = nullptr; 837 BasicBlock *EpilogueIterationCountCheck = nullptr; 838 BasicBlock *SCEVSafetyCheck = nullptr; 839 BasicBlock *MemSafetyCheck = nullptr; 840 Value *TripCount = nullptr; 841 Value *VectorTripCount = nullptr; 842 843 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 844 ElementCount EVF, unsigned EUF) 845 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 846 assert(EUF == 1 && 847 "A high UF for the epilogue loop is likely not beneficial."); 848 } 849 }; 850 851 /// An extension of the inner loop vectorizer that creates a skeleton for a 852 /// vectorized loop that has its epilogue (residual) also vectorized. 853 /// The idea is to run the vplan on a given loop twice, firstly to setup the 854 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 855 /// from the first step and vectorize the epilogue. This is achieved by 856 /// deriving two concrete strategy classes from this base class and invoking 857 /// them in succession from the loop vectorizer planner. 858 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 859 public: 860 InnerLoopAndEpilogueVectorizer( 861 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 862 DominatorTree *DT, const TargetLibraryInfo *TLI, 863 const TargetTransformInfo *TTI, AssumptionCache *AC, 864 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 865 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 866 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 867 GeneratedRTChecks &Checks) 868 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 869 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 870 Checks), 871 EPI(EPI) {} 872 873 // Override this function to handle the more complex control flow around the 874 // three loops. 875 std::pair<BasicBlock *, Value *> 876 createVectorizedLoopSkeleton() final override { 877 return createEpilogueVectorizedLoopSkeleton(); 878 } 879 880 /// The interface for creating a vectorized skeleton using one of two 881 /// different strategies, each corresponding to one execution of the vplan 882 /// as described above. 883 virtual std::pair<BasicBlock *, Value *> 884 createEpilogueVectorizedLoopSkeleton() = 0; 885 886 /// Holds and updates state information required to vectorize the main loop 887 /// and its epilogue in two separate passes. This setup helps us avoid 888 /// regenerating and recomputing runtime safety checks. It also helps us to 889 /// shorten the iteration-count-check path length for the cases where the 890 /// iteration count of the loop is so small that the main vector loop is 891 /// completely skipped. 892 EpilogueLoopVectorizationInfo &EPI; 893 }; 894 895 /// A specialized derived class of inner loop vectorizer that performs 896 /// vectorization of *main* loops in the process of vectorizing loops and their 897 /// epilogues. 898 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 899 public: 900 EpilogueVectorizerMainLoop( 901 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 902 DominatorTree *DT, const TargetLibraryInfo *TLI, 903 const TargetTransformInfo *TTI, AssumptionCache *AC, 904 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 905 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 906 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 907 GeneratedRTChecks &Check) 908 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 909 EPI, LVL, CM, BFI, PSI, Check) {} 910 /// Implements the interface for creating a vectorized skeleton using the 911 /// *main loop* strategy (ie the first pass of vplan execution). 912 std::pair<BasicBlock *, Value *> 913 createEpilogueVectorizedLoopSkeleton() final override; 914 915 protected: 916 /// Emits an iteration count bypass check once for the main loop (when \p 917 /// ForEpilogue is false) and once for the epilogue loop (when \p 918 /// ForEpilogue is true). 919 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 920 bool ForEpilogue); 921 void printDebugTracesAtStart() override; 922 void printDebugTracesAtEnd() override; 923 }; 924 925 // A specialized derived class of inner loop vectorizer that performs 926 // vectorization of *epilogue* loops in the process of vectorizing loops and 927 // their epilogues. 928 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 929 public: 930 EpilogueVectorizerEpilogueLoop( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI, LVL, CM, BFI, PSI, Checks) {} 940 /// Implements the interface for creating a vectorized skeleton using the 941 /// *epilogue loop* strategy (ie the second pass of vplan execution). 942 std::pair<BasicBlock *, Value *> 943 createEpilogueVectorizedLoopSkeleton() final override; 944 945 protected: 946 /// Emits an iteration count bypass check after the main vector loop has 947 /// finished to see if there are any iterations left to execute by either 948 /// the vector epilogue or the scalar epilogue. 949 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 950 BasicBlock *Bypass, 951 BasicBlock *Insert); 952 void printDebugTracesAtStart() override; 953 void printDebugTracesAtEnd() override; 954 }; 955 } // end namespace llvm 956 957 /// Look for a meaningful debug location on the instruction or it's 958 /// operands. 959 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 960 if (!I) 961 return I; 962 963 DebugLoc Empty; 964 if (I->getDebugLoc() != Empty) 965 return I; 966 967 for (Use &Op : I->operands()) { 968 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 969 if (OpInst->getDebugLoc() != Empty) 970 return OpInst; 971 } 972 973 return I; 974 } 975 976 void InnerLoopVectorizer::setDebugLocFromInst( 977 const Value *V, Optional<IRBuilderBase *> CustomBuilder) { 978 IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 979 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 980 const DILocation *DIL = Inst->getDebugLoc(); 981 982 // When a FSDiscriminator is enabled, we don't need to add the multiply 983 // factors to the discriminators. 984 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 985 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 986 // FIXME: For scalable vectors, assume vscale=1. 987 auto NewDIL = 988 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 989 if (NewDIL) 990 B->SetCurrentDebugLocation(NewDIL.getValue()); 991 else 992 LLVM_DEBUG(dbgs() 993 << "Failed to create new discriminator: " 994 << DIL->getFilename() << " Line: " << DIL->getLine()); 995 } else 996 B->SetCurrentDebugLocation(DIL); 997 } else 998 B->SetCurrentDebugLocation(DebugLoc()); 999 } 1000 1001 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1002 /// is passed, the message relates to that particular instruction. 1003 #ifndef NDEBUG 1004 static void debugVectorizationMessage(const StringRef Prefix, 1005 const StringRef DebugMsg, 1006 Instruction *I) { 1007 dbgs() << "LV: " << Prefix << DebugMsg; 1008 if (I != nullptr) 1009 dbgs() << " " << *I; 1010 else 1011 dbgs() << '.'; 1012 dbgs() << '\n'; 1013 } 1014 #endif 1015 1016 /// Create an analysis remark that explains why vectorization failed 1017 /// 1018 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1019 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1020 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1021 /// the location of the remark. \return the remark object that can be 1022 /// streamed to. 1023 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1024 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1025 Value *CodeRegion = TheLoop->getHeader(); 1026 DebugLoc DL = TheLoop->getStartLoc(); 1027 1028 if (I) { 1029 CodeRegion = I->getParent(); 1030 // If there is no debug location attached to the instruction, revert back to 1031 // using the loop's. 1032 if (I->getDebugLoc()) 1033 DL = I->getDebugLoc(); 1034 } 1035 1036 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1037 } 1038 1039 namespace llvm { 1040 1041 /// Return a value for Step multiplied by VF. 1042 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1043 int64_t Step) { 1044 assert(Ty->isIntegerTy() && "Expected an integer step"); 1045 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1046 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1047 } 1048 1049 /// Return the runtime value for VF. 1050 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1051 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1052 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1053 } 1054 1055 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1056 ElementCount VF) { 1057 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1058 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1059 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1060 return B.CreateUIToFP(RuntimeVF, FTy); 1061 } 1062 1063 void reportVectorizationFailure(const StringRef DebugMsg, 1064 const StringRef OREMsg, const StringRef ORETag, 1065 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1066 Instruction *I) { 1067 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1068 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1069 ORE->emit( 1070 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1071 << "loop not vectorized: " << OREMsg); 1072 } 1073 1074 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1075 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1076 Instruction *I) { 1077 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1078 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1079 ORE->emit( 1080 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1081 << Msg); 1082 } 1083 1084 } // end namespace llvm 1085 1086 #ifndef NDEBUG 1087 /// \return string containing a file name and a line # for the given loop. 1088 static std::string getDebugLocString(const Loop *L) { 1089 std::string Result; 1090 if (L) { 1091 raw_string_ostream OS(Result); 1092 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1093 LoopDbgLoc.print(OS); 1094 else 1095 // Just print the module name. 1096 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1097 OS.flush(); 1098 } 1099 return Result; 1100 } 1101 #endif 1102 1103 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1104 const Instruction *Orig) { 1105 // If the loop was versioned with memchecks, add the corresponding no-alias 1106 // metadata. 1107 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1108 LVer->annotateInstWithNoAlias(To, Orig); 1109 } 1110 1111 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1112 VPTransformState &State) { 1113 1114 // Collect recipes in the backward slice of `Root` that may generate a poison 1115 // value that is used after vectorization. 1116 SmallPtrSet<VPRecipeBase *, 16> Visited; 1117 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1118 SmallVector<VPRecipeBase *, 16> Worklist; 1119 Worklist.push_back(Root); 1120 1121 // Traverse the backward slice of Root through its use-def chain. 1122 while (!Worklist.empty()) { 1123 VPRecipeBase *CurRec = Worklist.back(); 1124 Worklist.pop_back(); 1125 1126 if (!Visited.insert(CurRec).second) 1127 continue; 1128 1129 // Prune search if we find another recipe generating a widen memory 1130 // instruction. Widen memory instructions involved in address computation 1131 // will lead to gather/scatter instructions, which don't need to be 1132 // handled. 1133 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1134 isa<VPInterleaveRecipe>(CurRec) || 1135 isa<VPScalarIVStepsRecipe>(CurRec) || 1136 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1137 continue; 1138 1139 // This recipe contributes to the address computation of a widen 1140 // load/store. Collect recipe if its underlying instruction has 1141 // poison-generating flags. 1142 Instruction *Instr = CurRec->getUnderlyingInstr(); 1143 if (Instr && Instr->hasPoisonGeneratingFlags()) 1144 State.MayGeneratePoisonRecipes.insert(CurRec); 1145 1146 // Add new definitions to the worklist. 1147 for (VPValue *operand : CurRec->operands()) 1148 if (VPDef *OpDef = operand->getDef()) 1149 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1150 } 1151 }); 1152 1153 // Traverse all the recipes in the VPlan and collect the poison-generating 1154 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1155 // VPInterleaveRecipe. 1156 auto Iter = depth_first( 1157 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1158 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1159 for (VPRecipeBase &Recipe : *VPBB) { 1160 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1161 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1162 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1163 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1164 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1165 collectPoisonGeneratingInstrsInBackwardSlice( 1166 cast<VPRecipeBase>(AddrDef)); 1167 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1168 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1169 if (AddrDef) { 1170 // Check if any member of the interleave group needs predication. 1171 const InterleaveGroup<Instruction> *InterGroup = 1172 InterleaveRec->getInterleaveGroup(); 1173 bool NeedPredication = false; 1174 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1175 I < NumMembers; ++I) { 1176 Instruction *Member = InterGroup->getMember(I); 1177 if (Member) 1178 NeedPredication |= 1179 Legal->blockNeedsPredication(Member->getParent()); 1180 } 1181 1182 if (NeedPredication) 1183 collectPoisonGeneratingInstrsInBackwardSlice( 1184 cast<VPRecipeBase>(AddrDef)); 1185 } 1186 } 1187 } 1188 } 1189 } 1190 1191 void InnerLoopVectorizer::addMetadata(Instruction *To, 1192 Instruction *From) { 1193 propagateMetadata(To, From); 1194 addNewMetadata(To, From); 1195 } 1196 1197 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1198 Instruction *From) { 1199 for (Value *V : To) { 1200 if (Instruction *I = dyn_cast<Instruction>(V)) 1201 addMetadata(I, From); 1202 } 1203 } 1204 1205 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1206 const RecurrenceDescriptor &RdxDesc) { 1207 auto It = ReductionResumeValues.find(&RdxDesc); 1208 assert(It != ReductionResumeValues.end() && 1209 "Expected to find a resume value for the reduction."); 1210 return It->second; 1211 } 1212 1213 namespace llvm { 1214 1215 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1216 // lowered. 1217 enum ScalarEpilogueLowering { 1218 1219 // The default: allowing scalar epilogues. 1220 CM_ScalarEpilogueAllowed, 1221 1222 // Vectorization with OptForSize: don't allow epilogues. 1223 CM_ScalarEpilogueNotAllowedOptSize, 1224 1225 // A special case of vectorisation with OptForSize: loops with a very small 1226 // trip count are considered for vectorization under OptForSize, thereby 1227 // making sure the cost of their loop body is dominant, free of runtime 1228 // guards and scalar iteration overheads. 1229 CM_ScalarEpilogueNotAllowedLowTripLoop, 1230 1231 // Loop hint predicate indicating an epilogue is undesired. 1232 CM_ScalarEpilogueNotNeededUsePredicate, 1233 1234 // Directive indicating we must either tail fold or not vectorize 1235 CM_ScalarEpilogueNotAllowedUsePredicate 1236 }; 1237 1238 /// ElementCountComparator creates a total ordering for ElementCount 1239 /// for the purposes of using it in a set structure. 1240 struct ElementCountComparator { 1241 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1242 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1243 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1244 } 1245 }; 1246 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1247 1248 /// LoopVectorizationCostModel - estimates the expected speedups due to 1249 /// vectorization. 1250 /// In many cases vectorization is not profitable. This can happen because of 1251 /// a number of reasons. In this class we mainly attempt to predict the 1252 /// expected speedup/slowdowns due to the supported instruction set. We use the 1253 /// TargetTransformInfo to query the different backends for the cost of 1254 /// different operations. 1255 class LoopVectorizationCostModel { 1256 public: 1257 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1258 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1259 LoopVectorizationLegality *Legal, 1260 const TargetTransformInfo &TTI, 1261 const TargetLibraryInfo *TLI, DemandedBits *DB, 1262 AssumptionCache *AC, 1263 OptimizationRemarkEmitter *ORE, const Function *F, 1264 const LoopVectorizeHints *Hints, 1265 InterleavedAccessInfo &IAI) 1266 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1267 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1268 Hints(Hints), InterleaveInfo(IAI) {} 1269 1270 /// \return An upper bound for the vectorization factors (both fixed and 1271 /// scalable). If the factors are 0, vectorization and interleaving should be 1272 /// avoided up front. 1273 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1274 1275 /// \return True if runtime checks are required for vectorization, and false 1276 /// otherwise. 1277 bool runtimeChecksRequired(); 1278 1279 /// \return The most profitable vectorization factor and the cost of that VF. 1280 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1281 /// then this vectorization factor will be selected if vectorization is 1282 /// possible. 1283 VectorizationFactor 1284 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1285 1286 VectorizationFactor 1287 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1288 const LoopVectorizationPlanner &LVP); 1289 1290 /// Setup cost-based decisions for user vectorization factor. 1291 /// \return true if the UserVF is a feasible VF to be chosen. 1292 bool selectUserVectorizationFactor(ElementCount UserVF) { 1293 collectUniformsAndScalars(UserVF); 1294 collectInstsToScalarize(UserVF); 1295 return expectedCost(UserVF).first.isValid(); 1296 } 1297 1298 /// \return The size (in bits) of the smallest and widest types in the code 1299 /// that needs to be vectorized. We ignore values that remain scalar such as 1300 /// 64 bit loop indices. 1301 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1302 1303 /// \return The desired interleave count. 1304 /// If interleave count has been specified by metadata it will be returned. 1305 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1306 /// are the selected vectorization factor and the cost of the selected VF. 1307 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1308 1309 /// Memory access instruction may be vectorized in more than one way. 1310 /// Form of instruction after vectorization depends on cost. 1311 /// This function takes cost-based decisions for Load/Store instructions 1312 /// and collects them in a map. This decisions map is used for building 1313 /// the lists of loop-uniform and loop-scalar instructions. 1314 /// The calculated cost is saved with widening decision in order to 1315 /// avoid redundant calculations. 1316 void setCostBasedWideningDecision(ElementCount VF); 1317 1318 /// A struct that represents some properties of the register usage 1319 /// of a loop. 1320 struct RegisterUsage { 1321 /// Holds the number of loop invariant values that are used in the loop. 1322 /// The key is ClassID of target-provided register class. 1323 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1324 /// Holds the maximum number of concurrent live intervals in the loop. 1325 /// The key is ClassID of target-provided register class. 1326 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1327 }; 1328 1329 /// \return Returns information about the register usages of the loop for the 1330 /// given vectorization factors. 1331 SmallVector<RegisterUsage, 8> 1332 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1333 1334 /// Collect values we want to ignore in the cost model. 1335 void collectValuesToIgnore(); 1336 1337 /// Collect all element types in the loop for which widening is needed. 1338 void collectElementTypesForWidening(); 1339 1340 /// Split reductions into those that happen in the loop, and those that happen 1341 /// outside. In loop reductions are collected into InLoopReductionChains. 1342 void collectInLoopReductions(); 1343 1344 /// Returns true if we should use strict in-order reductions for the given 1345 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1346 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1347 /// of FP operations. 1348 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1349 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1350 } 1351 1352 /// \returns The smallest bitwidth each instruction can be represented with. 1353 /// The vector equivalents of these instructions should be truncated to this 1354 /// type. 1355 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1356 return MinBWs; 1357 } 1358 1359 /// \returns True if it is more profitable to scalarize instruction \p I for 1360 /// vectorization factor \p VF. 1361 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1362 assert(VF.isVector() && 1363 "Profitable to scalarize relevant only for VF > 1."); 1364 1365 // Cost model is not run in the VPlan-native path - return conservative 1366 // result until this changes. 1367 if (EnableVPlanNativePath) 1368 return false; 1369 1370 auto Scalars = InstsToScalarize.find(VF); 1371 assert(Scalars != InstsToScalarize.end() && 1372 "VF not yet analyzed for scalarization profitability"); 1373 return Scalars->second.find(I) != Scalars->second.end(); 1374 } 1375 1376 /// Returns true if \p I is known to be uniform after vectorization. 1377 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1378 if (VF.isScalar()) 1379 return true; 1380 1381 // Cost model is not run in the VPlan-native path - return conservative 1382 // result until this changes. 1383 if (EnableVPlanNativePath) 1384 return false; 1385 1386 auto UniformsPerVF = Uniforms.find(VF); 1387 assert(UniformsPerVF != Uniforms.end() && 1388 "VF not yet analyzed for uniformity"); 1389 return UniformsPerVF->second.count(I); 1390 } 1391 1392 /// Returns true if \p I is known to be scalar after vectorization. 1393 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1394 if (VF.isScalar()) 1395 return true; 1396 1397 // Cost model is not run in the VPlan-native path - return conservative 1398 // result until this changes. 1399 if (EnableVPlanNativePath) 1400 return false; 1401 1402 auto ScalarsPerVF = Scalars.find(VF); 1403 assert(ScalarsPerVF != Scalars.end() && 1404 "Scalar values are not calculated for VF"); 1405 return ScalarsPerVF->second.count(I); 1406 } 1407 1408 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1409 /// for vectorization factor \p VF. 1410 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1411 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1412 !isProfitableToScalarize(I, VF) && 1413 !isScalarAfterVectorization(I, VF); 1414 } 1415 1416 /// Decision that was taken during cost calculation for memory instruction. 1417 enum InstWidening { 1418 CM_Unknown, 1419 CM_Widen, // For consecutive accesses with stride +1. 1420 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1421 CM_Interleave, 1422 CM_GatherScatter, 1423 CM_Scalarize 1424 }; 1425 1426 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1427 /// instruction \p I and vector width \p VF. 1428 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1429 InstructionCost Cost) { 1430 assert(VF.isVector() && "Expected VF >=2"); 1431 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1432 } 1433 1434 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1435 /// interleaving group \p Grp and vector width \p VF. 1436 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1437 ElementCount VF, InstWidening W, 1438 InstructionCost Cost) { 1439 assert(VF.isVector() && "Expected VF >=2"); 1440 /// Broadcast this decicion to all instructions inside the group. 1441 /// But the cost will be assigned to one instruction only. 1442 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1443 if (auto *I = Grp->getMember(i)) { 1444 if (Grp->getInsertPos() == I) 1445 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1446 else 1447 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1448 } 1449 } 1450 } 1451 1452 /// Return the cost model decision for the given instruction \p I and vector 1453 /// width \p VF. Return CM_Unknown if this instruction did not pass 1454 /// through the cost modeling. 1455 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1456 assert(VF.isVector() && "Expected VF to be a vector VF"); 1457 // Cost model is not run in the VPlan-native path - return conservative 1458 // result until this changes. 1459 if (EnableVPlanNativePath) 1460 return CM_GatherScatter; 1461 1462 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1463 auto Itr = WideningDecisions.find(InstOnVF); 1464 if (Itr == WideningDecisions.end()) 1465 return CM_Unknown; 1466 return Itr->second.first; 1467 } 1468 1469 /// Return the vectorization cost for the given instruction \p I and vector 1470 /// width \p VF. 1471 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1472 assert(VF.isVector() && "Expected VF >=2"); 1473 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1474 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1475 "The cost is not calculated"); 1476 return WideningDecisions[InstOnVF].second; 1477 } 1478 1479 /// Return True if instruction \p I is an optimizable truncate whose operand 1480 /// is an induction variable. Such a truncate will be removed by adding a new 1481 /// induction variable with the destination type. 1482 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1483 // If the instruction is not a truncate, return false. 1484 auto *Trunc = dyn_cast<TruncInst>(I); 1485 if (!Trunc) 1486 return false; 1487 1488 // Get the source and destination types of the truncate. 1489 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1490 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1491 1492 // If the truncate is free for the given types, return false. Replacing a 1493 // free truncate with an induction variable would add an induction variable 1494 // update instruction to each iteration of the loop. We exclude from this 1495 // check the primary induction variable since it will need an update 1496 // instruction regardless. 1497 Value *Op = Trunc->getOperand(0); 1498 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1499 return false; 1500 1501 // If the truncated value is not an induction variable, return false. 1502 return Legal->isInductionPhi(Op); 1503 } 1504 1505 /// Collects the instructions to scalarize for each predicated instruction in 1506 /// the loop. 1507 void collectInstsToScalarize(ElementCount VF); 1508 1509 /// Collect Uniform and Scalar values for the given \p VF. 1510 /// The sets depend on CM decision for Load/Store instructions 1511 /// that may be vectorized as interleave, gather-scatter or scalarized. 1512 void collectUniformsAndScalars(ElementCount VF) { 1513 // Do the analysis once. 1514 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1515 return; 1516 setCostBasedWideningDecision(VF); 1517 collectLoopUniforms(VF); 1518 collectLoopScalars(VF); 1519 } 1520 1521 /// Returns true if the target machine supports masked store operation 1522 /// for the given \p DataType and kind of access to \p Ptr. 1523 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1524 return Legal->isConsecutivePtr(DataType, Ptr) && 1525 TTI.isLegalMaskedStore(DataType, Alignment); 1526 } 1527 1528 /// Returns true if the target machine supports masked load operation 1529 /// for the given \p DataType and kind of access to \p Ptr. 1530 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1531 return Legal->isConsecutivePtr(DataType, Ptr) && 1532 TTI.isLegalMaskedLoad(DataType, Alignment); 1533 } 1534 1535 /// Returns true if the target machine can represent \p V as a masked gather 1536 /// or scatter operation. 1537 bool isLegalGatherOrScatter(Value *V, 1538 ElementCount VF = ElementCount::getFixed(1)) { 1539 bool LI = isa<LoadInst>(V); 1540 bool SI = isa<StoreInst>(V); 1541 if (!LI && !SI) 1542 return false; 1543 auto *Ty = getLoadStoreType(V); 1544 Align Align = getLoadStoreAlignment(V); 1545 if (VF.isVector()) 1546 Ty = VectorType::get(Ty, VF); 1547 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1548 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1549 } 1550 1551 /// Returns true if the target machine supports all of the reduction 1552 /// variables found for the given VF. 1553 bool canVectorizeReductions(ElementCount VF) const { 1554 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1555 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1556 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1557 })); 1558 } 1559 1560 /// Returns true if \p I is an instruction that will be scalarized with 1561 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1562 /// instructions include conditional stores and instructions that may divide 1563 /// by zero. 1564 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1565 1566 // Returns true if \p I is an instruction that will be predicated either 1567 // through scalar predication or masked load/store or masked gather/scatter. 1568 // \p VF is the vectorization factor that will be used to vectorize \p I. 1569 // Superset of instructions that return true for isScalarWithPredication. 1570 bool isPredicatedInst(Instruction *I, ElementCount VF, 1571 bool IsKnownUniform = false) { 1572 // When we know the load is uniform and the original scalar loop was not 1573 // predicated we don't need to mark it as a predicated instruction. Any 1574 // vectorised blocks created when tail-folding are something artificial we 1575 // have introduced and we know there is always at least one active lane. 1576 // That's why we call Legal->blockNeedsPredication here because it doesn't 1577 // query tail-folding. 1578 if (IsKnownUniform && isa<LoadInst>(I) && 1579 !Legal->blockNeedsPredication(I->getParent())) 1580 return false; 1581 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1582 return false; 1583 // Loads and stores that need some form of masked operation are predicated 1584 // instructions. 1585 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1586 return Legal->isMaskRequired(I); 1587 return isScalarWithPredication(I, VF); 1588 } 1589 1590 /// Returns true if \p I is a memory instruction with consecutive memory 1591 /// access that can be widened. 1592 bool 1593 memoryInstructionCanBeWidened(Instruction *I, 1594 ElementCount VF = ElementCount::getFixed(1)); 1595 1596 /// Returns true if \p I is a memory instruction in an interleaved-group 1597 /// of memory accesses that can be vectorized with wide vector loads/stores 1598 /// and shuffles. 1599 bool 1600 interleavedAccessCanBeWidened(Instruction *I, 1601 ElementCount VF = ElementCount::getFixed(1)); 1602 1603 /// Check if \p Instr belongs to any interleaved access group. 1604 bool isAccessInterleaved(Instruction *Instr) { 1605 return InterleaveInfo.isInterleaved(Instr); 1606 } 1607 1608 /// Get the interleaved access group that \p Instr belongs to. 1609 const InterleaveGroup<Instruction> * 1610 getInterleavedAccessGroup(Instruction *Instr) { 1611 return InterleaveInfo.getInterleaveGroup(Instr); 1612 } 1613 1614 /// Returns true if we're required to use a scalar epilogue for at least 1615 /// the final iteration of the original loop. 1616 bool requiresScalarEpilogue(ElementCount VF) const { 1617 if (!isScalarEpilogueAllowed()) 1618 return false; 1619 // If we might exit from anywhere but the latch, must run the exiting 1620 // iteration in scalar form. 1621 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1622 return true; 1623 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1624 } 1625 1626 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1627 /// loop hint annotation. 1628 bool isScalarEpilogueAllowed() const { 1629 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1630 } 1631 1632 /// Returns true if all loop blocks should be masked to fold tail loop. 1633 bool foldTailByMasking() const { return FoldTailByMasking; } 1634 1635 /// Returns true if the instructions in this block requires predication 1636 /// for any reason, e.g. because tail folding now requires a predicate 1637 /// or because the block in the original loop was predicated. 1638 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1639 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1640 } 1641 1642 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1643 /// nodes to the chain of instructions representing the reductions. Uses a 1644 /// MapVector to ensure deterministic iteration order. 1645 using ReductionChainMap = 1646 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1647 1648 /// Return the chain of instructions representing an inloop reduction. 1649 const ReductionChainMap &getInLoopReductionChains() const { 1650 return InLoopReductionChains; 1651 } 1652 1653 /// Returns true if the Phi is part of an inloop reduction. 1654 bool isInLoopReduction(PHINode *Phi) const { 1655 return InLoopReductionChains.count(Phi); 1656 } 1657 1658 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1659 /// with factor VF. Return the cost of the instruction, including 1660 /// scalarization overhead if it's needed. 1661 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1662 1663 /// Estimate cost of a call instruction CI if it were vectorized with factor 1664 /// VF. Return the cost of the instruction, including scalarization overhead 1665 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1666 /// scalarized - 1667 /// i.e. either vector version isn't available, or is too expensive. 1668 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1669 bool &NeedToScalarize) const; 1670 1671 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1672 /// that of B. 1673 bool isMoreProfitable(const VectorizationFactor &A, 1674 const VectorizationFactor &B) const; 1675 1676 /// Invalidates decisions already taken by the cost model. 1677 void invalidateCostModelingDecisions() { 1678 WideningDecisions.clear(); 1679 Uniforms.clear(); 1680 Scalars.clear(); 1681 } 1682 1683 private: 1684 unsigned NumPredStores = 0; 1685 1686 /// Convenience function that returns the value of vscale_range iff 1687 /// vscale_range.min == vscale_range.max or otherwise returns the value 1688 /// returned by the corresponding TLI method. 1689 Optional<unsigned> getVScaleForTuning() const; 1690 1691 /// \return An upper bound for the vectorization factors for both 1692 /// fixed and scalable vectorization, where the minimum-known number of 1693 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1694 /// disabled or unsupported, then the scalable part will be equal to 1695 /// ElementCount::getScalable(0). 1696 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1697 ElementCount UserVF, 1698 bool FoldTailByMasking); 1699 1700 /// \return the maximized element count based on the targets vector 1701 /// registers and the loop trip-count, but limited to a maximum safe VF. 1702 /// This is a helper function of computeFeasibleMaxVF. 1703 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1704 /// issue that occurred on one of the buildbots which cannot be reproduced 1705 /// without having access to the properietary compiler (see comments on 1706 /// D98509). The issue is currently under investigation and this workaround 1707 /// will be removed as soon as possible. 1708 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1709 unsigned SmallestType, 1710 unsigned WidestType, 1711 const ElementCount &MaxSafeVF, 1712 bool FoldTailByMasking); 1713 1714 /// \return the maximum legal scalable VF, based on the safe max number 1715 /// of elements. 1716 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1717 1718 /// The vectorization cost is a combination of the cost itself and a boolean 1719 /// indicating whether any of the contributing operations will actually 1720 /// operate on vector values after type legalization in the backend. If this 1721 /// latter value is false, then all operations will be scalarized (i.e. no 1722 /// vectorization has actually taken place). 1723 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1724 1725 /// Returns the expected execution cost. The unit of the cost does 1726 /// not matter because we use the 'cost' units to compare different 1727 /// vector widths. The cost that is returned is *not* normalized by 1728 /// the factor width. If \p Invalid is not nullptr, this function 1729 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1730 /// each instruction that has an Invalid cost for the given VF. 1731 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1732 VectorizationCostTy 1733 expectedCost(ElementCount VF, 1734 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1735 1736 /// Returns the execution time cost of an instruction for a given vector 1737 /// width. Vector width of one means scalar. 1738 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1739 1740 /// The cost-computation logic from getInstructionCost which provides 1741 /// the vector type as an output parameter. 1742 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1743 Type *&VectorTy); 1744 1745 /// Return the cost of instructions in an inloop reduction pattern, if I is 1746 /// part of that pattern. 1747 Optional<InstructionCost> 1748 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1749 TTI::TargetCostKind CostKind); 1750 1751 /// Calculate vectorization cost of memory instruction \p I. 1752 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1753 1754 /// The cost computation for scalarized memory instruction. 1755 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1756 1757 /// The cost computation for interleaving group of memory instructions. 1758 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1759 1760 /// The cost computation for Gather/Scatter instruction. 1761 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1762 1763 /// The cost computation for widening instruction \p I with consecutive 1764 /// memory access. 1765 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1766 1767 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1768 /// Load: scalar load + broadcast. 1769 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1770 /// element) 1771 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1772 1773 /// Estimate the overhead of scalarizing an instruction. This is a 1774 /// convenience wrapper for the type-based getScalarizationOverhead API. 1775 InstructionCost getScalarizationOverhead(Instruction *I, 1776 ElementCount VF) const; 1777 1778 /// Returns whether the instruction is a load or store and will be a emitted 1779 /// as a vector operation. 1780 bool isConsecutiveLoadOrStore(Instruction *I); 1781 1782 /// Returns true if an artificially high cost for emulated masked memrefs 1783 /// should be used. 1784 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1785 1786 /// Map of scalar integer values to the smallest bitwidth they can be legally 1787 /// represented as. The vector equivalents of these values should be truncated 1788 /// to this type. 1789 MapVector<Instruction *, uint64_t> MinBWs; 1790 1791 /// A type representing the costs for instructions if they were to be 1792 /// scalarized rather than vectorized. The entries are Instruction-Cost 1793 /// pairs. 1794 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1795 1796 /// A set containing all BasicBlocks that are known to present after 1797 /// vectorization as a predicated block. 1798 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1799 1800 /// Records whether it is allowed to have the original scalar loop execute at 1801 /// least once. This may be needed as a fallback loop in case runtime 1802 /// aliasing/dependence checks fail, or to handle the tail/remainder 1803 /// iterations when the trip count is unknown or doesn't divide by the VF, 1804 /// or as a peel-loop to handle gaps in interleave-groups. 1805 /// Under optsize and when the trip count is very small we don't allow any 1806 /// iterations to execute in the scalar loop. 1807 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1808 1809 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1810 bool FoldTailByMasking = false; 1811 1812 /// A map holding scalar costs for different vectorization factors. The 1813 /// presence of a cost for an instruction in the mapping indicates that the 1814 /// instruction will be scalarized when vectorizing with the associated 1815 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1816 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1817 1818 /// Holds the instructions known to be uniform after vectorization. 1819 /// The data is collected per VF. 1820 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1821 1822 /// Holds the instructions known to be scalar after vectorization. 1823 /// The data is collected per VF. 1824 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1825 1826 /// Holds the instructions (address computations) that are forced to be 1827 /// scalarized. 1828 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1829 1830 /// PHINodes of the reductions that should be expanded in-loop along with 1831 /// their associated chains of reduction operations, in program order from top 1832 /// (PHI) to bottom 1833 ReductionChainMap InLoopReductionChains; 1834 1835 /// A Map of inloop reduction operations and their immediate chain operand. 1836 /// FIXME: This can be removed once reductions can be costed correctly in 1837 /// vplan. This was added to allow quick lookup to the inloop operations, 1838 /// without having to loop through InLoopReductionChains. 1839 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1840 1841 /// Returns the expected difference in cost from scalarizing the expression 1842 /// feeding a predicated instruction \p PredInst. The instructions to 1843 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1844 /// non-negative return value implies the expression will be scalarized. 1845 /// Currently, only single-use chains are considered for scalarization. 1846 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1847 ElementCount VF); 1848 1849 /// Collect the instructions that are uniform after vectorization. An 1850 /// instruction is uniform if we represent it with a single scalar value in 1851 /// the vectorized loop corresponding to each vector iteration. Examples of 1852 /// uniform instructions include pointer operands of consecutive or 1853 /// interleaved memory accesses. Note that although uniformity implies an 1854 /// instruction will be scalar, the reverse is not true. In general, a 1855 /// scalarized instruction will be represented by VF scalar values in the 1856 /// vectorized loop, each corresponding to an iteration of the original 1857 /// scalar loop. 1858 void collectLoopUniforms(ElementCount VF); 1859 1860 /// Collect the instructions that are scalar after vectorization. An 1861 /// instruction is scalar if it is known to be uniform or will be scalarized 1862 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1863 /// to the list if they are used by a load/store instruction that is marked as 1864 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1865 /// VF values in the vectorized loop, each corresponding to an iteration of 1866 /// the original scalar loop. 1867 void collectLoopScalars(ElementCount VF); 1868 1869 /// Keeps cost model vectorization decision and cost for instructions. 1870 /// Right now it is used for memory instructions only. 1871 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1872 std::pair<InstWidening, InstructionCost>>; 1873 1874 DecisionList WideningDecisions; 1875 1876 /// Returns true if \p V is expected to be vectorized and it needs to be 1877 /// extracted. 1878 bool needsExtract(Value *V, ElementCount VF) const { 1879 Instruction *I = dyn_cast<Instruction>(V); 1880 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1881 TheLoop->isLoopInvariant(I)) 1882 return false; 1883 1884 // Assume we can vectorize V (and hence we need extraction) if the 1885 // scalars are not computed yet. This can happen, because it is called 1886 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1887 // the scalars are collected. That should be a safe assumption in most 1888 // cases, because we check if the operands have vectorizable types 1889 // beforehand in LoopVectorizationLegality. 1890 return Scalars.find(VF) == Scalars.end() || 1891 !isScalarAfterVectorization(I, VF); 1892 }; 1893 1894 /// Returns a range containing only operands needing to be extracted. 1895 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1896 ElementCount VF) const { 1897 return SmallVector<Value *, 4>(make_filter_range( 1898 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1899 } 1900 1901 /// Determines if we have the infrastructure to vectorize loop \p L and its 1902 /// epilogue, assuming the main loop is vectorized by \p VF. 1903 bool isCandidateForEpilogueVectorization(const Loop &L, 1904 const ElementCount VF) const; 1905 1906 /// Returns true if epilogue vectorization is considered profitable, and 1907 /// false otherwise. 1908 /// \p VF is the vectorization factor chosen for the original loop. 1909 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1910 1911 public: 1912 /// The loop that we evaluate. 1913 Loop *TheLoop; 1914 1915 /// Predicated scalar evolution analysis. 1916 PredicatedScalarEvolution &PSE; 1917 1918 /// Loop Info analysis. 1919 LoopInfo *LI; 1920 1921 /// Vectorization legality. 1922 LoopVectorizationLegality *Legal; 1923 1924 /// Vector target information. 1925 const TargetTransformInfo &TTI; 1926 1927 /// Target Library Info. 1928 const TargetLibraryInfo *TLI; 1929 1930 /// Demanded bits analysis. 1931 DemandedBits *DB; 1932 1933 /// Assumption cache. 1934 AssumptionCache *AC; 1935 1936 /// Interface to emit optimization remarks. 1937 OptimizationRemarkEmitter *ORE; 1938 1939 const Function *TheFunction; 1940 1941 /// Loop Vectorize Hint. 1942 const LoopVectorizeHints *Hints; 1943 1944 /// The interleave access information contains groups of interleaved accesses 1945 /// with the same stride and close to each other. 1946 InterleavedAccessInfo &InterleaveInfo; 1947 1948 /// Values to ignore in the cost model. 1949 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1950 1951 /// Values to ignore in the cost model when VF > 1. 1952 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1953 1954 /// All element types found in the loop. 1955 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1956 1957 /// Profitable vector factors. 1958 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1959 }; 1960 } // end namespace llvm 1961 1962 /// Helper struct to manage generating runtime checks for vectorization. 1963 /// 1964 /// The runtime checks are created up-front in temporary blocks to allow better 1965 /// estimating the cost and un-linked from the existing IR. After deciding to 1966 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1967 /// temporary blocks are completely removed. 1968 class GeneratedRTChecks { 1969 /// Basic block which contains the generated SCEV checks, if any. 1970 BasicBlock *SCEVCheckBlock = nullptr; 1971 1972 /// The value representing the result of the generated SCEV checks. If it is 1973 /// nullptr, either no SCEV checks have been generated or they have been used. 1974 Value *SCEVCheckCond = nullptr; 1975 1976 /// Basic block which contains the generated memory runtime checks, if any. 1977 BasicBlock *MemCheckBlock = nullptr; 1978 1979 /// The value representing the result of the generated memory runtime checks. 1980 /// If it is nullptr, either no memory runtime checks have been generated or 1981 /// they have been used. 1982 Value *MemRuntimeCheckCond = nullptr; 1983 1984 DominatorTree *DT; 1985 LoopInfo *LI; 1986 1987 SCEVExpander SCEVExp; 1988 SCEVExpander MemCheckExp; 1989 1990 public: 1991 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1992 const DataLayout &DL) 1993 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1994 MemCheckExp(SE, DL, "scev.check") {} 1995 1996 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1997 /// accurately estimate the cost of the runtime checks. The blocks are 1998 /// un-linked from the IR and is added back during vector code generation. If 1999 /// there is no vector code generation, the check blocks are removed 2000 /// completely. 2001 void Create(Loop *L, const LoopAccessInfo &LAI, 2002 const SCEVPredicate &Pred) { 2003 2004 BasicBlock *LoopHeader = L->getHeader(); 2005 BasicBlock *Preheader = L->getLoopPreheader(); 2006 2007 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2008 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2009 // may be used by SCEVExpander. The blocks will be un-linked from their 2010 // predecessors and removed from LI & DT at the end of the function. 2011 if (!Pred.isAlwaysTrue()) { 2012 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2013 nullptr, "vector.scevcheck"); 2014 2015 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2016 &Pred, SCEVCheckBlock->getTerminator()); 2017 } 2018 2019 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2020 if (RtPtrChecking.Need) { 2021 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2022 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2023 "vector.memcheck"); 2024 2025 MemRuntimeCheckCond = 2026 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2027 RtPtrChecking.getChecks(), MemCheckExp); 2028 assert(MemRuntimeCheckCond && 2029 "no RT checks generated although RtPtrChecking " 2030 "claimed checks are required"); 2031 } 2032 2033 if (!MemCheckBlock && !SCEVCheckBlock) 2034 return; 2035 2036 // Unhook the temporary block with the checks, update various places 2037 // accordingly. 2038 if (SCEVCheckBlock) 2039 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2040 if (MemCheckBlock) 2041 MemCheckBlock->replaceAllUsesWith(Preheader); 2042 2043 if (SCEVCheckBlock) { 2044 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2045 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2046 Preheader->getTerminator()->eraseFromParent(); 2047 } 2048 if (MemCheckBlock) { 2049 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2050 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2051 Preheader->getTerminator()->eraseFromParent(); 2052 } 2053 2054 DT->changeImmediateDominator(LoopHeader, Preheader); 2055 if (MemCheckBlock) { 2056 DT->eraseNode(MemCheckBlock); 2057 LI->removeBlock(MemCheckBlock); 2058 } 2059 if (SCEVCheckBlock) { 2060 DT->eraseNode(SCEVCheckBlock); 2061 LI->removeBlock(SCEVCheckBlock); 2062 } 2063 } 2064 2065 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2066 /// unused. 2067 ~GeneratedRTChecks() { 2068 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2069 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2070 if (!SCEVCheckCond) 2071 SCEVCleaner.markResultUsed(); 2072 2073 if (!MemRuntimeCheckCond) 2074 MemCheckCleaner.markResultUsed(); 2075 2076 if (MemRuntimeCheckCond) { 2077 auto &SE = *MemCheckExp.getSE(); 2078 // Memory runtime check generation creates compares that use expanded 2079 // values. Remove them before running the SCEVExpanderCleaners. 2080 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2081 if (MemCheckExp.isInsertedInstruction(&I)) 2082 continue; 2083 SE.forgetValue(&I); 2084 I.eraseFromParent(); 2085 } 2086 } 2087 MemCheckCleaner.cleanup(); 2088 SCEVCleaner.cleanup(); 2089 2090 if (SCEVCheckCond) 2091 SCEVCheckBlock->eraseFromParent(); 2092 if (MemRuntimeCheckCond) 2093 MemCheckBlock->eraseFromParent(); 2094 } 2095 2096 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2097 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2098 /// depending on the generated condition. 2099 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2100 BasicBlock *LoopVectorPreHeader, 2101 BasicBlock *LoopExitBlock) { 2102 if (!SCEVCheckCond) 2103 return nullptr; 2104 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2105 if (C->isZero()) 2106 return nullptr; 2107 2108 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2109 2110 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2111 // Create new preheader for vector loop. 2112 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2113 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2114 2115 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2116 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2117 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2118 SCEVCheckBlock); 2119 2120 DT->addNewBlock(SCEVCheckBlock, Pred); 2121 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2122 2123 ReplaceInstWithInst( 2124 SCEVCheckBlock->getTerminator(), 2125 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2126 // Mark the check as used, to prevent it from being removed during cleanup. 2127 SCEVCheckCond = nullptr; 2128 return SCEVCheckBlock; 2129 } 2130 2131 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2132 /// the branches to branch to the vector preheader or \p Bypass, depending on 2133 /// the generated condition. 2134 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2135 BasicBlock *LoopVectorPreHeader) { 2136 // Check if we generated code that checks in runtime if arrays overlap. 2137 if (!MemRuntimeCheckCond) 2138 return nullptr; 2139 2140 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2141 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2142 MemCheckBlock); 2143 2144 DT->addNewBlock(MemCheckBlock, Pred); 2145 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2146 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2147 2148 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2149 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2150 2151 ReplaceInstWithInst( 2152 MemCheckBlock->getTerminator(), 2153 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2154 MemCheckBlock->getTerminator()->setDebugLoc( 2155 Pred->getTerminator()->getDebugLoc()); 2156 2157 // Mark the check as used, to prevent it from being removed during cleanup. 2158 MemRuntimeCheckCond = nullptr; 2159 return MemCheckBlock; 2160 } 2161 }; 2162 2163 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2164 // vectorization. The loop needs to be annotated with #pragma omp simd 2165 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2166 // vector length information is not provided, vectorization is not considered 2167 // explicit. Interleave hints are not allowed either. These limitations will be 2168 // relaxed in the future. 2169 // Please, note that we are currently forced to abuse the pragma 'clang 2170 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2171 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2172 // provides *explicit vectorization hints* (LV can bypass legal checks and 2173 // assume that vectorization is legal). However, both hints are implemented 2174 // using the same metadata (llvm.loop.vectorize, processed by 2175 // LoopVectorizeHints). This will be fixed in the future when the native IR 2176 // representation for pragma 'omp simd' is introduced. 2177 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2178 OptimizationRemarkEmitter *ORE) { 2179 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2180 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2181 2182 // Only outer loops with an explicit vectorization hint are supported. 2183 // Unannotated outer loops are ignored. 2184 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2185 return false; 2186 2187 Function *Fn = OuterLp->getHeader()->getParent(); 2188 if (!Hints.allowVectorization(Fn, OuterLp, 2189 true /*VectorizeOnlyWhenForced*/)) { 2190 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2191 return false; 2192 } 2193 2194 if (Hints.getInterleave() > 1) { 2195 // TODO: Interleave support is future work. 2196 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2197 "outer loops.\n"); 2198 Hints.emitRemarkWithHints(); 2199 return false; 2200 } 2201 2202 return true; 2203 } 2204 2205 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2206 OptimizationRemarkEmitter *ORE, 2207 SmallVectorImpl<Loop *> &V) { 2208 // Collect inner loops and outer loops without irreducible control flow. For 2209 // now, only collect outer loops that have explicit vectorization hints. If we 2210 // are stress testing the VPlan H-CFG construction, we collect the outermost 2211 // loop of every loop nest. 2212 if (L.isInnermost() || VPlanBuildStressTest || 2213 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2214 LoopBlocksRPO RPOT(&L); 2215 RPOT.perform(LI); 2216 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2217 V.push_back(&L); 2218 // TODO: Collect inner loops inside marked outer loops in case 2219 // vectorization fails for the outer loop. Do not invoke 2220 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2221 // already known to be reducible. We can use an inherited attribute for 2222 // that. 2223 return; 2224 } 2225 } 2226 for (Loop *InnerL : L) 2227 collectSupportedLoops(*InnerL, LI, ORE, V); 2228 } 2229 2230 namespace { 2231 2232 /// The LoopVectorize Pass. 2233 struct LoopVectorize : public FunctionPass { 2234 /// Pass identification, replacement for typeid 2235 static char ID; 2236 2237 LoopVectorizePass Impl; 2238 2239 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2240 bool VectorizeOnlyWhenForced = false) 2241 : FunctionPass(ID), 2242 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2243 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2244 } 2245 2246 bool runOnFunction(Function &F) override { 2247 if (skipFunction(F)) 2248 return false; 2249 2250 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2251 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2252 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2253 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2254 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2255 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2256 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2257 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2258 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2259 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2260 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2261 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2262 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2263 2264 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2265 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2266 2267 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2268 GetLAA, *ORE, PSI).MadeAnyChange; 2269 } 2270 2271 void getAnalysisUsage(AnalysisUsage &AU) const override { 2272 AU.addRequired<AssumptionCacheTracker>(); 2273 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2274 AU.addRequired<DominatorTreeWrapperPass>(); 2275 AU.addRequired<LoopInfoWrapperPass>(); 2276 AU.addRequired<ScalarEvolutionWrapperPass>(); 2277 AU.addRequired<TargetTransformInfoWrapperPass>(); 2278 AU.addRequired<AAResultsWrapperPass>(); 2279 AU.addRequired<LoopAccessLegacyAnalysis>(); 2280 AU.addRequired<DemandedBitsWrapperPass>(); 2281 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2282 AU.addRequired<InjectTLIMappingsLegacy>(); 2283 2284 // We currently do not preserve loopinfo/dominator analyses with outer loop 2285 // vectorization. Until this is addressed, mark these analyses as preserved 2286 // only for non-VPlan-native path. 2287 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2288 if (!EnableVPlanNativePath) { 2289 AU.addPreserved<LoopInfoWrapperPass>(); 2290 AU.addPreserved<DominatorTreeWrapperPass>(); 2291 } 2292 2293 AU.addPreserved<BasicAAWrapperPass>(); 2294 AU.addPreserved<GlobalsAAWrapperPass>(); 2295 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2296 } 2297 }; 2298 2299 } // end anonymous namespace 2300 2301 //===----------------------------------------------------------------------===// 2302 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2303 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2304 //===----------------------------------------------------------------------===// 2305 2306 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2307 // We need to place the broadcast of invariant variables outside the loop, 2308 // but only if it's proven safe to do so. Else, broadcast will be inside 2309 // vector loop body. 2310 Instruction *Instr = dyn_cast<Instruction>(V); 2311 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2312 (!Instr || 2313 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2314 // Place the code for broadcasting invariant variables in the new preheader. 2315 IRBuilder<>::InsertPointGuard Guard(Builder); 2316 if (SafeToHoist) 2317 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2318 2319 // Broadcast the scalar into all locations in the vector. 2320 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2321 2322 return Shuf; 2323 } 2324 2325 /// This function adds 2326 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2327 /// to each vector element of Val. The sequence starts at StartIndex. 2328 /// \p Opcode is relevant for FP induction variable. 2329 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2330 Instruction::BinaryOps BinOp, ElementCount VF, 2331 IRBuilderBase &Builder) { 2332 assert(VF.isVector() && "only vector VFs are supported"); 2333 2334 // Create and check the types. 2335 auto *ValVTy = cast<VectorType>(Val->getType()); 2336 ElementCount VLen = ValVTy->getElementCount(); 2337 2338 Type *STy = Val->getType()->getScalarType(); 2339 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2340 "Induction Step must be an integer or FP"); 2341 assert(Step->getType() == STy && "Step has wrong type"); 2342 2343 SmallVector<Constant *, 8> Indices; 2344 2345 // Create a vector of consecutive numbers from zero to VF. 2346 VectorType *InitVecValVTy = ValVTy; 2347 if (STy->isFloatingPointTy()) { 2348 Type *InitVecValSTy = 2349 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2350 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2351 } 2352 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2353 2354 // Splat the StartIdx 2355 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2356 2357 if (STy->isIntegerTy()) { 2358 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2359 Step = Builder.CreateVectorSplat(VLen, Step); 2360 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2361 // FIXME: The newly created binary instructions should contain nsw/nuw 2362 // flags, which can be found from the original scalar operations. 2363 Step = Builder.CreateMul(InitVec, Step); 2364 return Builder.CreateAdd(Val, Step, "induction"); 2365 } 2366 2367 // Floating point induction. 2368 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2369 "Binary Opcode should be specified for FP induction"); 2370 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2371 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2372 2373 Step = Builder.CreateVectorSplat(VLen, Step); 2374 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2375 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2376 } 2377 2378 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2379 const InductionDescriptor &II, Value *Step, Value *Start, 2380 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2381 IRBuilderBase &Builder = State.Builder; 2382 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2383 "Expected either an induction phi-node or a truncate of it!"); 2384 2385 // Construct the initial value of the vector IV in the vector loop preheader 2386 auto CurrIP = Builder.saveIP(); 2387 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2388 if (isa<TruncInst>(EntryVal)) { 2389 assert(Start->getType()->isIntegerTy() && 2390 "Truncation requires an integer type"); 2391 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2392 Step = Builder.CreateTrunc(Step, TruncType); 2393 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2394 } 2395 2396 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2397 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2398 Value *SteppedStart = getStepVector( 2399 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2400 2401 // We create vector phi nodes for both integer and floating-point induction 2402 // variables. Here, we determine the kind of arithmetic we will perform. 2403 Instruction::BinaryOps AddOp; 2404 Instruction::BinaryOps MulOp; 2405 if (Step->getType()->isIntegerTy()) { 2406 AddOp = Instruction::Add; 2407 MulOp = Instruction::Mul; 2408 } else { 2409 AddOp = II.getInductionOpcode(); 2410 MulOp = Instruction::FMul; 2411 } 2412 2413 // Multiply the vectorization factor by the step using integer or 2414 // floating-point arithmetic as appropriate. 2415 Type *StepType = Step->getType(); 2416 Value *RuntimeVF; 2417 if (Step->getType()->isFloatingPointTy()) 2418 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2419 else 2420 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2421 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2422 2423 // Create a vector splat to use in the induction update. 2424 // 2425 // FIXME: If the step is non-constant, we create the vector splat with 2426 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2427 // handle a constant vector splat. 2428 Value *SplatVF = isa<Constant>(Mul) 2429 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2430 : Builder.CreateVectorSplat(State.VF, Mul); 2431 Builder.restoreIP(CurrIP); 2432 2433 // We may need to add the step a number of times, depending on the unroll 2434 // factor. The last of those goes into the PHI. 2435 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2436 &*LoopVectorBody->getFirstInsertionPt()); 2437 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2438 Instruction *LastInduction = VecInd; 2439 for (unsigned Part = 0; Part < UF; ++Part) { 2440 State.set(Def, LastInduction, Part); 2441 2442 if (isa<TruncInst>(EntryVal)) 2443 addMetadata(LastInduction, EntryVal); 2444 2445 LastInduction = cast<Instruction>( 2446 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2447 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2448 } 2449 2450 // Move the last step to the end of the latch block. This ensures consistent 2451 // placement of all induction updates. 2452 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2453 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2454 LastInduction->moveBefore(Br); 2455 LastInduction->setName("vec.ind.next"); 2456 2457 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2458 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2459 } 2460 2461 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2462 /// variable on which to base the steps, \p Step is the size of the step. 2463 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2464 const InductionDescriptor &ID, VPValue *Def, 2465 VPTransformState &State) { 2466 IRBuilderBase &Builder = State.Builder; 2467 // We shouldn't have to build scalar steps if we aren't vectorizing. 2468 assert(State.VF.isVector() && "VF should be greater than one"); 2469 // Get the value type and ensure it and the step have the same integer type. 2470 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2471 assert(ScalarIVTy == Step->getType() && 2472 "Val and Step should have the same type"); 2473 2474 // We build scalar steps for both integer and floating-point induction 2475 // variables. Here, we determine the kind of arithmetic we will perform. 2476 Instruction::BinaryOps AddOp; 2477 Instruction::BinaryOps MulOp; 2478 if (ScalarIVTy->isIntegerTy()) { 2479 AddOp = Instruction::Add; 2480 MulOp = Instruction::Mul; 2481 } else { 2482 AddOp = ID.getInductionOpcode(); 2483 MulOp = Instruction::FMul; 2484 } 2485 2486 // Determine the number of scalars we need to generate for each unroll 2487 // iteration. 2488 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2489 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2490 // Compute the scalar steps and save the results in State. 2491 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2492 ScalarIVTy->getScalarSizeInBits()); 2493 Type *VecIVTy = nullptr; 2494 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2495 if (!FirstLaneOnly && State.VF.isScalable()) { 2496 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2497 UnitStepVec = 2498 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2499 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2500 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2501 } 2502 2503 for (unsigned Part = 0; Part < State.UF; ++Part) { 2504 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2505 2506 if (!FirstLaneOnly && State.VF.isScalable()) { 2507 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2508 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2509 if (ScalarIVTy->isFloatingPointTy()) 2510 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2511 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2512 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2513 State.set(Def, Add, Part); 2514 // It's useful to record the lane values too for the known minimum number 2515 // of elements so we do those below. This improves the code quality when 2516 // trying to extract the first element, for example. 2517 } 2518 2519 if (ScalarIVTy->isFloatingPointTy()) 2520 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2521 2522 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2523 Value *StartIdx = Builder.CreateBinOp( 2524 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2525 // The step returned by `createStepForVF` is a runtime-evaluated value 2526 // when VF is scalable. Otherwise, it should be folded into a Constant. 2527 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2528 "Expected StartIdx to be folded to a constant when VF is not " 2529 "scalable"); 2530 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2531 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2532 State.set(Def, Add, VPIteration(Part, Lane)); 2533 } 2534 } 2535 } 2536 2537 // Generate code for the induction step. Note that induction steps are 2538 // required to be loop-invariant 2539 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2540 Instruction *InsertBefore, 2541 Loop *OrigLoop = nullptr) { 2542 const DataLayout &DL = SE.getDataLayout(); 2543 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2544 "Induction step should be loop invariant"); 2545 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2546 return E->getValue(); 2547 2548 SCEVExpander Exp(SE, DL, "induction"); 2549 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2550 } 2551 2552 /// Compute the transformed value of Index at offset StartValue using step 2553 /// StepValue. 2554 /// For integer induction, returns StartValue + Index * StepValue. 2555 /// For pointer induction, returns StartValue[Index * StepValue]. 2556 /// FIXME: The newly created binary instructions should contain nsw/nuw 2557 /// flags, which can be found from the original scalar operations. 2558 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2559 Value *StartValue, Value *Step, 2560 const InductionDescriptor &ID) { 2561 assert(Index->getType()->getScalarType() == Step->getType() && 2562 "Index scalar type does not match StepValue type"); 2563 2564 // Note: the IR at this point is broken. We cannot use SE to create any new 2565 // SCEV and then expand it, hoping that SCEV's simplification will give us 2566 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2567 // lead to various SCEV crashes. So all we can do is to use builder and rely 2568 // on InstCombine for future simplifications. Here we handle some trivial 2569 // cases only. 2570 auto CreateAdd = [&B](Value *X, Value *Y) { 2571 assert(X->getType() == Y->getType() && "Types don't match!"); 2572 if (auto *CX = dyn_cast<ConstantInt>(X)) 2573 if (CX->isZero()) 2574 return Y; 2575 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2576 if (CY->isZero()) 2577 return X; 2578 return B.CreateAdd(X, Y); 2579 }; 2580 2581 // We allow X to be a vector type, in which case Y will potentially be 2582 // splatted into a vector with the same element count. 2583 auto CreateMul = [&B](Value *X, Value *Y) { 2584 assert(X->getType()->getScalarType() == Y->getType() && 2585 "Types don't match!"); 2586 if (auto *CX = dyn_cast<ConstantInt>(X)) 2587 if (CX->isOne()) 2588 return Y; 2589 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2590 if (CY->isOne()) 2591 return X; 2592 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2593 if (XVTy && !isa<VectorType>(Y->getType())) 2594 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2595 return B.CreateMul(X, Y); 2596 }; 2597 2598 switch (ID.getKind()) { 2599 case InductionDescriptor::IK_IntInduction: { 2600 assert(!isa<VectorType>(Index->getType()) && 2601 "Vector indices not supported for integer inductions yet"); 2602 assert(Index->getType() == StartValue->getType() && 2603 "Index type does not match StartValue type"); 2604 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2605 return B.CreateSub(StartValue, Index); 2606 auto *Offset = CreateMul(Index, Step); 2607 return CreateAdd(StartValue, Offset); 2608 } 2609 case InductionDescriptor::IK_PtrInduction: { 2610 assert(isa<Constant>(Step) && 2611 "Expected constant step for pointer induction"); 2612 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2613 } 2614 case InductionDescriptor::IK_FpInduction: { 2615 assert(!isa<VectorType>(Index->getType()) && 2616 "Vector indices not supported for FP inductions yet"); 2617 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2618 auto InductionBinOp = ID.getInductionBinOp(); 2619 assert(InductionBinOp && 2620 (InductionBinOp->getOpcode() == Instruction::FAdd || 2621 InductionBinOp->getOpcode() == Instruction::FSub) && 2622 "Original bin op should be defined for FP induction"); 2623 2624 Value *MulExp = B.CreateFMul(Step, Index); 2625 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2626 "induction"); 2627 } 2628 case InductionDescriptor::IK_NoInduction: 2629 return nullptr; 2630 } 2631 llvm_unreachable("invalid enum"); 2632 } 2633 2634 void InnerLoopVectorizer::widenIntOrFpInduction( 2635 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, 2636 Value *CanonicalIV) { 2637 Value *Start = Def->getStartValue()->getLiveInIRValue(); 2638 const InductionDescriptor &ID = Def->getInductionDescriptor(); 2639 TruncInst *Trunc = Def->getTruncInst(); 2640 IRBuilderBase &Builder = State.Builder; 2641 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2642 assert(State.VF.isVector() && "must have vector VF"); 2643 2644 // The value from the original loop to which we are mapping the new induction 2645 // variable. 2646 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2647 2648 auto &DL = EntryVal->getModule()->getDataLayout(); 2649 2650 // Generate code for the induction step. Note that induction steps are 2651 // required to be loop-invariant 2652 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2653 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2654 "Induction step should be loop invariant"); 2655 if (PSE.getSE()->isSCEVable(IV->getType())) { 2656 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2657 return Exp.expandCodeFor(Step, Step->getType(), 2658 State.CFG.VectorPreHeader->getTerminator()); 2659 } 2660 return cast<SCEVUnknown>(Step)->getValue(); 2661 }; 2662 2663 // The scalar value to broadcast. This is derived from the canonical 2664 // induction variable. If a truncation type is given, truncate the canonical 2665 // induction variable and step. Otherwise, derive these values from the 2666 // induction descriptor. 2667 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2668 Value *ScalarIV = CanonicalIV; 2669 Type *NeededType = IV->getType(); 2670 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { 2671 ScalarIV = 2672 NeededType->isIntegerTy() 2673 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) 2674 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); 2675 ScalarIV = emitTransformedIndex(Builder, ScalarIV, Start, Step, ID); 2676 ScalarIV->setName("offset.idx"); 2677 } 2678 if (Trunc) { 2679 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2680 assert(Step->getType()->isIntegerTy() && 2681 "Truncation requires an integer step"); 2682 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2683 Step = Builder.CreateTrunc(Step, TruncType); 2684 } 2685 return ScalarIV; 2686 }; 2687 2688 // Fast-math-flags propagate from the original induction instruction. 2689 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2690 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2691 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2692 2693 // Now do the actual transformations, and start with creating the step value. 2694 Value *Step = CreateStepValue(ID.getStep()); 2695 2696 // Create a new independent vector induction variable. Later VPlan2VPlan 2697 // optimizations will remove it, if it won't be needed, e.g. because all users 2698 // of it access scalar values. 2699 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2700 2701 if (Def->needsScalarIV()) { 2702 // Create scalar steps that can be used by instructions we will later 2703 // scalarize. Note that the addition of the scalar steps will not increase 2704 // the number of instructions in the loop in the common case prior to 2705 // InstCombine. We will be trading one vector extract for each scalar step. 2706 Value *ScalarIV = CreateScalarIV(Step); 2707 buildScalarSteps(ScalarIV, Step, ID, Def, State); 2708 } 2709 } 2710 2711 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2712 const VPIteration &Instance, 2713 VPTransformState &State) { 2714 Value *ScalarInst = State.get(Def, Instance); 2715 Value *VectorValue = State.get(Def, Instance.Part); 2716 VectorValue = Builder.CreateInsertElement( 2717 VectorValue, ScalarInst, 2718 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2719 State.set(Def, VectorValue, Instance.Part); 2720 } 2721 2722 // Return whether we allow using masked interleave-groups (for dealing with 2723 // strided loads/stores that reside in predicated blocks, or for dealing 2724 // with gaps). 2725 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2726 // If an override option has been passed in for interleaved accesses, use it. 2727 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2728 return EnableMaskedInterleavedMemAccesses; 2729 2730 return TTI.enableMaskedInterleavedAccessVectorization(); 2731 } 2732 2733 // Try to vectorize the interleave group that \p Instr belongs to. 2734 // 2735 // E.g. Translate following interleaved load group (factor = 3): 2736 // for (i = 0; i < N; i+=3) { 2737 // R = Pic[i]; // Member of index 0 2738 // G = Pic[i+1]; // Member of index 1 2739 // B = Pic[i+2]; // Member of index 2 2740 // ... // do something to R, G, B 2741 // } 2742 // To: 2743 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2744 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2745 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2746 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2747 // 2748 // Or translate following interleaved store group (factor = 3): 2749 // for (i = 0; i < N; i+=3) { 2750 // ... do something to R, G, B 2751 // Pic[i] = R; // Member of index 0 2752 // Pic[i+1] = G; // Member of index 1 2753 // Pic[i+2] = B; // Member of index 2 2754 // } 2755 // To: 2756 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2757 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2758 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2759 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2760 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2761 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2762 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2763 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2764 VPValue *BlockInMask) { 2765 Instruction *Instr = Group->getInsertPos(); 2766 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2767 2768 // Prepare for the vector type of the interleaved load/store. 2769 Type *ScalarTy = getLoadStoreType(Instr); 2770 unsigned InterleaveFactor = Group->getFactor(); 2771 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2772 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2773 2774 // Prepare for the new pointers. 2775 SmallVector<Value *, 2> AddrParts; 2776 unsigned Index = Group->getIndex(Instr); 2777 2778 // TODO: extend the masked interleaved-group support to reversed access. 2779 assert((!BlockInMask || !Group->isReverse()) && 2780 "Reversed masked interleave-group not supported."); 2781 2782 // If the group is reverse, adjust the index to refer to the last vector lane 2783 // instead of the first. We adjust the index from the first vector lane, 2784 // rather than directly getting the pointer for lane VF - 1, because the 2785 // pointer operand of the interleaved access is supposed to be uniform. For 2786 // uniform instructions, we're only required to generate a value for the 2787 // first vector lane in each unroll iteration. 2788 if (Group->isReverse()) 2789 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2790 2791 for (unsigned Part = 0; Part < UF; Part++) { 2792 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2793 setDebugLocFromInst(AddrPart); 2794 2795 // Notice current instruction could be any index. Need to adjust the address 2796 // to the member of index 0. 2797 // 2798 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2799 // b = A[i]; // Member of index 0 2800 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2801 // 2802 // E.g. A[i+1] = a; // Member of index 1 2803 // A[i] = b; // Member of index 0 2804 // A[i+2] = c; // Member of index 2 (Current instruction) 2805 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2806 2807 bool InBounds = false; 2808 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2809 InBounds = gep->isInBounds(); 2810 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2811 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2812 2813 // Cast to the vector pointer type. 2814 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2815 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2816 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2817 } 2818 2819 setDebugLocFromInst(Instr); 2820 Value *PoisonVec = PoisonValue::get(VecTy); 2821 2822 Value *MaskForGaps = nullptr; 2823 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2824 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2825 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2826 } 2827 2828 // Vectorize the interleaved load group. 2829 if (isa<LoadInst>(Instr)) { 2830 // For each unroll part, create a wide load for the group. 2831 SmallVector<Value *, 2> NewLoads; 2832 for (unsigned Part = 0; Part < UF; Part++) { 2833 Instruction *NewLoad; 2834 if (BlockInMask || MaskForGaps) { 2835 assert(useMaskedInterleavedAccesses(*TTI) && 2836 "masked interleaved groups are not allowed."); 2837 Value *GroupMask = MaskForGaps; 2838 if (BlockInMask) { 2839 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2840 Value *ShuffledMask = Builder.CreateShuffleVector( 2841 BlockInMaskPart, 2842 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2843 "interleaved.mask"); 2844 GroupMask = MaskForGaps 2845 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2846 MaskForGaps) 2847 : ShuffledMask; 2848 } 2849 NewLoad = 2850 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2851 GroupMask, PoisonVec, "wide.masked.vec"); 2852 } 2853 else 2854 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2855 Group->getAlign(), "wide.vec"); 2856 Group->addMetadata(NewLoad); 2857 NewLoads.push_back(NewLoad); 2858 } 2859 2860 // For each member in the group, shuffle out the appropriate data from the 2861 // wide loads. 2862 unsigned J = 0; 2863 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2864 Instruction *Member = Group->getMember(I); 2865 2866 // Skip the gaps in the group. 2867 if (!Member) 2868 continue; 2869 2870 auto StrideMask = 2871 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2872 for (unsigned Part = 0; Part < UF; Part++) { 2873 Value *StridedVec = Builder.CreateShuffleVector( 2874 NewLoads[Part], StrideMask, "strided.vec"); 2875 2876 // If this member has different type, cast the result type. 2877 if (Member->getType() != ScalarTy) { 2878 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2879 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2880 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2881 } 2882 2883 if (Group->isReverse()) 2884 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2885 2886 State.set(VPDefs[J], StridedVec, Part); 2887 } 2888 ++J; 2889 } 2890 return; 2891 } 2892 2893 // The sub vector type for current instruction. 2894 auto *SubVT = VectorType::get(ScalarTy, VF); 2895 2896 // Vectorize the interleaved store group. 2897 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2898 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2899 "masked interleaved groups are not allowed."); 2900 assert((!MaskForGaps || !VF.isScalable()) && 2901 "masking gaps for scalable vectors is not yet supported."); 2902 for (unsigned Part = 0; Part < UF; Part++) { 2903 // Collect the stored vector from each member. 2904 SmallVector<Value *, 4> StoredVecs; 2905 for (unsigned i = 0; i < InterleaveFactor; i++) { 2906 assert((Group->getMember(i) || MaskForGaps) && 2907 "Fail to get a member from an interleaved store group"); 2908 Instruction *Member = Group->getMember(i); 2909 2910 // Skip the gaps in the group. 2911 if (!Member) { 2912 Value *Undef = PoisonValue::get(SubVT); 2913 StoredVecs.push_back(Undef); 2914 continue; 2915 } 2916 2917 Value *StoredVec = State.get(StoredValues[i], Part); 2918 2919 if (Group->isReverse()) 2920 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2921 2922 // If this member has different type, cast it to a unified type. 2923 2924 if (StoredVec->getType() != SubVT) 2925 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2926 2927 StoredVecs.push_back(StoredVec); 2928 } 2929 2930 // Concatenate all vectors into a wide vector. 2931 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2932 2933 // Interleave the elements in the wide vector. 2934 Value *IVec = Builder.CreateShuffleVector( 2935 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2936 "interleaved.vec"); 2937 2938 Instruction *NewStoreInstr; 2939 if (BlockInMask || MaskForGaps) { 2940 Value *GroupMask = MaskForGaps; 2941 if (BlockInMask) { 2942 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2943 Value *ShuffledMask = Builder.CreateShuffleVector( 2944 BlockInMaskPart, 2945 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2946 "interleaved.mask"); 2947 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2948 ShuffledMask, MaskForGaps) 2949 : ShuffledMask; 2950 } 2951 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2952 Group->getAlign(), GroupMask); 2953 } else 2954 NewStoreInstr = 2955 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2956 2957 Group->addMetadata(NewStoreInstr); 2958 } 2959 } 2960 2961 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2962 VPReplicateRecipe *RepRecipe, 2963 const VPIteration &Instance, 2964 bool IfPredicateInstr, 2965 VPTransformState &State) { 2966 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2967 2968 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2969 // the first lane and part. 2970 if (isa<NoAliasScopeDeclInst>(Instr)) 2971 if (!Instance.isFirstIteration()) 2972 return; 2973 2974 setDebugLocFromInst(Instr); 2975 2976 // Does this instruction return a value ? 2977 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2978 2979 Instruction *Cloned = Instr->clone(); 2980 if (!IsVoidRetTy) 2981 Cloned->setName(Instr->getName() + ".cloned"); 2982 2983 // If the scalarized instruction contributes to the address computation of a 2984 // widen masked load/store which was in a basic block that needed predication 2985 // and is not predicated after vectorization, we can't propagate 2986 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2987 // instruction could feed a poison value to the base address of the widen 2988 // load/store. 2989 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2990 Cloned->dropPoisonGeneratingFlags(); 2991 2992 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2993 Builder.GetInsertPoint()); 2994 // Replace the operands of the cloned instructions with their scalar 2995 // equivalents in the new loop. 2996 for (auto &I : enumerate(RepRecipe->operands())) { 2997 auto InputInstance = Instance; 2998 VPValue *Operand = I.value(); 2999 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 3000 if (OperandR && OperandR->isUniform()) 3001 InputInstance.Lane = VPLane::getFirstLane(); 3002 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 3003 } 3004 addNewMetadata(Cloned, Instr); 3005 3006 // Place the cloned scalar in the new loop. 3007 Builder.Insert(Cloned); 3008 3009 State.set(RepRecipe, Cloned, Instance); 3010 3011 // If we just cloned a new assumption, add it the assumption cache. 3012 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3013 AC->registerAssumption(II); 3014 3015 // End if-block. 3016 if (IfPredicateInstr) 3017 PredicatedInstructions.push_back(Cloned); 3018 } 3019 3020 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 3021 BasicBlock *Header = L->getHeader(); 3022 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 3023 3024 IRBuilder<> B(Header->getTerminator()); 3025 Instruction *OldInst = 3026 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 3027 setDebugLocFromInst(OldInst, &B); 3028 3029 // Connect the header to the exit and header blocks and replace the old 3030 // terminator. 3031 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 3032 3033 // Now we have two terminators. Remove the old one from the block. 3034 Header->getTerminator()->eraseFromParent(); 3035 } 3036 3037 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3038 if (TripCount) 3039 return TripCount; 3040 3041 assert(L && "Create Trip Count for null loop."); 3042 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3043 // Find the loop boundaries. 3044 ScalarEvolution *SE = PSE.getSE(); 3045 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3046 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3047 "Invalid loop count"); 3048 3049 Type *IdxTy = Legal->getWidestInductionType(); 3050 assert(IdxTy && "No type for induction"); 3051 3052 // The exit count might have the type of i64 while the phi is i32. This can 3053 // happen if we have an induction variable that is sign extended before the 3054 // compare. The only way that we get a backedge taken count is that the 3055 // induction variable was signed and as such will not overflow. In such a case 3056 // truncation is legal. 3057 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3058 IdxTy->getPrimitiveSizeInBits()) 3059 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3060 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3061 3062 // Get the total trip count from the count by adding 1. 3063 const SCEV *ExitCount = SE->getAddExpr( 3064 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3065 3066 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3067 3068 // Expand the trip count and place the new instructions in the preheader. 3069 // Notice that the pre-header does not change, only the loop body. 3070 SCEVExpander Exp(*SE, DL, "induction"); 3071 3072 // Count holds the overall loop count (N). 3073 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3074 L->getLoopPreheader()->getTerminator()); 3075 3076 if (TripCount->getType()->isPointerTy()) 3077 TripCount = 3078 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3079 L->getLoopPreheader()->getTerminator()); 3080 3081 return TripCount; 3082 } 3083 3084 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3085 if (VectorTripCount) 3086 return VectorTripCount; 3087 3088 Value *TC = getOrCreateTripCount(L); 3089 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3090 3091 Type *Ty = TC->getType(); 3092 // This is where we can make the step a runtime constant. 3093 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3094 3095 // If the tail is to be folded by masking, round the number of iterations N 3096 // up to a multiple of Step instead of rounding down. This is done by first 3097 // adding Step-1 and then rounding down. Note that it's ok if this addition 3098 // overflows: the vector induction variable will eventually wrap to zero given 3099 // that it starts at zero and its Step is a power of two; the loop will then 3100 // exit, with the last early-exit vector comparison also producing all-true. 3101 if (Cost->foldTailByMasking()) { 3102 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3103 "VF*UF must be a power of 2 when folding tail by masking"); 3104 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 3105 TC = Builder.CreateAdd( 3106 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 3107 } 3108 3109 // Now we need to generate the expression for the part of the loop that the 3110 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3111 // iterations are not required for correctness, or N - Step, otherwise. Step 3112 // is equal to the vectorization factor (number of SIMD elements) times the 3113 // unroll factor (number of SIMD instructions). 3114 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3115 3116 // There are cases where we *must* run at least one iteration in the remainder 3117 // loop. See the cost model for when this can happen. If the step evenly 3118 // divides the trip count, we set the remainder to be equal to the step. If 3119 // the step does not evenly divide the trip count, no adjustment is necessary 3120 // since there will already be scalar iterations. Note that the minimum 3121 // iterations check ensures that N >= Step. 3122 if (Cost->requiresScalarEpilogue(VF)) { 3123 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3124 R = Builder.CreateSelect(IsZero, Step, R); 3125 } 3126 3127 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3128 3129 return VectorTripCount; 3130 } 3131 3132 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3133 const DataLayout &DL) { 3134 // Verify that V is a vector type with same number of elements as DstVTy. 3135 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3136 unsigned VF = DstFVTy->getNumElements(); 3137 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3138 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3139 Type *SrcElemTy = SrcVecTy->getElementType(); 3140 Type *DstElemTy = DstFVTy->getElementType(); 3141 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3142 "Vector elements must have same size"); 3143 3144 // Do a direct cast if element types are castable. 3145 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3146 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3147 } 3148 // V cannot be directly casted to desired vector type. 3149 // May happen when V is a floating point vector but DstVTy is a vector of 3150 // pointers or vice-versa. Handle this using a two-step bitcast using an 3151 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3152 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3153 "Only one type should be a pointer type"); 3154 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3155 "Only one type should be a floating point type"); 3156 Type *IntTy = 3157 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3158 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3159 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3160 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3161 } 3162 3163 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3164 BasicBlock *Bypass) { 3165 Value *Count = getOrCreateTripCount(L); 3166 // Reuse existing vector loop preheader for TC checks. 3167 // Note that new preheader block is generated for vector loop. 3168 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3169 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3170 3171 // Generate code to check if the loop's trip count is less than VF * UF, or 3172 // equal to it in case a scalar epilogue is required; this implies that the 3173 // vector trip count is zero. This check also covers the case where adding one 3174 // to the backedge-taken count overflowed leading to an incorrect trip count 3175 // of zero. In this case we will also jump to the scalar loop. 3176 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3177 : ICmpInst::ICMP_ULT; 3178 3179 // If tail is to be folded, vector loop takes care of all iterations. 3180 Value *CheckMinIters = Builder.getFalse(); 3181 if (!Cost->foldTailByMasking()) { 3182 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3183 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3184 } 3185 // Create new preheader for vector loop. 3186 LoopVectorPreHeader = 3187 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3188 "vector.ph"); 3189 3190 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3191 DT->getNode(Bypass)->getIDom()) && 3192 "TC check is expected to dominate Bypass"); 3193 3194 // Update dominator for Bypass & LoopExit (if needed). 3195 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3196 if (!Cost->requiresScalarEpilogue(VF)) 3197 // If there is an epilogue which must run, there's no edge from the 3198 // middle block to exit blocks and thus no need to update the immediate 3199 // dominator of the exit blocks. 3200 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3201 3202 ReplaceInstWithInst( 3203 TCCheckBlock->getTerminator(), 3204 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3205 LoopBypassBlocks.push_back(TCCheckBlock); 3206 } 3207 3208 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3209 3210 BasicBlock *const SCEVCheckBlock = 3211 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3212 if (!SCEVCheckBlock) 3213 return nullptr; 3214 3215 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3216 (OptForSizeBasedOnProfile && 3217 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3218 "Cannot SCEV check stride or overflow when optimizing for size"); 3219 3220 3221 // Update dominator only if this is first RT check. 3222 if (LoopBypassBlocks.empty()) { 3223 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3224 if (!Cost->requiresScalarEpilogue(VF)) 3225 // If there is an epilogue which must run, there's no edge from the 3226 // middle block to exit blocks and thus no need to update the immediate 3227 // dominator of the exit blocks. 3228 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3229 } 3230 3231 LoopBypassBlocks.push_back(SCEVCheckBlock); 3232 AddedSafetyChecks = true; 3233 return SCEVCheckBlock; 3234 } 3235 3236 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3237 BasicBlock *Bypass) { 3238 // VPlan-native path does not do any analysis for runtime checks currently. 3239 if (EnableVPlanNativePath) 3240 return nullptr; 3241 3242 BasicBlock *const MemCheckBlock = 3243 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3244 3245 // Check if we generated code that checks in runtime if arrays overlap. We put 3246 // the checks into a separate block to make the more common case of few 3247 // elements faster. 3248 if (!MemCheckBlock) 3249 return nullptr; 3250 3251 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3252 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3253 "Cannot emit memory checks when optimizing for size, unless forced " 3254 "to vectorize."); 3255 ORE->emit([&]() { 3256 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3257 L->getStartLoc(), L->getHeader()) 3258 << "Code-size may be reduced by not forcing " 3259 "vectorization, or by source-code modifications " 3260 "eliminating the need for runtime checks " 3261 "(e.g., adding 'restrict')."; 3262 }); 3263 } 3264 3265 LoopBypassBlocks.push_back(MemCheckBlock); 3266 3267 AddedSafetyChecks = true; 3268 3269 // We currently don't use LoopVersioning for the actual loop cloning but we 3270 // still use it to add the noalias metadata. 3271 LVer = std::make_unique<LoopVersioning>( 3272 *Legal->getLAI(), 3273 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3274 DT, PSE.getSE()); 3275 LVer->prepareNoAliasMetadata(); 3276 return MemCheckBlock; 3277 } 3278 3279 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3280 LoopScalarBody = OrigLoop->getHeader(); 3281 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3282 assert(LoopVectorPreHeader && "Invalid loop structure"); 3283 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3284 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3285 "multiple exit loop without required epilogue?"); 3286 3287 LoopMiddleBlock = 3288 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3289 LI, nullptr, Twine(Prefix) + "middle.block"); 3290 LoopScalarPreHeader = 3291 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3292 nullptr, Twine(Prefix) + "scalar.ph"); 3293 3294 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3295 3296 // Set up the middle block terminator. Two cases: 3297 // 1) If we know that we must execute the scalar epilogue, emit an 3298 // unconditional branch. 3299 // 2) Otherwise, we must have a single unique exit block (due to how we 3300 // implement the multiple exit case). In this case, set up a conditonal 3301 // branch from the middle block to the loop scalar preheader, and the 3302 // exit block. completeLoopSkeleton will update the condition to use an 3303 // iteration check, if required to decide whether to execute the remainder. 3304 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3305 BranchInst::Create(LoopScalarPreHeader) : 3306 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3307 Builder.getTrue()); 3308 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3309 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3310 3311 // We intentionally don't let SplitBlock to update LoopInfo since 3312 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3313 // LoopVectorBody is explicitly added to the correct place few lines later. 3314 LoopVectorBody = 3315 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3316 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3317 3318 // Update dominator for loop exit. 3319 if (!Cost->requiresScalarEpilogue(VF)) 3320 // If there is an epilogue which must run, there's no edge from the 3321 // middle block to exit blocks and thus no need to update the immediate 3322 // dominator of the exit blocks. 3323 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3324 3325 // Create and register the new vector loop. 3326 Loop *Lp = LI->AllocateLoop(); 3327 Loop *ParentLoop = OrigLoop->getParentLoop(); 3328 3329 // Insert the new loop into the loop nest and register the new basic blocks 3330 // before calling any utilities such as SCEV that require valid LoopInfo. 3331 if (ParentLoop) { 3332 ParentLoop->addChildLoop(Lp); 3333 } else { 3334 LI->addTopLevelLoop(Lp); 3335 } 3336 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3337 return Lp; 3338 } 3339 3340 void InnerLoopVectorizer::createInductionResumeValues( 3341 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3342 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3343 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3344 "Inconsistent information about additional bypass."); 3345 3346 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3347 assert(VectorTripCount && L && "Expected valid arguments"); 3348 // We are going to resume the execution of the scalar loop. 3349 // Go over all of the induction variables that we found and fix the 3350 // PHIs that are left in the scalar version of the loop. 3351 // The starting values of PHI nodes depend on the counter of the last 3352 // iteration in the vectorized loop. 3353 // If we come from a bypass edge then we need to start from the original 3354 // start value. 3355 Instruction *OldInduction = Legal->getPrimaryInduction(); 3356 for (auto &InductionEntry : Legal->getInductionVars()) { 3357 PHINode *OrigPhi = InductionEntry.first; 3358 InductionDescriptor II = InductionEntry.second; 3359 3360 // Create phi nodes to merge from the backedge-taken check block. 3361 PHINode *BCResumeVal = 3362 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3363 LoopScalarPreHeader->getTerminator()); 3364 // Copy original phi DL over to the new one. 3365 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3366 Value *&EndValue = IVEndValues[OrigPhi]; 3367 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3368 if (OrigPhi == OldInduction) { 3369 // We know what the end value is. 3370 EndValue = VectorTripCount; 3371 } else { 3372 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3373 3374 // Fast-math-flags propagate from the original induction instruction. 3375 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3376 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3377 3378 Type *StepType = II.getStep()->getType(); 3379 Instruction::CastOps CastOp = 3380 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3381 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3382 Value *Step = 3383 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3384 EndValue = emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3385 EndValue->setName("ind.end"); 3386 3387 // Compute the end value for the additional bypass (if applicable). 3388 if (AdditionalBypass.first) { 3389 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3390 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3391 StepType, true); 3392 Value *Step = 3393 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3394 CRD = 3395 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3396 EndValueFromAdditionalBypass = 3397 emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3398 EndValueFromAdditionalBypass->setName("ind.end"); 3399 } 3400 } 3401 // The new PHI merges the original incoming value, in case of a bypass, 3402 // or the value at the end of the vectorized loop. 3403 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3404 3405 // Fix the scalar body counter (PHI node). 3406 // The old induction's phi node in the scalar body needs the truncated 3407 // value. 3408 for (BasicBlock *BB : LoopBypassBlocks) 3409 BCResumeVal->addIncoming(II.getStartValue(), BB); 3410 3411 if (AdditionalBypass.first) 3412 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3413 EndValueFromAdditionalBypass); 3414 3415 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3416 } 3417 } 3418 3419 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3420 MDNode *OrigLoopID) { 3421 assert(L && "Expected valid loop."); 3422 3423 // The trip counts should be cached by now. 3424 Value *Count = getOrCreateTripCount(L); 3425 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3426 3427 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3428 3429 // Add a check in the middle block to see if we have completed 3430 // all of the iterations in the first vector loop. Three cases: 3431 // 1) If we require a scalar epilogue, there is no conditional branch as 3432 // we unconditionally branch to the scalar preheader. Do nothing. 3433 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3434 // Thus if tail is to be folded, we know we don't need to run the 3435 // remainder and we can use the previous value for the condition (true). 3436 // 3) Otherwise, construct a runtime check. 3437 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3438 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3439 Count, VectorTripCount, "cmp.n", 3440 LoopMiddleBlock->getTerminator()); 3441 3442 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3443 // of the corresponding compare because they may have ended up with 3444 // different line numbers and we want to avoid awkward line stepping while 3445 // debugging. Eg. if the compare has got a line number inside the loop. 3446 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3447 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3448 } 3449 3450 // Get ready to start creating new instructions into the vectorized body. 3451 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3452 "Inconsistent vector loop preheader"); 3453 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3454 3455 #ifdef EXPENSIVE_CHECKS 3456 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3457 LI->verify(*DT); 3458 #endif 3459 3460 return LoopVectorPreHeader; 3461 } 3462 3463 std::pair<BasicBlock *, Value *> 3464 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3465 /* 3466 In this function we generate a new loop. The new loop will contain 3467 the vectorized instructions while the old loop will continue to run the 3468 scalar remainder. 3469 3470 [ ] <-- loop iteration number check. 3471 / | 3472 / v 3473 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3474 | / | 3475 | / v 3476 || [ ] <-- vector pre header. 3477 |/ | 3478 | v 3479 | [ ] \ 3480 | [ ]_| <-- vector loop. 3481 | | 3482 | v 3483 \ -[ ] <--- middle-block. 3484 \/ | 3485 /\ v 3486 | ->[ ] <--- new preheader. 3487 | | 3488 (opt) v <-- edge from middle to exit iff epilogue is not required. 3489 | [ ] \ 3490 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3491 \ | 3492 \ v 3493 >[ ] <-- exit block(s). 3494 ... 3495 */ 3496 3497 // Get the metadata of the original loop before it gets modified. 3498 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3499 3500 // Workaround! Compute the trip count of the original loop and cache it 3501 // before we start modifying the CFG. This code has a systemic problem 3502 // wherein it tries to run analysis over partially constructed IR; this is 3503 // wrong, and not simply for SCEV. The trip count of the original loop 3504 // simply happens to be prone to hitting this in practice. In theory, we 3505 // can hit the same issue for any SCEV, or ValueTracking query done during 3506 // mutation. See PR49900. 3507 getOrCreateTripCount(OrigLoop); 3508 3509 // Create an empty vector loop, and prepare basic blocks for the runtime 3510 // checks. 3511 Loop *Lp = createVectorLoopSkeleton(""); 3512 3513 // Now, compare the new count to zero. If it is zero skip the vector loop and 3514 // jump to the scalar loop. This check also covers the case where the 3515 // backedge-taken count is uint##_max: adding one to it will overflow leading 3516 // to an incorrect trip count of zero. In this (rare) case we will also jump 3517 // to the scalar loop. 3518 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3519 3520 // Generate the code to check any assumptions that we've made for SCEV 3521 // expressions. 3522 emitSCEVChecks(Lp, LoopScalarPreHeader); 3523 3524 // Generate the code that checks in runtime if arrays overlap. We put the 3525 // checks into a separate block to make the more common case of few elements 3526 // faster. 3527 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3528 3529 createHeaderBranch(Lp); 3530 3531 // Emit phis for the new starting index of the scalar loop. 3532 createInductionResumeValues(Lp); 3533 3534 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3535 } 3536 3537 // Fix up external users of the induction variable. At this point, we are 3538 // in LCSSA form, with all external PHIs that use the IV having one input value, 3539 // coming from the remainder loop. We need those PHIs to also have a correct 3540 // value for the IV when arriving directly from the middle block. 3541 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3542 const InductionDescriptor &II, 3543 Value *CountRoundDown, Value *EndValue, 3544 BasicBlock *MiddleBlock) { 3545 // There are two kinds of external IV usages - those that use the value 3546 // computed in the last iteration (the PHI) and those that use the penultimate 3547 // value (the value that feeds into the phi from the loop latch). 3548 // We allow both, but they, obviously, have different values. 3549 3550 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3551 3552 DenseMap<Value *, Value *> MissingVals; 3553 3554 // An external user of the last iteration's value should see the value that 3555 // the remainder loop uses to initialize its own IV. 3556 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3557 for (User *U : PostInc->users()) { 3558 Instruction *UI = cast<Instruction>(U); 3559 if (!OrigLoop->contains(UI)) { 3560 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3561 MissingVals[UI] = EndValue; 3562 } 3563 } 3564 3565 // An external user of the penultimate value need to see EndValue - Step. 3566 // The simplest way to get this is to recompute it from the constituent SCEVs, 3567 // that is Start + (Step * (CRD - 1)). 3568 for (User *U : OrigPhi->users()) { 3569 auto *UI = cast<Instruction>(U); 3570 if (!OrigLoop->contains(UI)) { 3571 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3572 3573 IRBuilder<> B(MiddleBlock->getTerminator()); 3574 3575 // Fast-math-flags propagate from the original induction instruction. 3576 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3577 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3578 3579 Value *CountMinusOne = B.CreateSub( 3580 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3581 Value *CMO = 3582 !II.getStep()->getType()->isIntegerTy() 3583 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3584 II.getStep()->getType()) 3585 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3586 CMO->setName("cast.cmo"); 3587 3588 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3589 LoopVectorBody->getTerminator()); 3590 Value *Escape = 3591 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3592 Escape->setName("ind.escape"); 3593 MissingVals[UI] = Escape; 3594 } 3595 } 3596 3597 for (auto &I : MissingVals) { 3598 PHINode *PHI = cast<PHINode>(I.first); 3599 // One corner case we have to handle is two IVs "chasing" each-other, 3600 // that is %IV2 = phi [...], [ %IV1, %latch ] 3601 // In this case, if IV1 has an external use, we need to avoid adding both 3602 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3603 // don't already have an incoming value for the middle block. 3604 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3605 PHI->addIncoming(I.second, MiddleBlock); 3606 } 3607 } 3608 3609 namespace { 3610 3611 struct CSEDenseMapInfo { 3612 static bool canHandle(const Instruction *I) { 3613 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3614 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3615 } 3616 3617 static inline Instruction *getEmptyKey() { 3618 return DenseMapInfo<Instruction *>::getEmptyKey(); 3619 } 3620 3621 static inline Instruction *getTombstoneKey() { 3622 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3623 } 3624 3625 static unsigned getHashValue(const Instruction *I) { 3626 assert(canHandle(I) && "Unknown instruction!"); 3627 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3628 I->value_op_end())); 3629 } 3630 3631 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3632 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3633 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3634 return LHS == RHS; 3635 return LHS->isIdenticalTo(RHS); 3636 } 3637 }; 3638 3639 } // end anonymous namespace 3640 3641 ///Perform cse of induction variable instructions. 3642 static void cse(BasicBlock *BB) { 3643 // Perform simple cse. 3644 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3645 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3646 if (!CSEDenseMapInfo::canHandle(&In)) 3647 continue; 3648 3649 // Check if we can replace this instruction with any of the 3650 // visited instructions. 3651 if (Instruction *V = CSEMap.lookup(&In)) { 3652 In.replaceAllUsesWith(V); 3653 In.eraseFromParent(); 3654 continue; 3655 } 3656 3657 CSEMap[&In] = &In; 3658 } 3659 } 3660 3661 InstructionCost 3662 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3663 bool &NeedToScalarize) const { 3664 Function *F = CI->getCalledFunction(); 3665 Type *ScalarRetTy = CI->getType(); 3666 SmallVector<Type *, 4> Tys, ScalarTys; 3667 for (auto &ArgOp : CI->args()) 3668 ScalarTys.push_back(ArgOp->getType()); 3669 3670 // Estimate cost of scalarized vector call. The source operands are assumed 3671 // to be vectors, so we need to extract individual elements from there, 3672 // execute VF scalar calls, and then gather the result into the vector return 3673 // value. 3674 InstructionCost ScalarCallCost = 3675 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3676 if (VF.isScalar()) 3677 return ScalarCallCost; 3678 3679 // Compute corresponding vector type for return value and arguments. 3680 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3681 for (Type *ScalarTy : ScalarTys) 3682 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3683 3684 // Compute costs of unpacking argument values for the scalar calls and 3685 // packing the return values to a vector. 3686 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3687 3688 InstructionCost Cost = 3689 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3690 3691 // If we can't emit a vector call for this function, then the currently found 3692 // cost is the cost we need to return. 3693 NeedToScalarize = true; 3694 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3695 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3696 3697 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3698 return Cost; 3699 3700 // If the corresponding vector cost is cheaper, return its cost. 3701 InstructionCost VectorCallCost = 3702 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3703 if (VectorCallCost < Cost) { 3704 NeedToScalarize = false; 3705 Cost = VectorCallCost; 3706 } 3707 return Cost; 3708 } 3709 3710 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3711 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3712 return Elt; 3713 return VectorType::get(Elt, VF); 3714 } 3715 3716 InstructionCost 3717 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3718 ElementCount VF) const { 3719 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3720 assert(ID && "Expected intrinsic call!"); 3721 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3722 FastMathFlags FMF; 3723 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3724 FMF = FPMO->getFastMathFlags(); 3725 3726 SmallVector<const Value *> Arguments(CI->args()); 3727 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3728 SmallVector<Type *> ParamTys; 3729 std::transform(FTy->param_begin(), FTy->param_end(), 3730 std::back_inserter(ParamTys), 3731 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3732 3733 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3734 dyn_cast<IntrinsicInst>(CI)); 3735 return TTI.getIntrinsicInstrCost(CostAttrs, 3736 TargetTransformInfo::TCK_RecipThroughput); 3737 } 3738 3739 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3740 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3741 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3742 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3743 } 3744 3745 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3746 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3747 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3748 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3749 } 3750 3751 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3752 // For every instruction `I` in MinBWs, truncate the operands, create a 3753 // truncated version of `I` and reextend its result. InstCombine runs 3754 // later and will remove any ext/trunc pairs. 3755 SmallPtrSet<Value *, 4> Erased; 3756 for (const auto &KV : Cost->getMinimalBitwidths()) { 3757 // If the value wasn't vectorized, we must maintain the original scalar 3758 // type. The absence of the value from State indicates that it 3759 // wasn't vectorized. 3760 // FIXME: Should not rely on getVPValue at this point. 3761 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3762 if (!State.hasAnyVectorValue(Def)) 3763 continue; 3764 for (unsigned Part = 0; Part < UF; ++Part) { 3765 Value *I = State.get(Def, Part); 3766 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3767 continue; 3768 Type *OriginalTy = I->getType(); 3769 Type *ScalarTruncatedTy = 3770 IntegerType::get(OriginalTy->getContext(), KV.second); 3771 auto *TruncatedTy = VectorType::get( 3772 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3773 if (TruncatedTy == OriginalTy) 3774 continue; 3775 3776 IRBuilder<> B(cast<Instruction>(I)); 3777 auto ShrinkOperand = [&](Value *V) -> Value * { 3778 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3779 if (ZI->getSrcTy() == TruncatedTy) 3780 return ZI->getOperand(0); 3781 return B.CreateZExtOrTrunc(V, TruncatedTy); 3782 }; 3783 3784 // The actual instruction modification depends on the instruction type, 3785 // unfortunately. 3786 Value *NewI = nullptr; 3787 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3788 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3789 ShrinkOperand(BO->getOperand(1))); 3790 3791 // Any wrapping introduced by shrinking this operation shouldn't be 3792 // considered undefined behavior. So, we can't unconditionally copy 3793 // arithmetic wrapping flags to NewI. 3794 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3795 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3796 NewI = 3797 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3798 ShrinkOperand(CI->getOperand(1))); 3799 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3800 NewI = B.CreateSelect(SI->getCondition(), 3801 ShrinkOperand(SI->getTrueValue()), 3802 ShrinkOperand(SI->getFalseValue())); 3803 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3804 switch (CI->getOpcode()) { 3805 default: 3806 llvm_unreachable("Unhandled cast!"); 3807 case Instruction::Trunc: 3808 NewI = ShrinkOperand(CI->getOperand(0)); 3809 break; 3810 case Instruction::SExt: 3811 NewI = B.CreateSExtOrTrunc( 3812 CI->getOperand(0), 3813 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3814 break; 3815 case Instruction::ZExt: 3816 NewI = B.CreateZExtOrTrunc( 3817 CI->getOperand(0), 3818 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3819 break; 3820 } 3821 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3822 auto Elements0 = 3823 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3824 auto *O0 = B.CreateZExtOrTrunc( 3825 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3826 auto Elements1 = 3827 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3828 auto *O1 = B.CreateZExtOrTrunc( 3829 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3830 3831 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3832 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3833 // Don't do anything with the operands, just extend the result. 3834 continue; 3835 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3836 auto Elements = 3837 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3838 auto *O0 = B.CreateZExtOrTrunc( 3839 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3840 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3841 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3842 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3843 auto Elements = 3844 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3845 auto *O0 = B.CreateZExtOrTrunc( 3846 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3847 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3848 } else { 3849 // If we don't know what to do, be conservative and don't do anything. 3850 continue; 3851 } 3852 3853 // Lastly, extend the result. 3854 NewI->takeName(cast<Instruction>(I)); 3855 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3856 I->replaceAllUsesWith(Res); 3857 cast<Instruction>(I)->eraseFromParent(); 3858 Erased.insert(I); 3859 State.reset(Def, Res, Part); 3860 } 3861 } 3862 3863 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3864 for (const auto &KV : Cost->getMinimalBitwidths()) { 3865 // If the value wasn't vectorized, we must maintain the original scalar 3866 // type. The absence of the value from State indicates that it 3867 // wasn't vectorized. 3868 // FIXME: Should not rely on getVPValue at this point. 3869 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3870 if (!State.hasAnyVectorValue(Def)) 3871 continue; 3872 for (unsigned Part = 0; Part < UF; ++Part) { 3873 Value *I = State.get(Def, Part); 3874 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3875 if (Inst && Inst->use_empty()) { 3876 Value *NewI = Inst->getOperand(0); 3877 Inst->eraseFromParent(); 3878 State.reset(Def, NewI, Part); 3879 } 3880 } 3881 } 3882 } 3883 3884 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3885 // Insert truncates and extends for any truncated instructions as hints to 3886 // InstCombine. 3887 if (VF.isVector()) 3888 truncateToMinimalBitwidths(State); 3889 3890 // Fix widened non-induction PHIs by setting up the PHI operands. 3891 if (OrigPHIsToFix.size()) { 3892 assert(EnableVPlanNativePath && 3893 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3894 fixNonInductionPHIs(State); 3895 } 3896 3897 // At this point every instruction in the original loop is widened to a 3898 // vector form. Now we need to fix the recurrences in the loop. These PHI 3899 // nodes are currently empty because we did not want to introduce cycles. 3900 // This is the second stage of vectorizing recurrences. 3901 fixCrossIterationPHIs(State); 3902 3903 // Forget the original basic block. 3904 PSE.getSE()->forgetLoop(OrigLoop); 3905 3906 // If we inserted an edge from the middle block to the unique exit block, 3907 // update uses outside the loop (phis) to account for the newly inserted 3908 // edge. 3909 if (!Cost->requiresScalarEpilogue(VF)) { 3910 // Fix-up external users of the induction variables. 3911 for (auto &Entry : Legal->getInductionVars()) 3912 fixupIVUsers(Entry.first, Entry.second, 3913 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3914 IVEndValues[Entry.first], LoopMiddleBlock); 3915 3916 fixLCSSAPHIs(State); 3917 } 3918 3919 for (Instruction *PI : PredicatedInstructions) 3920 sinkScalarOperands(&*PI); 3921 3922 // Remove redundant induction instructions. 3923 cse(LoopVectorBody); 3924 3925 // Set/update profile weights for the vector and remainder loops as original 3926 // loop iterations are now distributed among them. Note that original loop 3927 // represented by LoopScalarBody becomes remainder loop after vectorization. 3928 // 3929 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3930 // end up getting slightly roughened result but that should be OK since 3931 // profile is not inherently precise anyway. Note also possible bypass of 3932 // vector code caused by legality checks is ignored, assigning all the weight 3933 // to the vector loop, optimistically. 3934 // 3935 // For scalable vectorization we can't know at compile time how many iterations 3936 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3937 // vscale of '1'. 3938 setProfileInfoAfterUnrolling( 3939 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3940 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3941 } 3942 3943 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3944 // In order to support recurrences we need to be able to vectorize Phi nodes. 3945 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3946 // stage #2: We now need to fix the recurrences by adding incoming edges to 3947 // the currently empty PHI nodes. At this point every instruction in the 3948 // original loop is widened to a vector form so we can use them to construct 3949 // the incoming edges. 3950 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 3951 for (VPRecipeBase &R : Header->phis()) { 3952 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3953 fixReduction(ReductionPhi, State); 3954 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3955 fixFirstOrderRecurrence(FOR, State); 3956 } 3957 } 3958 3959 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3960 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3961 // This is the second phase of vectorizing first-order recurrences. An 3962 // overview of the transformation is described below. Suppose we have the 3963 // following loop. 3964 // 3965 // for (int i = 0; i < n; ++i) 3966 // b[i] = a[i] - a[i - 1]; 3967 // 3968 // There is a first-order recurrence on "a". For this loop, the shorthand 3969 // scalar IR looks like: 3970 // 3971 // scalar.ph: 3972 // s_init = a[-1] 3973 // br scalar.body 3974 // 3975 // scalar.body: 3976 // i = phi [0, scalar.ph], [i+1, scalar.body] 3977 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3978 // s2 = a[i] 3979 // b[i] = s2 - s1 3980 // br cond, scalar.body, ... 3981 // 3982 // In this example, s1 is a recurrence because it's value depends on the 3983 // previous iteration. In the first phase of vectorization, we created a 3984 // vector phi v1 for s1. We now complete the vectorization and produce the 3985 // shorthand vector IR shown below (for VF = 4, UF = 1). 3986 // 3987 // vector.ph: 3988 // v_init = vector(..., ..., ..., a[-1]) 3989 // br vector.body 3990 // 3991 // vector.body 3992 // i = phi [0, vector.ph], [i+4, vector.body] 3993 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3994 // v2 = a[i, i+1, i+2, i+3]; 3995 // v3 = vector(v1(3), v2(0, 1, 2)) 3996 // b[i, i+1, i+2, i+3] = v2 - v3 3997 // br cond, vector.body, middle.block 3998 // 3999 // middle.block: 4000 // x = v2(3) 4001 // br scalar.ph 4002 // 4003 // scalar.ph: 4004 // s_init = phi [x, middle.block], [a[-1], otherwise] 4005 // br scalar.body 4006 // 4007 // After execution completes the vector loop, we extract the next value of 4008 // the recurrence (x) to use as the initial value in the scalar loop. 4009 4010 // Extract the last vector element in the middle block. This will be the 4011 // initial value for the recurrence when jumping to the scalar loop. 4012 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4013 Value *Incoming = State.get(PreviousDef, UF - 1); 4014 auto *ExtractForScalar = Incoming; 4015 auto *IdxTy = Builder.getInt32Ty(); 4016 if (VF.isVector()) { 4017 auto *One = ConstantInt::get(IdxTy, 1); 4018 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4019 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4020 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4021 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4022 "vector.recur.extract"); 4023 } 4024 // Extract the second last element in the middle block if the 4025 // Phi is used outside the loop. We need to extract the phi itself 4026 // and not the last element (the phi update in the current iteration). This 4027 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4028 // when the scalar loop is not run at all. 4029 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4030 if (VF.isVector()) { 4031 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4032 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4033 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4034 Incoming, Idx, "vector.recur.extract.for.phi"); 4035 } else if (UF > 1) 4036 // When loop is unrolled without vectorizing, initialize 4037 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4038 // of `Incoming`. This is analogous to the vectorized case above: extracting 4039 // the second last element when VF > 1. 4040 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4041 4042 // Fix the initial value of the original recurrence in the scalar loop. 4043 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4044 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4045 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4046 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4047 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4048 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4049 Start->addIncoming(Incoming, BB); 4050 } 4051 4052 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4053 Phi->setName("scalar.recur"); 4054 4055 // Finally, fix users of the recurrence outside the loop. The users will need 4056 // either the last value of the scalar recurrence or the last value of the 4057 // vector recurrence we extracted in the middle block. Since the loop is in 4058 // LCSSA form, we just need to find all the phi nodes for the original scalar 4059 // recurrence in the exit block, and then add an edge for the middle block. 4060 // Note that LCSSA does not imply single entry when the original scalar loop 4061 // had multiple exiting edges (as we always run the last iteration in the 4062 // scalar epilogue); in that case, there is no edge from middle to exit and 4063 // and thus no phis which needed updated. 4064 if (!Cost->requiresScalarEpilogue(VF)) 4065 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4066 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4067 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4068 } 4069 4070 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4071 VPTransformState &State) { 4072 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4073 // Get it's reduction variable descriptor. 4074 assert(Legal->isReductionVariable(OrigPhi) && 4075 "Unable to find the reduction variable"); 4076 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4077 4078 RecurKind RK = RdxDesc.getRecurrenceKind(); 4079 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4080 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4081 setDebugLocFromInst(ReductionStartValue); 4082 4083 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4084 // This is the vector-clone of the value that leaves the loop. 4085 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4086 4087 // Wrap flags are in general invalid after vectorization, clear them. 4088 clearReductionWrapFlags(RdxDesc, State); 4089 4090 // Before each round, move the insertion point right between 4091 // the PHIs and the values we are going to write. 4092 // This allows us to write both PHINodes and the extractelement 4093 // instructions. 4094 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4095 4096 setDebugLocFromInst(LoopExitInst); 4097 4098 Type *PhiTy = OrigPhi->getType(); 4099 // If tail is folded by masking, the vector value to leave the loop should be 4100 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4101 // instead of the former. For an inloop reduction the reduction will already 4102 // be predicated, and does not need to be handled here. 4103 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4104 for (unsigned Part = 0; Part < UF; ++Part) { 4105 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4106 Value *Sel = nullptr; 4107 for (User *U : VecLoopExitInst->users()) { 4108 if (isa<SelectInst>(U)) { 4109 assert(!Sel && "Reduction exit feeding two selects"); 4110 Sel = U; 4111 } else 4112 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4113 } 4114 assert(Sel && "Reduction exit feeds no select"); 4115 State.reset(LoopExitInstDef, Sel, Part); 4116 4117 // If the target can create a predicated operator for the reduction at no 4118 // extra cost in the loop (for example a predicated vadd), it can be 4119 // cheaper for the select to remain in the loop than be sunk out of it, 4120 // and so use the select value for the phi instead of the old 4121 // LoopExitValue. 4122 if (PreferPredicatedReductionSelect || 4123 TTI->preferPredicatedReductionSelect( 4124 RdxDesc.getOpcode(), PhiTy, 4125 TargetTransformInfo::ReductionFlags())) { 4126 auto *VecRdxPhi = 4127 cast<PHINode>(State.get(PhiR, Part)); 4128 VecRdxPhi->setIncomingValueForBlock( 4129 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4130 } 4131 } 4132 } 4133 4134 // If the vector reduction can be performed in a smaller type, we truncate 4135 // then extend the loop exit value to enable InstCombine to evaluate the 4136 // entire expression in the smaller type. 4137 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4138 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4139 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4140 Builder.SetInsertPoint( 4141 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4142 VectorParts RdxParts(UF); 4143 for (unsigned Part = 0; Part < UF; ++Part) { 4144 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4145 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4146 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4147 : Builder.CreateZExt(Trunc, VecTy); 4148 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4149 if (U != Trunc) { 4150 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4151 RdxParts[Part] = Extnd; 4152 } 4153 } 4154 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4155 for (unsigned Part = 0; Part < UF; ++Part) { 4156 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4157 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4158 } 4159 } 4160 4161 // Reduce all of the unrolled parts into a single vector. 4162 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4163 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4164 4165 // The middle block terminator has already been assigned a DebugLoc here (the 4166 // OrigLoop's single latch terminator). We want the whole middle block to 4167 // appear to execute on this line because: (a) it is all compiler generated, 4168 // (b) these instructions are always executed after evaluating the latch 4169 // conditional branch, and (c) other passes may add new predecessors which 4170 // terminate on this line. This is the easiest way to ensure we don't 4171 // accidentally cause an extra step back into the loop while debugging. 4172 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4173 if (PhiR->isOrdered()) 4174 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4175 else { 4176 // Floating-point operations should have some FMF to enable the reduction. 4177 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4178 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4179 for (unsigned Part = 1; Part < UF; ++Part) { 4180 Value *RdxPart = State.get(LoopExitInstDef, Part); 4181 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4182 ReducedPartRdx = Builder.CreateBinOp( 4183 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4184 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4185 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4186 ReducedPartRdx, RdxPart); 4187 else 4188 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4189 } 4190 } 4191 4192 // Create the reduction after the loop. Note that inloop reductions create the 4193 // target reduction in the loop using a Reduction recipe. 4194 if (VF.isVector() && !PhiR->isInLoop()) { 4195 ReducedPartRdx = 4196 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4197 // If the reduction can be performed in a smaller type, we need to extend 4198 // the reduction to the wider type before we branch to the original loop. 4199 if (PhiTy != RdxDesc.getRecurrenceType()) 4200 ReducedPartRdx = RdxDesc.isSigned() 4201 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4202 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4203 } 4204 4205 PHINode *ResumePhi = 4206 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4207 4208 // Create a phi node that merges control-flow from the backedge-taken check 4209 // block and the middle block. 4210 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4211 LoopScalarPreHeader->getTerminator()); 4212 4213 // If we are fixing reductions in the epilogue loop then we should already 4214 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4215 // we carry over the incoming values correctly. 4216 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4217 if (Incoming == LoopMiddleBlock) 4218 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4219 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4220 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4221 Incoming); 4222 else 4223 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4224 } 4225 4226 // Set the resume value for this reduction 4227 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4228 4229 // Now, we need to fix the users of the reduction variable 4230 // inside and outside of the scalar remainder loop. 4231 4232 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4233 // in the exit blocks. See comment on analogous loop in 4234 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4235 if (!Cost->requiresScalarEpilogue(VF)) 4236 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4237 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4238 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4239 4240 // Fix the scalar loop reduction variable with the incoming reduction sum 4241 // from the vector body and from the backedge value. 4242 int IncomingEdgeBlockIdx = 4243 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4244 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4245 // Pick the other block. 4246 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4247 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4248 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4249 } 4250 4251 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4252 VPTransformState &State) { 4253 RecurKind RK = RdxDesc.getRecurrenceKind(); 4254 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4255 return; 4256 4257 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4258 assert(LoopExitInstr && "null loop exit instruction"); 4259 SmallVector<Instruction *, 8> Worklist; 4260 SmallPtrSet<Instruction *, 8> Visited; 4261 Worklist.push_back(LoopExitInstr); 4262 Visited.insert(LoopExitInstr); 4263 4264 while (!Worklist.empty()) { 4265 Instruction *Cur = Worklist.pop_back_val(); 4266 if (isa<OverflowingBinaryOperator>(Cur)) 4267 for (unsigned Part = 0; Part < UF; ++Part) { 4268 // FIXME: Should not rely on getVPValue at this point. 4269 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4270 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4271 } 4272 4273 for (User *U : Cur->users()) { 4274 Instruction *UI = cast<Instruction>(U); 4275 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4276 Visited.insert(UI).second) 4277 Worklist.push_back(UI); 4278 } 4279 } 4280 } 4281 4282 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4283 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4284 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4285 // Some phis were already hand updated by the reduction and recurrence 4286 // code above, leave them alone. 4287 continue; 4288 4289 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4290 // Non-instruction incoming values will have only one value. 4291 4292 VPLane Lane = VPLane::getFirstLane(); 4293 if (isa<Instruction>(IncomingValue) && 4294 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4295 VF)) 4296 Lane = VPLane::getLastLaneForVF(VF); 4297 4298 // Can be a loop invariant incoming value or the last scalar value to be 4299 // extracted from the vectorized loop. 4300 // FIXME: Should not rely on getVPValue at this point. 4301 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4302 Value *lastIncomingValue = 4303 OrigLoop->isLoopInvariant(IncomingValue) 4304 ? IncomingValue 4305 : State.get(State.Plan->getVPValue(IncomingValue, true), 4306 VPIteration(UF - 1, Lane)); 4307 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4308 } 4309 } 4310 4311 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4312 // The basic block and loop containing the predicated instruction. 4313 auto *PredBB = PredInst->getParent(); 4314 auto *VectorLoop = LI->getLoopFor(PredBB); 4315 4316 // Initialize a worklist with the operands of the predicated instruction. 4317 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4318 4319 // Holds instructions that we need to analyze again. An instruction may be 4320 // reanalyzed if we don't yet know if we can sink it or not. 4321 SmallVector<Instruction *, 8> InstsToReanalyze; 4322 4323 // Returns true if a given use occurs in the predicated block. Phi nodes use 4324 // their operands in their corresponding predecessor blocks. 4325 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4326 auto *I = cast<Instruction>(U.getUser()); 4327 BasicBlock *BB = I->getParent(); 4328 if (auto *Phi = dyn_cast<PHINode>(I)) 4329 BB = Phi->getIncomingBlock( 4330 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4331 return BB == PredBB; 4332 }; 4333 4334 // Iteratively sink the scalarized operands of the predicated instruction 4335 // into the block we created for it. When an instruction is sunk, it's 4336 // operands are then added to the worklist. The algorithm ends after one pass 4337 // through the worklist doesn't sink a single instruction. 4338 bool Changed; 4339 do { 4340 // Add the instructions that need to be reanalyzed to the worklist, and 4341 // reset the changed indicator. 4342 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4343 InstsToReanalyze.clear(); 4344 Changed = false; 4345 4346 while (!Worklist.empty()) { 4347 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4348 4349 // We can't sink an instruction if it is a phi node, is not in the loop, 4350 // or may have side effects. 4351 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4352 I->mayHaveSideEffects()) 4353 continue; 4354 4355 // If the instruction is already in PredBB, check if we can sink its 4356 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4357 // sinking the scalar instruction I, hence it appears in PredBB; but it 4358 // may have failed to sink I's operands (recursively), which we try 4359 // (again) here. 4360 if (I->getParent() == PredBB) { 4361 Worklist.insert(I->op_begin(), I->op_end()); 4362 continue; 4363 } 4364 4365 // It's legal to sink the instruction if all its uses occur in the 4366 // predicated block. Otherwise, there's nothing to do yet, and we may 4367 // need to reanalyze the instruction. 4368 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4369 InstsToReanalyze.push_back(I); 4370 continue; 4371 } 4372 4373 // Move the instruction to the beginning of the predicated block, and add 4374 // it's operands to the worklist. 4375 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4376 Worklist.insert(I->op_begin(), I->op_end()); 4377 4378 // The sinking may have enabled other instructions to be sunk, so we will 4379 // need to iterate. 4380 Changed = true; 4381 } 4382 } while (Changed); 4383 } 4384 4385 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4386 for (PHINode *OrigPhi : OrigPHIsToFix) { 4387 VPWidenPHIRecipe *VPPhi = 4388 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4389 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4390 // Make sure the builder has a valid insert point. 4391 Builder.SetInsertPoint(NewPhi); 4392 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4393 VPValue *Inc = VPPhi->getIncomingValue(i); 4394 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4395 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4396 } 4397 } 4398 } 4399 4400 bool InnerLoopVectorizer::useOrderedReductions( 4401 const RecurrenceDescriptor &RdxDesc) { 4402 return Cost->useOrderedReductions(RdxDesc); 4403 } 4404 4405 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4406 VPWidenPHIRecipe *PhiR, 4407 VPTransformState &State) { 4408 PHINode *P = cast<PHINode>(PN); 4409 if (EnableVPlanNativePath) { 4410 // Currently we enter here in the VPlan-native path for non-induction 4411 // PHIs where all control flow is uniform. We simply widen these PHIs. 4412 // Create a vector phi with no operands - the vector phi operands will be 4413 // set at the end of vector code generation. 4414 Type *VecTy = (State.VF.isScalar()) 4415 ? PN->getType() 4416 : VectorType::get(PN->getType(), State.VF); 4417 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4418 State.set(PhiR, VecPhi, 0); 4419 OrigPHIsToFix.push_back(P); 4420 4421 return; 4422 } 4423 4424 assert(PN->getParent() == OrigLoop->getHeader() && 4425 "Non-header phis should have been handled elsewhere"); 4426 4427 // In order to support recurrences we need to be able to vectorize Phi nodes. 4428 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4429 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4430 // this value when we vectorize all of the instructions that use the PHI. 4431 4432 assert(!Legal->isReductionVariable(P) && 4433 "reductions should be handled elsewhere"); 4434 4435 setDebugLocFromInst(P); 4436 4437 // This PHINode must be an induction variable. 4438 // Make sure that we know about it. 4439 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4440 4441 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4442 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4443 4444 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4445 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4446 4447 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4448 // which can be found from the original scalar operations. 4449 switch (II.getKind()) { 4450 case InductionDescriptor::IK_NoInduction: 4451 llvm_unreachable("Unknown induction"); 4452 case InductionDescriptor::IK_IntInduction: 4453 case InductionDescriptor::IK_FpInduction: 4454 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4455 case InductionDescriptor::IK_PtrInduction: { 4456 // Handle the pointer induction variable case. 4457 assert(P->getType()->isPointerTy() && "Unexpected type."); 4458 4459 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4460 // This is the normalized GEP that starts counting at zero. 4461 Value *PtrInd = 4462 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4463 // Determine the number of scalars we need to generate for each unroll 4464 // iteration. If the instruction is uniform, we only need to generate the 4465 // first lane. Otherwise, we generate all VF values. 4466 bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); 4467 assert((IsUniform || !State.VF.isScalable()) && 4468 "Cannot scalarize a scalable VF"); 4469 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4470 4471 for (unsigned Part = 0; Part < UF; ++Part) { 4472 Value *PartStart = 4473 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4474 4475 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4476 Value *Idx = Builder.CreateAdd( 4477 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4478 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4479 4480 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 4481 State.CFG.PrevBB->getTerminator()); 4482 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, 4483 II.getStartValue(), Step, II); 4484 SclrGep->setName("next.gep"); 4485 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4486 } 4487 } 4488 return; 4489 } 4490 assert(isa<SCEVConstant>(II.getStep()) && 4491 "Induction step not a SCEV constant!"); 4492 Type *PhiType = II.getStep()->getType(); 4493 4494 // Build a pointer phi 4495 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4496 Type *ScStValueType = ScalarStartValue->getType(); 4497 PHINode *NewPointerPhi = 4498 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4499 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4500 4501 // A pointer induction, performed by using a gep 4502 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4503 Instruction *InductionLoc = LoopLatch->getTerminator(); 4504 const SCEV *ScalarStep = II.getStep(); 4505 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4506 Value *ScalarStepValue = 4507 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4508 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4509 Value *NumUnrolledElems = 4510 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4511 Value *InductionGEP = GetElementPtrInst::Create( 4512 II.getElementType(), NewPointerPhi, 4513 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4514 InductionLoc); 4515 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4516 4517 // Create UF many actual address geps that use the pointer 4518 // phi as base and a vectorized version of the step value 4519 // (<step*0, ..., step*N>) as offset. 4520 for (unsigned Part = 0; Part < State.UF; ++Part) { 4521 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4522 Value *StartOffsetScalar = 4523 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4524 Value *StartOffset = 4525 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4526 // Create a vector of consecutive numbers from zero to VF. 4527 StartOffset = 4528 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4529 4530 Value *GEP = Builder.CreateGEP( 4531 II.getElementType(), NewPointerPhi, 4532 Builder.CreateMul( 4533 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4534 "vector.gep")); 4535 State.set(PhiR, GEP, Part); 4536 } 4537 } 4538 } 4539 } 4540 4541 /// A helper function for checking whether an integer division-related 4542 /// instruction may divide by zero (in which case it must be predicated if 4543 /// executed conditionally in the scalar code). 4544 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4545 /// Non-zero divisors that are non compile-time constants will not be 4546 /// converted into multiplication, so we will still end up scalarizing 4547 /// the division, but can do so w/o predication. 4548 static bool mayDivideByZero(Instruction &I) { 4549 assert((I.getOpcode() == Instruction::UDiv || 4550 I.getOpcode() == Instruction::SDiv || 4551 I.getOpcode() == Instruction::URem || 4552 I.getOpcode() == Instruction::SRem) && 4553 "Unexpected instruction"); 4554 Value *Divisor = I.getOperand(1); 4555 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4556 return !CInt || CInt->isZero(); 4557 } 4558 4559 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4560 VPUser &ArgOperands, 4561 VPTransformState &State) { 4562 assert(!isa<DbgInfoIntrinsic>(I) && 4563 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4564 setDebugLocFromInst(&I); 4565 4566 Module *M = I.getParent()->getParent()->getParent(); 4567 auto *CI = cast<CallInst>(&I); 4568 4569 SmallVector<Type *, 4> Tys; 4570 for (Value *ArgOperand : CI->args()) 4571 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4572 4573 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4574 4575 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4576 // version of the instruction. 4577 // Is it beneficial to perform intrinsic call compared to lib call? 4578 bool NeedToScalarize = false; 4579 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4580 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4581 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4582 assert((UseVectorIntrinsic || !NeedToScalarize) && 4583 "Instruction should be scalarized elsewhere."); 4584 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4585 "Either the intrinsic cost or vector call cost must be valid"); 4586 4587 for (unsigned Part = 0; Part < UF; ++Part) { 4588 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4589 SmallVector<Value *, 4> Args; 4590 for (auto &I : enumerate(ArgOperands.operands())) { 4591 // Some intrinsics have a scalar argument - don't replace it with a 4592 // vector. 4593 Value *Arg; 4594 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4595 Arg = State.get(I.value(), Part); 4596 else { 4597 Arg = State.get(I.value(), VPIteration(0, 0)); 4598 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4599 TysForDecl.push_back(Arg->getType()); 4600 } 4601 Args.push_back(Arg); 4602 } 4603 4604 Function *VectorF; 4605 if (UseVectorIntrinsic) { 4606 // Use vector version of the intrinsic. 4607 if (VF.isVector()) 4608 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4609 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4610 assert(VectorF && "Can't retrieve vector intrinsic."); 4611 } else { 4612 // Use vector version of the function call. 4613 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4614 #ifndef NDEBUG 4615 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4616 "Can't create vector function."); 4617 #endif 4618 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4619 } 4620 SmallVector<OperandBundleDef, 1> OpBundles; 4621 CI->getOperandBundlesAsDefs(OpBundles); 4622 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4623 4624 if (isa<FPMathOperator>(V)) 4625 V->copyFastMathFlags(CI); 4626 4627 State.set(Def, V, Part); 4628 addMetadata(V, &I); 4629 } 4630 } 4631 4632 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4633 // We should not collect Scalars more than once per VF. Right now, this 4634 // function is called from collectUniformsAndScalars(), which already does 4635 // this check. Collecting Scalars for VF=1 does not make any sense. 4636 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4637 "This function should not be visited twice for the same VF"); 4638 4639 SmallSetVector<Instruction *, 8> Worklist; 4640 4641 // These sets are used to seed the analysis with pointers used by memory 4642 // accesses that will remain scalar. 4643 SmallSetVector<Instruction *, 8> ScalarPtrs; 4644 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4645 auto *Latch = TheLoop->getLoopLatch(); 4646 4647 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4648 // The pointer operands of loads and stores will be scalar as long as the 4649 // memory access is not a gather or scatter operation. The value operand of a 4650 // store will remain scalar if the store is scalarized. 4651 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4652 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4653 assert(WideningDecision != CM_Unknown && 4654 "Widening decision should be ready at this moment"); 4655 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4656 if (Ptr == Store->getValueOperand()) 4657 return WideningDecision == CM_Scalarize; 4658 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4659 "Ptr is neither a value or pointer operand"); 4660 return WideningDecision != CM_GatherScatter; 4661 }; 4662 4663 // A helper that returns true if the given value is a bitcast or 4664 // getelementptr instruction contained in the loop. 4665 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4666 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4667 isa<GetElementPtrInst>(V)) && 4668 !TheLoop->isLoopInvariant(V); 4669 }; 4670 4671 // A helper that evaluates a memory access's use of a pointer. If the use will 4672 // be a scalar use and the pointer is only used by memory accesses, we place 4673 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4674 // PossibleNonScalarPtrs. 4675 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4676 // We only care about bitcast and getelementptr instructions contained in 4677 // the loop. 4678 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4679 return; 4680 4681 // If the pointer has already been identified as scalar (e.g., if it was 4682 // also identified as uniform), there's nothing to do. 4683 auto *I = cast<Instruction>(Ptr); 4684 if (Worklist.count(I)) 4685 return; 4686 4687 // If the use of the pointer will be a scalar use, and all users of the 4688 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4689 // place the pointer in PossibleNonScalarPtrs. 4690 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4691 return isa<LoadInst>(U) || isa<StoreInst>(U); 4692 })) 4693 ScalarPtrs.insert(I); 4694 else 4695 PossibleNonScalarPtrs.insert(I); 4696 }; 4697 4698 // We seed the scalars analysis with three classes of instructions: (1) 4699 // instructions marked uniform-after-vectorization and (2) bitcast, 4700 // getelementptr and (pointer) phi instructions used by memory accesses 4701 // requiring a scalar use. 4702 // 4703 // (1) Add to the worklist all instructions that have been identified as 4704 // uniform-after-vectorization. 4705 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4706 4707 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4708 // memory accesses requiring a scalar use. The pointer operands of loads and 4709 // stores will be scalar as long as the memory accesses is not a gather or 4710 // scatter operation. The value operand of a store will remain scalar if the 4711 // store is scalarized. 4712 for (auto *BB : TheLoop->blocks()) 4713 for (auto &I : *BB) { 4714 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4715 evaluatePtrUse(Load, Load->getPointerOperand()); 4716 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4717 evaluatePtrUse(Store, Store->getPointerOperand()); 4718 evaluatePtrUse(Store, Store->getValueOperand()); 4719 } 4720 } 4721 for (auto *I : ScalarPtrs) 4722 if (!PossibleNonScalarPtrs.count(I)) { 4723 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4724 Worklist.insert(I); 4725 } 4726 4727 // Insert the forced scalars. 4728 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4729 // induction variable when the PHI user is scalarized. 4730 auto ForcedScalar = ForcedScalars.find(VF); 4731 if (ForcedScalar != ForcedScalars.end()) 4732 for (auto *I : ForcedScalar->second) 4733 Worklist.insert(I); 4734 4735 // Expand the worklist by looking through any bitcasts and getelementptr 4736 // instructions we've already identified as scalar. This is similar to the 4737 // expansion step in collectLoopUniforms(); however, here we're only 4738 // expanding to include additional bitcasts and getelementptr instructions. 4739 unsigned Idx = 0; 4740 while (Idx != Worklist.size()) { 4741 Instruction *Dst = Worklist[Idx++]; 4742 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4743 continue; 4744 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4745 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4746 auto *J = cast<Instruction>(U); 4747 return !TheLoop->contains(J) || Worklist.count(J) || 4748 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4749 isScalarUse(J, Src)); 4750 })) { 4751 Worklist.insert(Src); 4752 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4753 } 4754 } 4755 4756 // An induction variable will remain scalar if all users of the induction 4757 // variable and induction variable update remain scalar. 4758 for (auto &Induction : Legal->getInductionVars()) { 4759 auto *Ind = Induction.first; 4760 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4761 4762 // If tail-folding is applied, the primary induction variable will be used 4763 // to feed a vector compare. 4764 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4765 continue; 4766 4767 // Returns true if \p Indvar is a pointer induction that is used directly by 4768 // load/store instruction \p I. 4769 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4770 Instruction *I) { 4771 return Induction.second.getKind() == 4772 InductionDescriptor::IK_PtrInduction && 4773 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4774 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4775 }; 4776 4777 // Determine if all users of the induction variable are scalar after 4778 // vectorization. 4779 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4780 auto *I = cast<Instruction>(U); 4781 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4782 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4783 }); 4784 if (!ScalarInd) 4785 continue; 4786 4787 // Determine if all users of the induction variable update instruction are 4788 // scalar after vectorization. 4789 auto ScalarIndUpdate = 4790 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4791 auto *I = cast<Instruction>(U); 4792 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4793 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4794 }); 4795 if (!ScalarIndUpdate) 4796 continue; 4797 4798 // The induction variable and its update instruction will remain scalar. 4799 Worklist.insert(Ind); 4800 Worklist.insert(IndUpdate); 4801 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4802 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4803 << "\n"); 4804 } 4805 4806 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4807 } 4808 4809 bool LoopVectorizationCostModel::isScalarWithPredication( 4810 Instruction *I, ElementCount VF) const { 4811 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4812 return false; 4813 switch(I->getOpcode()) { 4814 default: 4815 break; 4816 case Instruction::Load: 4817 case Instruction::Store: { 4818 if (!Legal->isMaskRequired(I)) 4819 return false; 4820 auto *Ptr = getLoadStorePointerOperand(I); 4821 auto *Ty = getLoadStoreType(I); 4822 Type *VTy = Ty; 4823 if (VF.isVector()) 4824 VTy = VectorType::get(Ty, VF); 4825 const Align Alignment = getLoadStoreAlignment(I); 4826 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4827 TTI.isLegalMaskedGather(VTy, Alignment)) 4828 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4829 TTI.isLegalMaskedScatter(VTy, Alignment)); 4830 } 4831 case Instruction::UDiv: 4832 case Instruction::SDiv: 4833 case Instruction::SRem: 4834 case Instruction::URem: 4835 return mayDivideByZero(*I); 4836 } 4837 return false; 4838 } 4839 4840 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4841 Instruction *I, ElementCount VF) { 4842 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4843 assert(getWideningDecision(I, VF) == CM_Unknown && 4844 "Decision should not be set yet."); 4845 auto *Group = getInterleavedAccessGroup(I); 4846 assert(Group && "Must have a group."); 4847 4848 // If the instruction's allocated size doesn't equal it's type size, it 4849 // requires padding and will be scalarized. 4850 auto &DL = I->getModule()->getDataLayout(); 4851 auto *ScalarTy = getLoadStoreType(I); 4852 if (hasIrregularType(ScalarTy, DL)) 4853 return false; 4854 4855 // Check if masking is required. 4856 // A Group may need masking for one of two reasons: it resides in a block that 4857 // needs predication, or it was decided to use masking to deal with gaps 4858 // (either a gap at the end of a load-access that may result in a speculative 4859 // load, or any gaps in a store-access). 4860 bool PredicatedAccessRequiresMasking = 4861 blockNeedsPredicationForAnyReason(I->getParent()) && 4862 Legal->isMaskRequired(I); 4863 bool LoadAccessWithGapsRequiresEpilogMasking = 4864 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4865 !isScalarEpilogueAllowed(); 4866 bool StoreAccessWithGapsRequiresMasking = 4867 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4868 if (!PredicatedAccessRequiresMasking && 4869 !LoadAccessWithGapsRequiresEpilogMasking && 4870 !StoreAccessWithGapsRequiresMasking) 4871 return true; 4872 4873 // If masked interleaving is required, we expect that the user/target had 4874 // enabled it, because otherwise it either wouldn't have been created or 4875 // it should have been invalidated by the CostModel. 4876 assert(useMaskedInterleavedAccesses(TTI) && 4877 "Masked interleave-groups for predicated accesses are not enabled."); 4878 4879 if (Group->isReverse()) 4880 return false; 4881 4882 auto *Ty = getLoadStoreType(I); 4883 const Align Alignment = getLoadStoreAlignment(I); 4884 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4885 : TTI.isLegalMaskedStore(Ty, Alignment); 4886 } 4887 4888 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4889 Instruction *I, ElementCount VF) { 4890 // Get and ensure we have a valid memory instruction. 4891 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4892 4893 auto *Ptr = getLoadStorePointerOperand(I); 4894 auto *ScalarTy = getLoadStoreType(I); 4895 4896 // In order to be widened, the pointer should be consecutive, first of all. 4897 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4898 return false; 4899 4900 // If the instruction is a store located in a predicated block, it will be 4901 // scalarized. 4902 if (isScalarWithPredication(I, VF)) 4903 return false; 4904 4905 // If the instruction's allocated size doesn't equal it's type size, it 4906 // requires padding and will be scalarized. 4907 auto &DL = I->getModule()->getDataLayout(); 4908 if (hasIrregularType(ScalarTy, DL)) 4909 return false; 4910 4911 return true; 4912 } 4913 4914 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4915 // We should not collect Uniforms more than once per VF. Right now, 4916 // this function is called from collectUniformsAndScalars(), which 4917 // already does this check. Collecting Uniforms for VF=1 does not make any 4918 // sense. 4919 4920 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4921 "This function should not be visited twice for the same VF"); 4922 4923 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4924 // not analyze again. Uniforms.count(VF) will return 1. 4925 Uniforms[VF].clear(); 4926 4927 // We now know that the loop is vectorizable! 4928 // Collect instructions inside the loop that will remain uniform after 4929 // vectorization. 4930 4931 // Global values, params and instructions outside of current loop are out of 4932 // scope. 4933 auto isOutOfScope = [&](Value *V) -> bool { 4934 Instruction *I = dyn_cast<Instruction>(V); 4935 return (!I || !TheLoop->contains(I)); 4936 }; 4937 4938 // Worklist containing uniform instructions demanding lane 0. 4939 SetVector<Instruction *> Worklist; 4940 BasicBlock *Latch = TheLoop->getLoopLatch(); 4941 4942 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4943 // that are scalar with predication must not be considered uniform after 4944 // vectorization, because that would create an erroneous replicating region 4945 // where only a single instance out of VF should be formed. 4946 // TODO: optimize such seldom cases if found important, see PR40816. 4947 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4948 if (isOutOfScope(I)) { 4949 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4950 << *I << "\n"); 4951 return; 4952 } 4953 if (isScalarWithPredication(I, VF)) { 4954 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4955 << *I << "\n"); 4956 return; 4957 } 4958 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4959 Worklist.insert(I); 4960 }; 4961 4962 // Start with the conditional branch. If the branch condition is an 4963 // instruction contained in the loop that is only used by the branch, it is 4964 // uniform. 4965 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4966 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4967 addToWorklistIfAllowed(Cmp); 4968 4969 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4970 InstWidening WideningDecision = getWideningDecision(I, VF); 4971 assert(WideningDecision != CM_Unknown && 4972 "Widening decision should be ready at this moment"); 4973 4974 // A uniform memory op is itself uniform. We exclude uniform stores 4975 // here as they demand the last lane, not the first one. 4976 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4977 assert(WideningDecision == CM_Scalarize); 4978 return true; 4979 } 4980 4981 return (WideningDecision == CM_Widen || 4982 WideningDecision == CM_Widen_Reverse || 4983 WideningDecision == CM_Interleave); 4984 }; 4985 4986 4987 // Returns true if Ptr is the pointer operand of a memory access instruction 4988 // I, and I is known to not require scalarization. 4989 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4990 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4991 }; 4992 4993 // Holds a list of values which are known to have at least one uniform use. 4994 // Note that there may be other uses which aren't uniform. A "uniform use" 4995 // here is something which only demands lane 0 of the unrolled iterations; 4996 // it does not imply that all lanes produce the same value (e.g. this is not 4997 // the usual meaning of uniform) 4998 SetVector<Value *> HasUniformUse; 4999 5000 // Scan the loop for instructions which are either a) known to have only 5001 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5002 for (auto *BB : TheLoop->blocks()) 5003 for (auto &I : *BB) { 5004 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5005 switch (II->getIntrinsicID()) { 5006 case Intrinsic::sideeffect: 5007 case Intrinsic::experimental_noalias_scope_decl: 5008 case Intrinsic::assume: 5009 case Intrinsic::lifetime_start: 5010 case Intrinsic::lifetime_end: 5011 if (TheLoop->hasLoopInvariantOperands(&I)) 5012 addToWorklistIfAllowed(&I); 5013 break; 5014 default: 5015 break; 5016 } 5017 } 5018 5019 // ExtractValue instructions must be uniform, because the operands are 5020 // known to be loop-invariant. 5021 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5022 assert(isOutOfScope(EVI->getAggregateOperand()) && 5023 "Expected aggregate value to be loop invariant"); 5024 addToWorklistIfAllowed(EVI); 5025 continue; 5026 } 5027 5028 // If there's no pointer operand, there's nothing to do. 5029 auto *Ptr = getLoadStorePointerOperand(&I); 5030 if (!Ptr) 5031 continue; 5032 5033 // A uniform memory op is itself uniform. We exclude uniform stores 5034 // here as they demand the last lane, not the first one. 5035 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5036 addToWorklistIfAllowed(&I); 5037 5038 if (isUniformDecision(&I, VF)) { 5039 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5040 HasUniformUse.insert(Ptr); 5041 } 5042 } 5043 5044 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5045 // demanding) users. Since loops are assumed to be in LCSSA form, this 5046 // disallows uses outside the loop as well. 5047 for (auto *V : HasUniformUse) { 5048 if (isOutOfScope(V)) 5049 continue; 5050 auto *I = cast<Instruction>(V); 5051 auto UsersAreMemAccesses = 5052 llvm::all_of(I->users(), [&](User *U) -> bool { 5053 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5054 }); 5055 if (UsersAreMemAccesses) 5056 addToWorklistIfAllowed(I); 5057 } 5058 5059 // Expand Worklist in topological order: whenever a new instruction 5060 // is added , its users should be already inside Worklist. It ensures 5061 // a uniform instruction will only be used by uniform instructions. 5062 unsigned idx = 0; 5063 while (idx != Worklist.size()) { 5064 Instruction *I = Worklist[idx++]; 5065 5066 for (auto OV : I->operand_values()) { 5067 // isOutOfScope operands cannot be uniform instructions. 5068 if (isOutOfScope(OV)) 5069 continue; 5070 // First order recurrence Phi's should typically be considered 5071 // non-uniform. 5072 auto *OP = dyn_cast<PHINode>(OV); 5073 if (OP && Legal->isFirstOrderRecurrence(OP)) 5074 continue; 5075 // If all the users of the operand are uniform, then add the 5076 // operand into the uniform worklist. 5077 auto *OI = cast<Instruction>(OV); 5078 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5079 auto *J = cast<Instruction>(U); 5080 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5081 })) 5082 addToWorklistIfAllowed(OI); 5083 } 5084 } 5085 5086 // For an instruction to be added into Worklist above, all its users inside 5087 // the loop should also be in Worklist. However, this condition cannot be 5088 // true for phi nodes that form a cyclic dependence. We must process phi 5089 // nodes separately. An induction variable will remain uniform if all users 5090 // of the induction variable and induction variable update remain uniform. 5091 // The code below handles both pointer and non-pointer induction variables. 5092 for (auto &Induction : Legal->getInductionVars()) { 5093 auto *Ind = Induction.first; 5094 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5095 5096 // Determine if all users of the induction variable are uniform after 5097 // vectorization. 5098 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5099 auto *I = cast<Instruction>(U); 5100 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5101 isVectorizedMemAccessUse(I, Ind); 5102 }); 5103 if (!UniformInd) 5104 continue; 5105 5106 // Determine if all users of the induction variable update instruction are 5107 // uniform after vectorization. 5108 auto UniformIndUpdate = 5109 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5110 auto *I = cast<Instruction>(U); 5111 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5112 isVectorizedMemAccessUse(I, IndUpdate); 5113 }); 5114 if (!UniformIndUpdate) 5115 continue; 5116 5117 // The induction variable and its update instruction will remain uniform. 5118 addToWorklistIfAllowed(Ind); 5119 addToWorklistIfAllowed(IndUpdate); 5120 } 5121 5122 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5123 } 5124 5125 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5126 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5127 5128 if (Legal->getRuntimePointerChecking()->Need) { 5129 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5130 "runtime pointer checks needed. Enable vectorization of this " 5131 "loop with '#pragma clang loop vectorize(enable)' when " 5132 "compiling with -Os/-Oz", 5133 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5134 return true; 5135 } 5136 5137 if (!PSE.getPredicate().isAlwaysTrue()) { 5138 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5139 "runtime SCEV checks needed. Enable vectorization of this " 5140 "loop with '#pragma clang loop vectorize(enable)' when " 5141 "compiling with -Os/-Oz", 5142 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5143 return true; 5144 } 5145 5146 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5147 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5148 reportVectorizationFailure("Runtime stride check for small trip count", 5149 "runtime stride == 1 checks needed. Enable vectorization of " 5150 "this loop without such check by compiling with -Os/-Oz", 5151 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5152 return true; 5153 } 5154 5155 return false; 5156 } 5157 5158 ElementCount 5159 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5160 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5161 return ElementCount::getScalable(0); 5162 5163 if (Hints->isScalableVectorizationDisabled()) { 5164 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5165 "ScalableVectorizationDisabled", ORE, TheLoop); 5166 return ElementCount::getScalable(0); 5167 } 5168 5169 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5170 5171 auto MaxScalableVF = ElementCount::getScalable( 5172 std::numeric_limits<ElementCount::ScalarTy>::max()); 5173 5174 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5175 // FIXME: While for scalable vectors this is currently sufficient, this should 5176 // be replaced by a more detailed mechanism that filters out specific VFs, 5177 // instead of invalidating vectorization for a whole set of VFs based on the 5178 // MaxVF. 5179 5180 // Disable scalable vectorization if the loop contains unsupported reductions. 5181 if (!canVectorizeReductions(MaxScalableVF)) { 5182 reportVectorizationInfo( 5183 "Scalable vectorization not supported for the reduction " 5184 "operations found in this loop.", 5185 "ScalableVFUnfeasible", ORE, TheLoop); 5186 return ElementCount::getScalable(0); 5187 } 5188 5189 // Disable scalable vectorization if the loop contains any instructions 5190 // with element types not supported for scalable vectors. 5191 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5192 return !Ty->isVoidTy() && 5193 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5194 })) { 5195 reportVectorizationInfo("Scalable vectorization is not supported " 5196 "for all element types found in this loop.", 5197 "ScalableVFUnfeasible", ORE, TheLoop); 5198 return ElementCount::getScalable(0); 5199 } 5200 5201 if (Legal->isSafeForAnyVectorWidth()) 5202 return MaxScalableVF; 5203 5204 // Limit MaxScalableVF by the maximum safe dependence distance. 5205 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5206 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5207 MaxVScale = 5208 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5209 MaxScalableVF = ElementCount::getScalable( 5210 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5211 if (!MaxScalableVF) 5212 reportVectorizationInfo( 5213 "Max legal vector width too small, scalable vectorization " 5214 "unfeasible.", 5215 "ScalableVFUnfeasible", ORE, TheLoop); 5216 5217 return MaxScalableVF; 5218 } 5219 5220 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5221 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5222 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5223 unsigned SmallestType, WidestType; 5224 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5225 5226 // Get the maximum safe dependence distance in bits computed by LAA. 5227 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5228 // the memory accesses that is most restrictive (involved in the smallest 5229 // dependence distance). 5230 unsigned MaxSafeElements = 5231 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5232 5233 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5234 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5235 5236 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5237 << ".\n"); 5238 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5239 << ".\n"); 5240 5241 // First analyze the UserVF, fall back if the UserVF should be ignored. 5242 if (UserVF) { 5243 auto MaxSafeUserVF = 5244 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5245 5246 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5247 // If `VF=vscale x N` is safe, then so is `VF=N` 5248 if (UserVF.isScalable()) 5249 return FixedScalableVFPair( 5250 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5251 else 5252 return UserVF; 5253 } 5254 5255 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5256 5257 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5258 // is better to ignore the hint and let the compiler choose a suitable VF. 5259 if (!UserVF.isScalable()) { 5260 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5261 << " is unsafe, clamping to max safe VF=" 5262 << MaxSafeFixedVF << ".\n"); 5263 ORE->emit([&]() { 5264 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5265 TheLoop->getStartLoc(), 5266 TheLoop->getHeader()) 5267 << "User-specified vectorization factor " 5268 << ore::NV("UserVectorizationFactor", UserVF) 5269 << " is unsafe, clamping to maximum safe vectorization factor " 5270 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5271 }); 5272 return MaxSafeFixedVF; 5273 } 5274 5275 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5276 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5277 << " is ignored because scalable vectors are not " 5278 "available.\n"); 5279 ORE->emit([&]() { 5280 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5281 TheLoop->getStartLoc(), 5282 TheLoop->getHeader()) 5283 << "User-specified vectorization factor " 5284 << ore::NV("UserVectorizationFactor", UserVF) 5285 << " is ignored because the target does not support scalable " 5286 "vectors. The compiler will pick a more suitable value."; 5287 }); 5288 } else { 5289 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5290 << " is unsafe. Ignoring scalable UserVF.\n"); 5291 ORE->emit([&]() { 5292 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5293 TheLoop->getStartLoc(), 5294 TheLoop->getHeader()) 5295 << "User-specified vectorization factor " 5296 << ore::NV("UserVectorizationFactor", UserVF) 5297 << " is unsafe. Ignoring the hint to let the compiler pick a " 5298 "more suitable value."; 5299 }); 5300 } 5301 } 5302 5303 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5304 << " / " << WidestType << " bits.\n"); 5305 5306 FixedScalableVFPair Result(ElementCount::getFixed(1), 5307 ElementCount::getScalable(0)); 5308 if (auto MaxVF = 5309 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5310 MaxSafeFixedVF, FoldTailByMasking)) 5311 Result.FixedVF = MaxVF; 5312 5313 if (auto MaxVF = 5314 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5315 MaxSafeScalableVF, FoldTailByMasking)) 5316 if (MaxVF.isScalable()) { 5317 Result.ScalableVF = MaxVF; 5318 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5319 << "\n"); 5320 } 5321 5322 return Result; 5323 } 5324 5325 FixedScalableVFPair 5326 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5327 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5328 // TODO: It may by useful to do since it's still likely to be dynamically 5329 // uniform if the target can skip. 5330 reportVectorizationFailure( 5331 "Not inserting runtime ptr check for divergent target", 5332 "runtime pointer checks needed. Not enabled for divergent target", 5333 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5334 return FixedScalableVFPair::getNone(); 5335 } 5336 5337 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5338 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5339 if (TC == 1) { 5340 reportVectorizationFailure("Single iteration (non) loop", 5341 "loop trip count is one, irrelevant for vectorization", 5342 "SingleIterationLoop", ORE, TheLoop); 5343 return FixedScalableVFPair::getNone(); 5344 } 5345 5346 switch (ScalarEpilogueStatus) { 5347 case CM_ScalarEpilogueAllowed: 5348 return computeFeasibleMaxVF(TC, UserVF, false); 5349 case CM_ScalarEpilogueNotAllowedUsePredicate: 5350 LLVM_FALLTHROUGH; 5351 case CM_ScalarEpilogueNotNeededUsePredicate: 5352 LLVM_DEBUG( 5353 dbgs() << "LV: vector predicate hint/switch found.\n" 5354 << "LV: Not allowing scalar epilogue, creating predicated " 5355 << "vector loop.\n"); 5356 break; 5357 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5358 // fallthrough as a special case of OptForSize 5359 case CM_ScalarEpilogueNotAllowedOptSize: 5360 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5361 LLVM_DEBUG( 5362 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5363 else 5364 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5365 << "count.\n"); 5366 5367 // Bail if runtime checks are required, which are not good when optimising 5368 // for size. 5369 if (runtimeChecksRequired()) 5370 return FixedScalableVFPair::getNone(); 5371 5372 break; 5373 } 5374 5375 // The only loops we can vectorize without a scalar epilogue, are loops with 5376 // a bottom-test and a single exiting block. We'd have to handle the fact 5377 // that not every instruction executes on the last iteration. This will 5378 // require a lane mask which varies through the vector loop body. (TODO) 5379 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5380 // If there was a tail-folding hint/switch, but we can't fold the tail by 5381 // masking, fallback to a vectorization with a scalar epilogue. 5382 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5383 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5384 "scalar epilogue instead.\n"); 5385 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5386 return computeFeasibleMaxVF(TC, UserVF, false); 5387 } 5388 return FixedScalableVFPair::getNone(); 5389 } 5390 5391 // Now try the tail folding 5392 5393 // Invalidate interleave groups that require an epilogue if we can't mask 5394 // the interleave-group. 5395 if (!useMaskedInterleavedAccesses(TTI)) { 5396 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5397 "No decisions should have been taken at this point"); 5398 // Note: There is no need to invalidate any cost modeling decisions here, as 5399 // non where taken so far. 5400 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5401 } 5402 5403 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5404 // Avoid tail folding if the trip count is known to be a multiple of any VF 5405 // we chose. 5406 // FIXME: The condition below pessimises the case for fixed-width vectors, 5407 // when scalable VFs are also candidates for vectorization. 5408 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5409 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5410 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5411 "MaxFixedVF must be a power of 2"); 5412 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5413 : MaxFixedVF.getFixedValue(); 5414 ScalarEvolution *SE = PSE.getSE(); 5415 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5416 const SCEV *ExitCount = SE->getAddExpr( 5417 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5418 const SCEV *Rem = SE->getURemExpr( 5419 SE->applyLoopGuards(ExitCount, TheLoop), 5420 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5421 if (Rem->isZero()) { 5422 // Accept MaxFixedVF if we do not have a tail. 5423 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5424 return MaxFactors; 5425 } 5426 } 5427 5428 // For scalable vectors don't use tail folding for low trip counts or 5429 // optimizing for code size. We only permit this if the user has explicitly 5430 // requested it. 5431 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5432 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5433 MaxFactors.ScalableVF.isVector()) 5434 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5435 5436 // If we don't know the precise trip count, or if the trip count that we 5437 // found modulo the vectorization factor is not zero, try to fold the tail 5438 // by masking. 5439 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5440 if (Legal->prepareToFoldTailByMasking()) { 5441 FoldTailByMasking = true; 5442 return MaxFactors; 5443 } 5444 5445 // If there was a tail-folding hint/switch, but we can't fold the tail by 5446 // masking, fallback to a vectorization with a scalar epilogue. 5447 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5448 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5449 "scalar epilogue instead.\n"); 5450 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5451 return MaxFactors; 5452 } 5453 5454 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5455 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5456 return FixedScalableVFPair::getNone(); 5457 } 5458 5459 if (TC == 0) { 5460 reportVectorizationFailure( 5461 "Unable to calculate the loop count due to complex control flow", 5462 "unable to calculate the loop count due to complex control flow", 5463 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5464 return FixedScalableVFPair::getNone(); 5465 } 5466 5467 reportVectorizationFailure( 5468 "Cannot optimize for size and vectorize at the same time.", 5469 "cannot optimize for size and vectorize at the same time. " 5470 "Enable vectorization of this loop with '#pragma clang loop " 5471 "vectorize(enable)' when compiling with -Os/-Oz", 5472 "NoTailLoopWithOptForSize", ORE, TheLoop); 5473 return FixedScalableVFPair::getNone(); 5474 } 5475 5476 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5477 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5478 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5479 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5480 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5481 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5482 : TargetTransformInfo::RGK_FixedWidthVector); 5483 5484 // Convenience function to return the minimum of two ElementCounts. 5485 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5486 assert((LHS.isScalable() == RHS.isScalable()) && 5487 "Scalable flags must match"); 5488 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5489 }; 5490 5491 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5492 // Note that both WidestRegister and WidestType may not be a powers of 2. 5493 auto MaxVectorElementCount = ElementCount::get( 5494 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5495 ComputeScalableMaxVF); 5496 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5497 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5498 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5499 5500 if (!MaxVectorElementCount) { 5501 LLVM_DEBUG(dbgs() << "LV: The target has no " 5502 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5503 << " vector registers.\n"); 5504 return ElementCount::getFixed(1); 5505 } 5506 5507 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5508 if (ConstTripCount && 5509 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5510 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5511 // If loop trip count (TC) is known at compile time there is no point in 5512 // choosing VF greater than TC (as done in the loop below). Select maximum 5513 // power of two which doesn't exceed TC. 5514 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5515 // when the TC is less than or equal to the known number of lanes. 5516 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5517 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5518 "exceeding the constant trip count: " 5519 << ClampedConstTripCount << "\n"); 5520 return ElementCount::getFixed(ClampedConstTripCount); 5521 } 5522 5523 ElementCount MaxVF = MaxVectorElementCount; 5524 if (TTI.shouldMaximizeVectorBandwidth() || 5525 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5526 auto MaxVectorElementCountMaxBW = ElementCount::get( 5527 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5528 ComputeScalableMaxVF); 5529 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5530 5531 // Collect all viable vectorization factors larger than the default MaxVF 5532 // (i.e. MaxVectorElementCount). 5533 SmallVector<ElementCount, 8> VFs; 5534 for (ElementCount VS = MaxVectorElementCount * 2; 5535 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5536 VFs.push_back(VS); 5537 5538 // For each VF calculate its register usage. 5539 auto RUs = calculateRegisterUsage(VFs); 5540 5541 // Select the largest VF which doesn't require more registers than existing 5542 // ones. 5543 for (int i = RUs.size() - 1; i >= 0; --i) { 5544 bool Selected = true; 5545 for (auto &pair : RUs[i].MaxLocalUsers) { 5546 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5547 if (pair.second > TargetNumRegisters) 5548 Selected = false; 5549 } 5550 if (Selected) { 5551 MaxVF = VFs[i]; 5552 break; 5553 } 5554 } 5555 if (ElementCount MinVF = 5556 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5557 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5558 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5559 << ") with target's minimum: " << MinVF << '\n'); 5560 MaxVF = MinVF; 5561 } 5562 } 5563 } 5564 return MaxVF; 5565 } 5566 5567 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5568 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5569 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5570 auto Min = Attr.getVScaleRangeMin(); 5571 auto Max = Attr.getVScaleRangeMax(); 5572 if (Max && Min == Max) 5573 return Max; 5574 } 5575 5576 return TTI.getVScaleForTuning(); 5577 } 5578 5579 bool LoopVectorizationCostModel::isMoreProfitable( 5580 const VectorizationFactor &A, const VectorizationFactor &B) const { 5581 InstructionCost CostA = A.Cost; 5582 InstructionCost CostB = B.Cost; 5583 5584 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5585 5586 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5587 MaxTripCount) { 5588 // If we are folding the tail and the trip count is a known (possibly small) 5589 // constant, the trip count will be rounded up to an integer number of 5590 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5591 // which we compare directly. When not folding the tail, the total cost will 5592 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5593 // approximated with the per-lane cost below instead of using the tripcount 5594 // as here. 5595 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5596 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5597 return RTCostA < RTCostB; 5598 } 5599 5600 // Improve estimate for the vector width if it is scalable. 5601 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5602 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5603 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5604 if (A.Width.isScalable()) 5605 EstimatedWidthA *= VScale.getValue(); 5606 if (B.Width.isScalable()) 5607 EstimatedWidthB *= VScale.getValue(); 5608 } 5609 5610 // Assume vscale may be larger than 1 (or the value being tuned for), 5611 // so that scalable vectorization is slightly favorable over fixed-width 5612 // vectorization. 5613 if (A.Width.isScalable() && !B.Width.isScalable()) 5614 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5615 5616 // To avoid the need for FP division: 5617 // (CostA / A.Width) < (CostB / B.Width) 5618 // <=> (CostA * B.Width) < (CostB * A.Width) 5619 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5620 } 5621 5622 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5623 const ElementCountSet &VFCandidates) { 5624 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5625 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5626 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5627 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5628 "Expected Scalar VF to be a candidate"); 5629 5630 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5631 VectorizationFactor ChosenFactor = ScalarCost; 5632 5633 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5634 if (ForceVectorization && VFCandidates.size() > 1) { 5635 // Ignore scalar width, because the user explicitly wants vectorization. 5636 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5637 // evaluation. 5638 ChosenFactor.Cost = InstructionCost::getMax(); 5639 } 5640 5641 SmallVector<InstructionVFPair> InvalidCosts; 5642 for (const auto &i : VFCandidates) { 5643 // The cost for scalar VF=1 is already calculated, so ignore it. 5644 if (i.isScalar()) 5645 continue; 5646 5647 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5648 VectorizationFactor Candidate(i, C.first); 5649 5650 #ifndef NDEBUG 5651 unsigned AssumedMinimumVscale = 1; 5652 if (Optional<unsigned> VScale = getVScaleForTuning()) 5653 AssumedMinimumVscale = VScale.getValue(); 5654 unsigned Width = 5655 Candidate.Width.isScalable() 5656 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5657 : Candidate.Width.getFixedValue(); 5658 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5659 << " costs: " << (Candidate.Cost / Width)); 5660 if (i.isScalable()) 5661 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5662 << AssumedMinimumVscale << ")"); 5663 LLVM_DEBUG(dbgs() << ".\n"); 5664 #endif 5665 5666 if (!C.second && !ForceVectorization) { 5667 LLVM_DEBUG( 5668 dbgs() << "LV: Not considering vector loop of width " << i 5669 << " because it will not generate any vector instructions.\n"); 5670 continue; 5671 } 5672 5673 // If profitable add it to ProfitableVF list. 5674 if (isMoreProfitable(Candidate, ScalarCost)) 5675 ProfitableVFs.push_back(Candidate); 5676 5677 if (isMoreProfitable(Candidate, ChosenFactor)) 5678 ChosenFactor = Candidate; 5679 } 5680 5681 // Emit a report of VFs with invalid costs in the loop. 5682 if (!InvalidCosts.empty()) { 5683 // Group the remarks per instruction, keeping the instruction order from 5684 // InvalidCosts. 5685 std::map<Instruction *, unsigned> Numbering; 5686 unsigned I = 0; 5687 for (auto &Pair : InvalidCosts) 5688 if (!Numbering.count(Pair.first)) 5689 Numbering[Pair.first] = I++; 5690 5691 // Sort the list, first on instruction(number) then on VF. 5692 llvm::sort(InvalidCosts, 5693 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5694 if (Numbering[A.first] != Numbering[B.first]) 5695 return Numbering[A.first] < Numbering[B.first]; 5696 ElementCountComparator ECC; 5697 return ECC(A.second, B.second); 5698 }); 5699 5700 // For a list of ordered instruction-vf pairs: 5701 // [(load, vf1), (load, vf2), (store, vf1)] 5702 // Group the instructions together to emit separate remarks for: 5703 // load (vf1, vf2) 5704 // store (vf1) 5705 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5706 auto Subset = ArrayRef<InstructionVFPair>(); 5707 do { 5708 if (Subset.empty()) 5709 Subset = Tail.take_front(1); 5710 5711 Instruction *I = Subset.front().first; 5712 5713 // If the next instruction is different, or if there are no other pairs, 5714 // emit a remark for the collated subset. e.g. 5715 // [(load, vf1), (load, vf2))] 5716 // to emit: 5717 // remark: invalid costs for 'load' at VF=(vf, vf2) 5718 if (Subset == Tail || Tail[Subset.size()].first != I) { 5719 std::string OutString; 5720 raw_string_ostream OS(OutString); 5721 assert(!Subset.empty() && "Unexpected empty range"); 5722 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5723 for (auto &Pair : Subset) 5724 OS << (Pair.second == Subset.front().second ? "" : ", ") 5725 << Pair.second; 5726 OS << "):"; 5727 if (auto *CI = dyn_cast<CallInst>(I)) 5728 OS << " call to " << CI->getCalledFunction()->getName(); 5729 else 5730 OS << " " << I->getOpcodeName(); 5731 OS.flush(); 5732 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5733 Tail = Tail.drop_front(Subset.size()); 5734 Subset = {}; 5735 } else 5736 // Grow the subset by one element 5737 Subset = Tail.take_front(Subset.size() + 1); 5738 } while (!Tail.empty()); 5739 } 5740 5741 if (!EnableCondStoresVectorization && NumPredStores) { 5742 reportVectorizationFailure("There are conditional stores.", 5743 "store that is conditionally executed prevents vectorization", 5744 "ConditionalStore", ORE, TheLoop); 5745 ChosenFactor = ScalarCost; 5746 } 5747 5748 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5749 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5750 << "LV: Vectorization seems to be not beneficial, " 5751 << "but was forced by a user.\n"); 5752 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5753 return ChosenFactor; 5754 } 5755 5756 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5757 const Loop &L, ElementCount VF) const { 5758 // Cross iteration phis such as reductions need special handling and are 5759 // currently unsupported. 5760 if (any_of(L.getHeader()->phis(), 5761 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5762 return false; 5763 5764 // Phis with uses outside of the loop require special handling and are 5765 // currently unsupported. 5766 for (auto &Entry : Legal->getInductionVars()) { 5767 // Look for uses of the value of the induction at the last iteration. 5768 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5769 for (User *U : PostInc->users()) 5770 if (!L.contains(cast<Instruction>(U))) 5771 return false; 5772 // Look for uses of penultimate value of the induction. 5773 for (User *U : Entry.first->users()) 5774 if (!L.contains(cast<Instruction>(U))) 5775 return false; 5776 } 5777 5778 // Induction variables that are widened require special handling that is 5779 // currently not supported. 5780 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5781 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5782 this->isProfitableToScalarize(Entry.first, VF)); 5783 })) 5784 return false; 5785 5786 // Epilogue vectorization code has not been auditted to ensure it handles 5787 // non-latch exits properly. It may be fine, but it needs auditted and 5788 // tested. 5789 if (L.getExitingBlock() != L.getLoopLatch()) 5790 return false; 5791 5792 return true; 5793 } 5794 5795 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5796 const ElementCount VF) const { 5797 // FIXME: We need a much better cost-model to take different parameters such 5798 // as register pressure, code size increase and cost of extra branches into 5799 // account. For now we apply a very crude heuristic and only consider loops 5800 // with vectorization factors larger than a certain value. 5801 // We also consider epilogue vectorization unprofitable for targets that don't 5802 // consider interleaving beneficial (eg. MVE). 5803 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5804 return false; 5805 // FIXME: We should consider changing the threshold for scalable 5806 // vectors to take VScaleForTuning into account. 5807 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5808 return true; 5809 return false; 5810 } 5811 5812 VectorizationFactor 5813 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5814 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5815 VectorizationFactor Result = VectorizationFactor::Disabled(); 5816 if (!EnableEpilogueVectorization) { 5817 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5818 return Result; 5819 } 5820 5821 if (!isScalarEpilogueAllowed()) { 5822 LLVM_DEBUG( 5823 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5824 "allowed.\n";); 5825 return Result; 5826 } 5827 5828 // Not really a cost consideration, but check for unsupported cases here to 5829 // simplify the logic. 5830 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5831 LLVM_DEBUG( 5832 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5833 "not a supported candidate.\n";); 5834 return Result; 5835 } 5836 5837 if (EpilogueVectorizationForceVF > 1) { 5838 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5839 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5840 if (LVP.hasPlanWithVF(ForcedEC)) 5841 return {ForcedEC, 0}; 5842 else { 5843 LLVM_DEBUG( 5844 dbgs() 5845 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5846 return Result; 5847 } 5848 } 5849 5850 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5851 TheLoop->getHeader()->getParent()->hasMinSize()) { 5852 LLVM_DEBUG( 5853 dbgs() 5854 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5855 return Result; 5856 } 5857 5858 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5859 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5860 "this loop\n"); 5861 return Result; 5862 } 5863 5864 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5865 // the main loop handles 8 lanes per iteration. We could still benefit from 5866 // vectorizing the epilogue loop with VF=4. 5867 ElementCount EstimatedRuntimeVF = MainLoopVF; 5868 if (MainLoopVF.isScalable()) { 5869 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5870 if (Optional<unsigned> VScale = getVScaleForTuning()) 5871 EstimatedRuntimeVF *= VScale.getValue(); 5872 } 5873 5874 for (auto &NextVF : ProfitableVFs) 5875 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5876 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5877 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5878 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5879 LVP.hasPlanWithVF(NextVF.Width)) 5880 Result = NextVF; 5881 5882 if (Result != VectorizationFactor::Disabled()) 5883 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5884 << Result.Width << "\n";); 5885 return Result; 5886 } 5887 5888 std::pair<unsigned, unsigned> 5889 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5890 unsigned MinWidth = -1U; 5891 unsigned MaxWidth = 8; 5892 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5893 // For in-loop reductions, no element types are added to ElementTypesInLoop 5894 // if there are no loads/stores in the loop. In this case, check through the 5895 // reduction variables to determine the maximum width. 5896 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5897 // Reset MaxWidth so that we can find the smallest type used by recurrences 5898 // in the loop. 5899 MaxWidth = -1U; 5900 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5901 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5902 // When finding the min width used by the recurrence we need to account 5903 // for casts on the input operands of the recurrence. 5904 MaxWidth = std::min<unsigned>( 5905 MaxWidth, std::min<unsigned>( 5906 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5907 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5908 } 5909 } else { 5910 for (Type *T : ElementTypesInLoop) { 5911 MinWidth = std::min<unsigned>( 5912 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5913 MaxWidth = std::max<unsigned>( 5914 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5915 } 5916 } 5917 return {MinWidth, MaxWidth}; 5918 } 5919 5920 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5921 ElementTypesInLoop.clear(); 5922 // For each block. 5923 for (BasicBlock *BB : TheLoop->blocks()) { 5924 // For each instruction in the loop. 5925 for (Instruction &I : BB->instructionsWithoutDebug()) { 5926 Type *T = I.getType(); 5927 5928 // Skip ignored values. 5929 if (ValuesToIgnore.count(&I)) 5930 continue; 5931 5932 // Only examine Loads, Stores and PHINodes. 5933 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5934 continue; 5935 5936 // Examine PHI nodes that are reduction variables. Update the type to 5937 // account for the recurrence type. 5938 if (auto *PN = dyn_cast<PHINode>(&I)) { 5939 if (!Legal->isReductionVariable(PN)) 5940 continue; 5941 const RecurrenceDescriptor &RdxDesc = 5942 Legal->getReductionVars().find(PN)->second; 5943 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5944 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5945 RdxDesc.getRecurrenceType(), 5946 TargetTransformInfo::ReductionFlags())) 5947 continue; 5948 T = RdxDesc.getRecurrenceType(); 5949 } 5950 5951 // Examine the stored values. 5952 if (auto *ST = dyn_cast<StoreInst>(&I)) 5953 T = ST->getValueOperand()->getType(); 5954 5955 assert(T->isSized() && 5956 "Expected the load/store/recurrence type to be sized"); 5957 5958 ElementTypesInLoop.insert(T); 5959 } 5960 } 5961 } 5962 5963 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5964 unsigned LoopCost) { 5965 // -- The interleave heuristics -- 5966 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5967 // There are many micro-architectural considerations that we can't predict 5968 // at this level. For example, frontend pressure (on decode or fetch) due to 5969 // code size, or the number and capabilities of the execution ports. 5970 // 5971 // We use the following heuristics to select the interleave count: 5972 // 1. If the code has reductions, then we interleave to break the cross 5973 // iteration dependency. 5974 // 2. If the loop is really small, then we interleave to reduce the loop 5975 // overhead. 5976 // 3. We don't interleave if we think that we will spill registers to memory 5977 // due to the increased register pressure. 5978 5979 if (!isScalarEpilogueAllowed()) 5980 return 1; 5981 5982 // We used the distance for the interleave count. 5983 if (Legal->getMaxSafeDepDistBytes() != -1U) 5984 return 1; 5985 5986 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5987 const bool HasReductions = !Legal->getReductionVars().empty(); 5988 // Do not interleave loops with a relatively small known or estimated trip 5989 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5990 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5991 // because with the above conditions interleaving can expose ILP and break 5992 // cross iteration dependences for reductions. 5993 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5994 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5995 return 1; 5996 5997 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5998 // We divide by these constants so assume that we have at least one 5999 // instruction that uses at least one register. 6000 for (auto& pair : R.MaxLocalUsers) { 6001 pair.second = std::max(pair.second, 1U); 6002 } 6003 6004 // We calculate the interleave count using the following formula. 6005 // Subtract the number of loop invariants from the number of available 6006 // registers. These registers are used by all of the interleaved instances. 6007 // Next, divide the remaining registers by the number of registers that is 6008 // required by the loop, in order to estimate how many parallel instances 6009 // fit without causing spills. All of this is rounded down if necessary to be 6010 // a power of two. We want power of two interleave count to simplify any 6011 // addressing operations or alignment considerations. 6012 // We also want power of two interleave counts to ensure that the induction 6013 // variable of the vector loop wraps to zero, when tail is folded by masking; 6014 // this currently happens when OptForSize, in which case IC is set to 1 above. 6015 unsigned IC = UINT_MAX; 6016 6017 for (auto& pair : R.MaxLocalUsers) { 6018 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6019 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6020 << " registers of " 6021 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6022 if (VF.isScalar()) { 6023 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6024 TargetNumRegisters = ForceTargetNumScalarRegs; 6025 } else { 6026 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6027 TargetNumRegisters = ForceTargetNumVectorRegs; 6028 } 6029 unsigned MaxLocalUsers = pair.second; 6030 unsigned LoopInvariantRegs = 0; 6031 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6032 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6033 6034 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6035 // Don't count the induction variable as interleaved. 6036 if (EnableIndVarRegisterHeur) { 6037 TmpIC = 6038 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6039 std::max(1U, (MaxLocalUsers - 1))); 6040 } 6041 6042 IC = std::min(IC, TmpIC); 6043 } 6044 6045 // Clamp the interleave ranges to reasonable counts. 6046 unsigned MaxInterleaveCount = 6047 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6048 6049 // Check if the user has overridden the max. 6050 if (VF.isScalar()) { 6051 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6052 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6053 } else { 6054 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6055 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6056 } 6057 6058 // If trip count is known or estimated compile time constant, limit the 6059 // interleave count to be less than the trip count divided by VF, provided it 6060 // is at least 1. 6061 // 6062 // For scalable vectors we can't know if interleaving is beneficial. It may 6063 // not be beneficial for small loops if none of the lanes in the second vector 6064 // iterations is enabled. However, for larger loops, there is likely to be a 6065 // similar benefit as for fixed-width vectors. For now, we choose to leave 6066 // the InterleaveCount as if vscale is '1', although if some information about 6067 // the vector is known (e.g. min vector size), we can make a better decision. 6068 if (BestKnownTC) { 6069 MaxInterleaveCount = 6070 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6071 // Make sure MaxInterleaveCount is greater than 0. 6072 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6073 } 6074 6075 assert(MaxInterleaveCount > 0 && 6076 "Maximum interleave count must be greater than 0"); 6077 6078 // Clamp the calculated IC to be between the 1 and the max interleave count 6079 // that the target and trip count allows. 6080 if (IC > MaxInterleaveCount) 6081 IC = MaxInterleaveCount; 6082 else 6083 // Make sure IC is greater than 0. 6084 IC = std::max(1u, IC); 6085 6086 assert(IC > 0 && "Interleave count must be greater than 0."); 6087 6088 // If we did not calculate the cost for VF (because the user selected the VF) 6089 // then we calculate the cost of VF here. 6090 if (LoopCost == 0) { 6091 InstructionCost C = expectedCost(VF).first; 6092 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6093 LoopCost = *C.getValue(); 6094 } 6095 6096 assert(LoopCost && "Non-zero loop cost expected"); 6097 6098 // Interleave if we vectorized this loop and there is a reduction that could 6099 // benefit from interleaving. 6100 if (VF.isVector() && HasReductions) { 6101 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6102 return IC; 6103 } 6104 6105 // For any scalar loop that either requires runtime checks or predication we 6106 // are better off leaving this to the unroller. Note that if we've already 6107 // vectorized the loop we will have done the runtime check and so interleaving 6108 // won't require further checks. 6109 bool ScalarInterleavingRequiresPredication = 6110 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 6111 return Legal->blockNeedsPredication(BB); 6112 })); 6113 bool ScalarInterleavingRequiresRuntimePointerCheck = 6114 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6115 6116 // We want to interleave small loops in order to reduce the loop overhead and 6117 // potentially expose ILP opportunities. 6118 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6119 << "LV: IC is " << IC << '\n' 6120 << "LV: VF is " << VF << '\n'); 6121 const bool AggressivelyInterleaveReductions = 6122 TTI.enableAggressiveInterleaving(HasReductions); 6123 if (!ScalarInterleavingRequiresRuntimePointerCheck && 6124 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 6125 // We assume that the cost overhead is 1 and we use the cost model 6126 // to estimate the cost of the loop and interleave until the cost of the 6127 // loop overhead is about 5% of the cost of the loop. 6128 unsigned SmallIC = 6129 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6130 6131 // Interleave until store/load ports (estimated by max interleave count) are 6132 // saturated. 6133 unsigned NumStores = Legal->getNumStores(); 6134 unsigned NumLoads = Legal->getNumLoads(); 6135 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6136 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6137 6138 // There is little point in interleaving for reductions containing selects 6139 // and compares when VF=1 since it may just create more overhead than it's 6140 // worth for loops with small trip counts. This is because we still have to 6141 // do the final reduction after the loop. 6142 bool HasSelectCmpReductions = 6143 HasReductions && 6144 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6145 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6146 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6147 RdxDesc.getRecurrenceKind()); 6148 }); 6149 if (HasSelectCmpReductions) { 6150 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6151 return 1; 6152 } 6153 6154 // If we have a scalar reduction (vector reductions are already dealt with 6155 // by this point), we can increase the critical path length if the loop 6156 // we're interleaving is inside another loop. For tree-wise reductions 6157 // set the limit to 2, and for ordered reductions it's best to disable 6158 // interleaving entirely. 6159 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6160 bool HasOrderedReductions = 6161 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6162 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6163 return RdxDesc.isOrdered(); 6164 }); 6165 if (HasOrderedReductions) { 6166 LLVM_DEBUG( 6167 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6168 return 1; 6169 } 6170 6171 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6172 SmallIC = std::min(SmallIC, F); 6173 StoresIC = std::min(StoresIC, F); 6174 LoadsIC = std::min(LoadsIC, F); 6175 } 6176 6177 if (EnableLoadStoreRuntimeInterleave && 6178 std::max(StoresIC, LoadsIC) > SmallIC) { 6179 LLVM_DEBUG( 6180 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6181 return std::max(StoresIC, LoadsIC); 6182 } 6183 6184 // If there are scalar reductions and TTI has enabled aggressive 6185 // interleaving for reductions, we will interleave to expose ILP. 6186 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6187 AggressivelyInterleaveReductions) { 6188 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6189 // Interleave no less than SmallIC but not as aggressive as the normal IC 6190 // to satisfy the rare situation when resources are too limited. 6191 return std::max(IC / 2, SmallIC); 6192 } else { 6193 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6194 return SmallIC; 6195 } 6196 } 6197 6198 // Interleave if this is a large loop (small loops are already dealt with by 6199 // this point) that could benefit from interleaving. 6200 if (AggressivelyInterleaveReductions) { 6201 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6202 return IC; 6203 } 6204 6205 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6206 return 1; 6207 } 6208 6209 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6210 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6211 // This function calculates the register usage by measuring the highest number 6212 // of values that are alive at a single location. Obviously, this is a very 6213 // rough estimation. We scan the loop in a topological order in order and 6214 // assign a number to each instruction. We use RPO to ensure that defs are 6215 // met before their users. We assume that each instruction that has in-loop 6216 // users starts an interval. We record every time that an in-loop value is 6217 // used, so we have a list of the first and last occurrences of each 6218 // instruction. Next, we transpose this data structure into a multi map that 6219 // holds the list of intervals that *end* at a specific location. This multi 6220 // map allows us to perform a linear search. We scan the instructions linearly 6221 // and record each time that a new interval starts, by placing it in a set. 6222 // If we find this value in the multi-map then we remove it from the set. 6223 // The max register usage is the maximum size of the set. 6224 // We also search for instructions that are defined outside the loop, but are 6225 // used inside the loop. We need this number separately from the max-interval 6226 // usage number because when we unroll, loop-invariant values do not take 6227 // more register. 6228 LoopBlocksDFS DFS(TheLoop); 6229 DFS.perform(LI); 6230 6231 RegisterUsage RU; 6232 6233 // Each 'key' in the map opens a new interval. The values 6234 // of the map are the index of the 'last seen' usage of the 6235 // instruction that is the key. 6236 using IntervalMap = DenseMap<Instruction *, unsigned>; 6237 6238 // Maps instruction to its index. 6239 SmallVector<Instruction *, 64> IdxToInstr; 6240 // Marks the end of each interval. 6241 IntervalMap EndPoint; 6242 // Saves the list of instruction indices that are used in the loop. 6243 SmallPtrSet<Instruction *, 8> Ends; 6244 // Saves the list of values that are used in the loop but are 6245 // defined outside the loop, such as arguments and constants. 6246 SmallPtrSet<Value *, 8> LoopInvariants; 6247 6248 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6249 for (Instruction &I : BB->instructionsWithoutDebug()) { 6250 IdxToInstr.push_back(&I); 6251 6252 // Save the end location of each USE. 6253 for (Value *U : I.operands()) { 6254 auto *Instr = dyn_cast<Instruction>(U); 6255 6256 // Ignore non-instruction values such as arguments, constants, etc. 6257 if (!Instr) 6258 continue; 6259 6260 // If this instruction is outside the loop then record it and continue. 6261 if (!TheLoop->contains(Instr)) { 6262 LoopInvariants.insert(Instr); 6263 continue; 6264 } 6265 6266 // Overwrite previous end points. 6267 EndPoint[Instr] = IdxToInstr.size(); 6268 Ends.insert(Instr); 6269 } 6270 } 6271 } 6272 6273 // Saves the list of intervals that end with the index in 'key'. 6274 using InstrList = SmallVector<Instruction *, 2>; 6275 DenseMap<unsigned, InstrList> TransposeEnds; 6276 6277 // Transpose the EndPoints to a list of values that end at each index. 6278 for (auto &Interval : EndPoint) 6279 TransposeEnds[Interval.second].push_back(Interval.first); 6280 6281 SmallPtrSet<Instruction *, 8> OpenIntervals; 6282 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6283 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6284 6285 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6286 6287 // A lambda that gets the register usage for the given type and VF. 6288 const auto &TTICapture = TTI; 6289 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6290 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6291 return 0; 6292 InstructionCost::CostType RegUsage = 6293 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6294 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6295 "Nonsensical values for register usage."); 6296 return RegUsage; 6297 }; 6298 6299 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6300 Instruction *I = IdxToInstr[i]; 6301 6302 // Remove all of the instructions that end at this location. 6303 InstrList &List = TransposeEnds[i]; 6304 for (Instruction *ToRemove : List) 6305 OpenIntervals.erase(ToRemove); 6306 6307 // Ignore instructions that are never used within the loop. 6308 if (!Ends.count(I)) 6309 continue; 6310 6311 // Skip ignored values. 6312 if (ValuesToIgnore.count(I)) 6313 continue; 6314 6315 // For each VF find the maximum usage of registers. 6316 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6317 // Count the number of live intervals. 6318 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6319 6320 if (VFs[j].isScalar()) { 6321 for (auto Inst : OpenIntervals) { 6322 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6323 if (RegUsage.find(ClassID) == RegUsage.end()) 6324 RegUsage[ClassID] = 1; 6325 else 6326 RegUsage[ClassID] += 1; 6327 } 6328 } else { 6329 collectUniformsAndScalars(VFs[j]); 6330 for (auto Inst : OpenIntervals) { 6331 // Skip ignored values for VF > 1. 6332 if (VecValuesToIgnore.count(Inst)) 6333 continue; 6334 if (isScalarAfterVectorization(Inst, VFs[j])) { 6335 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6336 if (RegUsage.find(ClassID) == RegUsage.end()) 6337 RegUsage[ClassID] = 1; 6338 else 6339 RegUsage[ClassID] += 1; 6340 } else { 6341 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6342 if (RegUsage.find(ClassID) == RegUsage.end()) 6343 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6344 else 6345 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6346 } 6347 } 6348 } 6349 6350 for (auto& pair : RegUsage) { 6351 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6352 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6353 else 6354 MaxUsages[j][pair.first] = pair.second; 6355 } 6356 } 6357 6358 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6359 << OpenIntervals.size() << '\n'); 6360 6361 // Add the current instruction to the list of open intervals. 6362 OpenIntervals.insert(I); 6363 } 6364 6365 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6366 SmallMapVector<unsigned, unsigned, 4> Invariant; 6367 6368 for (auto Inst : LoopInvariants) { 6369 unsigned Usage = 6370 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6371 unsigned ClassID = 6372 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6373 if (Invariant.find(ClassID) == Invariant.end()) 6374 Invariant[ClassID] = Usage; 6375 else 6376 Invariant[ClassID] += Usage; 6377 } 6378 6379 LLVM_DEBUG({ 6380 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6381 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6382 << " item\n"; 6383 for (const auto &pair : MaxUsages[i]) { 6384 dbgs() << "LV(REG): RegisterClass: " 6385 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6386 << " registers\n"; 6387 } 6388 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6389 << " item\n"; 6390 for (const auto &pair : Invariant) { 6391 dbgs() << "LV(REG): RegisterClass: " 6392 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6393 << " registers\n"; 6394 } 6395 }); 6396 6397 RU.LoopInvariantRegs = Invariant; 6398 RU.MaxLocalUsers = MaxUsages[i]; 6399 RUs[i] = RU; 6400 } 6401 6402 return RUs; 6403 } 6404 6405 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6406 ElementCount VF) { 6407 // TODO: Cost model for emulated masked load/store is completely 6408 // broken. This hack guides the cost model to use an artificially 6409 // high enough value to practically disable vectorization with such 6410 // operations, except where previously deployed legality hack allowed 6411 // using very low cost values. This is to avoid regressions coming simply 6412 // from moving "masked load/store" check from legality to cost model. 6413 // Masked Load/Gather emulation was previously never allowed. 6414 // Limited number of Masked Store/Scatter emulation was allowed. 6415 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6416 return isa<LoadInst>(I) || 6417 (isa<StoreInst>(I) && 6418 NumPredStores > NumberOfStoresToPredicate); 6419 } 6420 6421 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6422 // If we aren't vectorizing the loop, or if we've already collected the 6423 // instructions to scalarize, there's nothing to do. Collection may already 6424 // have occurred if we have a user-selected VF and are now computing the 6425 // expected cost for interleaving. 6426 if (VF.isScalar() || VF.isZero() || 6427 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6428 return; 6429 6430 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6431 // not profitable to scalarize any instructions, the presence of VF in the 6432 // map will indicate that we've analyzed it already. 6433 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6434 6435 // Find all the instructions that are scalar with predication in the loop and 6436 // determine if it would be better to not if-convert the blocks they are in. 6437 // If so, we also record the instructions to scalarize. 6438 for (BasicBlock *BB : TheLoop->blocks()) { 6439 if (!blockNeedsPredicationForAnyReason(BB)) 6440 continue; 6441 for (Instruction &I : *BB) 6442 if (isScalarWithPredication(&I, VF)) { 6443 ScalarCostsTy ScalarCosts; 6444 // Do not apply discount if scalable, because that would lead to 6445 // invalid scalarization costs. 6446 // Do not apply discount logic if hacked cost is needed 6447 // for emulated masked memrefs. 6448 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6449 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6450 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6451 // Remember that BB will remain after vectorization. 6452 PredicatedBBsAfterVectorization.insert(BB); 6453 } 6454 } 6455 } 6456 6457 int LoopVectorizationCostModel::computePredInstDiscount( 6458 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6459 assert(!isUniformAfterVectorization(PredInst, VF) && 6460 "Instruction marked uniform-after-vectorization will be predicated"); 6461 6462 // Initialize the discount to zero, meaning that the scalar version and the 6463 // vector version cost the same. 6464 InstructionCost Discount = 0; 6465 6466 // Holds instructions to analyze. The instructions we visit are mapped in 6467 // ScalarCosts. Those instructions are the ones that would be scalarized if 6468 // we find that the scalar version costs less. 6469 SmallVector<Instruction *, 8> Worklist; 6470 6471 // Returns true if the given instruction can be scalarized. 6472 auto canBeScalarized = [&](Instruction *I) -> bool { 6473 // We only attempt to scalarize instructions forming a single-use chain 6474 // from the original predicated block that would otherwise be vectorized. 6475 // Although not strictly necessary, we give up on instructions we know will 6476 // already be scalar to avoid traversing chains that are unlikely to be 6477 // beneficial. 6478 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6479 isScalarAfterVectorization(I, VF)) 6480 return false; 6481 6482 // If the instruction is scalar with predication, it will be analyzed 6483 // separately. We ignore it within the context of PredInst. 6484 if (isScalarWithPredication(I, VF)) 6485 return false; 6486 6487 // If any of the instruction's operands are uniform after vectorization, 6488 // the instruction cannot be scalarized. This prevents, for example, a 6489 // masked load from being scalarized. 6490 // 6491 // We assume we will only emit a value for lane zero of an instruction 6492 // marked uniform after vectorization, rather than VF identical values. 6493 // Thus, if we scalarize an instruction that uses a uniform, we would 6494 // create uses of values corresponding to the lanes we aren't emitting code 6495 // for. This behavior can be changed by allowing getScalarValue to clone 6496 // the lane zero values for uniforms rather than asserting. 6497 for (Use &U : I->operands()) 6498 if (auto *J = dyn_cast<Instruction>(U.get())) 6499 if (isUniformAfterVectorization(J, VF)) 6500 return false; 6501 6502 // Otherwise, we can scalarize the instruction. 6503 return true; 6504 }; 6505 6506 // Compute the expected cost discount from scalarizing the entire expression 6507 // feeding the predicated instruction. We currently only consider expressions 6508 // that are single-use instruction chains. 6509 Worklist.push_back(PredInst); 6510 while (!Worklist.empty()) { 6511 Instruction *I = Worklist.pop_back_val(); 6512 6513 // If we've already analyzed the instruction, there's nothing to do. 6514 if (ScalarCosts.find(I) != ScalarCosts.end()) 6515 continue; 6516 6517 // Compute the cost of the vector instruction. Note that this cost already 6518 // includes the scalarization overhead of the predicated instruction. 6519 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6520 6521 // Compute the cost of the scalarized instruction. This cost is the cost of 6522 // the instruction as if it wasn't if-converted and instead remained in the 6523 // predicated block. We will scale this cost by block probability after 6524 // computing the scalarization overhead. 6525 InstructionCost ScalarCost = 6526 VF.getFixedValue() * 6527 getInstructionCost(I, ElementCount::getFixed(1)).first; 6528 6529 // Compute the scalarization overhead of needed insertelement instructions 6530 // and phi nodes. 6531 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6532 ScalarCost += TTI.getScalarizationOverhead( 6533 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6534 APInt::getAllOnes(VF.getFixedValue()), true, false); 6535 ScalarCost += 6536 VF.getFixedValue() * 6537 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6538 } 6539 6540 // Compute the scalarization overhead of needed extractelement 6541 // instructions. For each of the instruction's operands, if the operand can 6542 // be scalarized, add it to the worklist; otherwise, account for the 6543 // overhead. 6544 for (Use &U : I->operands()) 6545 if (auto *J = dyn_cast<Instruction>(U.get())) { 6546 assert(VectorType::isValidElementType(J->getType()) && 6547 "Instruction has non-scalar type"); 6548 if (canBeScalarized(J)) 6549 Worklist.push_back(J); 6550 else if (needsExtract(J, VF)) { 6551 ScalarCost += TTI.getScalarizationOverhead( 6552 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6553 APInt::getAllOnes(VF.getFixedValue()), false, true); 6554 } 6555 } 6556 6557 // Scale the total scalar cost by block probability. 6558 ScalarCost /= getReciprocalPredBlockProb(); 6559 6560 // Compute the discount. A non-negative discount means the vector version 6561 // of the instruction costs more, and scalarizing would be beneficial. 6562 Discount += VectorCost - ScalarCost; 6563 ScalarCosts[I] = ScalarCost; 6564 } 6565 6566 return *Discount.getValue(); 6567 } 6568 6569 LoopVectorizationCostModel::VectorizationCostTy 6570 LoopVectorizationCostModel::expectedCost( 6571 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6572 VectorizationCostTy Cost; 6573 6574 // For each block. 6575 for (BasicBlock *BB : TheLoop->blocks()) { 6576 VectorizationCostTy BlockCost; 6577 6578 // For each instruction in the old loop. 6579 for (Instruction &I : BB->instructionsWithoutDebug()) { 6580 // Skip ignored values. 6581 if (ValuesToIgnore.count(&I) || 6582 (VF.isVector() && VecValuesToIgnore.count(&I))) 6583 continue; 6584 6585 VectorizationCostTy C = getInstructionCost(&I, VF); 6586 6587 // Check if we should override the cost. 6588 if (C.first.isValid() && 6589 ForceTargetInstructionCost.getNumOccurrences() > 0) 6590 C.first = InstructionCost(ForceTargetInstructionCost); 6591 6592 // Keep a list of instructions with invalid costs. 6593 if (Invalid && !C.first.isValid()) 6594 Invalid->emplace_back(&I, VF); 6595 6596 BlockCost.first += C.first; 6597 BlockCost.second |= C.second; 6598 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6599 << " for VF " << VF << " For instruction: " << I 6600 << '\n'); 6601 } 6602 6603 // If we are vectorizing a predicated block, it will have been 6604 // if-converted. This means that the block's instructions (aside from 6605 // stores and instructions that may divide by zero) will now be 6606 // unconditionally executed. For the scalar case, we may not always execute 6607 // the predicated block, if it is an if-else block. Thus, scale the block's 6608 // cost by the probability of executing it. blockNeedsPredication from 6609 // Legal is used so as to not include all blocks in tail folded loops. 6610 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6611 BlockCost.first /= getReciprocalPredBlockProb(); 6612 6613 Cost.first += BlockCost.first; 6614 Cost.second |= BlockCost.second; 6615 } 6616 6617 return Cost; 6618 } 6619 6620 /// Gets Address Access SCEV after verifying that the access pattern 6621 /// is loop invariant except the induction variable dependence. 6622 /// 6623 /// This SCEV can be sent to the Target in order to estimate the address 6624 /// calculation cost. 6625 static const SCEV *getAddressAccessSCEV( 6626 Value *Ptr, 6627 LoopVectorizationLegality *Legal, 6628 PredicatedScalarEvolution &PSE, 6629 const Loop *TheLoop) { 6630 6631 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6632 if (!Gep) 6633 return nullptr; 6634 6635 // We are looking for a gep with all loop invariant indices except for one 6636 // which should be an induction variable. 6637 auto SE = PSE.getSE(); 6638 unsigned NumOperands = Gep->getNumOperands(); 6639 for (unsigned i = 1; i < NumOperands; ++i) { 6640 Value *Opd = Gep->getOperand(i); 6641 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6642 !Legal->isInductionVariable(Opd)) 6643 return nullptr; 6644 } 6645 6646 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6647 return PSE.getSCEV(Ptr); 6648 } 6649 6650 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6651 return Legal->hasStride(I->getOperand(0)) || 6652 Legal->hasStride(I->getOperand(1)); 6653 } 6654 6655 InstructionCost 6656 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6657 ElementCount VF) { 6658 assert(VF.isVector() && 6659 "Scalarization cost of instruction implies vectorization."); 6660 if (VF.isScalable()) 6661 return InstructionCost::getInvalid(); 6662 6663 Type *ValTy = getLoadStoreType(I); 6664 auto SE = PSE.getSE(); 6665 6666 unsigned AS = getLoadStoreAddressSpace(I); 6667 Value *Ptr = getLoadStorePointerOperand(I); 6668 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6669 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6670 // that it is being called from this specific place. 6671 6672 // Figure out whether the access is strided and get the stride value 6673 // if it's known in compile time 6674 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6675 6676 // Get the cost of the scalar memory instruction and address computation. 6677 InstructionCost Cost = 6678 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6679 6680 // Don't pass *I here, since it is scalar but will actually be part of a 6681 // vectorized loop where the user of it is a vectorized instruction. 6682 const Align Alignment = getLoadStoreAlignment(I); 6683 Cost += VF.getKnownMinValue() * 6684 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6685 AS, TTI::TCK_RecipThroughput); 6686 6687 // Get the overhead of the extractelement and insertelement instructions 6688 // we might create due to scalarization. 6689 Cost += getScalarizationOverhead(I, VF); 6690 6691 // If we have a predicated load/store, it will need extra i1 extracts and 6692 // conditional branches, but may not be executed for each vector lane. Scale 6693 // the cost by the probability of executing the predicated block. 6694 if (isPredicatedInst(I, VF)) { 6695 Cost /= getReciprocalPredBlockProb(); 6696 6697 // Add the cost of an i1 extract and a branch 6698 auto *Vec_i1Ty = 6699 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6700 Cost += TTI.getScalarizationOverhead( 6701 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6702 /*Insert=*/false, /*Extract=*/true); 6703 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6704 6705 if (useEmulatedMaskMemRefHack(I, VF)) 6706 // Artificially setting to a high enough value to practically disable 6707 // vectorization with such operations. 6708 Cost = 3000000; 6709 } 6710 6711 return Cost; 6712 } 6713 6714 InstructionCost 6715 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6716 ElementCount VF) { 6717 Type *ValTy = getLoadStoreType(I); 6718 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6719 Value *Ptr = getLoadStorePointerOperand(I); 6720 unsigned AS = getLoadStoreAddressSpace(I); 6721 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6722 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6723 6724 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6725 "Stride should be 1 or -1 for consecutive memory access"); 6726 const Align Alignment = getLoadStoreAlignment(I); 6727 InstructionCost Cost = 0; 6728 if (Legal->isMaskRequired(I)) 6729 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6730 CostKind); 6731 else 6732 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6733 CostKind, I); 6734 6735 bool Reverse = ConsecutiveStride < 0; 6736 if (Reverse) 6737 Cost += 6738 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6739 return Cost; 6740 } 6741 6742 InstructionCost 6743 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6744 ElementCount VF) { 6745 assert(Legal->isUniformMemOp(*I)); 6746 6747 Type *ValTy = getLoadStoreType(I); 6748 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6749 const Align Alignment = getLoadStoreAlignment(I); 6750 unsigned AS = getLoadStoreAddressSpace(I); 6751 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6752 if (isa<LoadInst>(I)) { 6753 return TTI.getAddressComputationCost(ValTy) + 6754 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6755 CostKind) + 6756 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6757 } 6758 StoreInst *SI = cast<StoreInst>(I); 6759 6760 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6761 return TTI.getAddressComputationCost(ValTy) + 6762 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6763 CostKind) + 6764 (isLoopInvariantStoreValue 6765 ? 0 6766 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6767 VF.getKnownMinValue() - 1)); 6768 } 6769 6770 InstructionCost 6771 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6772 ElementCount VF) { 6773 Type *ValTy = getLoadStoreType(I); 6774 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6775 const Align Alignment = getLoadStoreAlignment(I); 6776 const Value *Ptr = getLoadStorePointerOperand(I); 6777 6778 return TTI.getAddressComputationCost(VectorTy) + 6779 TTI.getGatherScatterOpCost( 6780 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6781 TargetTransformInfo::TCK_RecipThroughput, I); 6782 } 6783 6784 InstructionCost 6785 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6786 ElementCount VF) { 6787 // TODO: Once we have support for interleaving with scalable vectors 6788 // we can calculate the cost properly here. 6789 if (VF.isScalable()) 6790 return InstructionCost::getInvalid(); 6791 6792 Type *ValTy = getLoadStoreType(I); 6793 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6794 unsigned AS = getLoadStoreAddressSpace(I); 6795 6796 auto Group = getInterleavedAccessGroup(I); 6797 assert(Group && "Fail to get an interleaved access group."); 6798 6799 unsigned InterleaveFactor = Group->getFactor(); 6800 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6801 6802 // Holds the indices of existing members in the interleaved group. 6803 SmallVector<unsigned, 4> Indices; 6804 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6805 if (Group->getMember(IF)) 6806 Indices.push_back(IF); 6807 6808 // Calculate the cost of the whole interleaved group. 6809 bool UseMaskForGaps = 6810 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6811 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6812 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6813 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6814 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6815 6816 if (Group->isReverse()) { 6817 // TODO: Add support for reversed masked interleaved access. 6818 assert(!Legal->isMaskRequired(I) && 6819 "Reverse masked interleaved access not supported."); 6820 Cost += 6821 Group->getNumMembers() * 6822 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6823 } 6824 return Cost; 6825 } 6826 6827 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6828 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6829 using namespace llvm::PatternMatch; 6830 // Early exit for no inloop reductions 6831 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6832 return None; 6833 auto *VectorTy = cast<VectorType>(Ty); 6834 6835 // We are looking for a pattern of, and finding the minimal acceptable cost: 6836 // reduce(mul(ext(A), ext(B))) or 6837 // reduce(mul(A, B)) or 6838 // reduce(ext(A)) or 6839 // reduce(A). 6840 // The basic idea is that we walk down the tree to do that, finding the root 6841 // reduction instruction in InLoopReductionImmediateChains. From there we find 6842 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6843 // of the components. If the reduction cost is lower then we return it for the 6844 // reduction instruction and 0 for the other instructions in the pattern. If 6845 // it is not we return an invalid cost specifying the orignal cost method 6846 // should be used. 6847 Instruction *RetI = I; 6848 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6849 if (!RetI->hasOneUser()) 6850 return None; 6851 RetI = RetI->user_back(); 6852 } 6853 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6854 RetI->user_back()->getOpcode() == Instruction::Add) { 6855 if (!RetI->hasOneUser()) 6856 return None; 6857 RetI = RetI->user_back(); 6858 } 6859 6860 // Test if the found instruction is a reduction, and if not return an invalid 6861 // cost specifying the parent to use the original cost modelling. 6862 if (!InLoopReductionImmediateChains.count(RetI)) 6863 return None; 6864 6865 // Find the reduction this chain is a part of and calculate the basic cost of 6866 // the reduction on its own. 6867 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6868 Instruction *ReductionPhi = LastChain; 6869 while (!isa<PHINode>(ReductionPhi)) 6870 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6871 6872 const RecurrenceDescriptor &RdxDesc = 6873 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6874 6875 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6876 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6877 6878 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6879 // normal fmul instruction to the cost of the fadd reduction. 6880 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6881 BaseCost += 6882 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6883 6884 // If we're using ordered reductions then we can just return the base cost 6885 // here, since getArithmeticReductionCost calculates the full ordered 6886 // reduction cost when FP reassociation is not allowed. 6887 if (useOrderedReductions(RdxDesc)) 6888 return BaseCost; 6889 6890 // Get the operand that was not the reduction chain and match it to one of the 6891 // patterns, returning the better cost if it is found. 6892 Instruction *RedOp = RetI->getOperand(1) == LastChain 6893 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6894 : dyn_cast<Instruction>(RetI->getOperand(1)); 6895 6896 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6897 6898 Instruction *Op0, *Op1; 6899 if (RedOp && 6900 match(RedOp, 6901 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6902 match(Op0, m_ZExtOrSExt(m_Value())) && 6903 Op0->getOpcode() == Op1->getOpcode() && 6904 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6905 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6906 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6907 6908 // Matched reduce(ext(mul(ext(A), ext(B))) 6909 // Note that the extend opcodes need to all match, or if A==B they will have 6910 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6911 // which is equally fine. 6912 bool IsUnsigned = isa<ZExtInst>(Op0); 6913 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6914 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6915 6916 InstructionCost ExtCost = 6917 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6918 TTI::CastContextHint::None, CostKind, Op0); 6919 InstructionCost MulCost = 6920 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6921 InstructionCost Ext2Cost = 6922 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6923 TTI::CastContextHint::None, CostKind, RedOp); 6924 6925 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6926 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6927 CostKind); 6928 6929 if (RedCost.isValid() && 6930 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6931 return I == RetI ? RedCost : 0; 6932 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6933 !TheLoop->isLoopInvariant(RedOp)) { 6934 // Matched reduce(ext(A)) 6935 bool IsUnsigned = isa<ZExtInst>(RedOp); 6936 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6937 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6938 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6939 CostKind); 6940 6941 InstructionCost ExtCost = 6942 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6943 TTI::CastContextHint::None, CostKind, RedOp); 6944 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6945 return I == RetI ? RedCost : 0; 6946 } else if (RedOp && 6947 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6948 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6949 Op0->getOpcode() == Op1->getOpcode() && 6950 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6951 bool IsUnsigned = isa<ZExtInst>(Op0); 6952 Type *Op0Ty = Op0->getOperand(0)->getType(); 6953 Type *Op1Ty = Op1->getOperand(0)->getType(); 6954 Type *LargestOpTy = 6955 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6956 : Op0Ty; 6957 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6958 6959 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6960 // different sizes. We take the largest type as the ext to reduce, and add 6961 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6962 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6963 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6964 TTI::CastContextHint::None, CostKind, Op0); 6965 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6966 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6967 TTI::CastContextHint::None, CostKind, Op1); 6968 InstructionCost MulCost = 6969 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6970 6971 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6972 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6973 CostKind); 6974 InstructionCost ExtraExtCost = 0; 6975 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6976 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6977 ExtraExtCost = TTI.getCastInstrCost( 6978 ExtraExtOp->getOpcode(), ExtType, 6979 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6980 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6981 } 6982 6983 if (RedCost.isValid() && 6984 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6985 return I == RetI ? RedCost : 0; 6986 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6987 // Matched reduce(mul()) 6988 InstructionCost MulCost = 6989 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6990 6991 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6992 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6993 CostKind); 6994 6995 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6996 return I == RetI ? RedCost : 0; 6997 } 6998 } 6999 7000 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7001 } 7002 7003 InstructionCost 7004 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7005 ElementCount VF) { 7006 // Calculate scalar cost only. Vectorization cost should be ready at this 7007 // moment. 7008 if (VF.isScalar()) { 7009 Type *ValTy = getLoadStoreType(I); 7010 const Align Alignment = getLoadStoreAlignment(I); 7011 unsigned AS = getLoadStoreAddressSpace(I); 7012 7013 return TTI.getAddressComputationCost(ValTy) + 7014 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7015 TTI::TCK_RecipThroughput, I); 7016 } 7017 return getWideningCost(I, VF); 7018 } 7019 7020 LoopVectorizationCostModel::VectorizationCostTy 7021 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7022 ElementCount VF) { 7023 // If we know that this instruction will remain uniform, check the cost of 7024 // the scalar version. 7025 if (isUniformAfterVectorization(I, VF)) 7026 VF = ElementCount::getFixed(1); 7027 7028 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7029 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7030 7031 // Forced scalars do not have any scalarization overhead. 7032 auto ForcedScalar = ForcedScalars.find(VF); 7033 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7034 auto InstSet = ForcedScalar->second; 7035 if (InstSet.count(I)) 7036 return VectorizationCostTy( 7037 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7038 VF.getKnownMinValue()), 7039 false); 7040 } 7041 7042 Type *VectorTy; 7043 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7044 7045 bool TypeNotScalarized = false; 7046 if (VF.isVector() && VectorTy->isVectorTy()) { 7047 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7048 if (NumParts) 7049 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7050 else 7051 C = InstructionCost::getInvalid(); 7052 } 7053 return VectorizationCostTy(C, TypeNotScalarized); 7054 } 7055 7056 InstructionCost 7057 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7058 ElementCount VF) const { 7059 7060 // There is no mechanism yet to create a scalable scalarization loop, 7061 // so this is currently Invalid. 7062 if (VF.isScalable()) 7063 return InstructionCost::getInvalid(); 7064 7065 if (VF.isScalar()) 7066 return 0; 7067 7068 InstructionCost Cost = 0; 7069 Type *RetTy = ToVectorTy(I->getType(), VF); 7070 if (!RetTy->isVoidTy() && 7071 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7072 Cost += TTI.getScalarizationOverhead( 7073 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7074 false); 7075 7076 // Some targets keep addresses scalar. 7077 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7078 return Cost; 7079 7080 // Some targets support efficient element stores. 7081 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7082 return Cost; 7083 7084 // Collect operands to consider. 7085 CallInst *CI = dyn_cast<CallInst>(I); 7086 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7087 7088 // Skip operands that do not require extraction/scalarization and do not incur 7089 // any overhead. 7090 SmallVector<Type *> Tys; 7091 for (auto *V : filterExtractingOperands(Ops, VF)) 7092 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7093 return Cost + TTI.getOperandsScalarizationOverhead( 7094 filterExtractingOperands(Ops, VF), Tys); 7095 } 7096 7097 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7098 if (VF.isScalar()) 7099 return; 7100 NumPredStores = 0; 7101 for (BasicBlock *BB : TheLoop->blocks()) { 7102 // For each instruction in the old loop. 7103 for (Instruction &I : *BB) { 7104 Value *Ptr = getLoadStorePointerOperand(&I); 7105 if (!Ptr) 7106 continue; 7107 7108 // TODO: We should generate better code and update the cost model for 7109 // predicated uniform stores. Today they are treated as any other 7110 // predicated store (see added test cases in 7111 // invariant-store-vectorization.ll). 7112 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 7113 NumPredStores++; 7114 7115 if (Legal->isUniformMemOp(I)) { 7116 // TODO: Avoid replicating loads and stores instead of 7117 // relying on instcombine to remove them. 7118 // Load: Scalar load + broadcast 7119 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7120 InstructionCost Cost; 7121 if (isa<StoreInst>(&I) && VF.isScalable() && 7122 isLegalGatherOrScatter(&I, VF)) { 7123 Cost = getGatherScatterCost(&I, VF); 7124 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7125 } else { 7126 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7127 "Cannot yet scalarize uniform stores"); 7128 Cost = getUniformMemOpCost(&I, VF); 7129 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7130 } 7131 continue; 7132 } 7133 7134 // We assume that widening is the best solution when possible. 7135 if (memoryInstructionCanBeWidened(&I, VF)) { 7136 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7137 int ConsecutiveStride = Legal->isConsecutivePtr( 7138 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7139 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7140 "Expected consecutive stride."); 7141 InstWidening Decision = 7142 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7143 setWideningDecision(&I, VF, Decision, Cost); 7144 continue; 7145 } 7146 7147 // Choose between Interleaving, Gather/Scatter or Scalarization. 7148 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7149 unsigned NumAccesses = 1; 7150 if (isAccessInterleaved(&I)) { 7151 auto Group = getInterleavedAccessGroup(&I); 7152 assert(Group && "Fail to get an interleaved access group."); 7153 7154 // Make one decision for the whole group. 7155 if (getWideningDecision(&I, VF) != CM_Unknown) 7156 continue; 7157 7158 NumAccesses = Group->getNumMembers(); 7159 if (interleavedAccessCanBeWidened(&I, VF)) 7160 InterleaveCost = getInterleaveGroupCost(&I, VF); 7161 } 7162 7163 InstructionCost GatherScatterCost = 7164 isLegalGatherOrScatter(&I, VF) 7165 ? getGatherScatterCost(&I, VF) * NumAccesses 7166 : InstructionCost::getInvalid(); 7167 7168 InstructionCost ScalarizationCost = 7169 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7170 7171 // Choose better solution for the current VF, 7172 // write down this decision and use it during vectorization. 7173 InstructionCost Cost; 7174 InstWidening Decision; 7175 if (InterleaveCost <= GatherScatterCost && 7176 InterleaveCost < ScalarizationCost) { 7177 Decision = CM_Interleave; 7178 Cost = InterleaveCost; 7179 } else if (GatherScatterCost < ScalarizationCost) { 7180 Decision = CM_GatherScatter; 7181 Cost = GatherScatterCost; 7182 } else { 7183 Decision = CM_Scalarize; 7184 Cost = ScalarizationCost; 7185 } 7186 // If the instructions belongs to an interleave group, the whole group 7187 // receives the same decision. The whole group receives the cost, but 7188 // the cost will actually be assigned to one instruction. 7189 if (auto Group = getInterleavedAccessGroup(&I)) 7190 setWideningDecision(Group, VF, Decision, Cost); 7191 else 7192 setWideningDecision(&I, VF, Decision, Cost); 7193 } 7194 } 7195 7196 // Make sure that any load of address and any other address computation 7197 // remains scalar unless there is gather/scatter support. This avoids 7198 // inevitable extracts into address registers, and also has the benefit of 7199 // activating LSR more, since that pass can't optimize vectorized 7200 // addresses. 7201 if (TTI.prefersVectorizedAddressing()) 7202 return; 7203 7204 // Start with all scalar pointer uses. 7205 SmallPtrSet<Instruction *, 8> AddrDefs; 7206 for (BasicBlock *BB : TheLoop->blocks()) 7207 for (Instruction &I : *BB) { 7208 Instruction *PtrDef = 7209 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7210 if (PtrDef && TheLoop->contains(PtrDef) && 7211 getWideningDecision(&I, VF) != CM_GatherScatter) 7212 AddrDefs.insert(PtrDef); 7213 } 7214 7215 // Add all instructions used to generate the addresses. 7216 SmallVector<Instruction *, 4> Worklist; 7217 append_range(Worklist, AddrDefs); 7218 while (!Worklist.empty()) { 7219 Instruction *I = Worklist.pop_back_val(); 7220 for (auto &Op : I->operands()) 7221 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7222 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7223 AddrDefs.insert(InstOp).second) 7224 Worklist.push_back(InstOp); 7225 } 7226 7227 for (auto *I : AddrDefs) { 7228 if (isa<LoadInst>(I)) { 7229 // Setting the desired widening decision should ideally be handled in 7230 // by cost functions, but since this involves the task of finding out 7231 // if the loaded register is involved in an address computation, it is 7232 // instead changed here when we know this is the case. 7233 InstWidening Decision = getWideningDecision(I, VF); 7234 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7235 // Scalarize a widened load of address. 7236 setWideningDecision( 7237 I, VF, CM_Scalarize, 7238 (VF.getKnownMinValue() * 7239 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7240 else if (auto Group = getInterleavedAccessGroup(I)) { 7241 // Scalarize an interleave group of address loads. 7242 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7243 if (Instruction *Member = Group->getMember(I)) 7244 setWideningDecision( 7245 Member, VF, CM_Scalarize, 7246 (VF.getKnownMinValue() * 7247 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7248 } 7249 } 7250 } else 7251 // Make sure I gets scalarized and a cost estimate without 7252 // scalarization overhead. 7253 ForcedScalars[VF].insert(I); 7254 } 7255 } 7256 7257 InstructionCost 7258 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7259 Type *&VectorTy) { 7260 Type *RetTy = I->getType(); 7261 if (canTruncateToMinimalBitwidth(I, VF)) 7262 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7263 auto SE = PSE.getSE(); 7264 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7265 7266 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7267 ElementCount VF) -> bool { 7268 if (VF.isScalar()) 7269 return true; 7270 7271 auto Scalarized = InstsToScalarize.find(VF); 7272 assert(Scalarized != InstsToScalarize.end() && 7273 "VF not yet analyzed for scalarization profitability"); 7274 return !Scalarized->second.count(I) && 7275 llvm::all_of(I->users(), [&](User *U) { 7276 auto *UI = cast<Instruction>(U); 7277 return !Scalarized->second.count(UI); 7278 }); 7279 }; 7280 (void) hasSingleCopyAfterVectorization; 7281 7282 if (isScalarAfterVectorization(I, VF)) { 7283 // With the exception of GEPs and PHIs, after scalarization there should 7284 // only be one copy of the instruction generated in the loop. This is 7285 // because the VF is either 1, or any instructions that need scalarizing 7286 // have already been dealt with by the the time we get here. As a result, 7287 // it means we don't have to multiply the instruction cost by VF. 7288 assert(I->getOpcode() == Instruction::GetElementPtr || 7289 I->getOpcode() == Instruction::PHI || 7290 (I->getOpcode() == Instruction::BitCast && 7291 I->getType()->isPointerTy()) || 7292 hasSingleCopyAfterVectorization(I, VF)); 7293 VectorTy = RetTy; 7294 } else 7295 VectorTy = ToVectorTy(RetTy, VF); 7296 7297 // TODO: We need to estimate the cost of intrinsic calls. 7298 switch (I->getOpcode()) { 7299 case Instruction::GetElementPtr: 7300 // We mark this instruction as zero-cost because the cost of GEPs in 7301 // vectorized code depends on whether the corresponding memory instruction 7302 // is scalarized or not. Therefore, we handle GEPs with the memory 7303 // instruction cost. 7304 return 0; 7305 case Instruction::Br: { 7306 // In cases of scalarized and predicated instructions, there will be VF 7307 // predicated blocks in the vectorized loop. Each branch around these 7308 // blocks requires also an extract of its vector compare i1 element. 7309 bool ScalarPredicatedBB = false; 7310 BranchInst *BI = cast<BranchInst>(I); 7311 if (VF.isVector() && BI->isConditional() && 7312 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7313 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7314 ScalarPredicatedBB = true; 7315 7316 if (ScalarPredicatedBB) { 7317 // Not possible to scalarize scalable vector with predicated instructions. 7318 if (VF.isScalable()) 7319 return InstructionCost::getInvalid(); 7320 // Return cost for branches around scalarized and predicated blocks. 7321 auto *Vec_i1Ty = 7322 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7323 return ( 7324 TTI.getScalarizationOverhead( 7325 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7326 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7327 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7328 // The back-edge branch will remain, as will all scalar branches. 7329 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7330 else 7331 // This branch will be eliminated by if-conversion. 7332 return 0; 7333 // Note: We currently assume zero cost for an unconditional branch inside 7334 // a predicated block since it will become a fall-through, although we 7335 // may decide in the future to call TTI for all branches. 7336 } 7337 case Instruction::PHI: { 7338 auto *Phi = cast<PHINode>(I); 7339 7340 // First-order recurrences are replaced by vector shuffles inside the loop. 7341 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7342 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7343 return TTI.getShuffleCost( 7344 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7345 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7346 7347 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7348 // converted into select instructions. We require N - 1 selects per phi 7349 // node, where N is the number of incoming values. 7350 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7351 return (Phi->getNumIncomingValues() - 1) * 7352 TTI.getCmpSelInstrCost( 7353 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7354 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7355 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7356 7357 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7358 } 7359 case Instruction::UDiv: 7360 case Instruction::SDiv: 7361 case Instruction::URem: 7362 case Instruction::SRem: 7363 // If we have a predicated instruction, it may not be executed for each 7364 // vector lane. Get the scalarization cost and scale this amount by the 7365 // probability of executing the predicated block. If the instruction is not 7366 // predicated, we fall through to the next case. 7367 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7368 InstructionCost Cost = 0; 7369 7370 // These instructions have a non-void type, so account for the phi nodes 7371 // that we will create. This cost is likely to be zero. The phi node 7372 // cost, if any, should be scaled by the block probability because it 7373 // models a copy at the end of each predicated block. 7374 Cost += VF.getKnownMinValue() * 7375 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7376 7377 // The cost of the non-predicated instruction. 7378 Cost += VF.getKnownMinValue() * 7379 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7380 7381 // The cost of insertelement and extractelement instructions needed for 7382 // scalarization. 7383 Cost += getScalarizationOverhead(I, VF); 7384 7385 // Scale the cost by the probability of executing the predicated blocks. 7386 // This assumes the predicated block for each vector lane is equally 7387 // likely. 7388 return Cost / getReciprocalPredBlockProb(); 7389 } 7390 LLVM_FALLTHROUGH; 7391 case Instruction::Add: 7392 case Instruction::FAdd: 7393 case Instruction::Sub: 7394 case Instruction::FSub: 7395 case Instruction::Mul: 7396 case Instruction::FMul: 7397 case Instruction::FDiv: 7398 case Instruction::FRem: 7399 case Instruction::Shl: 7400 case Instruction::LShr: 7401 case Instruction::AShr: 7402 case Instruction::And: 7403 case Instruction::Or: 7404 case Instruction::Xor: { 7405 // Since we will replace the stride by 1 the multiplication should go away. 7406 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7407 return 0; 7408 7409 // Detect reduction patterns 7410 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7411 return *RedCost; 7412 7413 // Certain instructions can be cheaper to vectorize if they have a constant 7414 // second vector operand. One example of this are shifts on x86. 7415 Value *Op2 = I->getOperand(1); 7416 TargetTransformInfo::OperandValueProperties Op2VP; 7417 TargetTransformInfo::OperandValueKind Op2VK = 7418 TTI.getOperandInfo(Op2, Op2VP); 7419 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7420 Op2VK = TargetTransformInfo::OK_UniformValue; 7421 7422 SmallVector<const Value *, 4> Operands(I->operand_values()); 7423 return TTI.getArithmeticInstrCost( 7424 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7425 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7426 } 7427 case Instruction::FNeg: { 7428 return TTI.getArithmeticInstrCost( 7429 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7430 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7431 TargetTransformInfo::OP_None, I->getOperand(0), I); 7432 } 7433 case Instruction::Select: { 7434 SelectInst *SI = cast<SelectInst>(I); 7435 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7436 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7437 7438 const Value *Op0, *Op1; 7439 using namespace llvm::PatternMatch; 7440 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7441 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7442 // select x, y, false --> x & y 7443 // select x, true, y --> x | y 7444 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7445 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7446 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7447 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7448 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7449 Op1->getType()->getScalarSizeInBits() == 1); 7450 7451 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7452 return TTI.getArithmeticInstrCost( 7453 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7454 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7455 } 7456 7457 Type *CondTy = SI->getCondition()->getType(); 7458 if (!ScalarCond) 7459 CondTy = VectorType::get(CondTy, VF); 7460 7461 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7462 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7463 Pred = Cmp->getPredicate(); 7464 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7465 CostKind, I); 7466 } 7467 case Instruction::ICmp: 7468 case Instruction::FCmp: { 7469 Type *ValTy = I->getOperand(0)->getType(); 7470 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7471 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7472 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7473 VectorTy = ToVectorTy(ValTy, VF); 7474 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7475 cast<CmpInst>(I)->getPredicate(), CostKind, 7476 I); 7477 } 7478 case Instruction::Store: 7479 case Instruction::Load: { 7480 ElementCount Width = VF; 7481 if (Width.isVector()) { 7482 InstWidening Decision = getWideningDecision(I, Width); 7483 assert(Decision != CM_Unknown && 7484 "CM decision should be taken at this point"); 7485 if (Decision == CM_Scalarize) 7486 Width = ElementCount::getFixed(1); 7487 } 7488 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7489 return getMemoryInstructionCost(I, VF); 7490 } 7491 case Instruction::BitCast: 7492 if (I->getType()->isPointerTy()) 7493 return 0; 7494 LLVM_FALLTHROUGH; 7495 case Instruction::ZExt: 7496 case Instruction::SExt: 7497 case Instruction::FPToUI: 7498 case Instruction::FPToSI: 7499 case Instruction::FPExt: 7500 case Instruction::PtrToInt: 7501 case Instruction::IntToPtr: 7502 case Instruction::SIToFP: 7503 case Instruction::UIToFP: 7504 case Instruction::Trunc: 7505 case Instruction::FPTrunc: { 7506 // Computes the CastContextHint from a Load/Store instruction. 7507 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7508 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7509 "Expected a load or a store!"); 7510 7511 if (VF.isScalar() || !TheLoop->contains(I)) 7512 return TTI::CastContextHint::Normal; 7513 7514 switch (getWideningDecision(I, VF)) { 7515 case LoopVectorizationCostModel::CM_GatherScatter: 7516 return TTI::CastContextHint::GatherScatter; 7517 case LoopVectorizationCostModel::CM_Interleave: 7518 return TTI::CastContextHint::Interleave; 7519 case LoopVectorizationCostModel::CM_Scalarize: 7520 case LoopVectorizationCostModel::CM_Widen: 7521 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7522 : TTI::CastContextHint::Normal; 7523 case LoopVectorizationCostModel::CM_Widen_Reverse: 7524 return TTI::CastContextHint::Reversed; 7525 case LoopVectorizationCostModel::CM_Unknown: 7526 llvm_unreachable("Instr did not go through cost modelling?"); 7527 } 7528 7529 llvm_unreachable("Unhandled case!"); 7530 }; 7531 7532 unsigned Opcode = I->getOpcode(); 7533 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7534 // For Trunc, the context is the only user, which must be a StoreInst. 7535 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7536 if (I->hasOneUse()) 7537 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7538 CCH = ComputeCCH(Store); 7539 } 7540 // For Z/Sext, the context is the operand, which must be a LoadInst. 7541 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7542 Opcode == Instruction::FPExt) { 7543 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7544 CCH = ComputeCCH(Load); 7545 } 7546 7547 // We optimize the truncation of induction variables having constant 7548 // integer steps. The cost of these truncations is the same as the scalar 7549 // operation. 7550 if (isOptimizableIVTruncate(I, VF)) { 7551 auto *Trunc = cast<TruncInst>(I); 7552 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7553 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7554 } 7555 7556 // Detect reduction patterns 7557 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7558 return *RedCost; 7559 7560 Type *SrcScalarTy = I->getOperand(0)->getType(); 7561 Type *SrcVecTy = 7562 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7563 if (canTruncateToMinimalBitwidth(I, VF)) { 7564 // This cast is going to be shrunk. This may remove the cast or it might 7565 // turn it into slightly different cast. For example, if MinBW == 16, 7566 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7567 // 7568 // Calculate the modified src and dest types. 7569 Type *MinVecTy = VectorTy; 7570 if (Opcode == Instruction::Trunc) { 7571 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7572 VectorTy = 7573 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7574 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7575 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7576 VectorTy = 7577 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7578 } 7579 } 7580 7581 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7582 } 7583 case Instruction::Call: { 7584 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7585 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7586 return *RedCost; 7587 bool NeedToScalarize; 7588 CallInst *CI = cast<CallInst>(I); 7589 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7590 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7591 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7592 return std::min(CallCost, IntrinsicCost); 7593 } 7594 return CallCost; 7595 } 7596 case Instruction::ExtractValue: 7597 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7598 case Instruction::Alloca: 7599 // We cannot easily widen alloca to a scalable alloca, as 7600 // the result would need to be a vector of pointers. 7601 if (VF.isScalable()) 7602 return InstructionCost::getInvalid(); 7603 LLVM_FALLTHROUGH; 7604 default: 7605 // This opcode is unknown. Assume that it is the same as 'mul'. 7606 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7607 } // end of switch. 7608 } 7609 7610 char LoopVectorize::ID = 0; 7611 7612 static const char lv_name[] = "Loop Vectorization"; 7613 7614 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7615 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7616 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7617 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7618 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7619 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7620 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7621 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7622 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7623 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7624 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7625 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7626 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7627 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7628 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7629 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7630 7631 namespace llvm { 7632 7633 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7634 7635 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7636 bool VectorizeOnlyWhenForced) { 7637 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7638 } 7639 7640 } // end namespace llvm 7641 7642 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7643 // Check if the pointer operand of a load or store instruction is 7644 // consecutive. 7645 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7646 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7647 return false; 7648 } 7649 7650 void LoopVectorizationCostModel::collectValuesToIgnore() { 7651 // Ignore ephemeral values. 7652 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7653 7654 // Ignore type-promoting instructions we identified during reduction 7655 // detection. 7656 for (auto &Reduction : Legal->getReductionVars()) { 7657 const RecurrenceDescriptor &RedDes = Reduction.second; 7658 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7659 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7660 } 7661 // Ignore type-casting instructions we identified during induction 7662 // detection. 7663 for (auto &Induction : Legal->getInductionVars()) { 7664 const InductionDescriptor &IndDes = Induction.second; 7665 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7666 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7667 } 7668 } 7669 7670 void LoopVectorizationCostModel::collectInLoopReductions() { 7671 for (auto &Reduction : Legal->getReductionVars()) { 7672 PHINode *Phi = Reduction.first; 7673 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7674 7675 // We don't collect reductions that are type promoted (yet). 7676 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7677 continue; 7678 7679 // If the target would prefer this reduction to happen "in-loop", then we 7680 // want to record it as such. 7681 unsigned Opcode = RdxDesc.getOpcode(); 7682 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7683 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7684 TargetTransformInfo::ReductionFlags())) 7685 continue; 7686 7687 // Check that we can correctly put the reductions into the loop, by 7688 // finding the chain of operations that leads from the phi to the loop 7689 // exit value. 7690 SmallVector<Instruction *, 4> ReductionOperations = 7691 RdxDesc.getReductionOpChain(Phi, TheLoop); 7692 bool InLoop = !ReductionOperations.empty(); 7693 if (InLoop) { 7694 InLoopReductionChains[Phi] = ReductionOperations; 7695 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7696 Instruction *LastChain = Phi; 7697 for (auto *I : ReductionOperations) { 7698 InLoopReductionImmediateChains[I] = LastChain; 7699 LastChain = I; 7700 } 7701 } 7702 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7703 << " reduction for phi: " << *Phi << "\n"); 7704 } 7705 } 7706 7707 // TODO: we could return a pair of values that specify the max VF and 7708 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7709 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7710 // doesn't have a cost model that can choose which plan to execute if 7711 // more than one is generated. 7712 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7713 LoopVectorizationCostModel &CM) { 7714 unsigned WidestType; 7715 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7716 return WidestVectorRegBits / WidestType; 7717 } 7718 7719 VectorizationFactor 7720 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7721 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7722 ElementCount VF = UserVF; 7723 // Outer loop handling: They may require CFG and instruction level 7724 // transformations before even evaluating whether vectorization is profitable. 7725 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7726 // the vectorization pipeline. 7727 if (!OrigLoop->isInnermost()) { 7728 // If the user doesn't provide a vectorization factor, determine a 7729 // reasonable one. 7730 if (UserVF.isZero()) { 7731 VF = ElementCount::getFixed(determineVPlanVF( 7732 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7733 .getFixedSize(), 7734 CM)); 7735 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7736 7737 // Make sure we have a VF > 1 for stress testing. 7738 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7739 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7740 << "overriding computed VF.\n"); 7741 VF = ElementCount::getFixed(4); 7742 } 7743 } 7744 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7745 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7746 "VF needs to be a power of two"); 7747 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7748 << "VF " << VF << " to build VPlans.\n"); 7749 buildVPlans(VF, VF); 7750 7751 // For VPlan build stress testing, we bail out after VPlan construction. 7752 if (VPlanBuildStressTest) 7753 return VectorizationFactor::Disabled(); 7754 7755 return {VF, 0 /*Cost*/}; 7756 } 7757 7758 LLVM_DEBUG( 7759 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7760 "VPlan-native path.\n"); 7761 return VectorizationFactor::Disabled(); 7762 } 7763 7764 Optional<VectorizationFactor> 7765 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7766 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7767 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7768 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7769 return None; 7770 7771 // Invalidate interleave groups if all blocks of loop will be predicated. 7772 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7773 !useMaskedInterleavedAccesses(*TTI)) { 7774 LLVM_DEBUG( 7775 dbgs() 7776 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7777 "which requires masked-interleaved support.\n"); 7778 if (CM.InterleaveInfo.invalidateGroups()) 7779 // Invalidating interleave groups also requires invalidating all decisions 7780 // based on them, which includes widening decisions and uniform and scalar 7781 // values. 7782 CM.invalidateCostModelingDecisions(); 7783 } 7784 7785 ElementCount MaxUserVF = 7786 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7787 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7788 if (!UserVF.isZero() && UserVFIsLegal) { 7789 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7790 "VF needs to be a power of two"); 7791 // Collect the instructions (and their associated costs) that will be more 7792 // profitable to scalarize. 7793 if (CM.selectUserVectorizationFactor(UserVF)) { 7794 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7795 CM.collectInLoopReductions(); 7796 buildVPlansWithVPRecipes(UserVF, UserVF); 7797 LLVM_DEBUG(printPlans(dbgs())); 7798 return {{UserVF, 0}}; 7799 } else 7800 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7801 "InvalidCost", ORE, OrigLoop); 7802 } 7803 7804 // Populate the set of Vectorization Factor Candidates. 7805 ElementCountSet VFCandidates; 7806 for (auto VF = ElementCount::getFixed(1); 7807 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7808 VFCandidates.insert(VF); 7809 for (auto VF = ElementCount::getScalable(1); 7810 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7811 VFCandidates.insert(VF); 7812 7813 for (const auto &VF : VFCandidates) { 7814 // Collect Uniform and Scalar instructions after vectorization with VF. 7815 CM.collectUniformsAndScalars(VF); 7816 7817 // Collect the instructions (and their associated costs) that will be more 7818 // profitable to scalarize. 7819 if (VF.isVector()) 7820 CM.collectInstsToScalarize(VF); 7821 } 7822 7823 CM.collectInLoopReductions(); 7824 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7825 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7826 7827 LLVM_DEBUG(printPlans(dbgs())); 7828 if (!MaxFactors.hasVector()) 7829 return VectorizationFactor::Disabled(); 7830 7831 // Select the optimal vectorization factor. 7832 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7833 7834 // Check if it is profitable to vectorize with runtime checks. 7835 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7836 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7837 bool PragmaThresholdReached = 7838 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7839 bool ThresholdReached = 7840 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7841 if ((ThresholdReached && !Hints.allowReordering()) || 7842 PragmaThresholdReached) { 7843 ORE->emit([&]() { 7844 return OptimizationRemarkAnalysisAliasing( 7845 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7846 OrigLoop->getHeader()) 7847 << "loop not vectorized: cannot prove it is safe to reorder " 7848 "memory operations"; 7849 }); 7850 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7851 Hints.emitRemarkWithHints(); 7852 return VectorizationFactor::Disabled(); 7853 } 7854 } 7855 return SelectedVF; 7856 } 7857 7858 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7859 assert(count_if(VPlans, 7860 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7861 1 && 7862 "Best VF has not a single VPlan."); 7863 7864 for (const VPlanPtr &Plan : VPlans) { 7865 if (Plan->hasVF(VF)) 7866 return *Plan.get(); 7867 } 7868 llvm_unreachable("No plan found!"); 7869 } 7870 7871 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7872 SmallVector<Metadata *, 4> MDs; 7873 // Reserve first location for self reference to the LoopID metadata node. 7874 MDs.push_back(nullptr); 7875 bool IsUnrollMetadata = false; 7876 MDNode *LoopID = L->getLoopID(); 7877 if (LoopID) { 7878 // First find existing loop unrolling disable metadata. 7879 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7880 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7881 if (MD) { 7882 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7883 IsUnrollMetadata = 7884 S && S->getString().startswith("llvm.loop.unroll.disable"); 7885 } 7886 MDs.push_back(LoopID->getOperand(i)); 7887 } 7888 } 7889 7890 if (!IsUnrollMetadata) { 7891 // Add runtime unroll disable metadata. 7892 LLVMContext &Context = L->getHeader()->getContext(); 7893 SmallVector<Metadata *, 1> DisableOperands; 7894 DisableOperands.push_back( 7895 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7896 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7897 MDs.push_back(DisableNode); 7898 MDNode *NewLoopID = MDNode::get(Context, MDs); 7899 // Set operand 0 to refer to the loop id itself. 7900 NewLoopID->replaceOperandWith(0, NewLoopID); 7901 L->setLoopID(NewLoopID); 7902 } 7903 } 7904 7905 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7906 VPlan &BestVPlan, 7907 InnerLoopVectorizer &ILV, 7908 DominatorTree *DT) { 7909 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7910 << '\n'); 7911 7912 // Perform the actual loop transformation. 7913 7914 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7915 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7916 Value *CanonicalIVStartValue; 7917 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7918 ILV.createVectorizedLoopSkeleton(); 7919 ILV.collectPoisonGeneratingRecipes(State); 7920 7921 ILV.printDebugTracesAtStart(); 7922 7923 //===------------------------------------------------===// 7924 // 7925 // Notice: any optimization or new instruction that go 7926 // into the code below should also be implemented in 7927 // the cost-model. 7928 // 7929 //===------------------------------------------------===// 7930 7931 // 2. Copy and widen instructions from the old loop into the new loop. 7932 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7933 ILV.getOrCreateVectorTripCount(nullptr), 7934 CanonicalIVStartValue, State); 7935 BestVPlan.execute(&State); 7936 7937 // Keep all loop hints from the original loop on the vector loop (we'll 7938 // replace the vectorizer-specific hints below). 7939 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7940 7941 Optional<MDNode *> VectorizedLoopID = 7942 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7943 LLVMLoopVectorizeFollowupVectorized}); 7944 7945 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 7946 if (VectorizedLoopID.hasValue()) 7947 L->setLoopID(VectorizedLoopID.getValue()); 7948 else { 7949 // Keep all loop hints from the original loop on the vector loop (we'll 7950 // replace the vectorizer-specific hints below). 7951 if (MDNode *LID = OrigLoop->getLoopID()) 7952 L->setLoopID(LID); 7953 7954 LoopVectorizeHints Hints(L, true, *ORE); 7955 Hints.setAlreadyVectorized(); 7956 } 7957 // Disable runtime unrolling when vectorizing the epilogue loop. 7958 if (CanonicalIVStartValue) 7959 AddRuntimeUnrollDisableMetaData(L); 7960 7961 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7962 // predication, updating analyses. 7963 ILV.fixVectorizedLoop(State); 7964 7965 ILV.printDebugTracesAtEnd(); 7966 } 7967 7968 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7969 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7970 for (const auto &Plan : VPlans) 7971 if (PrintVPlansInDotFormat) 7972 Plan->printDOT(O); 7973 else 7974 Plan->print(O); 7975 } 7976 #endif 7977 7978 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7979 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7980 7981 // We create new control-flow for the vectorized loop, so the original exit 7982 // conditions will be dead after vectorization if it's only used by the 7983 // terminator 7984 SmallVector<BasicBlock*> ExitingBlocks; 7985 OrigLoop->getExitingBlocks(ExitingBlocks); 7986 for (auto *BB : ExitingBlocks) { 7987 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7988 if (!Cmp || !Cmp->hasOneUse()) 7989 continue; 7990 7991 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7992 if (!DeadInstructions.insert(Cmp).second) 7993 continue; 7994 7995 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7996 // TODO: can recurse through operands in general 7997 for (Value *Op : Cmp->operands()) { 7998 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7999 DeadInstructions.insert(cast<Instruction>(Op)); 8000 } 8001 } 8002 8003 // We create new "steps" for induction variable updates to which the original 8004 // induction variables map. An original update instruction will be dead if 8005 // all its users except the induction variable are dead. 8006 auto *Latch = OrigLoop->getLoopLatch(); 8007 for (auto &Induction : Legal->getInductionVars()) { 8008 PHINode *Ind = Induction.first; 8009 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8010 8011 // If the tail is to be folded by masking, the primary induction variable, 8012 // if exists, isn't dead: it will be used for masking. Don't kill it. 8013 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8014 continue; 8015 8016 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8017 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8018 })) 8019 DeadInstructions.insert(IndUpdate); 8020 } 8021 } 8022 8023 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8024 8025 //===--------------------------------------------------------------------===// 8026 // EpilogueVectorizerMainLoop 8027 //===--------------------------------------------------------------------===// 8028 8029 /// This function is partially responsible for generating the control flow 8030 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8031 std::pair<BasicBlock *, Value *> 8032 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8033 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8034 Loop *Lp = createVectorLoopSkeleton(""); 8035 8036 // Generate the code to check the minimum iteration count of the vector 8037 // epilogue (see below). 8038 EPI.EpilogueIterationCountCheck = 8039 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8040 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8041 8042 // Generate the code to check any assumptions that we've made for SCEV 8043 // expressions. 8044 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8045 8046 // Generate the code that checks at runtime if arrays overlap. We put the 8047 // checks into a separate block to make the more common case of few elements 8048 // faster. 8049 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8050 8051 // Generate the iteration count check for the main loop, *after* the check 8052 // for the epilogue loop, so that the path-length is shorter for the case 8053 // that goes directly through the vector epilogue. The longer-path length for 8054 // the main loop is compensated for, by the gain from vectorizing the larger 8055 // trip count. Note: the branch will get updated later on when we vectorize 8056 // the epilogue. 8057 EPI.MainLoopIterationCountCheck = 8058 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8059 8060 // Generate the induction variable. 8061 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8062 EPI.VectorTripCount = CountRoundDown; 8063 createHeaderBranch(Lp); 8064 8065 // Skip induction resume value creation here because they will be created in 8066 // the second pass. If we created them here, they wouldn't be used anyway, 8067 // because the vplan in the second pass still contains the inductions from the 8068 // original loop. 8069 8070 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 8071 } 8072 8073 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8074 LLVM_DEBUG({ 8075 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8076 << "Main Loop VF:" << EPI.MainLoopVF 8077 << ", Main Loop UF:" << EPI.MainLoopUF 8078 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8079 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8080 }); 8081 } 8082 8083 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8084 DEBUG_WITH_TYPE(VerboseDebug, { 8085 dbgs() << "intermediate fn:\n" 8086 << *OrigLoop->getHeader()->getParent() << "\n"; 8087 }); 8088 } 8089 8090 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8091 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8092 assert(L && "Expected valid Loop."); 8093 assert(Bypass && "Expected valid bypass basic block."); 8094 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8095 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8096 Value *Count = getOrCreateTripCount(L); 8097 // Reuse existing vector loop preheader for TC checks. 8098 // Note that new preheader block is generated for vector loop. 8099 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8100 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8101 8102 // Generate code to check if the loop's trip count is less than VF * UF of the 8103 // main vector loop. 8104 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8105 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8106 8107 Value *CheckMinIters = Builder.CreateICmp( 8108 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8109 "min.iters.check"); 8110 8111 if (!ForEpilogue) 8112 TCCheckBlock->setName("vector.main.loop.iter.check"); 8113 8114 // Create new preheader for vector loop. 8115 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8116 DT, LI, nullptr, "vector.ph"); 8117 8118 if (ForEpilogue) { 8119 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8120 DT->getNode(Bypass)->getIDom()) && 8121 "TC check is expected to dominate Bypass"); 8122 8123 // Update dominator for Bypass & LoopExit. 8124 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8125 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8126 // For loops with multiple exits, there's no edge from the middle block 8127 // to exit blocks (as the epilogue must run) and thus no need to update 8128 // the immediate dominator of the exit blocks. 8129 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8130 8131 LoopBypassBlocks.push_back(TCCheckBlock); 8132 8133 // Save the trip count so we don't have to regenerate it in the 8134 // vec.epilog.iter.check. This is safe to do because the trip count 8135 // generated here dominates the vector epilog iter check. 8136 EPI.TripCount = Count; 8137 } 8138 8139 ReplaceInstWithInst( 8140 TCCheckBlock->getTerminator(), 8141 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8142 8143 return TCCheckBlock; 8144 } 8145 8146 //===--------------------------------------------------------------------===// 8147 // EpilogueVectorizerEpilogueLoop 8148 //===--------------------------------------------------------------------===// 8149 8150 /// This function is partially responsible for generating the control flow 8151 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8152 std::pair<BasicBlock *, Value *> 8153 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8154 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8155 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8156 8157 // Now, compare the remaining count and if there aren't enough iterations to 8158 // execute the vectorized epilogue skip to the scalar part. 8159 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8160 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8161 LoopVectorPreHeader = 8162 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8163 LI, nullptr, "vec.epilog.ph"); 8164 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8165 VecEpilogueIterationCountCheck); 8166 8167 // Adjust the control flow taking the state info from the main loop 8168 // vectorization into account. 8169 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8170 "expected this to be saved from the previous pass."); 8171 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8172 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8173 8174 DT->changeImmediateDominator(LoopVectorPreHeader, 8175 EPI.MainLoopIterationCountCheck); 8176 8177 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8178 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8179 8180 if (EPI.SCEVSafetyCheck) 8181 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8182 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8183 if (EPI.MemSafetyCheck) 8184 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8185 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8186 8187 DT->changeImmediateDominator( 8188 VecEpilogueIterationCountCheck, 8189 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8190 8191 DT->changeImmediateDominator(LoopScalarPreHeader, 8192 EPI.EpilogueIterationCountCheck); 8193 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8194 // If there is an epilogue which must run, there's no edge from the 8195 // middle block to exit blocks and thus no need to update the immediate 8196 // dominator of the exit blocks. 8197 DT->changeImmediateDominator(LoopExitBlock, 8198 EPI.EpilogueIterationCountCheck); 8199 8200 // Keep track of bypass blocks, as they feed start values to the induction 8201 // phis in the scalar loop preheader. 8202 if (EPI.SCEVSafetyCheck) 8203 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8204 if (EPI.MemSafetyCheck) 8205 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8206 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8207 8208 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 8209 // merge control-flow from the latch block and the middle block. Update the 8210 // incoming values here and move the Phi into the preheader. 8211 SmallVector<PHINode *, 4> PhisInBlock; 8212 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8213 PhisInBlock.push_back(&Phi); 8214 8215 for (PHINode *Phi : PhisInBlock) { 8216 Phi->replaceIncomingBlockWith( 8217 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8218 VecEpilogueIterationCountCheck); 8219 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8220 if (EPI.SCEVSafetyCheck) 8221 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8222 if (EPI.MemSafetyCheck) 8223 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8224 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8225 } 8226 8227 // Generate a resume induction for the vector epilogue and put it in the 8228 // vector epilogue preheader 8229 Type *IdxTy = Legal->getWidestInductionType(); 8230 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8231 LoopVectorPreHeader->getFirstNonPHI()); 8232 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8233 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8234 EPI.MainLoopIterationCountCheck); 8235 8236 // Generate the induction variable. 8237 createHeaderBranch(Lp); 8238 8239 // Generate induction resume values. These variables save the new starting 8240 // indexes for the scalar loop. They are used to test if there are any tail 8241 // iterations left once the vector loop has completed. 8242 // Note that when the vectorized epilogue is skipped due to iteration count 8243 // check, then the resume value for the induction variable comes from 8244 // the trip count of the main vector loop, hence passing the AdditionalBypass 8245 // argument. 8246 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8247 EPI.VectorTripCount} /* AdditionalBypass */); 8248 8249 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8250 } 8251 8252 BasicBlock * 8253 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8254 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8255 8256 assert(EPI.TripCount && 8257 "Expected trip count to have been safed in the first pass."); 8258 assert( 8259 (!isa<Instruction>(EPI.TripCount) || 8260 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8261 "saved trip count does not dominate insertion point."); 8262 Value *TC = EPI.TripCount; 8263 IRBuilder<> Builder(Insert->getTerminator()); 8264 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8265 8266 // Generate code to check if the loop's trip count is less than VF * UF of the 8267 // vector epilogue loop. 8268 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8269 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8270 8271 Value *CheckMinIters = 8272 Builder.CreateICmp(P, Count, 8273 createStepForVF(Builder, Count->getType(), 8274 EPI.EpilogueVF, EPI.EpilogueUF), 8275 "min.epilog.iters.check"); 8276 8277 ReplaceInstWithInst( 8278 Insert->getTerminator(), 8279 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8280 8281 LoopBypassBlocks.push_back(Insert); 8282 return Insert; 8283 } 8284 8285 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8286 LLVM_DEBUG({ 8287 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8288 << "Epilogue Loop VF:" << EPI.EpilogueVF 8289 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8290 }); 8291 } 8292 8293 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8294 DEBUG_WITH_TYPE(VerboseDebug, { 8295 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8296 }); 8297 } 8298 8299 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8300 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8301 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8302 bool PredicateAtRangeStart = Predicate(Range.Start); 8303 8304 for (ElementCount TmpVF = Range.Start * 2; 8305 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8306 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8307 Range.End = TmpVF; 8308 break; 8309 } 8310 8311 return PredicateAtRangeStart; 8312 } 8313 8314 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8315 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8316 /// of VF's starting at a given VF and extending it as much as possible. Each 8317 /// vectorization decision can potentially shorten this sub-range during 8318 /// buildVPlan(). 8319 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8320 ElementCount MaxVF) { 8321 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8322 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8323 VFRange SubRange = {VF, MaxVFPlusOne}; 8324 VPlans.push_back(buildVPlan(SubRange)); 8325 VF = SubRange.End; 8326 } 8327 } 8328 8329 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8330 VPlanPtr &Plan) { 8331 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8332 8333 // Look for cached value. 8334 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8335 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8336 if (ECEntryIt != EdgeMaskCache.end()) 8337 return ECEntryIt->second; 8338 8339 VPValue *SrcMask = createBlockInMask(Src, Plan); 8340 8341 // The terminator has to be a branch inst! 8342 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8343 assert(BI && "Unexpected terminator found"); 8344 8345 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8346 return EdgeMaskCache[Edge] = SrcMask; 8347 8348 // If source is an exiting block, we know the exit edge is dynamically dead 8349 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8350 // adding uses of an otherwise potentially dead instruction. 8351 if (OrigLoop->isLoopExiting(Src)) 8352 return EdgeMaskCache[Edge] = SrcMask; 8353 8354 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8355 assert(EdgeMask && "No Edge Mask found for condition"); 8356 8357 if (BI->getSuccessor(0) != Dst) 8358 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8359 8360 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8361 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8362 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8363 // The select version does not introduce new UB if SrcMask is false and 8364 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8365 VPValue *False = Plan->getOrAddVPValue( 8366 ConstantInt::getFalse(BI->getCondition()->getType())); 8367 EdgeMask = 8368 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8369 } 8370 8371 return EdgeMaskCache[Edge] = EdgeMask; 8372 } 8373 8374 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8375 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8376 8377 // Look for cached value. 8378 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8379 if (BCEntryIt != BlockMaskCache.end()) 8380 return BCEntryIt->second; 8381 8382 // All-one mask is modelled as no-mask following the convention for masked 8383 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8384 VPValue *BlockMask = nullptr; 8385 8386 if (OrigLoop->getHeader() == BB) { 8387 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8388 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8389 8390 // Introduce the early-exit compare IV <= BTC to form header block mask. 8391 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8392 // constructing the desired canonical IV in the header block as its first 8393 // non-phi instructions. 8394 assert(CM.foldTailByMasking() && "must fold the tail"); 8395 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8396 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8397 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8398 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8399 8400 VPBuilder::InsertPointGuard Guard(Builder); 8401 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8402 if (CM.TTI.emitGetActiveLaneMask()) { 8403 VPValue *TC = Plan->getOrCreateTripCount(); 8404 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8405 } else { 8406 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8407 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8408 } 8409 return BlockMaskCache[BB] = BlockMask; 8410 } 8411 8412 // This is the block mask. We OR all incoming edges. 8413 for (auto *Predecessor : predecessors(BB)) { 8414 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8415 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8416 return BlockMaskCache[BB] = EdgeMask; 8417 8418 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8419 BlockMask = EdgeMask; 8420 continue; 8421 } 8422 8423 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8424 } 8425 8426 return BlockMaskCache[BB] = BlockMask; 8427 } 8428 8429 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8430 ArrayRef<VPValue *> Operands, 8431 VFRange &Range, 8432 VPlanPtr &Plan) { 8433 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8434 "Must be called with either a load or store"); 8435 8436 auto willWiden = [&](ElementCount VF) -> bool { 8437 if (VF.isScalar()) 8438 return false; 8439 LoopVectorizationCostModel::InstWidening Decision = 8440 CM.getWideningDecision(I, VF); 8441 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8442 "CM decision should be taken at this point."); 8443 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8444 return true; 8445 if (CM.isScalarAfterVectorization(I, VF) || 8446 CM.isProfitableToScalarize(I, VF)) 8447 return false; 8448 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8449 }; 8450 8451 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8452 return nullptr; 8453 8454 VPValue *Mask = nullptr; 8455 if (Legal->isMaskRequired(I)) 8456 Mask = createBlockInMask(I->getParent(), Plan); 8457 8458 // Determine if the pointer operand of the access is either consecutive or 8459 // reverse consecutive. 8460 LoopVectorizationCostModel::InstWidening Decision = 8461 CM.getWideningDecision(I, Range.Start); 8462 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8463 bool Consecutive = 8464 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8465 8466 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8467 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8468 Consecutive, Reverse); 8469 8470 StoreInst *Store = cast<StoreInst>(I); 8471 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8472 Mask, Consecutive, Reverse); 8473 } 8474 8475 static VPWidenIntOrFpInductionRecipe * 8476 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, 8477 VPValue *Start, const InductionDescriptor &IndDesc, 8478 LoopVectorizationCostModel &CM, Loop &OrigLoop, 8479 VFRange &Range) { 8480 // Returns true if an instruction \p I should be scalarized instead of 8481 // vectorized for the chosen vectorization factor. 8482 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8483 return CM.isScalarAfterVectorization(I, VF) || 8484 CM.isProfitableToScalarize(I, VF); 8485 }; 8486 8487 bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( 8488 [&](ElementCount VF) { 8489 // Returns true if we should generate a scalar version of \p IV. 8490 if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) 8491 return true; 8492 auto isScalarInst = [&](User *U) -> bool { 8493 auto *I = cast<Instruction>(U); 8494 return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); 8495 }; 8496 return any_of(PhiOrTrunc->users(), isScalarInst); 8497 }, 8498 Range); 8499 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8500 [&](ElementCount VF) { 8501 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8502 }, 8503 Range); 8504 assert(IndDesc.getStartValue() == 8505 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8506 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8507 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, 8508 NeedsScalarIV, !NeedsScalarIVOnly); 8509 } 8510 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8511 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, 8512 !NeedsScalarIVOnly); 8513 } 8514 8515 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8516 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { 8517 8518 // Check if this is an integer or fp induction. If so, build the recipe that 8519 // produces its scalar and vector values. 8520 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8521 return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, 8522 Range); 8523 8524 return nullptr; 8525 } 8526 8527 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8528 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8529 VPlan &Plan) const { 8530 // Optimize the special case where the source is a constant integer 8531 // induction variable. Notice that we can only optimize the 'trunc' case 8532 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8533 // (c) other casts depend on pointer size. 8534 8535 // Determine whether \p K is a truncation based on an induction variable that 8536 // can be optimized. 8537 auto isOptimizableIVTruncate = 8538 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8539 return [=](ElementCount VF) -> bool { 8540 return CM.isOptimizableIVTruncate(K, VF); 8541 }; 8542 }; 8543 8544 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8545 isOptimizableIVTruncate(I), Range)) { 8546 8547 auto *Phi = cast<PHINode>(I->getOperand(0)); 8548 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8549 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8550 return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); 8551 } 8552 return nullptr; 8553 } 8554 8555 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8556 ArrayRef<VPValue *> Operands, 8557 VPlanPtr &Plan) { 8558 // If all incoming values are equal, the incoming VPValue can be used directly 8559 // instead of creating a new VPBlendRecipe. 8560 VPValue *FirstIncoming = Operands[0]; 8561 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8562 return FirstIncoming == Inc; 8563 })) { 8564 return Operands[0]; 8565 } 8566 8567 unsigned NumIncoming = Phi->getNumIncomingValues(); 8568 // For in-loop reductions, we do not need to create an additional select. 8569 VPValue *InLoopVal = nullptr; 8570 for (unsigned In = 0; In < NumIncoming; In++) { 8571 PHINode *PhiOp = 8572 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8573 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8574 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8575 InLoopVal = Operands[In]; 8576 } 8577 } 8578 8579 assert((!InLoopVal || NumIncoming == 2) && 8580 "Found an in-loop reduction for PHI with unexpected number of " 8581 "incoming values"); 8582 if (InLoopVal) 8583 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8584 8585 // We know that all PHIs in non-header blocks are converted into selects, so 8586 // we don't have to worry about the insertion order and we can just use the 8587 // builder. At this point we generate the predication tree. There may be 8588 // duplications since this is a simple recursive scan, but future 8589 // optimizations will clean it up. 8590 SmallVector<VPValue *, 2> OperandsWithMask; 8591 8592 for (unsigned In = 0; In < NumIncoming; In++) { 8593 VPValue *EdgeMask = 8594 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8595 assert((EdgeMask || NumIncoming == 1) && 8596 "Multiple predecessors with one having a full mask"); 8597 OperandsWithMask.push_back(Operands[In]); 8598 if (EdgeMask) 8599 OperandsWithMask.push_back(EdgeMask); 8600 } 8601 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8602 } 8603 8604 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8605 ArrayRef<VPValue *> Operands, 8606 VFRange &Range) const { 8607 8608 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8609 [this, CI](ElementCount VF) { 8610 return CM.isScalarWithPredication(CI, VF); 8611 }, 8612 Range); 8613 8614 if (IsPredicated) 8615 return nullptr; 8616 8617 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8618 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8619 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8620 ID == Intrinsic::pseudoprobe || 8621 ID == Intrinsic::experimental_noalias_scope_decl)) 8622 return nullptr; 8623 8624 auto willWiden = [&](ElementCount VF) -> bool { 8625 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8626 // The following case may be scalarized depending on the VF. 8627 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8628 // version of the instruction. 8629 // Is it beneficial to perform intrinsic call compared to lib call? 8630 bool NeedToScalarize = false; 8631 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8632 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8633 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8634 return UseVectorIntrinsic || !NeedToScalarize; 8635 }; 8636 8637 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8638 return nullptr; 8639 8640 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8641 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8642 } 8643 8644 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8645 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8646 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8647 // Instruction should be widened, unless it is scalar after vectorization, 8648 // scalarization is profitable or it is predicated. 8649 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8650 return CM.isScalarAfterVectorization(I, VF) || 8651 CM.isProfitableToScalarize(I, VF) || 8652 CM.isScalarWithPredication(I, VF); 8653 }; 8654 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8655 Range); 8656 } 8657 8658 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8659 ArrayRef<VPValue *> Operands) const { 8660 auto IsVectorizableOpcode = [](unsigned Opcode) { 8661 switch (Opcode) { 8662 case Instruction::Add: 8663 case Instruction::And: 8664 case Instruction::AShr: 8665 case Instruction::BitCast: 8666 case Instruction::FAdd: 8667 case Instruction::FCmp: 8668 case Instruction::FDiv: 8669 case Instruction::FMul: 8670 case Instruction::FNeg: 8671 case Instruction::FPExt: 8672 case Instruction::FPToSI: 8673 case Instruction::FPToUI: 8674 case Instruction::FPTrunc: 8675 case Instruction::FRem: 8676 case Instruction::FSub: 8677 case Instruction::ICmp: 8678 case Instruction::IntToPtr: 8679 case Instruction::LShr: 8680 case Instruction::Mul: 8681 case Instruction::Or: 8682 case Instruction::PtrToInt: 8683 case Instruction::SDiv: 8684 case Instruction::Select: 8685 case Instruction::SExt: 8686 case Instruction::Shl: 8687 case Instruction::SIToFP: 8688 case Instruction::SRem: 8689 case Instruction::Sub: 8690 case Instruction::Trunc: 8691 case Instruction::UDiv: 8692 case Instruction::UIToFP: 8693 case Instruction::URem: 8694 case Instruction::Xor: 8695 case Instruction::ZExt: 8696 return true; 8697 } 8698 return false; 8699 }; 8700 8701 if (!IsVectorizableOpcode(I->getOpcode())) 8702 return nullptr; 8703 8704 // Success: widen this instruction. 8705 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8706 } 8707 8708 void VPRecipeBuilder::fixHeaderPhis() { 8709 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8710 for (VPHeaderPHIRecipe *R : PhisToFix) { 8711 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8712 VPRecipeBase *IncR = 8713 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8714 R->addOperand(IncR->getVPSingleValue()); 8715 } 8716 } 8717 8718 VPBasicBlock *VPRecipeBuilder::handleReplication( 8719 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8720 VPlanPtr &Plan) { 8721 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8722 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8723 Range); 8724 8725 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8726 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8727 Range); 8728 8729 // Even if the instruction is not marked as uniform, there are certain 8730 // intrinsic calls that can be effectively treated as such, so we check for 8731 // them here. Conservatively, we only do this for scalable vectors, since 8732 // for fixed-width VFs we can always fall back on full scalarization. 8733 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8734 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8735 case Intrinsic::assume: 8736 case Intrinsic::lifetime_start: 8737 case Intrinsic::lifetime_end: 8738 // For scalable vectors if one of the operands is variant then we still 8739 // want to mark as uniform, which will generate one instruction for just 8740 // the first lane of the vector. We can't scalarize the call in the same 8741 // way as for fixed-width vectors because we don't know how many lanes 8742 // there are. 8743 // 8744 // The reasons for doing it this way for scalable vectors are: 8745 // 1. For the assume intrinsic generating the instruction for the first 8746 // lane is still be better than not generating any at all. For 8747 // example, the input may be a splat across all lanes. 8748 // 2. For the lifetime start/end intrinsics the pointer operand only 8749 // does anything useful when the input comes from a stack object, 8750 // which suggests it should always be uniform. For non-stack objects 8751 // the effect is to poison the object, which still allows us to 8752 // remove the call. 8753 IsUniform = true; 8754 break; 8755 default: 8756 break; 8757 } 8758 } 8759 8760 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8761 IsUniform, IsPredicated); 8762 setRecipe(I, Recipe); 8763 Plan->addVPValue(I, Recipe); 8764 8765 // Find if I uses a predicated instruction. If so, it will use its scalar 8766 // value. Avoid hoisting the insert-element which packs the scalar value into 8767 // a vector value, as that happens iff all users use the vector value. 8768 for (VPValue *Op : Recipe->operands()) { 8769 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8770 if (!PredR) 8771 continue; 8772 auto *RepR = 8773 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8774 assert(RepR->isPredicated() && 8775 "expected Replicate recipe to be predicated"); 8776 RepR->setAlsoPack(false); 8777 } 8778 8779 // Finalize the recipe for Instr, first if it is not predicated. 8780 if (!IsPredicated) { 8781 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8782 VPBB->appendRecipe(Recipe); 8783 return VPBB; 8784 } 8785 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8786 8787 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8788 assert(SingleSucc && "VPBB must have a single successor when handling " 8789 "predicated replication."); 8790 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8791 // Record predicated instructions for above packing optimizations. 8792 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8793 VPBlockUtils::insertBlockAfter(Region, VPBB); 8794 auto *RegSucc = new VPBasicBlock(); 8795 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8796 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8797 return RegSucc; 8798 } 8799 8800 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8801 VPRecipeBase *PredRecipe, 8802 VPlanPtr &Plan) { 8803 // Instructions marked for predication are replicated and placed under an 8804 // if-then construct to prevent side-effects. 8805 8806 // Generate recipes to compute the block mask for this region. 8807 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8808 8809 // Build the triangular if-then region. 8810 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8811 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8812 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8813 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8814 auto *PHIRecipe = Instr->getType()->isVoidTy() 8815 ? nullptr 8816 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8817 if (PHIRecipe) { 8818 Plan->removeVPValueFor(Instr); 8819 Plan->addVPValue(Instr, PHIRecipe); 8820 } 8821 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8822 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8823 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8824 8825 // Note: first set Entry as region entry and then connect successors starting 8826 // from it in order, to propagate the "parent" of each VPBasicBlock. 8827 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8828 VPBlockUtils::connectBlocks(Pred, Exit); 8829 8830 return Region; 8831 } 8832 8833 VPRecipeOrVPValueTy 8834 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8835 ArrayRef<VPValue *> Operands, 8836 VFRange &Range, VPlanPtr &Plan) { 8837 // First, check for specific widening recipes that deal with calls, memory 8838 // operations, inductions and Phi nodes. 8839 if (auto *CI = dyn_cast<CallInst>(Instr)) 8840 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8841 8842 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8843 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8844 8845 VPRecipeBase *Recipe; 8846 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8847 if (Phi->getParent() != OrigLoop->getHeader()) 8848 return tryToBlend(Phi, Operands, Plan); 8849 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8850 return toVPRecipeResult(Recipe); 8851 8852 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8853 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8854 VPValue *StartV = Operands[0]; 8855 if (Legal->isReductionVariable(Phi)) { 8856 const RecurrenceDescriptor &RdxDesc = 8857 Legal->getReductionVars().find(Phi)->second; 8858 assert(RdxDesc.getRecurrenceStartValue() == 8859 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8860 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8861 CM.isInLoopReduction(Phi), 8862 CM.useOrderedReductions(RdxDesc)); 8863 } else { 8864 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8865 } 8866 8867 // Record the incoming value from the backedge, so we can add the incoming 8868 // value from the backedge after all recipes have been created. 8869 recordRecipeOf(cast<Instruction>( 8870 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8871 PhisToFix.push_back(PhiRecipe); 8872 } else { 8873 // TODO: record backedge value for remaining pointer induction phis. 8874 assert(Phi->getType()->isPointerTy() && 8875 "only pointer phis should be handled here"); 8876 assert(Legal->getInductionVars().count(Phi) && 8877 "Not an induction variable"); 8878 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8879 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8880 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8881 } 8882 8883 return toVPRecipeResult(PhiRecipe); 8884 } 8885 8886 if (isa<TruncInst>(Instr) && 8887 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8888 Range, *Plan))) 8889 return toVPRecipeResult(Recipe); 8890 8891 if (!shouldWiden(Instr, Range)) 8892 return nullptr; 8893 8894 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8895 return toVPRecipeResult(new VPWidenGEPRecipe( 8896 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8897 8898 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8899 bool InvariantCond = 8900 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8901 return toVPRecipeResult(new VPWidenSelectRecipe( 8902 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8903 } 8904 8905 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8906 } 8907 8908 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8909 ElementCount MaxVF) { 8910 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8911 8912 // Collect instructions from the original loop that will become trivially dead 8913 // in the vectorized loop. We don't need to vectorize these instructions. For 8914 // example, original induction update instructions can become dead because we 8915 // separately emit induction "steps" when generating code for the new loop. 8916 // Similarly, we create a new latch condition when setting up the structure 8917 // of the new loop, so the old one can become dead. 8918 SmallPtrSet<Instruction *, 4> DeadInstructions; 8919 collectTriviallyDeadInstructions(DeadInstructions); 8920 8921 // Add assume instructions we need to drop to DeadInstructions, to prevent 8922 // them from being added to the VPlan. 8923 // TODO: We only need to drop assumes in blocks that get flattend. If the 8924 // control flow is preserved, we should keep them. 8925 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8926 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8927 8928 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8929 // Dead instructions do not need sinking. Remove them from SinkAfter. 8930 for (Instruction *I : DeadInstructions) 8931 SinkAfter.erase(I); 8932 8933 // Cannot sink instructions after dead instructions (there won't be any 8934 // recipes for them). Instead, find the first non-dead previous instruction. 8935 for (auto &P : Legal->getSinkAfter()) { 8936 Instruction *SinkTarget = P.second; 8937 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8938 (void)FirstInst; 8939 while (DeadInstructions.contains(SinkTarget)) { 8940 assert( 8941 SinkTarget != FirstInst && 8942 "Must find a live instruction (at least the one feeding the " 8943 "first-order recurrence PHI) before reaching beginning of the block"); 8944 SinkTarget = SinkTarget->getPrevNode(); 8945 assert(SinkTarget != P.first && 8946 "sink source equals target, no sinking required"); 8947 } 8948 P.second = SinkTarget; 8949 } 8950 8951 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8952 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8953 VFRange SubRange = {VF, MaxVFPlusOne}; 8954 VPlans.push_back( 8955 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8956 VF = SubRange.End; 8957 } 8958 } 8959 8960 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8961 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8962 // BranchOnCount VPInstruction to the latch. 8963 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8964 bool HasNUW, bool IsVPlanNative) { 8965 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8966 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8967 8968 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8969 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8970 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8971 if (IsVPlanNative) 8972 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 8973 Header->insert(CanonicalIVPHI, Header->begin()); 8974 8975 auto *CanonicalIVIncrement = 8976 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8977 : VPInstruction::CanonicalIVIncrement, 8978 {CanonicalIVPHI}, DL); 8979 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8980 8981 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 8982 if (IsVPlanNative) { 8983 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 8984 EB->setCondBit(nullptr); 8985 } 8986 EB->appendRecipe(CanonicalIVIncrement); 8987 8988 auto *BranchOnCount = 8989 new VPInstruction(VPInstruction::BranchOnCount, 8990 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8991 EB->appendRecipe(BranchOnCount); 8992 } 8993 8994 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8995 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8996 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8997 8998 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8999 9000 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9001 9002 // --------------------------------------------------------------------------- 9003 // Pre-construction: record ingredients whose recipes we'll need to further 9004 // process after constructing the initial VPlan. 9005 // --------------------------------------------------------------------------- 9006 9007 // Mark instructions we'll need to sink later and their targets as 9008 // ingredients whose recipe we'll need to record. 9009 for (auto &Entry : SinkAfter) { 9010 RecipeBuilder.recordRecipeOf(Entry.first); 9011 RecipeBuilder.recordRecipeOf(Entry.second); 9012 } 9013 for (auto &Reduction : CM.getInLoopReductionChains()) { 9014 PHINode *Phi = Reduction.first; 9015 RecurKind Kind = 9016 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 9017 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9018 9019 RecipeBuilder.recordRecipeOf(Phi); 9020 for (auto &R : ReductionOperations) { 9021 RecipeBuilder.recordRecipeOf(R); 9022 // For min/max reducitons, where we have a pair of icmp/select, we also 9023 // need to record the ICmp recipe, so it can be removed later. 9024 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9025 "Only min/max recurrences allowed for inloop reductions"); 9026 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9027 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9028 } 9029 } 9030 9031 // For each interleave group which is relevant for this (possibly trimmed) 9032 // Range, add it to the set of groups to be later applied to the VPlan and add 9033 // placeholders for its members' Recipes which we'll be replacing with a 9034 // single VPInterleaveRecipe. 9035 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9036 auto applyIG = [IG, this](ElementCount VF) -> bool { 9037 return (VF.isVector() && // Query is illegal for VF == 1 9038 CM.getWideningDecision(IG->getInsertPos(), VF) == 9039 LoopVectorizationCostModel::CM_Interleave); 9040 }; 9041 if (!getDecisionAndClampRange(applyIG, Range)) 9042 continue; 9043 InterleaveGroups.insert(IG); 9044 for (unsigned i = 0; i < IG->getFactor(); i++) 9045 if (Instruction *Member = IG->getMember(i)) 9046 RecipeBuilder.recordRecipeOf(Member); 9047 }; 9048 9049 // --------------------------------------------------------------------------- 9050 // Build initial VPlan: Scan the body of the loop in a topological order to 9051 // visit each basic block after having visited its predecessor basic blocks. 9052 // --------------------------------------------------------------------------- 9053 9054 // Create initial VPlan skeleton, with separate header and latch blocks. 9055 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9056 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9057 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9058 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9059 auto Plan = std::make_unique<VPlan>(TopRegion); 9060 9061 Instruction *DLInst = 9062 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9063 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 9064 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 9065 !CM.foldTailByMasking(), false); 9066 9067 // Scan the body of the loop in a topological order to visit each basic block 9068 // after having visited its predecessor basic blocks. 9069 LoopBlocksDFS DFS(OrigLoop); 9070 DFS.perform(LI); 9071 9072 VPBasicBlock *VPBB = HeaderVPBB; 9073 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9074 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9075 // Relevant instructions from basic block BB will be grouped into VPRecipe 9076 // ingredients and fill a new VPBasicBlock. 9077 unsigned VPBBsForBB = 0; 9078 VPBB->setName(BB->getName()); 9079 Builder.setInsertPoint(VPBB); 9080 9081 // Introduce each ingredient into VPlan. 9082 // TODO: Model and preserve debug instrinsics in VPlan. 9083 for (Instruction &I : BB->instructionsWithoutDebug()) { 9084 Instruction *Instr = &I; 9085 9086 // First filter out irrelevant instructions, to ensure no recipes are 9087 // built for them. 9088 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9089 continue; 9090 9091 SmallVector<VPValue *, 4> Operands; 9092 auto *Phi = dyn_cast<PHINode>(Instr); 9093 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9094 Operands.push_back(Plan->getOrAddVPValue( 9095 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9096 } else { 9097 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9098 Operands = {OpRange.begin(), OpRange.end()}; 9099 } 9100 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9101 Instr, Operands, Range, Plan)) { 9102 // If Instr can be simplified to an existing VPValue, use it. 9103 if (RecipeOrValue.is<VPValue *>()) { 9104 auto *VPV = RecipeOrValue.get<VPValue *>(); 9105 Plan->addVPValue(Instr, VPV); 9106 // If the re-used value is a recipe, register the recipe for the 9107 // instruction, in case the recipe for Instr needs to be recorded. 9108 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9109 RecipeBuilder.setRecipe(Instr, R); 9110 continue; 9111 } 9112 // Otherwise, add the new recipe. 9113 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9114 for (auto *Def : Recipe->definedValues()) { 9115 auto *UV = Def->getUnderlyingValue(); 9116 Plan->addVPValue(UV, Def); 9117 } 9118 9119 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9120 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9121 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9122 // of the header block. That can happen for truncates of induction 9123 // variables. Those recipes are moved to the phi section of the header 9124 // block after applying SinkAfter, which relies on the original 9125 // position of the trunc. 9126 assert(isa<TruncInst>(Instr)); 9127 InductionsToMove.push_back( 9128 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9129 } 9130 RecipeBuilder.setRecipe(Instr, Recipe); 9131 VPBB->appendRecipe(Recipe); 9132 continue; 9133 } 9134 9135 // Otherwise, if all widening options failed, Instruction is to be 9136 // replicated. This may create a successor for VPBB. 9137 VPBasicBlock *NextVPBB = 9138 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9139 if (NextVPBB != VPBB) { 9140 VPBB = NextVPBB; 9141 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9142 : ""); 9143 } 9144 } 9145 9146 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9147 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9148 } 9149 9150 // Fold the last, empty block into its predecessor. 9151 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9152 assert(VPBB && "expected to fold last (empty) block"); 9153 // After here, VPBB should not be used. 9154 VPBB = nullptr; 9155 9156 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9157 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9158 "entry block must be set to a VPRegionBlock having a non-empty entry " 9159 "VPBasicBlock"); 9160 RecipeBuilder.fixHeaderPhis(); 9161 9162 // --------------------------------------------------------------------------- 9163 // Transform initial VPlan: Apply previously taken decisions, in order, to 9164 // bring the VPlan to its final state. 9165 // --------------------------------------------------------------------------- 9166 9167 // Apply Sink-After legal constraints. 9168 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9169 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9170 if (Region && Region->isReplicator()) { 9171 assert(Region->getNumSuccessors() == 1 && 9172 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9173 assert(R->getParent()->size() == 1 && 9174 "A recipe in an original replicator region must be the only " 9175 "recipe in its block"); 9176 return Region; 9177 } 9178 return nullptr; 9179 }; 9180 for (auto &Entry : SinkAfter) { 9181 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9182 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9183 9184 auto *TargetRegion = GetReplicateRegion(Target); 9185 auto *SinkRegion = GetReplicateRegion(Sink); 9186 if (!SinkRegion) { 9187 // If the sink source is not a replicate region, sink the recipe directly. 9188 if (TargetRegion) { 9189 // The target is in a replication region, make sure to move Sink to 9190 // the block after it, not into the replication region itself. 9191 VPBasicBlock *NextBlock = 9192 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9193 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9194 } else 9195 Sink->moveAfter(Target); 9196 continue; 9197 } 9198 9199 // The sink source is in a replicate region. Unhook the region from the CFG. 9200 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9201 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9202 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9203 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9204 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9205 9206 if (TargetRegion) { 9207 // The target recipe is also in a replicate region, move the sink region 9208 // after the target region. 9209 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9210 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9211 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9212 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9213 } else { 9214 // The sink source is in a replicate region, we need to move the whole 9215 // replicate region, which should only contain a single recipe in the 9216 // main block. 9217 auto *SplitBlock = 9218 Target->getParent()->splitAt(std::next(Target->getIterator())); 9219 9220 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9221 9222 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9223 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9224 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9225 } 9226 } 9227 9228 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9229 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9230 9231 // Now that sink-after is done, move induction recipes for optimized truncates 9232 // to the phi section of the header block. 9233 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9234 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9235 9236 // Adjust the recipes for any inloop reductions. 9237 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9238 RecipeBuilder, Range.Start); 9239 9240 // Introduce a recipe to combine the incoming and previous values of a 9241 // first-order recurrence. 9242 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9243 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9244 if (!RecurPhi) 9245 continue; 9246 9247 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9248 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9249 auto *Region = GetReplicateRegion(PrevRecipe); 9250 if (Region) 9251 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9252 if (Region || PrevRecipe->isPhi()) 9253 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9254 else 9255 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9256 9257 auto *RecurSplice = cast<VPInstruction>( 9258 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9259 {RecurPhi, RecurPhi->getBackedgeValue()})); 9260 9261 RecurPhi->replaceAllUsesWith(RecurSplice); 9262 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9263 // all users. 9264 RecurSplice->setOperand(0, RecurPhi); 9265 } 9266 9267 // Interleave memory: for each Interleave Group we marked earlier as relevant 9268 // for this VPlan, replace the Recipes widening its memory instructions with a 9269 // single VPInterleaveRecipe at its insertion point. 9270 for (auto IG : InterleaveGroups) { 9271 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9272 RecipeBuilder.getRecipe(IG->getInsertPos())); 9273 SmallVector<VPValue *, 4> StoredValues; 9274 for (unsigned i = 0; i < IG->getFactor(); ++i) 9275 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9276 auto *StoreR = 9277 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9278 StoredValues.push_back(StoreR->getStoredValue()); 9279 } 9280 9281 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9282 Recipe->getMask()); 9283 VPIG->insertBefore(Recipe); 9284 unsigned J = 0; 9285 for (unsigned i = 0; i < IG->getFactor(); ++i) 9286 if (Instruction *Member = IG->getMember(i)) { 9287 if (!Member->getType()->isVoidTy()) { 9288 VPValue *OriginalV = Plan->getVPValue(Member); 9289 Plan->removeVPValueFor(Member); 9290 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9291 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9292 J++; 9293 } 9294 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9295 } 9296 } 9297 9298 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9299 // in ways that accessing values using original IR values is incorrect. 9300 Plan->disableValue2VPValue(); 9301 9302 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9303 VPlanTransforms::sinkScalarOperands(*Plan); 9304 VPlanTransforms::mergeReplicateRegions(*Plan); 9305 VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop); 9306 9307 std::string PlanName; 9308 raw_string_ostream RSO(PlanName); 9309 ElementCount VF = Range.Start; 9310 Plan->addVF(VF); 9311 RSO << "Initial VPlan for VF={" << VF; 9312 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9313 Plan->addVF(VF); 9314 RSO << "," << VF; 9315 } 9316 RSO << "},UF>=1"; 9317 RSO.flush(); 9318 Plan->setName(PlanName); 9319 9320 // Fold Exit block into its predecessor if possible. 9321 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9322 // VPBasicBlock as exit. 9323 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9324 9325 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9326 return Plan; 9327 } 9328 9329 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9330 // Outer loop handling: They may require CFG and instruction level 9331 // transformations before even evaluating whether vectorization is profitable. 9332 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9333 // the vectorization pipeline. 9334 assert(!OrigLoop->isInnermost()); 9335 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9336 9337 // Create new empty VPlan 9338 auto Plan = std::make_unique<VPlan>(); 9339 9340 // Build hierarchical CFG 9341 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9342 HCFGBuilder.buildHierarchicalCFG(); 9343 9344 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9345 VF *= 2) 9346 Plan->addVF(VF); 9347 9348 if (EnableVPlanPredication) { 9349 VPlanPredicator VPP(*Plan); 9350 VPP.predicate(); 9351 9352 // Avoid running transformation to recipes until masked code generation in 9353 // VPlan-native path is in place. 9354 return Plan; 9355 } 9356 9357 SmallPtrSet<Instruction *, 1> DeadInstructions; 9358 VPlanTransforms::VPInstructionsToVPRecipes( 9359 OrigLoop, Plan, 9360 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9361 DeadInstructions, *PSE.getSE()); 9362 9363 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9364 true, true); 9365 return Plan; 9366 } 9367 9368 // Adjust the recipes for reductions. For in-loop reductions the chain of 9369 // instructions leading from the loop exit instr to the phi need to be converted 9370 // to reductions, with one operand being vector and the other being the scalar 9371 // reduction chain. For other reductions, a select is introduced between the phi 9372 // and live-out recipes when folding the tail. 9373 void LoopVectorizationPlanner::adjustRecipesForReductions( 9374 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9375 ElementCount MinVF) { 9376 for (auto &Reduction : CM.getInLoopReductionChains()) { 9377 PHINode *Phi = Reduction.first; 9378 const RecurrenceDescriptor &RdxDesc = 9379 Legal->getReductionVars().find(Phi)->second; 9380 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9381 9382 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9383 continue; 9384 9385 // ReductionOperations are orders top-down from the phi's use to the 9386 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9387 // which of the two operands will remain scalar and which will be reduced. 9388 // For minmax the chain will be the select instructions. 9389 Instruction *Chain = Phi; 9390 for (Instruction *R : ReductionOperations) { 9391 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9392 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9393 9394 VPValue *ChainOp = Plan->getVPValue(Chain); 9395 unsigned FirstOpId; 9396 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9397 "Only min/max recurrences allowed for inloop reductions"); 9398 // Recognize a call to the llvm.fmuladd intrinsic. 9399 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9400 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9401 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9402 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9403 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9404 "Expected to replace a VPWidenSelectSC"); 9405 FirstOpId = 1; 9406 } else { 9407 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9408 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9409 "Expected to replace a VPWidenSC"); 9410 FirstOpId = 0; 9411 } 9412 unsigned VecOpId = 9413 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9414 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9415 9416 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9417 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9418 : nullptr; 9419 9420 if (IsFMulAdd) { 9421 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9422 // need to create an fmul recipe to use as the vector operand for the 9423 // fadd reduction. 9424 VPInstruction *FMulRecipe = new VPInstruction( 9425 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9426 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9427 WidenRecipe->getParent()->insert(FMulRecipe, 9428 WidenRecipe->getIterator()); 9429 VecOp = FMulRecipe; 9430 } 9431 VPReductionRecipe *RedRecipe = 9432 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9433 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9434 Plan->removeVPValueFor(R); 9435 Plan->addVPValue(R, RedRecipe); 9436 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9437 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9438 WidenRecipe->eraseFromParent(); 9439 9440 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9441 VPRecipeBase *CompareRecipe = 9442 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9443 assert(isa<VPWidenRecipe>(CompareRecipe) && 9444 "Expected to replace a VPWidenSC"); 9445 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9446 "Expected no remaining users"); 9447 CompareRecipe->eraseFromParent(); 9448 } 9449 Chain = R; 9450 } 9451 } 9452 9453 // If tail is folded by masking, introduce selects between the phi 9454 // and the live-out instruction of each reduction, at the beginning of the 9455 // dedicated latch block. 9456 if (CM.foldTailByMasking()) { 9457 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9458 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9459 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9460 if (!PhiR || PhiR->isInLoop()) 9461 continue; 9462 VPValue *Cond = 9463 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9464 VPValue *Red = PhiR->getBackedgeValue(); 9465 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9466 "reduction recipe must be defined before latch"); 9467 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9468 } 9469 } 9470 } 9471 9472 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9473 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9474 VPSlotTracker &SlotTracker) const { 9475 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9476 IG->getInsertPos()->printAsOperand(O, false); 9477 O << ", "; 9478 getAddr()->printAsOperand(O, SlotTracker); 9479 VPValue *Mask = getMask(); 9480 if (Mask) { 9481 O << ", "; 9482 Mask->printAsOperand(O, SlotTracker); 9483 } 9484 9485 unsigned OpIdx = 0; 9486 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9487 if (!IG->getMember(i)) 9488 continue; 9489 if (getNumStoreOperands() > 0) { 9490 O << "\n" << Indent << " store "; 9491 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9492 O << " to index " << i; 9493 } else { 9494 O << "\n" << Indent << " "; 9495 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9496 O << " = load from index " << i; 9497 } 9498 ++OpIdx; 9499 } 9500 } 9501 #endif 9502 9503 void VPWidenCallRecipe::execute(VPTransformState &State) { 9504 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9505 *this, State); 9506 } 9507 9508 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9509 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9510 State.ILV->setDebugLocFromInst(&I); 9511 9512 // The condition can be loop invariant but still defined inside the 9513 // loop. This means that we can't just use the original 'cond' value. 9514 // We have to take the 'vectorized' value and pick the first lane. 9515 // Instcombine will make this a no-op. 9516 auto *InvarCond = 9517 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9518 9519 for (unsigned Part = 0; Part < State.UF; ++Part) { 9520 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9521 Value *Op0 = State.get(getOperand(1), Part); 9522 Value *Op1 = State.get(getOperand(2), Part); 9523 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9524 State.set(this, Sel, Part); 9525 State.ILV->addMetadata(Sel, &I); 9526 } 9527 } 9528 9529 void VPWidenRecipe::execute(VPTransformState &State) { 9530 auto &I = *cast<Instruction>(getUnderlyingValue()); 9531 auto &Builder = State.Builder; 9532 switch (I.getOpcode()) { 9533 case Instruction::Call: 9534 case Instruction::Br: 9535 case Instruction::PHI: 9536 case Instruction::GetElementPtr: 9537 case Instruction::Select: 9538 llvm_unreachable("This instruction is handled by a different recipe."); 9539 case Instruction::UDiv: 9540 case Instruction::SDiv: 9541 case Instruction::SRem: 9542 case Instruction::URem: 9543 case Instruction::Add: 9544 case Instruction::FAdd: 9545 case Instruction::Sub: 9546 case Instruction::FSub: 9547 case Instruction::FNeg: 9548 case Instruction::Mul: 9549 case Instruction::FMul: 9550 case Instruction::FDiv: 9551 case Instruction::FRem: 9552 case Instruction::Shl: 9553 case Instruction::LShr: 9554 case Instruction::AShr: 9555 case Instruction::And: 9556 case Instruction::Or: 9557 case Instruction::Xor: { 9558 // Just widen unops and binops. 9559 State.ILV->setDebugLocFromInst(&I); 9560 9561 for (unsigned Part = 0; Part < State.UF; ++Part) { 9562 SmallVector<Value *, 2> Ops; 9563 for (VPValue *VPOp : operands()) 9564 Ops.push_back(State.get(VPOp, Part)); 9565 9566 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9567 9568 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9569 VecOp->copyIRFlags(&I); 9570 9571 // If the instruction is vectorized and was in a basic block that needed 9572 // predication, we can't propagate poison-generating flags (nuw/nsw, 9573 // exact, etc.). The control flow has been linearized and the 9574 // instruction is no longer guarded by the predicate, which could make 9575 // the flag properties to no longer hold. 9576 if (State.MayGeneratePoisonRecipes.contains(this)) 9577 VecOp->dropPoisonGeneratingFlags(); 9578 } 9579 9580 // Use this vector value for all users of the original instruction. 9581 State.set(this, V, Part); 9582 State.ILV->addMetadata(V, &I); 9583 } 9584 9585 break; 9586 } 9587 case Instruction::ICmp: 9588 case Instruction::FCmp: { 9589 // Widen compares. Generate vector compares. 9590 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9591 auto *Cmp = cast<CmpInst>(&I); 9592 State.ILV->setDebugLocFromInst(Cmp); 9593 for (unsigned Part = 0; Part < State.UF; ++Part) { 9594 Value *A = State.get(getOperand(0), Part); 9595 Value *B = State.get(getOperand(1), Part); 9596 Value *C = nullptr; 9597 if (FCmp) { 9598 // Propagate fast math flags. 9599 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9600 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9601 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9602 } else { 9603 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9604 } 9605 State.set(this, C, Part); 9606 State.ILV->addMetadata(C, &I); 9607 } 9608 9609 break; 9610 } 9611 9612 case Instruction::ZExt: 9613 case Instruction::SExt: 9614 case Instruction::FPToUI: 9615 case Instruction::FPToSI: 9616 case Instruction::FPExt: 9617 case Instruction::PtrToInt: 9618 case Instruction::IntToPtr: 9619 case Instruction::SIToFP: 9620 case Instruction::UIToFP: 9621 case Instruction::Trunc: 9622 case Instruction::FPTrunc: 9623 case Instruction::BitCast: { 9624 auto *CI = cast<CastInst>(&I); 9625 State.ILV->setDebugLocFromInst(CI); 9626 9627 /// Vectorize casts. 9628 Type *DestTy = (State.VF.isScalar()) 9629 ? CI->getType() 9630 : VectorType::get(CI->getType(), State.VF); 9631 9632 for (unsigned Part = 0; Part < State.UF; ++Part) { 9633 Value *A = State.get(getOperand(0), Part); 9634 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9635 State.set(this, Cast, Part); 9636 State.ILV->addMetadata(Cast, &I); 9637 } 9638 break; 9639 } 9640 default: 9641 // This instruction is not vectorized by simple widening. 9642 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9643 llvm_unreachable("Unhandled instruction!"); 9644 } // end of switch. 9645 } 9646 9647 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9648 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9649 // Construct a vector GEP by widening the operands of the scalar GEP as 9650 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9651 // results in a vector of pointers when at least one operand of the GEP 9652 // is vector-typed. Thus, to keep the representation compact, we only use 9653 // vector-typed operands for loop-varying values. 9654 9655 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9656 // If we are vectorizing, but the GEP has only loop-invariant operands, 9657 // the GEP we build (by only using vector-typed operands for 9658 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9659 // produce a vector of pointers, we need to either arbitrarily pick an 9660 // operand to broadcast, or broadcast a clone of the original GEP. 9661 // Here, we broadcast a clone of the original. 9662 // 9663 // TODO: If at some point we decide to scalarize instructions having 9664 // loop-invariant operands, this special case will no longer be 9665 // required. We would add the scalarization decision to 9666 // collectLoopScalars() and teach getVectorValue() to broadcast 9667 // the lane-zero scalar value. 9668 auto *Clone = State.Builder.Insert(GEP->clone()); 9669 for (unsigned Part = 0; Part < State.UF; ++Part) { 9670 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9671 State.set(this, EntryPart, Part); 9672 State.ILV->addMetadata(EntryPart, GEP); 9673 } 9674 } else { 9675 // If the GEP has at least one loop-varying operand, we are sure to 9676 // produce a vector of pointers. But if we are only unrolling, we want 9677 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9678 // produce with the code below will be scalar (if VF == 1) or vector 9679 // (otherwise). Note that for the unroll-only case, we still maintain 9680 // values in the vector mapping with initVector, as we do for other 9681 // instructions. 9682 for (unsigned Part = 0; Part < State.UF; ++Part) { 9683 // The pointer operand of the new GEP. If it's loop-invariant, we 9684 // won't broadcast it. 9685 auto *Ptr = IsPtrLoopInvariant 9686 ? State.get(getOperand(0), VPIteration(0, 0)) 9687 : State.get(getOperand(0), Part); 9688 9689 // Collect all the indices for the new GEP. If any index is 9690 // loop-invariant, we won't broadcast it. 9691 SmallVector<Value *, 4> Indices; 9692 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9693 VPValue *Operand = getOperand(I); 9694 if (IsIndexLoopInvariant[I - 1]) 9695 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9696 else 9697 Indices.push_back(State.get(Operand, Part)); 9698 } 9699 9700 // If the GEP instruction is vectorized and was in a basic block that 9701 // needed predication, we can't propagate the poison-generating 'inbounds' 9702 // flag. The control flow has been linearized and the GEP is no longer 9703 // guarded by the predicate, which could make the 'inbounds' properties to 9704 // no longer hold. 9705 bool IsInBounds = 9706 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9707 9708 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9709 // but it should be a vector, otherwise. 9710 auto *NewGEP = IsInBounds 9711 ? State.Builder.CreateInBoundsGEP( 9712 GEP->getSourceElementType(), Ptr, Indices) 9713 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9714 Ptr, Indices); 9715 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9716 "NewGEP is not a pointer vector"); 9717 State.set(this, NewGEP, Part); 9718 State.ILV->addMetadata(NewGEP, GEP); 9719 } 9720 } 9721 } 9722 9723 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9724 assert(!State.Instance && "Int or FP induction being replicated."); 9725 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9726 State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); 9727 } 9728 9729 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9730 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9731 9732 // Fast-math-flags propagate from the original induction instruction. 9733 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9734 if (IndDesc.getInductionBinOp() && 9735 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9736 State.Builder.setFastMathFlags( 9737 IndDesc.getInductionBinOp()->getFastMathFlags()); 9738 9739 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9740 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9741 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9742 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9743 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9744 ScalarIV = 9745 Ty->isIntegerTy() 9746 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9747 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9748 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9749 getStartValue()->getLiveInIRValue(), Step, 9750 IndDesc); 9751 ScalarIV->setName("offset.idx"); 9752 } 9753 if (TruncToTy) { 9754 assert(Step->getType()->isIntegerTy() && 9755 "Truncation requires an integer step"); 9756 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9757 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9758 } 9759 return ScalarIV; 9760 }; 9761 9762 Value *ScalarIV = CreateScalarIV(Step); 9763 if (State.VF.isVector()) { 9764 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9765 return; 9766 } 9767 9768 for (unsigned Part = 0; Part < State.UF; ++Part) { 9769 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9770 Value *EntryPart; 9771 if (Step->getType()->isFloatingPointTy()) { 9772 Value *StartIdx = 9773 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9774 // Floating-point operations inherit FMF via the builder's flags. 9775 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9776 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9777 ScalarIV, MulOp); 9778 } else { 9779 Value *StartIdx = 9780 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9781 EntryPart = State.Builder.CreateAdd( 9782 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9783 } 9784 State.set(this, EntryPart, Part); 9785 } 9786 } 9787 9788 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9789 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9790 State); 9791 } 9792 9793 void VPBlendRecipe::execute(VPTransformState &State) { 9794 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9795 // We know that all PHIs in non-header blocks are converted into 9796 // selects, so we don't have to worry about the insertion order and we 9797 // can just use the builder. 9798 // At this point we generate the predication tree. There may be 9799 // duplications since this is a simple recursive scan, but future 9800 // optimizations will clean it up. 9801 9802 unsigned NumIncoming = getNumIncomingValues(); 9803 9804 // Generate a sequence of selects of the form: 9805 // SELECT(Mask3, In3, 9806 // SELECT(Mask2, In2, 9807 // SELECT(Mask1, In1, 9808 // In0))) 9809 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9810 // are essentially undef are taken from In0. 9811 InnerLoopVectorizer::VectorParts Entry(State.UF); 9812 for (unsigned In = 0; In < NumIncoming; ++In) { 9813 for (unsigned Part = 0; Part < State.UF; ++Part) { 9814 // We might have single edge PHIs (blocks) - use an identity 9815 // 'select' for the first PHI operand. 9816 Value *In0 = State.get(getIncomingValue(In), Part); 9817 if (In == 0) 9818 Entry[Part] = In0; // Initialize with the first incoming value. 9819 else { 9820 // Select between the current value and the previous incoming edge 9821 // based on the incoming mask. 9822 Value *Cond = State.get(getMask(In), Part); 9823 Entry[Part] = 9824 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9825 } 9826 } 9827 } 9828 for (unsigned Part = 0; Part < State.UF; ++Part) 9829 State.set(this, Entry[Part], Part); 9830 } 9831 9832 void VPInterleaveRecipe::execute(VPTransformState &State) { 9833 assert(!State.Instance && "Interleave group being replicated."); 9834 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9835 getStoredValues(), getMask()); 9836 } 9837 9838 void VPReductionRecipe::execute(VPTransformState &State) { 9839 assert(!State.Instance && "Reduction being replicated."); 9840 Value *PrevInChain = State.get(getChainOp(), 0); 9841 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9842 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9843 // Propagate the fast-math flags carried by the underlying instruction. 9844 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9845 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9846 for (unsigned Part = 0; Part < State.UF; ++Part) { 9847 Value *NewVecOp = State.get(getVecOp(), Part); 9848 if (VPValue *Cond = getCondOp()) { 9849 Value *NewCond = State.get(Cond, Part); 9850 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9851 Value *Iden = RdxDesc->getRecurrenceIdentity( 9852 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9853 Value *IdenVec = 9854 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9855 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9856 NewVecOp = Select; 9857 } 9858 Value *NewRed; 9859 Value *NextInChain; 9860 if (IsOrdered) { 9861 if (State.VF.isVector()) 9862 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9863 PrevInChain); 9864 else 9865 NewRed = State.Builder.CreateBinOp( 9866 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9867 NewVecOp); 9868 PrevInChain = NewRed; 9869 } else { 9870 PrevInChain = State.get(getChainOp(), Part); 9871 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9872 } 9873 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9874 NextInChain = 9875 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9876 NewRed, PrevInChain); 9877 } else if (IsOrdered) 9878 NextInChain = NewRed; 9879 else 9880 NextInChain = State.Builder.CreateBinOp( 9881 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9882 PrevInChain); 9883 State.set(this, NextInChain, Part); 9884 } 9885 } 9886 9887 void VPReplicateRecipe::execute(VPTransformState &State) { 9888 if (State.Instance) { // Generate a single instance. 9889 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9890 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9891 IsPredicated, State); 9892 // Insert scalar instance packing it into a vector. 9893 if (AlsoPack && State.VF.isVector()) { 9894 // If we're constructing lane 0, initialize to start from poison. 9895 if (State.Instance->Lane.isFirstLane()) { 9896 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9897 Value *Poison = PoisonValue::get( 9898 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9899 State.set(this, Poison, State.Instance->Part); 9900 } 9901 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9902 } 9903 return; 9904 } 9905 9906 // Generate scalar instances for all VF lanes of all UF parts, unless the 9907 // instruction is uniform inwhich case generate only the first lane for each 9908 // of the UF parts. 9909 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9910 assert((!State.VF.isScalable() || IsUniform) && 9911 "Can't scalarize a scalable vector"); 9912 for (unsigned Part = 0; Part < State.UF; ++Part) 9913 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9914 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9915 VPIteration(Part, Lane), IsPredicated, 9916 State); 9917 } 9918 9919 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9920 assert(State.Instance && "Branch on Mask works only on single instance."); 9921 9922 unsigned Part = State.Instance->Part; 9923 unsigned Lane = State.Instance->Lane.getKnownLane(); 9924 9925 Value *ConditionBit = nullptr; 9926 VPValue *BlockInMask = getMask(); 9927 if (BlockInMask) { 9928 ConditionBit = State.get(BlockInMask, Part); 9929 if (ConditionBit->getType()->isVectorTy()) 9930 ConditionBit = State.Builder.CreateExtractElement( 9931 ConditionBit, State.Builder.getInt32(Lane)); 9932 } else // Block in mask is all-one. 9933 ConditionBit = State.Builder.getTrue(); 9934 9935 // Replace the temporary unreachable terminator with a new conditional branch, 9936 // whose two destinations will be set later when they are created. 9937 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9938 assert(isa<UnreachableInst>(CurrentTerminator) && 9939 "Expected to replace unreachable terminator with conditional branch."); 9940 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9941 CondBr->setSuccessor(0, nullptr); 9942 ReplaceInstWithInst(CurrentTerminator, CondBr); 9943 } 9944 9945 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9946 assert(State.Instance && "Predicated instruction PHI works per instance."); 9947 Instruction *ScalarPredInst = 9948 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9949 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9950 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9951 assert(PredicatingBB && "Predicated block has no single predecessor."); 9952 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9953 "operand must be VPReplicateRecipe"); 9954 9955 // By current pack/unpack logic we need to generate only a single phi node: if 9956 // a vector value for the predicated instruction exists at this point it means 9957 // the instruction has vector users only, and a phi for the vector value is 9958 // needed. In this case the recipe of the predicated instruction is marked to 9959 // also do that packing, thereby "hoisting" the insert-element sequence. 9960 // Otherwise, a phi node for the scalar value is needed. 9961 unsigned Part = State.Instance->Part; 9962 if (State.hasVectorValue(getOperand(0), Part)) { 9963 Value *VectorValue = State.get(getOperand(0), Part); 9964 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9965 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9966 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9967 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9968 if (State.hasVectorValue(this, Part)) 9969 State.reset(this, VPhi, Part); 9970 else 9971 State.set(this, VPhi, Part); 9972 // NOTE: Currently we need to update the value of the operand, so the next 9973 // predicated iteration inserts its generated value in the correct vector. 9974 State.reset(getOperand(0), VPhi, Part); 9975 } else { 9976 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9977 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9978 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9979 PredicatingBB); 9980 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9981 if (State.hasScalarValue(this, *State.Instance)) 9982 State.reset(this, Phi, *State.Instance); 9983 else 9984 State.set(this, Phi, *State.Instance); 9985 // NOTE: Currently we need to update the value of the operand, so the next 9986 // predicated iteration inserts its generated value in the correct vector. 9987 State.reset(getOperand(0), Phi, *State.Instance); 9988 } 9989 } 9990 9991 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9992 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9993 9994 // Attempt to issue a wide load. 9995 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9996 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9997 9998 assert((LI || SI) && "Invalid Load/Store instruction"); 9999 assert((!SI || StoredValue) && "No stored value provided for widened store"); 10000 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 10001 10002 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 10003 10004 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 10005 const Align Alignment = getLoadStoreAlignment(&Ingredient); 10006 bool CreateGatherScatter = !Consecutive; 10007 10008 auto &Builder = State.Builder; 10009 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 10010 bool isMaskRequired = getMask(); 10011 if (isMaskRequired) 10012 for (unsigned Part = 0; Part < State.UF; ++Part) 10013 BlockInMaskParts[Part] = State.get(getMask(), Part); 10014 10015 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 10016 // Calculate the pointer for the specific unroll-part. 10017 GetElementPtrInst *PartPtr = nullptr; 10018 10019 bool InBounds = false; 10020 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 10021 InBounds = gep->isInBounds(); 10022 if (Reverse) { 10023 // If the address is consecutive but reversed, then the 10024 // wide store needs to start at the last vector element. 10025 // RunTimeVF = VScale * VF.getKnownMinValue() 10026 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 10027 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 10028 // NumElt = -Part * RunTimeVF 10029 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 10030 // LastLane = 1 - RunTimeVF 10031 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 10032 PartPtr = 10033 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 10034 PartPtr->setIsInBounds(InBounds); 10035 PartPtr = cast<GetElementPtrInst>( 10036 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 10037 PartPtr->setIsInBounds(InBounds); 10038 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 10039 BlockInMaskParts[Part] = 10040 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 10041 } else { 10042 Value *Increment = 10043 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 10044 PartPtr = cast<GetElementPtrInst>( 10045 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 10046 PartPtr->setIsInBounds(InBounds); 10047 } 10048 10049 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 10050 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 10051 }; 10052 10053 // Handle Stores: 10054 if (SI) { 10055 State.ILV->setDebugLocFromInst(SI); 10056 10057 for (unsigned Part = 0; Part < State.UF; ++Part) { 10058 Instruction *NewSI = nullptr; 10059 Value *StoredVal = State.get(StoredValue, Part); 10060 if (CreateGatherScatter) { 10061 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10062 Value *VectorGep = State.get(getAddr(), Part); 10063 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 10064 MaskPart); 10065 } else { 10066 if (Reverse) { 10067 // If we store to reverse consecutive memory locations, then we need 10068 // to reverse the order of elements in the stored value. 10069 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 10070 // We don't want to update the value in the map as it might be used in 10071 // another expression. So don't call resetVectorValue(StoredVal). 10072 } 10073 auto *VecPtr = 10074 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10075 if (isMaskRequired) 10076 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 10077 BlockInMaskParts[Part]); 10078 else 10079 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 10080 } 10081 State.ILV->addMetadata(NewSI, SI); 10082 } 10083 return; 10084 } 10085 10086 // Handle loads. 10087 assert(LI && "Must have a load instruction"); 10088 State.ILV->setDebugLocFromInst(LI); 10089 for (unsigned Part = 0; Part < State.UF; ++Part) { 10090 Value *NewLI; 10091 if (CreateGatherScatter) { 10092 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10093 Value *VectorGep = State.get(getAddr(), Part); 10094 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10095 nullptr, "wide.masked.gather"); 10096 State.ILV->addMetadata(NewLI, LI); 10097 } else { 10098 auto *VecPtr = 10099 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10100 if (isMaskRequired) 10101 NewLI = Builder.CreateMaskedLoad( 10102 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10103 PoisonValue::get(DataTy), "wide.masked.load"); 10104 else 10105 NewLI = 10106 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10107 10108 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10109 State.ILV->addMetadata(NewLI, LI); 10110 if (Reverse) 10111 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10112 } 10113 10114 State.set(this, NewLI, Part); 10115 } 10116 } 10117 10118 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10119 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10120 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10121 // for predication. 10122 static ScalarEpilogueLowering getScalarEpilogueLowering( 10123 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10124 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10125 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10126 LoopVectorizationLegality &LVL) { 10127 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10128 // don't look at hints or options, and don't request a scalar epilogue. 10129 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10130 // LoopAccessInfo (due to code dependency and not being able to reliably get 10131 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10132 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10133 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10134 // back to the old way and vectorize with versioning when forced. See D81345.) 10135 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10136 PGSOQueryType::IRPass) && 10137 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10138 return CM_ScalarEpilogueNotAllowedOptSize; 10139 10140 // 2) If set, obey the directives 10141 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10142 switch (PreferPredicateOverEpilogue) { 10143 case PreferPredicateTy::ScalarEpilogue: 10144 return CM_ScalarEpilogueAllowed; 10145 case PreferPredicateTy::PredicateElseScalarEpilogue: 10146 return CM_ScalarEpilogueNotNeededUsePredicate; 10147 case PreferPredicateTy::PredicateOrDontVectorize: 10148 return CM_ScalarEpilogueNotAllowedUsePredicate; 10149 }; 10150 } 10151 10152 // 3) If set, obey the hints 10153 switch (Hints.getPredicate()) { 10154 case LoopVectorizeHints::FK_Enabled: 10155 return CM_ScalarEpilogueNotNeededUsePredicate; 10156 case LoopVectorizeHints::FK_Disabled: 10157 return CM_ScalarEpilogueAllowed; 10158 }; 10159 10160 // 4) if the TTI hook indicates this is profitable, request predication. 10161 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10162 LVL.getLAI())) 10163 return CM_ScalarEpilogueNotNeededUsePredicate; 10164 10165 return CM_ScalarEpilogueAllowed; 10166 } 10167 10168 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10169 // If Values have been set for this Def return the one relevant for \p Part. 10170 if (hasVectorValue(Def, Part)) 10171 return Data.PerPartOutput[Def][Part]; 10172 10173 if (!hasScalarValue(Def, {Part, 0})) { 10174 Value *IRV = Def->getLiveInIRValue(); 10175 Value *B = ILV->getBroadcastInstrs(IRV); 10176 set(Def, B, Part); 10177 return B; 10178 } 10179 10180 Value *ScalarValue = get(Def, {Part, 0}); 10181 // If we aren't vectorizing, we can just copy the scalar map values over 10182 // to the vector map. 10183 if (VF.isScalar()) { 10184 set(Def, ScalarValue, Part); 10185 return ScalarValue; 10186 } 10187 10188 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10189 bool IsUniform = RepR && RepR->isUniform(); 10190 10191 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10192 // Check if there is a scalar value for the selected lane. 10193 if (!hasScalarValue(Def, {Part, LastLane})) { 10194 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10195 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10196 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10197 "unexpected recipe found to be invariant"); 10198 IsUniform = true; 10199 LastLane = 0; 10200 } 10201 10202 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10203 // Set the insert point after the last scalarized instruction or after the 10204 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10205 // will directly follow the scalar definitions. 10206 auto OldIP = Builder.saveIP(); 10207 auto NewIP = 10208 isa<PHINode>(LastInst) 10209 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10210 : std::next(BasicBlock::iterator(LastInst)); 10211 Builder.SetInsertPoint(&*NewIP); 10212 10213 // However, if we are vectorizing, we need to construct the vector values. 10214 // If the value is known to be uniform after vectorization, we can just 10215 // broadcast the scalar value corresponding to lane zero for each unroll 10216 // iteration. Otherwise, we construct the vector values using 10217 // insertelement instructions. Since the resulting vectors are stored in 10218 // State, we will only generate the insertelements once. 10219 Value *VectorValue = nullptr; 10220 if (IsUniform) { 10221 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10222 set(Def, VectorValue, Part); 10223 } else { 10224 // Initialize packing with insertelements to start from undef. 10225 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10226 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10227 set(Def, Undef, Part); 10228 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10229 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10230 VectorValue = get(Def, Part); 10231 } 10232 Builder.restoreIP(OldIP); 10233 return VectorValue; 10234 } 10235 10236 // Process the loop in the VPlan-native vectorization path. This path builds 10237 // VPlan upfront in the vectorization pipeline, which allows to apply 10238 // VPlan-to-VPlan transformations from the very beginning without modifying the 10239 // input LLVM IR. 10240 static bool processLoopInVPlanNativePath( 10241 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10242 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10243 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10244 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10245 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10246 LoopVectorizationRequirements &Requirements) { 10247 10248 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10249 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10250 return false; 10251 } 10252 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10253 Function *F = L->getHeader()->getParent(); 10254 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10255 10256 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10257 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10258 10259 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10260 &Hints, IAI); 10261 // Use the planner for outer loop vectorization. 10262 // TODO: CM is not used at this point inside the planner. Turn CM into an 10263 // optional argument if we don't need it in the future. 10264 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10265 Requirements, ORE); 10266 10267 // Get user vectorization factor. 10268 ElementCount UserVF = Hints.getWidth(); 10269 10270 CM.collectElementTypesForWidening(); 10271 10272 // Plan how to best vectorize, return the best VF and its cost. 10273 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10274 10275 // If we are stress testing VPlan builds, do not attempt to generate vector 10276 // code. Masked vector code generation support will follow soon. 10277 // Also, do not attempt to vectorize if no vector code will be produced. 10278 if (VPlanBuildStressTest || EnableVPlanPredication || 10279 VectorizationFactor::Disabled() == VF) 10280 return false; 10281 10282 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10283 10284 { 10285 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10286 F->getParent()->getDataLayout()); 10287 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10288 &CM, BFI, PSI, Checks); 10289 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10290 << L->getHeader()->getParent()->getName() << "\"\n"); 10291 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10292 } 10293 10294 // Mark the loop as already vectorized to avoid vectorizing again. 10295 Hints.setAlreadyVectorized(); 10296 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10297 return true; 10298 } 10299 10300 // Emit a remark if there are stores to floats that required a floating point 10301 // extension. If the vectorized loop was generated with floating point there 10302 // will be a performance penalty from the conversion overhead and the change in 10303 // the vector width. 10304 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10305 SmallVector<Instruction *, 4> Worklist; 10306 for (BasicBlock *BB : L->getBlocks()) { 10307 for (Instruction &Inst : *BB) { 10308 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10309 if (S->getValueOperand()->getType()->isFloatTy()) 10310 Worklist.push_back(S); 10311 } 10312 } 10313 } 10314 10315 // Traverse the floating point stores upwards searching, for floating point 10316 // conversions. 10317 SmallPtrSet<const Instruction *, 4> Visited; 10318 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10319 while (!Worklist.empty()) { 10320 auto *I = Worklist.pop_back_val(); 10321 if (!L->contains(I)) 10322 continue; 10323 if (!Visited.insert(I).second) 10324 continue; 10325 10326 // Emit a remark if the floating point store required a floating 10327 // point conversion. 10328 // TODO: More work could be done to identify the root cause such as a 10329 // constant or a function return type and point the user to it. 10330 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10331 ORE->emit([&]() { 10332 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10333 I->getDebugLoc(), L->getHeader()) 10334 << "floating point conversion changes vector width. " 10335 << "Mixed floating point precision requires an up/down " 10336 << "cast that will negatively impact performance."; 10337 }); 10338 10339 for (Use &Op : I->operands()) 10340 if (auto *OpI = dyn_cast<Instruction>(Op)) 10341 Worklist.push_back(OpI); 10342 } 10343 } 10344 10345 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10346 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10347 !EnableLoopInterleaving), 10348 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10349 !EnableLoopVectorization) {} 10350 10351 bool LoopVectorizePass::processLoop(Loop *L) { 10352 assert((EnableVPlanNativePath || L->isInnermost()) && 10353 "VPlan-native path is not enabled. Only process inner loops."); 10354 10355 #ifndef NDEBUG 10356 const std::string DebugLocStr = getDebugLocString(L); 10357 #endif /* NDEBUG */ 10358 10359 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10360 << L->getHeader()->getParent()->getName() << "\" from " 10361 << DebugLocStr << "\n"); 10362 10363 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10364 10365 LLVM_DEBUG( 10366 dbgs() << "LV: Loop hints:" 10367 << " force=" 10368 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10369 ? "disabled" 10370 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10371 ? "enabled" 10372 : "?")) 10373 << " width=" << Hints.getWidth() 10374 << " interleave=" << Hints.getInterleave() << "\n"); 10375 10376 // Function containing loop 10377 Function *F = L->getHeader()->getParent(); 10378 10379 // Looking at the diagnostic output is the only way to determine if a loop 10380 // was vectorized (other than looking at the IR or machine code), so it 10381 // is important to generate an optimization remark for each loop. Most of 10382 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10383 // generated as OptimizationRemark and OptimizationRemarkMissed are 10384 // less verbose reporting vectorized loops and unvectorized loops that may 10385 // benefit from vectorization, respectively. 10386 10387 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10388 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10389 return false; 10390 } 10391 10392 PredicatedScalarEvolution PSE(*SE, *L); 10393 10394 // Check if it is legal to vectorize the loop. 10395 LoopVectorizationRequirements Requirements; 10396 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10397 &Requirements, &Hints, DB, AC, BFI, PSI); 10398 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10399 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10400 Hints.emitRemarkWithHints(); 10401 return false; 10402 } 10403 10404 // Check the function attributes and profiles to find out if this function 10405 // should be optimized for size. 10406 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10407 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10408 10409 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10410 // here. They may require CFG and instruction level transformations before 10411 // even evaluating whether vectorization is profitable. Since we cannot modify 10412 // the incoming IR, we need to build VPlan upfront in the vectorization 10413 // pipeline. 10414 if (!L->isInnermost()) 10415 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10416 ORE, BFI, PSI, Hints, Requirements); 10417 10418 assert(L->isInnermost() && "Inner loop expected."); 10419 10420 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10421 // count by optimizing for size, to minimize overheads. 10422 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10423 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10424 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10425 << "This loop is worth vectorizing only if no scalar " 10426 << "iteration overheads are incurred."); 10427 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10428 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10429 else { 10430 LLVM_DEBUG(dbgs() << "\n"); 10431 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10432 } 10433 } 10434 10435 // Check the function attributes to see if implicit floats are allowed. 10436 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10437 // an integer loop and the vector instructions selected are purely integer 10438 // vector instructions? 10439 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10440 reportVectorizationFailure( 10441 "Can't vectorize when the NoImplicitFloat attribute is used", 10442 "loop not vectorized due to NoImplicitFloat attribute", 10443 "NoImplicitFloat", ORE, L); 10444 Hints.emitRemarkWithHints(); 10445 return false; 10446 } 10447 10448 // Check if the target supports potentially unsafe FP vectorization. 10449 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10450 // for the target we're vectorizing for, to make sure none of the 10451 // additional fp-math flags can help. 10452 if (Hints.isPotentiallyUnsafe() && 10453 TTI->isFPVectorizationPotentiallyUnsafe()) { 10454 reportVectorizationFailure( 10455 "Potentially unsafe FP op prevents vectorization", 10456 "loop not vectorized due to unsafe FP support.", 10457 "UnsafeFP", ORE, L); 10458 Hints.emitRemarkWithHints(); 10459 return false; 10460 } 10461 10462 bool AllowOrderedReductions; 10463 // If the flag is set, use that instead and override the TTI behaviour. 10464 if (ForceOrderedReductions.getNumOccurrences() > 0) 10465 AllowOrderedReductions = ForceOrderedReductions; 10466 else 10467 AllowOrderedReductions = TTI->enableOrderedReductions(); 10468 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10469 ORE->emit([&]() { 10470 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10471 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10472 ExactFPMathInst->getDebugLoc(), 10473 ExactFPMathInst->getParent()) 10474 << "loop not vectorized: cannot prove it is safe to reorder " 10475 "floating-point operations"; 10476 }); 10477 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10478 "reorder floating-point operations\n"); 10479 Hints.emitRemarkWithHints(); 10480 return false; 10481 } 10482 10483 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10484 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10485 10486 // If an override option has been passed in for interleaved accesses, use it. 10487 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10488 UseInterleaved = EnableInterleavedMemAccesses; 10489 10490 // Analyze interleaved memory accesses. 10491 if (UseInterleaved) { 10492 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10493 } 10494 10495 // Use the cost model. 10496 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10497 F, &Hints, IAI); 10498 CM.collectValuesToIgnore(); 10499 CM.collectElementTypesForWidening(); 10500 10501 // Use the planner for vectorization. 10502 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10503 Requirements, ORE); 10504 10505 // Get user vectorization factor and interleave count. 10506 ElementCount UserVF = Hints.getWidth(); 10507 unsigned UserIC = Hints.getInterleave(); 10508 10509 // Plan how to best vectorize, return the best VF and its cost. 10510 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10511 10512 VectorizationFactor VF = VectorizationFactor::Disabled(); 10513 unsigned IC = 1; 10514 10515 if (MaybeVF) { 10516 VF = *MaybeVF; 10517 // Select the interleave count. 10518 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10519 } 10520 10521 // Identify the diagnostic messages that should be produced. 10522 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10523 bool VectorizeLoop = true, InterleaveLoop = true; 10524 if (VF.Width.isScalar()) { 10525 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10526 VecDiagMsg = std::make_pair( 10527 "VectorizationNotBeneficial", 10528 "the cost-model indicates that vectorization is not beneficial"); 10529 VectorizeLoop = false; 10530 } 10531 10532 if (!MaybeVF && UserIC > 1) { 10533 // Tell the user interleaving was avoided up-front, despite being explicitly 10534 // requested. 10535 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10536 "interleaving should be avoided up front\n"); 10537 IntDiagMsg = std::make_pair( 10538 "InterleavingAvoided", 10539 "Ignoring UserIC, because interleaving was avoided up front"); 10540 InterleaveLoop = false; 10541 } else if (IC == 1 && UserIC <= 1) { 10542 // Tell the user interleaving is not beneficial. 10543 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10544 IntDiagMsg = std::make_pair( 10545 "InterleavingNotBeneficial", 10546 "the cost-model indicates that interleaving is not beneficial"); 10547 InterleaveLoop = false; 10548 if (UserIC == 1) { 10549 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10550 IntDiagMsg.second += 10551 " and is explicitly disabled or interleave count is set to 1"; 10552 } 10553 } else if (IC > 1 && UserIC == 1) { 10554 // Tell the user interleaving is beneficial, but it explicitly disabled. 10555 LLVM_DEBUG( 10556 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10557 IntDiagMsg = std::make_pair( 10558 "InterleavingBeneficialButDisabled", 10559 "the cost-model indicates that interleaving is beneficial " 10560 "but is explicitly disabled or interleave count is set to 1"); 10561 InterleaveLoop = false; 10562 } 10563 10564 // Override IC if user provided an interleave count. 10565 IC = UserIC > 0 ? UserIC : IC; 10566 10567 // Emit diagnostic messages, if any. 10568 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10569 if (!VectorizeLoop && !InterleaveLoop) { 10570 // Do not vectorize or interleaving the loop. 10571 ORE->emit([&]() { 10572 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10573 L->getStartLoc(), L->getHeader()) 10574 << VecDiagMsg.second; 10575 }); 10576 ORE->emit([&]() { 10577 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10578 L->getStartLoc(), L->getHeader()) 10579 << IntDiagMsg.second; 10580 }); 10581 return false; 10582 } else if (!VectorizeLoop && InterleaveLoop) { 10583 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10584 ORE->emit([&]() { 10585 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10586 L->getStartLoc(), L->getHeader()) 10587 << VecDiagMsg.second; 10588 }); 10589 } else if (VectorizeLoop && !InterleaveLoop) { 10590 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10591 << ") in " << DebugLocStr << '\n'); 10592 ORE->emit([&]() { 10593 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10594 L->getStartLoc(), L->getHeader()) 10595 << IntDiagMsg.second; 10596 }); 10597 } else if (VectorizeLoop && InterleaveLoop) { 10598 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10599 << ") in " << DebugLocStr << '\n'); 10600 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10601 } 10602 10603 bool DisableRuntimeUnroll = false; 10604 MDNode *OrigLoopID = L->getLoopID(); 10605 { 10606 // Optimistically generate runtime checks. Drop them if they turn out to not 10607 // be profitable. Limit the scope of Checks, so the cleanup happens 10608 // immediately after vector codegeneration is done. 10609 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10610 F->getParent()->getDataLayout()); 10611 if (!VF.Width.isScalar() || IC > 1) 10612 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate()); 10613 10614 using namespace ore; 10615 if (!VectorizeLoop) { 10616 assert(IC > 1 && "interleave count should not be 1 or 0"); 10617 // If we decided that it is not legal to vectorize the loop, then 10618 // interleave it. 10619 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10620 &CM, BFI, PSI, Checks); 10621 10622 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10623 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10624 10625 ORE->emit([&]() { 10626 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10627 L->getHeader()) 10628 << "interleaved loop (interleaved count: " 10629 << NV("InterleaveCount", IC) << ")"; 10630 }); 10631 } else { 10632 // If we decided that it is *legal* to vectorize the loop, then do it. 10633 10634 // Consider vectorizing the epilogue too if it's profitable. 10635 VectorizationFactor EpilogueVF = 10636 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10637 if (EpilogueVF.Width.isVector()) { 10638 10639 // The first pass vectorizes the main loop and creates a scalar epilogue 10640 // to be vectorized by executing the plan (potentially with a different 10641 // factor) again shortly afterwards. 10642 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10643 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10644 EPI, &LVL, &CM, BFI, PSI, Checks); 10645 10646 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10647 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10648 DT); 10649 ++LoopsVectorized; 10650 10651 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10652 formLCSSARecursively(*L, *DT, LI, SE); 10653 10654 // Second pass vectorizes the epilogue and adjusts the control flow 10655 // edges from the first pass. 10656 EPI.MainLoopVF = EPI.EpilogueVF; 10657 EPI.MainLoopUF = EPI.EpilogueUF; 10658 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10659 ORE, EPI, &LVL, &CM, BFI, PSI, 10660 Checks); 10661 10662 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10663 10664 // Ensure that the start values for any VPReductionPHIRecipes are 10665 // updated before vectorising the epilogue loop. 10666 VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); 10667 for (VPRecipeBase &R : Header->phis()) { 10668 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10669 if (auto *Resume = MainILV.getReductionResumeValue( 10670 ReductionPhi->getRecurrenceDescriptor())) { 10671 VPValue *StartVal = new VPValue(Resume); 10672 BestEpiPlan.addExternalDef(StartVal); 10673 ReductionPhi->setOperand(0, StartVal); 10674 } 10675 } 10676 } 10677 10678 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10679 DT); 10680 ++LoopsEpilogueVectorized; 10681 10682 if (!MainILV.areSafetyChecksAdded()) 10683 DisableRuntimeUnroll = true; 10684 } else { 10685 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10686 &LVL, &CM, BFI, PSI, Checks); 10687 10688 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10689 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10690 ++LoopsVectorized; 10691 10692 // Add metadata to disable runtime unrolling a scalar loop when there 10693 // are no runtime checks about strides and memory. A scalar loop that is 10694 // rarely used is not worth unrolling. 10695 if (!LB.areSafetyChecksAdded()) 10696 DisableRuntimeUnroll = true; 10697 } 10698 // Report the vectorization decision. 10699 ORE->emit([&]() { 10700 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10701 L->getHeader()) 10702 << "vectorized loop (vectorization width: " 10703 << NV("VectorizationFactor", VF.Width) 10704 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10705 }); 10706 } 10707 10708 if (ORE->allowExtraAnalysis(LV_NAME)) 10709 checkMixedPrecision(L, ORE); 10710 } 10711 10712 Optional<MDNode *> RemainderLoopID = 10713 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10714 LLVMLoopVectorizeFollowupEpilogue}); 10715 if (RemainderLoopID.hasValue()) { 10716 L->setLoopID(RemainderLoopID.getValue()); 10717 } else { 10718 if (DisableRuntimeUnroll) 10719 AddRuntimeUnrollDisableMetaData(L); 10720 10721 // Mark the loop as already vectorized to avoid vectorizing again. 10722 Hints.setAlreadyVectorized(); 10723 } 10724 10725 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10726 return true; 10727 } 10728 10729 LoopVectorizeResult LoopVectorizePass::runImpl( 10730 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10731 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10732 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10733 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10734 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10735 SE = &SE_; 10736 LI = &LI_; 10737 TTI = &TTI_; 10738 DT = &DT_; 10739 BFI = &BFI_; 10740 TLI = TLI_; 10741 AA = &AA_; 10742 AC = &AC_; 10743 GetLAA = &GetLAA_; 10744 DB = &DB_; 10745 ORE = &ORE_; 10746 PSI = PSI_; 10747 10748 // Don't attempt if 10749 // 1. the target claims to have no vector registers, and 10750 // 2. interleaving won't help ILP. 10751 // 10752 // The second condition is necessary because, even if the target has no 10753 // vector registers, loop vectorization may still enable scalar 10754 // interleaving. 10755 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10756 TTI->getMaxInterleaveFactor(1) < 2) 10757 return LoopVectorizeResult(false, false); 10758 10759 bool Changed = false, CFGChanged = false; 10760 10761 // The vectorizer requires loops to be in simplified form. 10762 // Since simplification may add new inner loops, it has to run before the 10763 // legality and profitability checks. This means running the loop vectorizer 10764 // will simplify all loops, regardless of whether anything end up being 10765 // vectorized. 10766 for (auto &L : *LI) 10767 Changed |= CFGChanged |= 10768 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10769 10770 // Build up a worklist of inner-loops to vectorize. This is necessary as 10771 // the act of vectorizing or partially unrolling a loop creates new loops 10772 // and can invalidate iterators across the loops. 10773 SmallVector<Loop *, 8> Worklist; 10774 10775 for (Loop *L : *LI) 10776 collectSupportedLoops(*L, LI, ORE, Worklist); 10777 10778 LoopsAnalyzed += Worklist.size(); 10779 10780 // Now walk the identified inner loops. 10781 while (!Worklist.empty()) { 10782 Loop *L = Worklist.pop_back_val(); 10783 10784 // For the inner loops we actually process, form LCSSA to simplify the 10785 // transform. 10786 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10787 10788 Changed |= CFGChanged |= processLoop(L); 10789 } 10790 10791 // Process each loop nest in the function. 10792 return LoopVectorizeResult(Changed, CFGChanged); 10793 } 10794 10795 PreservedAnalyses LoopVectorizePass::run(Function &F, 10796 FunctionAnalysisManager &AM) { 10797 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10798 auto &LI = AM.getResult<LoopAnalysis>(F); 10799 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10800 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10801 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10802 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10803 auto &AA = AM.getResult<AAManager>(F); 10804 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10805 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10806 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10807 10808 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10809 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10810 [&](Loop &L) -> const LoopAccessInfo & { 10811 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10812 TLI, TTI, nullptr, nullptr, nullptr}; 10813 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10814 }; 10815 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10816 ProfileSummaryInfo *PSI = 10817 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10818 LoopVectorizeResult Result = 10819 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10820 if (!Result.MadeAnyChange) 10821 return PreservedAnalyses::all(); 10822 PreservedAnalyses PA; 10823 10824 // We currently do not preserve loopinfo/dominator analyses with outer loop 10825 // vectorization. Until this is addressed, mark these analyses as preserved 10826 // only for non-VPlan-native path. 10827 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10828 if (!EnableVPlanNativePath) { 10829 PA.preserve<LoopAnalysis>(); 10830 PA.preserve<DominatorTreeAnalysis>(); 10831 } 10832 10833 if (Result.MadeCFGChange) { 10834 // Making CFG changes likely means a loop got vectorized. Indicate that 10835 // extra simplification passes should be run. 10836 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10837 // be run if runtime checks have been added. 10838 AM.getResult<ShouldRunExtraVectorPasses>(F); 10839 PA.preserve<ShouldRunExtraVectorPasses>(); 10840 } else { 10841 PA.preserveSet<CFGAnalyses>(); 10842 } 10843 return PA; 10844 } 10845 10846 void LoopVectorizePass::printPipeline( 10847 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10848 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10849 OS, MapClassName2PassName); 10850 10851 OS << "<"; 10852 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10853 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10854 OS << ">"; 10855 } 10856