1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop and the start value for the canonical induction, if it is != 0. The 474 /// latter is the case when vectorizing the epilogue loop. In the case of 475 /// epilogue vectorization, this function is overriden to handle the more 476 /// complex control flow around the loops. 477 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 478 479 /// Widen a single call instruction within the innermost loop. 480 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 481 VPTransformState &State); 482 483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 484 void fixVectorizedLoop(VPTransformState &State); 485 486 // Return true if any runtime check is added. 487 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 488 489 /// A type for vectorized values in the new loop. Each value from the 490 /// original loop, when vectorized, is represented by UF vector values in the 491 /// new unrolled loop, where UF is the unroll factor. 492 using VectorParts = SmallVector<Value *, 2>; 493 494 /// Vectorize a single first-order recurrence or pointer induction PHINode in 495 /// a block. This method handles the induction variable canonicalization. It 496 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 497 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 498 VPTransformState &State); 499 500 /// A helper function to scalarize a single Instruction in the innermost loop. 501 /// Generates a sequence of scalar instances for each lane between \p MinLane 502 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 503 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 504 /// Instr's operands. 505 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 506 const VPIteration &Instance, bool IfPredicateInstr, 507 VPTransformState &State); 508 509 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 510 /// is provided, the integer induction variable will first be truncated to 511 /// the corresponding type. \p CanonicalIV is the scalar value generated for 512 /// the canonical induction variable. 513 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, 514 VPTransformState &State, Value *CanonicalIV); 515 516 /// Construct the vector value of a scalarized value \p V one lane at a time. 517 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 518 VPTransformState &State); 519 520 /// Try to vectorize interleaved access group \p Group with the base address 521 /// given in \p Addr, optionally masking the vector operations if \p 522 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 523 /// values in the vectorized loop. 524 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 525 ArrayRef<VPValue *> VPDefs, 526 VPTransformState &State, VPValue *Addr, 527 ArrayRef<VPValue *> StoredValues, 528 VPValue *BlockInMask = nullptr); 529 530 /// Set the debug location in the builder \p Ptr using the debug location in 531 /// \p V. If \p Ptr is None then it uses the class member's Builder. 532 void setDebugLocFromInst(const Value *V, 533 Optional<IRBuilderBase *> CustomBuilder = None); 534 535 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 536 void fixNonInductionPHIs(VPTransformState &State); 537 538 /// Returns true if the reordering of FP operations is not allowed, but we are 539 /// able to vectorize with strict in-order reductions for the given RdxDesc. 540 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 541 542 /// Create a broadcast instruction. This method generates a broadcast 543 /// instruction (shuffle) for loop invariant values and for the induction 544 /// value. If this is the induction variable then we extend it to N, N+1, ... 545 /// this is needed because each iteration in the loop corresponds to a SIMD 546 /// element. 547 virtual Value *getBroadcastInstrs(Value *V); 548 549 /// Add metadata from one instruction to another. 550 /// 551 /// This includes both the original MDs from \p From and additional ones (\see 552 /// addNewMetadata). Use this for *newly created* instructions in the vector 553 /// loop. 554 void addMetadata(Instruction *To, Instruction *From); 555 556 /// Similar to the previous function but it adds the metadata to a 557 /// vector of instructions. 558 void addMetadata(ArrayRef<Value *> To, Instruction *From); 559 560 // Returns the resume value (bc.merge.rdx) for a reduction as 561 // generated by fixReduction. 562 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 563 564 protected: 565 friend class LoopVectorizationPlanner; 566 567 /// A small list of PHINodes. 568 using PhiVector = SmallVector<PHINode *, 4>; 569 570 /// A type for scalarized values in the new loop. Each value from the 571 /// original loop, when scalarized, is represented by UF x VF scalar values 572 /// in the new unrolled loop, where UF is the unroll factor and VF is the 573 /// vectorization factor. 574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 575 576 /// Set up the values of the IVs correctly when exiting the vector loop. 577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 578 Value *CountRoundDown, Value *EndValue, 579 BasicBlock *MiddleBlock); 580 581 /// Introduce a conditional branch (on true, condition to be set later) at the 582 /// end of the header=latch connecting it to itself (across the backedge) and 583 /// to the exit block of \p L. 584 void createHeaderBranch(Loop *L); 585 586 /// Handle all cross-iteration phis in the header. 587 void fixCrossIterationPHIs(VPTransformState &State); 588 589 /// Create the exit value of first order recurrences in the middle block and 590 /// update their users. 591 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 592 VPTransformState &State); 593 594 /// Create code for the loop exit value of the reduction. 595 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 596 597 /// Clear NSW/NUW flags from reduction instructions if necessary. 598 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 599 VPTransformState &State); 600 601 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 602 /// means we need to add the appropriate incoming value from the middle 603 /// block as exiting edges from the scalar epilogue loop (if present) are 604 /// already in place, and we exit the vector loop exclusively to the middle 605 /// block. 606 void fixLCSSAPHIs(VPTransformState &State); 607 608 /// Iteratively sink the scalarized operands of a predicated instruction into 609 /// the block that was created for it. 610 void sinkScalarOperands(Instruction *PredInst); 611 612 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 613 /// represented as. 614 void truncateToMinimalBitwidths(VPTransformState &State); 615 616 /// Create a vector induction phi node based on an existing scalar one. \p 617 /// EntryVal is the value from the original loop that maps to the vector phi 618 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 619 /// truncate instruction, instead of widening the original IV, we widen a 620 /// version of the IV truncated to \p EntryVal's type. 621 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 622 Value *Step, Value *Start, 623 Instruction *EntryVal, VPValue *Def, 624 VPTransformState &State); 625 626 /// Returns (and creates if needed) the original loop trip count. 627 Value *getOrCreateTripCount(Loop *NewLoop); 628 629 /// Returns (and creates if needed) the trip count of the widened loop. 630 Value *getOrCreateVectorTripCount(Loop *NewLoop); 631 632 /// Returns a bitcasted value to the requested vector type. 633 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 634 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 635 const DataLayout &DL); 636 637 /// Emit a bypass check to see if the vector trip count is zero, including if 638 /// it overflows. 639 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 640 641 /// Emit a bypass check to see if all of the SCEV assumptions we've 642 /// had to make are correct. Returns the block containing the checks or 643 /// nullptr if no checks have been added. 644 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 645 646 /// Emit bypass checks to check any memory assumptions we may have made. 647 /// Returns the block containing the checks or nullptr if no checks have been 648 /// added. 649 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 650 651 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 652 /// vector loop preheader, middle block and scalar preheader. Also 653 /// allocate a loop object for the new vector loop and return it. 654 Loop *createVectorLoopSkeleton(StringRef Prefix); 655 656 /// Create new phi nodes for the induction variables to resume iteration count 657 /// in the scalar epilogue, from where the vectorized loop left off. 658 /// In cases where the loop skeleton is more complicated (eg. epilogue 659 /// vectorization) and the resume values can come from an additional bypass 660 /// block, the \p AdditionalBypass pair provides information about the bypass 661 /// block and the end value on the edge from bypass to this loop. 662 void createInductionResumeValues( 663 Loop *L, 664 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 665 666 /// Complete the loop skeleton by adding debug MDs, creating appropriate 667 /// conditional branches in the middle block, preparing the builder and 668 /// running the verifier. Take in the vector loop \p L as argument, and return 669 /// the preheader of the completed vector loop. 670 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 671 672 /// Add additional metadata to \p To that was not present on \p Orig. 673 /// 674 /// Currently this is used to add the noalias annotations based on the 675 /// inserted memchecks. Use this for instructions that are *cloned* into the 676 /// vector loop. 677 void addNewMetadata(Instruction *To, const Instruction *Orig); 678 679 /// Collect poison-generating recipes that may generate a poison value that is 680 /// used after vectorization, even when their operands are not poison. Those 681 /// recipes meet the following conditions: 682 /// * Contribute to the address computation of a recipe generating a widen 683 /// memory load/store (VPWidenMemoryInstructionRecipe or 684 /// VPInterleaveRecipe). 685 /// * Such a widen memory load/store has at least one underlying Instruction 686 /// that is in a basic block that needs predication and after vectorization 687 /// the generated instruction won't be predicated. 688 void collectPoisonGeneratingRecipes(VPTransformState &State); 689 690 /// Allow subclasses to override and print debug traces before/after vplan 691 /// execution, when trace information is requested. 692 virtual void printDebugTracesAtStart(){}; 693 virtual void printDebugTracesAtEnd(){}; 694 695 /// The original loop. 696 Loop *OrigLoop; 697 698 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 699 /// dynamic knowledge to simplify SCEV expressions and converts them to a 700 /// more usable form. 701 PredicatedScalarEvolution &PSE; 702 703 /// Loop Info. 704 LoopInfo *LI; 705 706 /// Dominator Tree. 707 DominatorTree *DT; 708 709 /// Alias Analysis. 710 AAResults *AA; 711 712 /// Target Library Info. 713 const TargetLibraryInfo *TLI; 714 715 /// Target Transform Info. 716 const TargetTransformInfo *TTI; 717 718 /// Assumption Cache. 719 AssumptionCache *AC; 720 721 /// Interface to emit optimization remarks. 722 OptimizationRemarkEmitter *ORE; 723 724 /// LoopVersioning. It's only set up (non-null) if memchecks were 725 /// used. 726 /// 727 /// This is currently only used to add no-alias metadata based on the 728 /// memchecks. The actually versioning is performed manually. 729 std::unique_ptr<LoopVersioning> LVer; 730 731 /// The vectorization SIMD factor to use. Each vector will have this many 732 /// vector elements. 733 ElementCount VF; 734 735 /// The vectorization unroll factor to use. Each scalar is vectorized to this 736 /// many different vector instructions. 737 unsigned UF; 738 739 /// The builder that we use 740 IRBuilder<> Builder; 741 742 // --- Vectorization state --- 743 744 /// The vector-loop preheader. 745 BasicBlock *LoopVectorPreHeader; 746 747 /// The scalar-loop preheader. 748 BasicBlock *LoopScalarPreHeader; 749 750 /// Middle Block between the vector and the scalar. 751 BasicBlock *LoopMiddleBlock; 752 753 /// The unique ExitBlock of the scalar loop if one exists. Note that 754 /// there can be multiple exiting edges reaching this block. 755 BasicBlock *LoopExitBlock; 756 757 /// The vector loop body. 758 BasicBlock *LoopVectorBody; 759 760 /// The scalar loop body. 761 BasicBlock *LoopScalarBody; 762 763 /// A list of all bypass blocks. The first block is the entry of the loop. 764 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 765 766 /// Store instructions that were predicated. 767 SmallVector<Instruction *, 4> PredicatedInstructions; 768 769 /// Trip count of the original loop. 770 Value *TripCount = nullptr; 771 772 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 773 Value *VectorTripCount = nullptr; 774 775 /// The legality analysis. 776 LoopVectorizationLegality *Legal; 777 778 /// The profitablity analysis. 779 LoopVectorizationCostModel *Cost; 780 781 // Record whether runtime checks are added. 782 bool AddedSafetyChecks = false; 783 784 // Holds the end values for each induction variable. We save the end values 785 // so we can later fix-up the external users of the induction variables. 786 DenseMap<PHINode *, Value *> IVEndValues; 787 788 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 789 // fixed up at the end of vector code generation. 790 SmallVector<PHINode *, 8> OrigPHIsToFix; 791 792 /// BFI and PSI are used to check for profile guided size optimizations. 793 BlockFrequencyInfo *BFI; 794 ProfileSummaryInfo *PSI; 795 796 // Whether this loop should be optimized for size based on profile guided size 797 // optimizatios. 798 bool OptForSizeBasedOnProfile; 799 800 /// Structure to hold information about generated runtime checks, responsible 801 /// for cleaning the checks, if vectorization turns out unprofitable. 802 GeneratedRTChecks &RTChecks; 803 804 // Holds the resume values for reductions in the loops, used to set the 805 // correct start value of reduction PHIs when vectorizing the epilogue. 806 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 807 ReductionResumeValues; 808 }; 809 810 class InnerLoopUnroller : public InnerLoopVectorizer { 811 public: 812 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 813 LoopInfo *LI, DominatorTree *DT, 814 const TargetLibraryInfo *TLI, 815 const TargetTransformInfo *TTI, AssumptionCache *AC, 816 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 817 LoopVectorizationLegality *LVL, 818 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 819 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 820 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 821 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 822 BFI, PSI, Check) {} 823 824 private: 825 Value *getBroadcastInstrs(Value *V) override; 826 }; 827 828 /// Encapsulate information regarding vectorization of a loop and its epilogue. 829 /// This information is meant to be updated and used across two stages of 830 /// epilogue vectorization. 831 struct EpilogueLoopVectorizationInfo { 832 ElementCount MainLoopVF = ElementCount::getFixed(0); 833 unsigned MainLoopUF = 0; 834 ElementCount EpilogueVF = ElementCount::getFixed(0); 835 unsigned EpilogueUF = 0; 836 BasicBlock *MainLoopIterationCountCheck = nullptr; 837 BasicBlock *EpilogueIterationCountCheck = nullptr; 838 BasicBlock *SCEVSafetyCheck = nullptr; 839 BasicBlock *MemSafetyCheck = nullptr; 840 Value *TripCount = nullptr; 841 Value *VectorTripCount = nullptr; 842 843 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 844 ElementCount EVF, unsigned EUF) 845 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 846 assert(EUF == 1 && 847 "A high UF for the epilogue loop is likely not beneficial."); 848 } 849 }; 850 851 /// An extension of the inner loop vectorizer that creates a skeleton for a 852 /// vectorized loop that has its epilogue (residual) also vectorized. 853 /// The idea is to run the vplan on a given loop twice, firstly to setup the 854 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 855 /// from the first step and vectorize the epilogue. This is achieved by 856 /// deriving two concrete strategy classes from this base class and invoking 857 /// them in succession from the loop vectorizer planner. 858 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 859 public: 860 InnerLoopAndEpilogueVectorizer( 861 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 862 DominatorTree *DT, const TargetLibraryInfo *TLI, 863 const TargetTransformInfo *TTI, AssumptionCache *AC, 864 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 865 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 866 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 867 GeneratedRTChecks &Checks) 868 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 869 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 870 Checks), 871 EPI(EPI) {} 872 873 // Override this function to handle the more complex control flow around the 874 // three loops. 875 std::pair<BasicBlock *, Value *> 876 createVectorizedLoopSkeleton() final override { 877 return createEpilogueVectorizedLoopSkeleton(); 878 } 879 880 /// The interface for creating a vectorized skeleton using one of two 881 /// different strategies, each corresponding to one execution of the vplan 882 /// as described above. 883 virtual std::pair<BasicBlock *, Value *> 884 createEpilogueVectorizedLoopSkeleton() = 0; 885 886 /// Holds and updates state information required to vectorize the main loop 887 /// and its epilogue in two separate passes. This setup helps us avoid 888 /// regenerating and recomputing runtime safety checks. It also helps us to 889 /// shorten the iteration-count-check path length for the cases where the 890 /// iteration count of the loop is so small that the main vector loop is 891 /// completely skipped. 892 EpilogueLoopVectorizationInfo &EPI; 893 }; 894 895 /// A specialized derived class of inner loop vectorizer that performs 896 /// vectorization of *main* loops in the process of vectorizing loops and their 897 /// epilogues. 898 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 899 public: 900 EpilogueVectorizerMainLoop( 901 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 902 DominatorTree *DT, const TargetLibraryInfo *TLI, 903 const TargetTransformInfo *TTI, AssumptionCache *AC, 904 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 905 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 906 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 907 GeneratedRTChecks &Check) 908 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 909 EPI, LVL, CM, BFI, PSI, Check) {} 910 /// Implements the interface for creating a vectorized skeleton using the 911 /// *main loop* strategy (ie the first pass of vplan execution). 912 std::pair<BasicBlock *, Value *> 913 createEpilogueVectorizedLoopSkeleton() final override; 914 915 protected: 916 /// Emits an iteration count bypass check once for the main loop (when \p 917 /// ForEpilogue is false) and once for the epilogue loop (when \p 918 /// ForEpilogue is true). 919 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 920 bool ForEpilogue); 921 void printDebugTracesAtStart() override; 922 void printDebugTracesAtEnd() override; 923 }; 924 925 // A specialized derived class of inner loop vectorizer that performs 926 // vectorization of *epilogue* loops in the process of vectorizing loops and 927 // their epilogues. 928 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 929 public: 930 EpilogueVectorizerEpilogueLoop( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI, LVL, CM, BFI, PSI, Checks) {} 940 /// Implements the interface for creating a vectorized skeleton using the 941 /// *epilogue loop* strategy (ie the second pass of vplan execution). 942 std::pair<BasicBlock *, Value *> 943 createEpilogueVectorizedLoopSkeleton() final override; 944 945 protected: 946 /// Emits an iteration count bypass check after the main vector loop has 947 /// finished to see if there are any iterations left to execute by either 948 /// the vector epilogue or the scalar epilogue. 949 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 950 BasicBlock *Bypass, 951 BasicBlock *Insert); 952 void printDebugTracesAtStart() override; 953 void printDebugTracesAtEnd() override; 954 }; 955 } // end namespace llvm 956 957 /// Look for a meaningful debug location on the instruction or it's 958 /// operands. 959 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 960 if (!I) 961 return I; 962 963 DebugLoc Empty; 964 if (I->getDebugLoc() != Empty) 965 return I; 966 967 for (Use &Op : I->operands()) { 968 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 969 if (OpInst->getDebugLoc() != Empty) 970 return OpInst; 971 } 972 973 return I; 974 } 975 976 void InnerLoopVectorizer::setDebugLocFromInst( 977 const Value *V, Optional<IRBuilderBase *> CustomBuilder) { 978 IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 979 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 980 const DILocation *DIL = Inst->getDebugLoc(); 981 982 // When a FSDiscriminator is enabled, we don't need to add the multiply 983 // factors to the discriminators. 984 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 985 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 986 // FIXME: For scalable vectors, assume vscale=1. 987 auto NewDIL = 988 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 989 if (NewDIL) 990 B->SetCurrentDebugLocation(NewDIL.getValue()); 991 else 992 LLVM_DEBUG(dbgs() 993 << "Failed to create new discriminator: " 994 << DIL->getFilename() << " Line: " << DIL->getLine()); 995 } else 996 B->SetCurrentDebugLocation(DIL); 997 } else 998 B->SetCurrentDebugLocation(DebugLoc()); 999 } 1000 1001 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1002 /// is passed, the message relates to that particular instruction. 1003 #ifndef NDEBUG 1004 static void debugVectorizationMessage(const StringRef Prefix, 1005 const StringRef DebugMsg, 1006 Instruction *I) { 1007 dbgs() << "LV: " << Prefix << DebugMsg; 1008 if (I != nullptr) 1009 dbgs() << " " << *I; 1010 else 1011 dbgs() << '.'; 1012 dbgs() << '\n'; 1013 } 1014 #endif 1015 1016 /// Create an analysis remark that explains why vectorization failed 1017 /// 1018 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1019 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1020 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1021 /// the location of the remark. \return the remark object that can be 1022 /// streamed to. 1023 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1024 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1025 Value *CodeRegion = TheLoop->getHeader(); 1026 DebugLoc DL = TheLoop->getStartLoc(); 1027 1028 if (I) { 1029 CodeRegion = I->getParent(); 1030 // If there is no debug location attached to the instruction, revert back to 1031 // using the loop's. 1032 if (I->getDebugLoc()) 1033 DL = I->getDebugLoc(); 1034 } 1035 1036 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1037 } 1038 1039 namespace llvm { 1040 1041 /// Return a value for Step multiplied by VF. 1042 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1043 int64_t Step) { 1044 assert(Ty->isIntegerTy() && "Expected an integer step"); 1045 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1046 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1047 } 1048 1049 /// Return the runtime value for VF. 1050 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1051 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1052 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1053 } 1054 1055 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1056 ElementCount VF) { 1057 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1058 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1059 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1060 return B.CreateUIToFP(RuntimeVF, FTy); 1061 } 1062 1063 void reportVectorizationFailure(const StringRef DebugMsg, 1064 const StringRef OREMsg, const StringRef ORETag, 1065 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1066 Instruction *I) { 1067 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1068 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1069 ORE->emit( 1070 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1071 << "loop not vectorized: " << OREMsg); 1072 } 1073 1074 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1075 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1076 Instruction *I) { 1077 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1078 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1079 ORE->emit( 1080 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1081 << Msg); 1082 } 1083 1084 } // end namespace llvm 1085 1086 #ifndef NDEBUG 1087 /// \return string containing a file name and a line # for the given loop. 1088 static std::string getDebugLocString(const Loop *L) { 1089 std::string Result; 1090 if (L) { 1091 raw_string_ostream OS(Result); 1092 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1093 LoopDbgLoc.print(OS); 1094 else 1095 // Just print the module name. 1096 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1097 OS.flush(); 1098 } 1099 return Result; 1100 } 1101 #endif 1102 1103 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1104 const Instruction *Orig) { 1105 // If the loop was versioned with memchecks, add the corresponding no-alias 1106 // metadata. 1107 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1108 LVer->annotateInstWithNoAlias(To, Orig); 1109 } 1110 1111 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1112 VPTransformState &State) { 1113 1114 // Collect recipes in the backward slice of `Root` that may generate a poison 1115 // value that is used after vectorization. 1116 SmallPtrSet<VPRecipeBase *, 16> Visited; 1117 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1118 SmallVector<VPRecipeBase *, 16> Worklist; 1119 Worklist.push_back(Root); 1120 1121 // Traverse the backward slice of Root through its use-def chain. 1122 while (!Worklist.empty()) { 1123 VPRecipeBase *CurRec = Worklist.back(); 1124 Worklist.pop_back(); 1125 1126 if (!Visited.insert(CurRec).second) 1127 continue; 1128 1129 // Prune search if we find another recipe generating a widen memory 1130 // instruction. Widen memory instructions involved in address computation 1131 // will lead to gather/scatter instructions, which don't need to be 1132 // handled. 1133 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1134 isa<VPInterleaveRecipe>(CurRec) || 1135 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1136 continue; 1137 1138 // This recipe contributes to the address computation of a widen 1139 // load/store. Collect recipe if its underlying instruction has 1140 // poison-generating flags. 1141 Instruction *Instr = CurRec->getUnderlyingInstr(); 1142 if (Instr && Instr->hasPoisonGeneratingFlags()) 1143 State.MayGeneratePoisonRecipes.insert(CurRec); 1144 1145 // Add new definitions to the worklist. 1146 for (VPValue *operand : CurRec->operands()) 1147 if (VPDef *OpDef = operand->getDef()) 1148 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1149 } 1150 }); 1151 1152 // Traverse all the recipes in the VPlan and collect the poison-generating 1153 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1154 // VPInterleaveRecipe. 1155 auto Iter = depth_first( 1156 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1157 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1158 for (VPRecipeBase &Recipe : *VPBB) { 1159 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1160 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1161 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1162 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1163 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1164 collectPoisonGeneratingInstrsInBackwardSlice( 1165 cast<VPRecipeBase>(AddrDef)); 1166 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1167 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1168 if (AddrDef) { 1169 // Check if any member of the interleave group needs predication. 1170 const InterleaveGroup<Instruction> *InterGroup = 1171 InterleaveRec->getInterleaveGroup(); 1172 bool NeedPredication = false; 1173 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1174 I < NumMembers; ++I) { 1175 Instruction *Member = InterGroup->getMember(I); 1176 if (Member) 1177 NeedPredication |= 1178 Legal->blockNeedsPredication(Member->getParent()); 1179 } 1180 1181 if (NeedPredication) 1182 collectPoisonGeneratingInstrsInBackwardSlice( 1183 cast<VPRecipeBase>(AddrDef)); 1184 } 1185 } 1186 } 1187 } 1188 } 1189 1190 void InnerLoopVectorizer::addMetadata(Instruction *To, 1191 Instruction *From) { 1192 propagateMetadata(To, From); 1193 addNewMetadata(To, From); 1194 } 1195 1196 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1197 Instruction *From) { 1198 for (Value *V : To) { 1199 if (Instruction *I = dyn_cast<Instruction>(V)) 1200 addMetadata(I, From); 1201 } 1202 } 1203 1204 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1205 const RecurrenceDescriptor &RdxDesc) { 1206 auto It = ReductionResumeValues.find(&RdxDesc); 1207 assert(It != ReductionResumeValues.end() && 1208 "Expected to find a resume value for the reduction."); 1209 return It->second; 1210 } 1211 1212 namespace llvm { 1213 1214 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1215 // lowered. 1216 enum ScalarEpilogueLowering { 1217 1218 // The default: allowing scalar epilogues. 1219 CM_ScalarEpilogueAllowed, 1220 1221 // Vectorization with OptForSize: don't allow epilogues. 1222 CM_ScalarEpilogueNotAllowedOptSize, 1223 1224 // A special case of vectorisation with OptForSize: loops with a very small 1225 // trip count are considered for vectorization under OptForSize, thereby 1226 // making sure the cost of their loop body is dominant, free of runtime 1227 // guards and scalar iteration overheads. 1228 CM_ScalarEpilogueNotAllowedLowTripLoop, 1229 1230 // Loop hint predicate indicating an epilogue is undesired. 1231 CM_ScalarEpilogueNotNeededUsePredicate, 1232 1233 // Directive indicating we must either tail fold or not vectorize 1234 CM_ScalarEpilogueNotAllowedUsePredicate 1235 }; 1236 1237 /// ElementCountComparator creates a total ordering for ElementCount 1238 /// for the purposes of using it in a set structure. 1239 struct ElementCountComparator { 1240 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1241 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1242 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1243 } 1244 }; 1245 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1246 1247 /// LoopVectorizationCostModel - estimates the expected speedups due to 1248 /// vectorization. 1249 /// In many cases vectorization is not profitable. This can happen because of 1250 /// a number of reasons. In this class we mainly attempt to predict the 1251 /// expected speedup/slowdowns due to the supported instruction set. We use the 1252 /// TargetTransformInfo to query the different backends for the cost of 1253 /// different operations. 1254 class LoopVectorizationCostModel { 1255 public: 1256 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1257 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1258 LoopVectorizationLegality *Legal, 1259 const TargetTransformInfo &TTI, 1260 const TargetLibraryInfo *TLI, DemandedBits *DB, 1261 AssumptionCache *AC, 1262 OptimizationRemarkEmitter *ORE, const Function *F, 1263 const LoopVectorizeHints *Hints, 1264 InterleavedAccessInfo &IAI) 1265 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1266 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1267 Hints(Hints), InterleaveInfo(IAI) {} 1268 1269 /// \return An upper bound for the vectorization factors (both fixed and 1270 /// scalable). If the factors are 0, vectorization and interleaving should be 1271 /// avoided up front. 1272 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1273 1274 /// \return True if runtime checks are required for vectorization, and false 1275 /// otherwise. 1276 bool runtimeChecksRequired(); 1277 1278 /// \return The most profitable vectorization factor and the cost of that VF. 1279 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1280 /// then this vectorization factor will be selected if vectorization is 1281 /// possible. 1282 VectorizationFactor 1283 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1284 1285 VectorizationFactor 1286 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1287 const LoopVectorizationPlanner &LVP); 1288 1289 /// Setup cost-based decisions for user vectorization factor. 1290 /// \return true if the UserVF is a feasible VF to be chosen. 1291 bool selectUserVectorizationFactor(ElementCount UserVF) { 1292 collectUniformsAndScalars(UserVF); 1293 collectInstsToScalarize(UserVF); 1294 return expectedCost(UserVF).first.isValid(); 1295 } 1296 1297 /// \return The size (in bits) of the smallest and widest types in the code 1298 /// that needs to be vectorized. We ignore values that remain scalar such as 1299 /// 64 bit loop indices. 1300 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1301 1302 /// \return The desired interleave count. 1303 /// If interleave count has been specified by metadata it will be returned. 1304 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1305 /// are the selected vectorization factor and the cost of the selected VF. 1306 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1307 1308 /// Memory access instruction may be vectorized in more than one way. 1309 /// Form of instruction after vectorization depends on cost. 1310 /// This function takes cost-based decisions for Load/Store instructions 1311 /// and collects them in a map. This decisions map is used for building 1312 /// the lists of loop-uniform and loop-scalar instructions. 1313 /// The calculated cost is saved with widening decision in order to 1314 /// avoid redundant calculations. 1315 void setCostBasedWideningDecision(ElementCount VF); 1316 1317 /// A struct that represents some properties of the register usage 1318 /// of a loop. 1319 struct RegisterUsage { 1320 /// Holds the number of loop invariant values that are used in the loop. 1321 /// The key is ClassID of target-provided register class. 1322 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1323 /// Holds the maximum number of concurrent live intervals in the loop. 1324 /// The key is ClassID of target-provided register class. 1325 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1326 }; 1327 1328 /// \return Returns information about the register usages of the loop for the 1329 /// given vectorization factors. 1330 SmallVector<RegisterUsage, 8> 1331 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1332 1333 /// Collect values we want to ignore in the cost model. 1334 void collectValuesToIgnore(); 1335 1336 /// Collect all element types in the loop for which widening is needed. 1337 void collectElementTypesForWidening(); 1338 1339 /// Split reductions into those that happen in the loop, and those that happen 1340 /// outside. In loop reductions are collected into InLoopReductionChains. 1341 void collectInLoopReductions(); 1342 1343 /// Returns true if we should use strict in-order reductions for the given 1344 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1345 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1346 /// of FP operations. 1347 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1348 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1349 } 1350 1351 /// \returns The smallest bitwidth each instruction can be represented with. 1352 /// The vector equivalents of these instructions should be truncated to this 1353 /// type. 1354 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1355 return MinBWs; 1356 } 1357 1358 /// \returns True if it is more profitable to scalarize instruction \p I for 1359 /// vectorization factor \p VF. 1360 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1361 assert(VF.isVector() && 1362 "Profitable to scalarize relevant only for VF > 1."); 1363 1364 // Cost model is not run in the VPlan-native path - return conservative 1365 // result until this changes. 1366 if (EnableVPlanNativePath) 1367 return false; 1368 1369 auto Scalars = InstsToScalarize.find(VF); 1370 assert(Scalars != InstsToScalarize.end() && 1371 "VF not yet analyzed for scalarization profitability"); 1372 return Scalars->second.find(I) != Scalars->second.end(); 1373 } 1374 1375 /// Returns true if \p I is known to be uniform after vectorization. 1376 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1377 if (VF.isScalar()) 1378 return true; 1379 1380 // Cost model is not run in the VPlan-native path - return conservative 1381 // result until this changes. 1382 if (EnableVPlanNativePath) 1383 return false; 1384 1385 auto UniformsPerVF = Uniforms.find(VF); 1386 assert(UniformsPerVF != Uniforms.end() && 1387 "VF not yet analyzed for uniformity"); 1388 return UniformsPerVF->second.count(I); 1389 } 1390 1391 /// Returns true if \p I is known to be scalar after vectorization. 1392 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1393 if (VF.isScalar()) 1394 return true; 1395 1396 // Cost model is not run in the VPlan-native path - return conservative 1397 // result until this changes. 1398 if (EnableVPlanNativePath) 1399 return false; 1400 1401 auto ScalarsPerVF = Scalars.find(VF); 1402 assert(ScalarsPerVF != Scalars.end() && 1403 "Scalar values are not calculated for VF"); 1404 return ScalarsPerVF->second.count(I); 1405 } 1406 1407 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1408 /// for vectorization factor \p VF. 1409 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1410 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1411 !isProfitableToScalarize(I, VF) && 1412 !isScalarAfterVectorization(I, VF); 1413 } 1414 1415 /// Decision that was taken during cost calculation for memory instruction. 1416 enum InstWidening { 1417 CM_Unknown, 1418 CM_Widen, // For consecutive accesses with stride +1. 1419 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1420 CM_Interleave, 1421 CM_GatherScatter, 1422 CM_Scalarize 1423 }; 1424 1425 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1426 /// instruction \p I and vector width \p VF. 1427 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1428 InstructionCost Cost) { 1429 assert(VF.isVector() && "Expected VF >=2"); 1430 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1431 } 1432 1433 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1434 /// interleaving group \p Grp and vector width \p VF. 1435 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1436 ElementCount VF, InstWidening W, 1437 InstructionCost Cost) { 1438 assert(VF.isVector() && "Expected VF >=2"); 1439 /// Broadcast this decicion to all instructions inside the group. 1440 /// But the cost will be assigned to one instruction only. 1441 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1442 if (auto *I = Grp->getMember(i)) { 1443 if (Grp->getInsertPos() == I) 1444 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1445 else 1446 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1447 } 1448 } 1449 } 1450 1451 /// Return the cost model decision for the given instruction \p I and vector 1452 /// width \p VF. Return CM_Unknown if this instruction did not pass 1453 /// through the cost modeling. 1454 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1455 assert(VF.isVector() && "Expected VF to be a vector VF"); 1456 // Cost model is not run in the VPlan-native path - return conservative 1457 // result until this changes. 1458 if (EnableVPlanNativePath) 1459 return CM_GatherScatter; 1460 1461 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1462 auto Itr = WideningDecisions.find(InstOnVF); 1463 if (Itr == WideningDecisions.end()) 1464 return CM_Unknown; 1465 return Itr->second.first; 1466 } 1467 1468 /// Return the vectorization cost for the given instruction \p I and vector 1469 /// width \p VF. 1470 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1471 assert(VF.isVector() && "Expected VF >=2"); 1472 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1473 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1474 "The cost is not calculated"); 1475 return WideningDecisions[InstOnVF].second; 1476 } 1477 1478 /// Return True if instruction \p I is an optimizable truncate whose operand 1479 /// is an induction variable. Such a truncate will be removed by adding a new 1480 /// induction variable with the destination type. 1481 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1482 // If the instruction is not a truncate, return false. 1483 auto *Trunc = dyn_cast<TruncInst>(I); 1484 if (!Trunc) 1485 return false; 1486 1487 // Get the source and destination types of the truncate. 1488 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1489 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1490 1491 // If the truncate is free for the given types, return false. Replacing a 1492 // free truncate with an induction variable would add an induction variable 1493 // update instruction to each iteration of the loop. We exclude from this 1494 // check the primary induction variable since it will need an update 1495 // instruction regardless. 1496 Value *Op = Trunc->getOperand(0); 1497 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1498 return false; 1499 1500 // If the truncated value is not an induction variable, return false. 1501 return Legal->isInductionPhi(Op); 1502 } 1503 1504 /// Collects the instructions to scalarize for each predicated instruction in 1505 /// the loop. 1506 void collectInstsToScalarize(ElementCount VF); 1507 1508 /// Collect Uniform and Scalar values for the given \p VF. 1509 /// The sets depend on CM decision for Load/Store instructions 1510 /// that may be vectorized as interleave, gather-scatter or scalarized. 1511 void collectUniformsAndScalars(ElementCount VF) { 1512 // Do the analysis once. 1513 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1514 return; 1515 setCostBasedWideningDecision(VF); 1516 collectLoopUniforms(VF); 1517 collectLoopScalars(VF); 1518 } 1519 1520 /// Returns true if the target machine supports masked store operation 1521 /// for the given \p DataType and kind of access to \p Ptr. 1522 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1523 return Legal->isConsecutivePtr(DataType, Ptr) && 1524 TTI.isLegalMaskedStore(DataType, Alignment); 1525 } 1526 1527 /// Returns true if the target machine supports masked load operation 1528 /// for the given \p DataType and kind of access to \p Ptr. 1529 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1530 return Legal->isConsecutivePtr(DataType, Ptr) && 1531 TTI.isLegalMaskedLoad(DataType, Alignment); 1532 } 1533 1534 /// Returns true if the target machine can represent \p V as a masked gather 1535 /// or scatter operation. 1536 bool isLegalGatherOrScatter(Value *V, 1537 ElementCount VF = ElementCount::getFixed(1)) { 1538 bool LI = isa<LoadInst>(V); 1539 bool SI = isa<StoreInst>(V); 1540 if (!LI && !SI) 1541 return false; 1542 auto *Ty = getLoadStoreType(V); 1543 Align Align = getLoadStoreAlignment(V); 1544 if (VF.isVector()) 1545 Ty = VectorType::get(Ty, VF); 1546 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1547 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1548 } 1549 1550 /// Returns true if the target machine supports all of the reduction 1551 /// variables found for the given VF. 1552 bool canVectorizeReductions(ElementCount VF) const { 1553 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1554 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1555 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1556 })); 1557 } 1558 1559 /// Returns true if \p I is an instruction that will be scalarized with 1560 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1561 /// instructions include conditional stores and instructions that may divide 1562 /// by zero. 1563 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1564 1565 // Returns true if \p I is an instruction that will be predicated either 1566 // through scalar predication or masked load/store or masked gather/scatter. 1567 // \p VF is the vectorization factor that will be used to vectorize \p I. 1568 // Superset of instructions that return true for isScalarWithPredication. 1569 bool isPredicatedInst(Instruction *I, ElementCount VF, 1570 bool IsKnownUniform = false) { 1571 // When we know the load is uniform and the original scalar loop was not 1572 // predicated we don't need to mark it as a predicated instruction. Any 1573 // vectorised blocks created when tail-folding are something artificial we 1574 // have introduced and we know there is always at least one active lane. 1575 // That's why we call Legal->blockNeedsPredication here because it doesn't 1576 // query tail-folding. 1577 if (IsKnownUniform && isa<LoadInst>(I) && 1578 !Legal->blockNeedsPredication(I->getParent())) 1579 return false; 1580 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1581 return false; 1582 // Loads and stores that need some form of masked operation are predicated 1583 // instructions. 1584 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1585 return Legal->isMaskRequired(I); 1586 return isScalarWithPredication(I, VF); 1587 } 1588 1589 /// Returns true if \p I is a memory instruction with consecutive memory 1590 /// access that can be widened. 1591 bool 1592 memoryInstructionCanBeWidened(Instruction *I, 1593 ElementCount VF = ElementCount::getFixed(1)); 1594 1595 /// Returns true if \p I is a memory instruction in an interleaved-group 1596 /// of memory accesses that can be vectorized with wide vector loads/stores 1597 /// and shuffles. 1598 bool 1599 interleavedAccessCanBeWidened(Instruction *I, 1600 ElementCount VF = ElementCount::getFixed(1)); 1601 1602 /// Check if \p Instr belongs to any interleaved access group. 1603 bool isAccessInterleaved(Instruction *Instr) { 1604 return InterleaveInfo.isInterleaved(Instr); 1605 } 1606 1607 /// Get the interleaved access group that \p Instr belongs to. 1608 const InterleaveGroup<Instruction> * 1609 getInterleavedAccessGroup(Instruction *Instr) { 1610 return InterleaveInfo.getInterleaveGroup(Instr); 1611 } 1612 1613 /// Returns true if we're required to use a scalar epilogue for at least 1614 /// the final iteration of the original loop. 1615 bool requiresScalarEpilogue(ElementCount VF) const { 1616 if (!isScalarEpilogueAllowed()) 1617 return false; 1618 // If we might exit from anywhere but the latch, must run the exiting 1619 // iteration in scalar form. 1620 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1621 return true; 1622 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1623 } 1624 1625 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1626 /// loop hint annotation. 1627 bool isScalarEpilogueAllowed() const { 1628 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1629 } 1630 1631 /// Returns true if all loop blocks should be masked to fold tail loop. 1632 bool foldTailByMasking() const { return FoldTailByMasking; } 1633 1634 /// Returns true if the instructions in this block requires predication 1635 /// for any reason, e.g. because tail folding now requires a predicate 1636 /// or because the block in the original loop was predicated. 1637 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1638 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1639 } 1640 1641 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1642 /// nodes to the chain of instructions representing the reductions. Uses a 1643 /// MapVector to ensure deterministic iteration order. 1644 using ReductionChainMap = 1645 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1646 1647 /// Return the chain of instructions representing an inloop reduction. 1648 const ReductionChainMap &getInLoopReductionChains() const { 1649 return InLoopReductionChains; 1650 } 1651 1652 /// Returns true if the Phi is part of an inloop reduction. 1653 bool isInLoopReduction(PHINode *Phi) const { 1654 return InLoopReductionChains.count(Phi); 1655 } 1656 1657 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1658 /// with factor VF. Return the cost of the instruction, including 1659 /// scalarization overhead if it's needed. 1660 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1661 1662 /// Estimate cost of a call instruction CI if it were vectorized with factor 1663 /// VF. Return the cost of the instruction, including scalarization overhead 1664 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1665 /// scalarized - 1666 /// i.e. either vector version isn't available, or is too expensive. 1667 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1668 bool &NeedToScalarize) const; 1669 1670 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1671 /// that of B. 1672 bool isMoreProfitable(const VectorizationFactor &A, 1673 const VectorizationFactor &B) const; 1674 1675 /// Invalidates decisions already taken by the cost model. 1676 void invalidateCostModelingDecisions() { 1677 WideningDecisions.clear(); 1678 Uniforms.clear(); 1679 Scalars.clear(); 1680 } 1681 1682 private: 1683 unsigned NumPredStores = 0; 1684 1685 /// Convenience function that returns the value of vscale_range iff 1686 /// vscale_range.min == vscale_range.max or otherwise returns the value 1687 /// returned by the corresponding TLI method. 1688 Optional<unsigned> getVScaleForTuning() const; 1689 1690 /// \return An upper bound for the vectorization factors for both 1691 /// fixed and scalable vectorization, where the minimum-known number of 1692 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1693 /// disabled or unsupported, then the scalable part will be equal to 1694 /// ElementCount::getScalable(0). 1695 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1696 ElementCount UserVF, 1697 bool FoldTailByMasking); 1698 1699 /// \return the maximized element count based on the targets vector 1700 /// registers and the loop trip-count, but limited to a maximum safe VF. 1701 /// This is a helper function of computeFeasibleMaxVF. 1702 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1703 /// issue that occurred on one of the buildbots which cannot be reproduced 1704 /// without having access to the properietary compiler (see comments on 1705 /// D98509). The issue is currently under investigation and this workaround 1706 /// will be removed as soon as possible. 1707 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1708 unsigned SmallestType, 1709 unsigned WidestType, 1710 const ElementCount &MaxSafeVF, 1711 bool FoldTailByMasking); 1712 1713 /// \return the maximum legal scalable VF, based on the safe max number 1714 /// of elements. 1715 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1716 1717 /// The vectorization cost is a combination of the cost itself and a boolean 1718 /// indicating whether any of the contributing operations will actually 1719 /// operate on vector values after type legalization in the backend. If this 1720 /// latter value is false, then all operations will be scalarized (i.e. no 1721 /// vectorization has actually taken place). 1722 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1723 1724 /// Returns the expected execution cost. The unit of the cost does 1725 /// not matter because we use the 'cost' units to compare different 1726 /// vector widths. The cost that is returned is *not* normalized by 1727 /// the factor width. If \p Invalid is not nullptr, this function 1728 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1729 /// each instruction that has an Invalid cost for the given VF. 1730 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1731 VectorizationCostTy 1732 expectedCost(ElementCount VF, 1733 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1734 1735 /// Returns the execution time cost of an instruction for a given vector 1736 /// width. Vector width of one means scalar. 1737 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1738 1739 /// The cost-computation logic from getInstructionCost which provides 1740 /// the vector type as an output parameter. 1741 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1742 Type *&VectorTy); 1743 1744 /// Return the cost of instructions in an inloop reduction pattern, if I is 1745 /// part of that pattern. 1746 Optional<InstructionCost> 1747 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1748 TTI::TargetCostKind CostKind); 1749 1750 /// Calculate vectorization cost of memory instruction \p I. 1751 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1752 1753 /// The cost computation for scalarized memory instruction. 1754 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1755 1756 /// The cost computation for interleaving group of memory instructions. 1757 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1758 1759 /// The cost computation for Gather/Scatter instruction. 1760 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1761 1762 /// The cost computation for widening instruction \p I with consecutive 1763 /// memory access. 1764 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1765 1766 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1767 /// Load: scalar load + broadcast. 1768 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1769 /// element) 1770 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1771 1772 /// Estimate the overhead of scalarizing an instruction. This is a 1773 /// convenience wrapper for the type-based getScalarizationOverhead API. 1774 InstructionCost getScalarizationOverhead(Instruction *I, 1775 ElementCount VF) const; 1776 1777 /// Returns whether the instruction is a load or store and will be a emitted 1778 /// as a vector operation. 1779 bool isConsecutiveLoadOrStore(Instruction *I); 1780 1781 /// Returns true if an artificially high cost for emulated masked memrefs 1782 /// should be used. 1783 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1784 1785 /// Map of scalar integer values to the smallest bitwidth they can be legally 1786 /// represented as. The vector equivalents of these values should be truncated 1787 /// to this type. 1788 MapVector<Instruction *, uint64_t> MinBWs; 1789 1790 /// A type representing the costs for instructions if they were to be 1791 /// scalarized rather than vectorized. The entries are Instruction-Cost 1792 /// pairs. 1793 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1794 1795 /// A set containing all BasicBlocks that are known to present after 1796 /// vectorization as a predicated block. 1797 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1798 1799 /// Records whether it is allowed to have the original scalar loop execute at 1800 /// least once. This may be needed as a fallback loop in case runtime 1801 /// aliasing/dependence checks fail, or to handle the tail/remainder 1802 /// iterations when the trip count is unknown or doesn't divide by the VF, 1803 /// or as a peel-loop to handle gaps in interleave-groups. 1804 /// Under optsize and when the trip count is very small we don't allow any 1805 /// iterations to execute in the scalar loop. 1806 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1807 1808 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1809 bool FoldTailByMasking = false; 1810 1811 /// A map holding scalar costs for different vectorization factors. The 1812 /// presence of a cost for an instruction in the mapping indicates that the 1813 /// instruction will be scalarized when vectorizing with the associated 1814 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1815 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1816 1817 /// Holds the instructions known to be uniform after vectorization. 1818 /// The data is collected per VF. 1819 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1820 1821 /// Holds the instructions known to be scalar after vectorization. 1822 /// The data is collected per VF. 1823 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1824 1825 /// Holds the instructions (address computations) that are forced to be 1826 /// scalarized. 1827 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1828 1829 /// PHINodes of the reductions that should be expanded in-loop along with 1830 /// their associated chains of reduction operations, in program order from top 1831 /// (PHI) to bottom 1832 ReductionChainMap InLoopReductionChains; 1833 1834 /// A Map of inloop reduction operations and their immediate chain operand. 1835 /// FIXME: This can be removed once reductions can be costed correctly in 1836 /// vplan. This was added to allow quick lookup to the inloop operations, 1837 /// without having to loop through InLoopReductionChains. 1838 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1839 1840 /// Returns the expected difference in cost from scalarizing the expression 1841 /// feeding a predicated instruction \p PredInst. The instructions to 1842 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1843 /// non-negative return value implies the expression will be scalarized. 1844 /// Currently, only single-use chains are considered for scalarization. 1845 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1846 ElementCount VF); 1847 1848 /// Collect the instructions that are uniform after vectorization. An 1849 /// instruction is uniform if we represent it with a single scalar value in 1850 /// the vectorized loop corresponding to each vector iteration. Examples of 1851 /// uniform instructions include pointer operands of consecutive or 1852 /// interleaved memory accesses. Note that although uniformity implies an 1853 /// instruction will be scalar, the reverse is not true. In general, a 1854 /// scalarized instruction will be represented by VF scalar values in the 1855 /// vectorized loop, each corresponding to an iteration of the original 1856 /// scalar loop. 1857 void collectLoopUniforms(ElementCount VF); 1858 1859 /// Collect the instructions that are scalar after vectorization. An 1860 /// instruction is scalar if it is known to be uniform or will be scalarized 1861 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1862 /// to the list if they are used by a load/store instruction that is marked as 1863 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1864 /// VF values in the vectorized loop, each corresponding to an iteration of 1865 /// the original scalar loop. 1866 void collectLoopScalars(ElementCount VF); 1867 1868 /// Keeps cost model vectorization decision and cost for instructions. 1869 /// Right now it is used for memory instructions only. 1870 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1871 std::pair<InstWidening, InstructionCost>>; 1872 1873 DecisionList WideningDecisions; 1874 1875 /// Returns true if \p V is expected to be vectorized and it needs to be 1876 /// extracted. 1877 bool needsExtract(Value *V, ElementCount VF) const { 1878 Instruction *I = dyn_cast<Instruction>(V); 1879 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1880 TheLoop->isLoopInvariant(I)) 1881 return false; 1882 1883 // Assume we can vectorize V (and hence we need extraction) if the 1884 // scalars are not computed yet. This can happen, because it is called 1885 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1886 // the scalars are collected. That should be a safe assumption in most 1887 // cases, because we check if the operands have vectorizable types 1888 // beforehand in LoopVectorizationLegality. 1889 return Scalars.find(VF) == Scalars.end() || 1890 !isScalarAfterVectorization(I, VF); 1891 }; 1892 1893 /// Returns a range containing only operands needing to be extracted. 1894 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1895 ElementCount VF) const { 1896 return SmallVector<Value *, 4>(make_filter_range( 1897 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1898 } 1899 1900 /// Determines if we have the infrastructure to vectorize loop \p L and its 1901 /// epilogue, assuming the main loop is vectorized by \p VF. 1902 bool isCandidateForEpilogueVectorization(const Loop &L, 1903 const ElementCount VF) const; 1904 1905 /// Returns true if epilogue vectorization is considered profitable, and 1906 /// false otherwise. 1907 /// \p VF is the vectorization factor chosen for the original loop. 1908 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1909 1910 public: 1911 /// The loop that we evaluate. 1912 Loop *TheLoop; 1913 1914 /// Predicated scalar evolution analysis. 1915 PredicatedScalarEvolution &PSE; 1916 1917 /// Loop Info analysis. 1918 LoopInfo *LI; 1919 1920 /// Vectorization legality. 1921 LoopVectorizationLegality *Legal; 1922 1923 /// Vector target information. 1924 const TargetTransformInfo &TTI; 1925 1926 /// Target Library Info. 1927 const TargetLibraryInfo *TLI; 1928 1929 /// Demanded bits analysis. 1930 DemandedBits *DB; 1931 1932 /// Assumption cache. 1933 AssumptionCache *AC; 1934 1935 /// Interface to emit optimization remarks. 1936 OptimizationRemarkEmitter *ORE; 1937 1938 const Function *TheFunction; 1939 1940 /// Loop Vectorize Hint. 1941 const LoopVectorizeHints *Hints; 1942 1943 /// The interleave access information contains groups of interleaved accesses 1944 /// with the same stride and close to each other. 1945 InterleavedAccessInfo &InterleaveInfo; 1946 1947 /// Values to ignore in the cost model. 1948 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1949 1950 /// Values to ignore in the cost model when VF > 1. 1951 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1952 1953 /// All element types found in the loop. 1954 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1955 1956 /// Profitable vector factors. 1957 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1958 }; 1959 } // end namespace llvm 1960 1961 /// Helper struct to manage generating runtime checks for vectorization. 1962 /// 1963 /// The runtime checks are created up-front in temporary blocks to allow better 1964 /// estimating the cost and un-linked from the existing IR. After deciding to 1965 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1966 /// temporary blocks are completely removed. 1967 class GeneratedRTChecks { 1968 /// Basic block which contains the generated SCEV checks, if any. 1969 BasicBlock *SCEVCheckBlock = nullptr; 1970 1971 /// The value representing the result of the generated SCEV checks. If it is 1972 /// nullptr, either no SCEV checks have been generated or they have been used. 1973 Value *SCEVCheckCond = nullptr; 1974 1975 /// Basic block which contains the generated memory runtime checks, if any. 1976 BasicBlock *MemCheckBlock = nullptr; 1977 1978 /// The value representing the result of the generated memory runtime checks. 1979 /// If it is nullptr, either no memory runtime checks have been generated or 1980 /// they have been used. 1981 Value *MemRuntimeCheckCond = nullptr; 1982 1983 DominatorTree *DT; 1984 LoopInfo *LI; 1985 1986 SCEVExpander SCEVExp; 1987 SCEVExpander MemCheckExp; 1988 1989 public: 1990 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1991 const DataLayout &DL) 1992 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1993 MemCheckExp(SE, DL, "scev.check") {} 1994 1995 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1996 /// accurately estimate the cost of the runtime checks. The blocks are 1997 /// un-linked from the IR and is added back during vector code generation. If 1998 /// there is no vector code generation, the check blocks are removed 1999 /// completely. 2000 void Create(Loop *L, const LoopAccessInfo &LAI, 2001 const SCEVPredicate &Pred) { 2002 2003 BasicBlock *LoopHeader = L->getHeader(); 2004 BasicBlock *Preheader = L->getLoopPreheader(); 2005 2006 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2007 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2008 // may be used by SCEVExpander. The blocks will be un-linked from their 2009 // predecessors and removed from LI & DT at the end of the function. 2010 if (!Pred.isAlwaysTrue()) { 2011 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2012 nullptr, "vector.scevcheck"); 2013 2014 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2015 &Pred, SCEVCheckBlock->getTerminator()); 2016 } 2017 2018 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2019 if (RtPtrChecking.Need) { 2020 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2021 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2022 "vector.memcheck"); 2023 2024 MemRuntimeCheckCond = 2025 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2026 RtPtrChecking.getChecks(), MemCheckExp); 2027 assert(MemRuntimeCheckCond && 2028 "no RT checks generated although RtPtrChecking " 2029 "claimed checks are required"); 2030 } 2031 2032 if (!MemCheckBlock && !SCEVCheckBlock) 2033 return; 2034 2035 // Unhook the temporary block with the checks, update various places 2036 // accordingly. 2037 if (SCEVCheckBlock) 2038 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2039 if (MemCheckBlock) 2040 MemCheckBlock->replaceAllUsesWith(Preheader); 2041 2042 if (SCEVCheckBlock) { 2043 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2044 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2045 Preheader->getTerminator()->eraseFromParent(); 2046 } 2047 if (MemCheckBlock) { 2048 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2049 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2050 Preheader->getTerminator()->eraseFromParent(); 2051 } 2052 2053 DT->changeImmediateDominator(LoopHeader, Preheader); 2054 if (MemCheckBlock) { 2055 DT->eraseNode(MemCheckBlock); 2056 LI->removeBlock(MemCheckBlock); 2057 } 2058 if (SCEVCheckBlock) { 2059 DT->eraseNode(SCEVCheckBlock); 2060 LI->removeBlock(SCEVCheckBlock); 2061 } 2062 } 2063 2064 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2065 /// unused. 2066 ~GeneratedRTChecks() { 2067 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2068 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2069 if (!SCEVCheckCond) 2070 SCEVCleaner.markResultUsed(); 2071 2072 if (!MemRuntimeCheckCond) 2073 MemCheckCleaner.markResultUsed(); 2074 2075 if (MemRuntimeCheckCond) { 2076 auto &SE = *MemCheckExp.getSE(); 2077 // Memory runtime check generation creates compares that use expanded 2078 // values. Remove them before running the SCEVExpanderCleaners. 2079 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2080 if (MemCheckExp.isInsertedInstruction(&I)) 2081 continue; 2082 SE.forgetValue(&I); 2083 I.eraseFromParent(); 2084 } 2085 } 2086 MemCheckCleaner.cleanup(); 2087 SCEVCleaner.cleanup(); 2088 2089 if (SCEVCheckCond) 2090 SCEVCheckBlock->eraseFromParent(); 2091 if (MemRuntimeCheckCond) 2092 MemCheckBlock->eraseFromParent(); 2093 } 2094 2095 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2096 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2097 /// depending on the generated condition. 2098 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2099 BasicBlock *LoopVectorPreHeader, 2100 BasicBlock *LoopExitBlock) { 2101 if (!SCEVCheckCond) 2102 return nullptr; 2103 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2104 if (C->isZero()) 2105 return nullptr; 2106 2107 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2108 2109 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2110 // Create new preheader for vector loop. 2111 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2112 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2113 2114 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2115 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2116 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2117 SCEVCheckBlock); 2118 2119 DT->addNewBlock(SCEVCheckBlock, Pred); 2120 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2121 2122 ReplaceInstWithInst( 2123 SCEVCheckBlock->getTerminator(), 2124 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2125 // Mark the check as used, to prevent it from being removed during cleanup. 2126 SCEVCheckCond = nullptr; 2127 return SCEVCheckBlock; 2128 } 2129 2130 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2131 /// the branches to branch to the vector preheader or \p Bypass, depending on 2132 /// the generated condition. 2133 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2134 BasicBlock *LoopVectorPreHeader) { 2135 // Check if we generated code that checks in runtime if arrays overlap. 2136 if (!MemRuntimeCheckCond) 2137 return nullptr; 2138 2139 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2140 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2141 MemCheckBlock); 2142 2143 DT->addNewBlock(MemCheckBlock, Pred); 2144 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2145 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2146 2147 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2148 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2149 2150 ReplaceInstWithInst( 2151 MemCheckBlock->getTerminator(), 2152 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2153 MemCheckBlock->getTerminator()->setDebugLoc( 2154 Pred->getTerminator()->getDebugLoc()); 2155 2156 // Mark the check as used, to prevent it from being removed during cleanup. 2157 MemRuntimeCheckCond = nullptr; 2158 return MemCheckBlock; 2159 } 2160 }; 2161 2162 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2163 // vectorization. The loop needs to be annotated with #pragma omp simd 2164 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2165 // vector length information is not provided, vectorization is not considered 2166 // explicit. Interleave hints are not allowed either. These limitations will be 2167 // relaxed in the future. 2168 // Please, note that we are currently forced to abuse the pragma 'clang 2169 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2170 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2171 // provides *explicit vectorization hints* (LV can bypass legal checks and 2172 // assume that vectorization is legal). However, both hints are implemented 2173 // using the same metadata (llvm.loop.vectorize, processed by 2174 // LoopVectorizeHints). This will be fixed in the future when the native IR 2175 // representation for pragma 'omp simd' is introduced. 2176 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2177 OptimizationRemarkEmitter *ORE) { 2178 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2179 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2180 2181 // Only outer loops with an explicit vectorization hint are supported. 2182 // Unannotated outer loops are ignored. 2183 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2184 return false; 2185 2186 Function *Fn = OuterLp->getHeader()->getParent(); 2187 if (!Hints.allowVectorization(Fn, OuterLp, 2188 true /*VectorizeOnlyWhenForced*/)) { 2189 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2190 return false; 2191 } 2192 2193 if (Hints.getInterleave() > 1) { 2194 // TODO: Interleave support is future work. 2195 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2196 "outer loops.\n"); 2197 Hints.emitRemarkWithHints(); 2198 return false; 2199 } 2200 2201 return true; 2202 } 2203 2204 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2205 OptimizationRemarkEmitter *ORE, 2206 SmallVectorImpl<Loop *> &V) { 2207 // Collect inner loops and outer loops without irreducible control flow. For 2208 // now, only collect outer loops that have explicit vectorization hints. If we 2209 // are stress testing the VPlan H-CFG construction, we collect the outermost 2210 // loop of every loop nest. 2211 if (L.isInnermost() || VPlanBuildStressTest || 2212 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2213 LoopBlocksRPO RPOT(&L); 2214 RPOT.perform(LI); 2215 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2216 V.push_back(&L); 2217 // TODO: Collect inner loops inside marked outer loops in case 2218 // vectorization fails for the outer loop. Do not invoke 2219 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2220 // already known to be reducible. We can use an inherited attribute for 2221 // that. 2222 return; 2223 } 2224 } 2225 for (Loop *InnerL : L) 2226 collectSupportedLoops(*InnerL, LI, ORE, V); 2227 } 2228 2229 namespace { 2230 2231 /// The LoopVectorize Pass. 2232 struct LoopVectorize : public FunctionPass { 2233 /// Pass identification, replacement for typeid 2234 static char ID; 2235 2236 LoopVectorizePass Impl; 2237 2238 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2239 bool VectorizeOnlyWhenForced = false) 2240 : FunctionPass(ID), 2241 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2242 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2243 } 2244 2245 bool runOnFunction(Function &F) override { 2246 if (skipFunction(F)) 2247 return false; 2248 2249 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2250 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2251 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2252 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2253 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2254 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2255 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2256 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2257 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2258 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2259 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2260 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2261 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2262 2263 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2264 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2265 2266 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2267 GetLAA, *ORE, PSI).MadeAnyChange; 2268 } 2269 2270 void getAnalysisUsage(AnalysisUsage &AU) const override { 2271 AU.addRequired<AssumptionCacheTracker>(); 2272 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2273 AU.addRequired<DominatorTreeWrapperPass>(); 2274 AU.addRequired<LoopInfoWrapperPass>(); 2275 AU.addRequired<ScalarEvolutionWrapperPass>(); 2276 AU.addRequired<TargetTransformInfoWrapperPass>(); 2277 AU.addRequired<AAResultsWrapperPass>(); 2278 AU.addRequired<LoopAccessLegacyAnalysis>(); 2279 AU.addRequired<DemandedBitsWrapperPass>(); 2280 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2281 AU.addRequired<InjectTLIMappingsLegacy>(); 2282 2283 // We currently do not preserve loopinfo/dominator analyses with outer loop 2284 // vectorization. Until this is addressed, mark these analyses as preserved 2285 // only for non-VPlan-native path. 2286 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2287 if (!EnableVPlanNativePath) { 2288 AU.addPreserved<LoopInfoWrapperPass>(); 2289 AU.addPreserved<DominatorTreeWrapperPass>(); 2290 } 2291 2292 AU.addPreserved<BasicAAWrapperPass>(); 2293 AU.addPreserved<GlobalsAAWrapperPass>(); 2294 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2295 } 2296 }; 2297 2298 } // end anonymous namespace 2299 2300 //===----------------------------------------------------------------------===// 2301 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2302 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2303 //===----------------------------------------------------------------------===// 2304 2305 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2306 // We need to place the broadcast of invariant variables outside the loop, 2307 // but only if it's proven safe to do so. Else, broadcast will be inside 2308 // vector loop body. 2309 Instruction *Instr = dyn_cast<Instruction>(V); 2310 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2311 (!Instr || 2312 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2313 // Place the code for broadcasting invariant variables in the new preheader. 2314 IRBuilder<>::InsertPointGuard Guard(Builder); 2315 if (SafeToHoist) 2316 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2317 2318 // Broadcast the scalar into all locations in the vector. 2319 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2320 2321 return Shuf; 2322 } 2323 2324 /// This function adds 2325 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2326 /// to each vector element of Val. The sequence starts at StartIndex. 2327 /// \p Opcode is relevant for FP induction variable. 2328 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2329 Instruction::BinaryOps BinOp, ElementCount VF, 2330 IRBuilderBase &Builder) { 2331 assert(VF.isVector() && "only vector VFs are supported"); 2332 2333 // Create and check the types. 2334 auto *ValVTy = cast<VectorType>(Val->getType()); 2335 ElementCount VLen = ValVTy->getElementCount(); 2336 2337 Type *STy = Val->getType()->getScalarType(); 2338 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2339 "Induction Step must be an integer or FP"); 2340 assert(Step->getType() == STy && "Step has wrong type"); 2341 2342 SmallVector<Constant *, 8> Indices; 2343 2344 // Create a vector of consecutive numbers from zero to VF. 2345 VectorType *InitVecValVTy = ValVTy; 2346 if (STy->isFloatingPointTy()) { 2347 Type *InitVecValSTy = 2348 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2349 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2350 } 2351 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2352 2353 // Splat the StartIdx 2354 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2355 2356 if (STy->isIntegerTy()) { 2357 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2358 Step = Builder.CreateVectorSplat(VLen, Step); 2359 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2360 // FIXME: The newly created binary instructions should contain nsw/nuw 2361 // flags, which can be found from the original scalar operations. 2362 Step = Builder.CreateMul(InitVec, Step); 2363 return Builder.CreateAdd(Val, Step, "induction"); 2364 } 2365 2366 // Floating point induction. 2367 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2368 "Binary Opcode should be specified for FP induction"); 2369 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2370 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2371 2372 Step = Builder.CreateVectorSplat(VLen, Step); 2373 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2374 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2375 } 2376 2377 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2378 const InductionDescriptor &II, Value *Step, Value *Start, 2379 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2380 IRBuilderBase &Builder = State.Builder; 2381 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2382 "Expected either an induction phi-node or a truncate of it!"); 2383 2384 // Construct the initial value of the vector IV in the vector loop preheader 2385 auto CurrIP = Builder.saveIP(); 2386 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2387 if (isa<TruncInst>(EntryVal)) { 2388 assert(Start->getType()->isIntegerTy() && 2389 "Truncation requires an integer type"); 2390 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2391 Step = Builder.CreateTrunc(Step, TruncType); 2392 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2393 } 2394 2395 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2396 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2397 Value *SteppedStart = getStepVector( 2398 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2399 2400 // We create vector phi nodes for both integer and floating-point induction 2401 // variables. Here, we determine the kind of arithmetic we will perform. 2402 Instruction::BinaryOps AddOp; 2403 Instruction::BinaryOps MulOp; 2404 if (Step->getType()->isIntegerTy()) { 2405 AddOp = Instruction::Add; 2406 MulOp = Instruction::Mul; 2407 } else { 2408 AddOp = II.getInductionOpcode(); 2409 MulOp = Instruction::FMul; 2410 } 2411 2412 // Multiply the vectorization factor by the step using integer or 2413 // floating-point arithmetic as appropriate. 2414 Type *StepType = Step->getType(); 2415 Value *RuntimeVF; 2416 if (Step->getType()->isFloatingPointTy()) 2417 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2418 else 2419 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2420 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2421 2422 // Create a vector splat to use in the induction update. 2423 // 2424 // FIXME: If the step is non-constant, we create the vector splat with 2425 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2426 // handle a constant vector splat. 2427 Value *SplatVF = isa<Constant>(Mul) 2428 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2429 : Builder.CreateVectorSplat(State.VF, Mul); 2430 Builder.restoreIP(CurrIP); 2431 2432 // We may need to add the step a number of times, depending on the unroll 2433 // factor. The last of those goes into the PHI. 2434 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2435 &*LoopVectorBody->getFirstInsertionPt()); 2436 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2437 Instruction *LastInduction = VecInd; 2438 for (unsigned Part = 0; Part < UF; ++Part) { 2439 State.set(Def, LastInduction, Part); 2440 2441 if (isa<TruncInst>(EntryVal)) 2442 addMetadata(LastInduction, EntryVal); 2443 2444 LastInduction = cast<Instruction>( 2445 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2446 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2447 } 2448 2449 // Move the last step to the end of the latch block. This ensures consistent 2450 // placement of all induction updates. 2451 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2452 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2453 LastInduction->moveBefore(Br); 2454 LastInduction->setName("vec.ind.next"); 2455 2456 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2457 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2458 } 2459 2460 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2461 /// variable on which to base the steps, \p Step is the size of the step, and 2462 /// \p EntryVal is the value from the original loop that maps to the steps. 2463 /// Note that \p EntryVal doesn't have to be an induction variable - it 2464 /// can also be a truncate instruction. 2465 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2466 Instruction *EntryVal, 2467 const InductionDescriptor &ID, VPValue *Def, 2468 VPTransformState &State) { 2469 IRBuilderBase &Builder = State.Builder; 2470 // We shouldn't have to build scalar steps if we aren't vectorizing. 2471 assert(State.VF.isVector() && "VF should be greater than one"); 2472 // Get the value type and ensure it and the step have the same integer type. 2473 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2474 assert(ScalarIVTy == Step->getType() && 2475 "Val and Step should have the same type"); 2476 2477 // We build scalar steps for both integer and floating-point induction 2478 // variables. Here, we determine the kind of arithmetic we will perform. 2479 Instruction::BinaryOps AddOp; 2480 Instruction::BinaryOps MulOp; 2481 if (ScalarIVTy->isIntegerTy()) { 2482 AddOp = Instruction::Add; 2483 MulOp = Instruction::Mul; 2484 } else { 2485 AddOp = ID.getInductionOpcode(); 2486 MulOp = Instruction::FMul; 2487 } 2488 2489 // Determine the number of scalars we need to generate for each unroll 2490 // iteration. 2491 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2492 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2493 // Compute the scalar steps and save the results in State. 2494 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2495 ScalarIVTy->getScalarSizeInBits()); 2496 Type *VecIVTy = nullptr; 2497 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2498 if (!FirstLaneOnly && State.VF.isScalable()) { 2499 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2500 UnitStepVec = 2501 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2502 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2503 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2504 } 2505 2506 for (unsigned Part = 0; Part < State.UF; ++Part) { 2507 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2508 2509 if (!FirstLaneOnly && State.VF.isScalable()) { 2510 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2511 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2512 if (ScalarIVTy->isFloatingPointTy()) 2513 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2514 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2515 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2516 State.set(Def, Add, Part); 2517 // It's useful to record the lane values too for the known minimum number 2518 // of elements so we do those below. This improves the code quality when 2519 // trying to extract the first element, for example. 2520 } 2521 2522 if (ScalarIVTy->isFloatingPointTy()) 2523 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2524 2525 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2526 Value *StartIdx = Builder.CreateBinOp( 2527 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2528 // The step returned by `createStepForVF` is a runtime-evaluated value 2529 // when VF is scalable. Otherwise, it should be folded into a Constant. 2530 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2531 "Expected StartIdx to be folded to a constant when VF is not " 2532 "scalable"); 2533 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2534 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2535 State.set(Def, Add, VPIteration(Part, Lane)); 2536 } 2537 } 2538 } 2539 2540 // Generate code for the induction step. Note that induction steps are 2541 // required to be loop-invariant 2542 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2543 Instruction *InsertBefore, 2544 Loop *OrigLoop = nullptr) { 2545 const DataLayout &DL = SE.getDataLayout(); 2546 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2547 "Induction step should be loop invariant"); 2548 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2549 return E->getValue(); 2550 2551 SCEVExpander Exp(SE, DL, "induction"); 2552 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2553 } 2554 2555 /// Compute the transformed value of Index at offset StartValue using step 2556 /// StepValue. 2557 /// For integer induction, returns StartValue + Index * StepValue. 2558 /// For pointer induction, returns StartValue[Index * StepValue]. 2559 /// FIXME: The newly created binary instructions should contain nsw/nuw 2560 /// flags, which can be found from the original scalar operations. 2561 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2562 Value *StartValue, Value *Step, 2563 const InductionDescriptor &ID) { 2564 assert(Index->getType()->getScalarType() == Step->getType() && 2565 "Index scalar type does not match StepValue type"); 2566 2567 // Note: the IR at this point is broken. We cannot use SE to create any new 2568 // SCEV and then expand it, hoping that SCEV's simplification will give us 2569 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2570 // lead to various SCEV crashes. So all we can do is to use builder and rely 2571 // on InstCombine for future simplifications. Here we handle some trivial 2572 // cases only. 2573 auto CreateAdd = [&B](Value *X, Value *Y) { 2574 assert(X->getType() == Y->getType() && "Types don't match!"); 2575 if (auto *CX = dyn_cast<ConstantInt>(X)) 2576 if (CX->isZero()) 2577 return Y; 2578 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2579 if (CY->isZero()) 2580 return X; 2581 return B.CreateAdd(X, Y); 2582 }; 2583 2584 // We allow X to be a vector type, in which case Y will potentially be 2585 // splatted into a vector with the same element count. 2586 auto CreateMul = [&B](Value *X, Value *Y) { 2587 assert(X->getType()->getScalarType() == Y->getType() && 2588 "Types don't match!"); 2589 if (auto *CX = dyn_cast<ConstantInt>(X)) 2590 if (CX->isOne()) 2591 return Y; 2592 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2593 if (CY->isOne()) 2594 return X; 2595 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2596 if (XVTy && !isa<VectorType>(Y->getType())) 2597 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2598 return B.CreateMul(X, Y); 2599 }; 2600 2601 switch (ID.getKind()) { 2602 case InductionDescriptor::IK_IntInduction: { 2603 assert(!isa<VectorType>(Index->getType()) && 2604 "Vector indices not supported for integer inductions yet"); 2605 assert(Index->getType() == StartValue->getType() && 2606 "Index type does not match StartValue type"); 2607 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2608 return B.CreateSub(StartValue, Index); 2609 auto *Offset = CreateMul(Index, Step); 2610 return CreateAdd(StartValue, Offset); 2611 } 2612 case InductionDescriptor::IK_PtrInduction: { 2613 assert(isa<Constant>(Step) && 2614 "Expected constant step for pointer induction"); 2615 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2616 } 2617 case InductionDescriptor::IK_FpInduction: { 2618 assert(!isa<VectorType>(Index->getType()) && 2619 "Vector indices not supported for FP inductions yet"); 2620 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2621 auto InductionBinOp = ID.getInductionBinOp(); 2622 assert(InductionBinOp && 2623 (InductionBinOp->getOpcode() == Instruction::FAdd || 2624 InductionBinOp->getOpcode() == Instruction::FSub) && 2625 "Original bin op should be defined for FP induction"); 2626 2627 Value *MulExp = B.CreateFMul(Step, Index); 2628 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2629 "induction"); 2630 } 2631 case InductionDescriptor::IK_NoInduction: 2632 return nullptr; 2633 } 2634 llvm_unreachable("invalid enum"); 2635 } 2636 2637 void InnerLoopVectorizer::widenIntOrFpInduction( 2638 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, 2639 Value *CanonicalIV) { 2640 Value *Start = Def->getStartValue()->getLiveInIRValue(); 2641 const InductionDescriptor &ID = Def->getInductionDescriptor(); 2642 TruncInst *Trunc = Def->getTruncInst(); 2643 IRBuilderBase &Builder = State.Builder; 2644 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2645 assert(State.VF.isVector() && "must have vector VF"); 2646 2647 // The value from the original loop to which we are mapping the new induction 2648 // variable. 2649 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2650 2651 auto &DL = EntryVal->getModule()->getDataLayout(); 2652 2653 // Generate code for the induction step. Note that induction steps are 2654 // required to be loop-invariant 2655 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2656 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2657 "Induction step should be loop invariant"); 2658 if (PSE.getSE()->isSCEVable(IV->getType())) { 2659 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2660 return Exp.expandCodeFor(Step, Step->getType(), 2661 State.CFG.VectorPreHeader->getTerminator()); 2662 } 2663 return cast<SCEVUnknown>(Step)->getValue(); 2664 }; 2665 2666 // The scalar value to broadcast. This is derived from the canonical 2667 // induction variable. If a truncation type is given, truncate the canonical 2668 // induction variable and step. Otherwise, derive these values from the 2669 // induction descriptor. 2670 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2671 Value *ScalarIV = CanonicalIV; 2672 Type *NeededType = IV->getType(); 2673 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { 2674 ScalarIV = 2675 NeededType->isIntegerTy() 2676 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) 2677 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); 2678 ScalarIV = emitTransformedIndex(Builder, ScalarIV, Start, Step, ID); 2679 ScalarIV->setName("offset.idx"); 2680 } 2681 if (Trunc) { 2682 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2683 assert(Step->getType()->isIntegerTy() && 2684 "Truncation requires an integer step"); 2685 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2686 Step = Builder.CreateTrunc(Step, TruncType); 2687 } 2688 return ScalarIV; 2689 }; 2690 2691 // Fast-math-flags propagate from the original induction instruction. 2692 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2693 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2694 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2695 2696 // Now do the actual transformations, and start with creating the step value. 2697 Value *Step = CreateStepValue(ID.getStep()); 2698 2699 // Create a new independent vector induction variable. Later VPlan2VPlan 2700 // optimizations will remove it, if it won't be needed, e.g. because all users 2701 // of it access scalar values. 2702 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2703 2704 if (Def->needsScalarIV()) { 2705 // Create scalar steps that can be used by instructions we will later 2706 // scalarize. Note that the addition of the scalar steps will not increase 2707 // the number of instructions in the loop in the common case prior to 2708 // InstCombine. We will be trading one vector extract for each scalar step. 2709 Value *ScalarIV = CreateScalarIV(Step); 2710 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2711 } 2712 } 2713 2714 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2715 const VPIteration &Instance, 2716 VPTransformState &State) { 2717 Value *ScalarInst = State.get(Def, Instance); 2718 Value *VectorValue = State.get(Def, Instance.Part); 2719 VectorValue = Builder.CreateInsertElement( 2720 VectorValue, ScalarInst, 2721 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2722 State.set(Def, VectorValue, Instance.Part); 2723 } 2724 2725 // Return whether we allow using masked interleave-groups (for dealing with 2726 // strided loads/stores that reside in predicated blocks, or for dealing 2727 // with gaps). 2728 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2729 // If an override option has been passed in for interleaved accesses, use it. 2730 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2731 return EnableMaskedInterleavedMemAccesses; 2732 2733 return TTI.enableMaskedInterleavedAccessVectorization(); 2734 } 2735 2736 // Try to vectorize the interleave group that \p Instr belongs to. 2737 // 2738 // E.g. Translate following interleaved load group (factor = 3): 2739 // for (i = 0; i < N; i+=3) { 2740 // R = Pic[i]; // Member of index 0 2741 // G = Pic[i+1]; // Member of index 1 2742 // B = Pic[i+2]; // Member of index 2 2743 // ... // do something to R, G, B 2744 // } 2745 // To: 2746 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2747 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2748 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2749 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2750 // 2751 // Or translate following interleaved store group (factor = 3): 2752 // for (i = 0; i < N; i+=3) { 2753 // ... do something to R, G, B 2754 // Pic[i] = R; // Member of index 0 2755 // Pic[i+1] = G; // Member of index 1 2756 // Pic[i+2] = B; // Member of index 2 2757 // } 2758 // To: 2759 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2760 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2761 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2762 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2763 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2764 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2765 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2766 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2767 VPValue *BlockInMask) { 2768 Instruction *Instr = Group->getInsertPos(); 2769 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2770 2771 // Prepare for the vector type of the interleaved load/store. 2772 Type *ScalarTy = getLoadStoreType(Instr); 2773 unsigned InterleaveFactor = Group->getFactor(); 2774 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2775 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2776 2777 // Prepare for the new pointers. 2778 SmallVector<Value *, 2> AddrParts; 2779 unsigned Index = Group->getIndex(Instr); 2780 2781 // TODO: extend the masked interleaved-group support to reversed access. 2782 assert((!BlockInMask || !Group->isReverse()) && 2783 "Reversed masked interleave-group not supported."); 2784 2785 // If the group is reverse, adjust the index to refer to the last vector lane 2786 // instead of the first. We adjust the index from the first vector lane, 2787 // rather than directly getting the pointer for lane VF - 1, because the 2788 // pointer operand of the interleaved access is supposed to be uniform. For 2789 // uniform instructions, we're only required to generate a value for the 2790 // first vector lane in each unroll iteration. 2791 if (Group->isReverse()) 2792 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2793 2794 for (unsigned Part = 0; Part < UF; Part++) { 2795 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2796 setDebugLocFromInst(AddrPart); 2797 2798 // Notice current instruction could be any index. Need to adjust the address 2799 // to the member of index 0. 2800 // 2801 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2802 // b = A[i]; // Member of index 0 2803 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2804 // 2805 // E.g. A[i+1] = a; // Member of index 1 2806 // A[i] = b; // Member of index 0 2807 // A[i+2] = c; // Member of index 2 (Current instruction) 2808 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2809 2810 bool InBounds = false; 2811 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2812 InBounds = gep->isInBounds(); 2813 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2814 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2815 2816 // Cast to the vector pointer type. 2817 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2818 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2819 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2820 } 2821 2822 setDebugLocFromInst(Instr); 2823 Value *PoisonVec = PoisonValue::get(VecTy); 2824 2825 Value *MaskForGaps = nullptr; 2826 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2827 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2828 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2829 } 2830 2831 // Vectorize the interleaved load group. 2832 if (isa<LoadInst>(Instr)) { 2833 // For each unroll part, create a wide load for the group. 2834 SmallVector<Value *, 2> NewLoads; 2835 for (unsigned Part = 0; Part < UF; Part++) { 2836 Instruction *NewLoad; 2837 if (BlockInMask || MaskForGaps) { 2838 assert(useMaskedInterleavedAccesses(*TTI) && 2839 "masked interleaved groups are not allowed."); 2840 Value *GroupMask = MaskForGaps; 2841 if (BlockInMask) { 2842 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2843 Value *ShuffledMask = Builder.CreateShuffleVector( 2844 BlockInMaskPart, 2845 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2846 "interleaved.mask"); 2847 GroupMask = MaskForGaps 2848 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2849 MaskForGaps) 2850 : ShuffledMask; 2851 } 2852 NewLoad = 2853 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2854 GroupMask, PoisonVec, "wide.masked.vec"); 2855 } 2856 else 2857 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2858 Group->getAlign(), "wide.vec"); 2859 Group->addMetadata(NewLoad); 2860 NewLoads.push_back(NewLoad); 2861 } 2862 2863 // For each member in the group, shuffle out the appropriate data from the 2864 // wide loads. 2865 unsigned J = 0; 2866 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2867 Instruction *Member = Group->getMember(I); 2868 2869 // Skip the gaps in the group. 2870 if (!Member) 2871 continue; 2872 2873 auto StrideMask = 2874 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2875 for (unsigned Part = 0; Part < UF; Part++) { 2876 Value *StridedVec = Builder.CreateShuffleVector( 2877 NewLoads[Part], StrideMask, "strided.vec"); 2878 2879 // If this member has different type, cast the result type. 2880 if (Member->getType() != ScalarTy) { 2881 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2882 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2883 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2884 } 2885 2886 if (Group->isReverse()) 2887 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2888 2889 State.set(VPDefs[J], StridedVec, Part); 2890 } 2891 ++J; 2892 } 2893 return; 2894 } 2895 2896 // The sub vector type for current instruction. 2897 auto *SubVT = VectorType::get(ScalarTy, VF); 2898 2899 // Vectorize the interleaved store group. 2900 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2901 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2902 "masked interleaved groups are not allowed."); 2903 assert((!MaskForGaps || !VF.isScalable()) && 2904 "masking gaps for scalable vectors is not yet supported."); 2905 for (unsigned Part = 0; Part < UF; Part++) { 2906 // Collect the stored vector from each member. 2907 SmallVector<Value *, 4> StoredVecs; 2908 for (unsigned i = 0; i < InterleaveFactor; i++) { 2909 assert((Group->getMember(i) || MaskForGaps) && 2910 "Fail to get a member from an interleaved store group"); 2911 Instruction *Member = Group->getMember(i); 2912 2913 // Skip the gaps in the group. 2914 if (!Member) { 2915 Value *Undef = PoisonValue::get(SubVT); 2916 StoredVecs.push_back(Undef); 2917 continue; 2918 } 2919 2920 Value *StoredVec = State.get(StoredValues[i], Part); 2921 2922 if (Group->isReverse()) 2923 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2924 2925 // If this member has different type, cast it to a unified type. 2926 2927 if (StoredVec->getType() != SubVT) 2928 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2929 2930 StoredVecs.push_back(StoredVec); 2931 } 2932 2933 // Concatenate all vectors into a wide vector. 2934 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2935 2936 // Interleave the elements in the wide vector. 2937 Value *IVec = Builder.CreateShuffleVector( 2938 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2939 "interleaved.vec"); 2940 2941 Instruction *NewStoreInstr; 2942 if (BlockInMask || MaskForGaps) { 2943 Value *GroupMask = MaskForGaps; 2944 if (BlockInMask) { 2945 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2946 Value *ShuffledMask = Builder.CreateShuffleVector( 2947 BlockInMaskPart, 2948 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2949 "interleaved.mask"); 2950 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2951 ShuffledMask, MaskForGaps) 2952 : ShuffledMask; 2953 } 2954 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2955 Group->getAlign(), GroupMask); 2956 } else 2957 NewStoreInstr = 2958 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2959 2960 Group->addMetadata(NewStoreInstr); 2961 } 2962 } 2963 2964 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2965 VPReplicateRecipe *RepRecipe, 2966 const VPIteration &Instance, 2967 bool IfPredicateInstr, 2968 VPTransformState &State) { 2969 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2970 2971 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2972 // the first lane and part. 2973 if (isa<NoAliasScopeDeclInst>(Instr)) 2974 if (!Instance.isFirstIteration()) 2975 return; 2976 2977 setDebugLocFromInst(Instr); 2978 2979 // Does this instruction return a value ? 2980 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2981 2982 Instruction *Cloned = Instr->clone(); 2983 if (!IsVoidRetTy) 2984 Cloned->setName(Instr->getName() + ".cloned"); 2985 2986 // If the scalarized instruction contributes to the address computation of a 2987 // widen masked load/store which was in a basic block that needed predication 2988 // and is not predicated after vectorization, we can't propagate 2989 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2990 // instruction could feed a poison value to the base address of the widen 2991 // load/store. 2992 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2993 Cloned->dropPoisonGeneratingFlags(); 2994 2995 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2996 Builder.GetInsertPoint()); 2997 // Replace the operands of the cloned instructions with their scalar 2998 // equivalents in the new loop. 2999 for (auto &I : enumerate(RepRecipe->operands())) { 3000 auto InputInstance = Instance; 3001 VPValue *Operand = I.value(); 3002 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 3003 if (OperandR && OperandR->isUniform()) 3004 InputInstance.Lane = VPLane::getFirstLane(); 3005 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 3006 } 3007 addNewMetadata(Cloned, Instr); 3008 3009 // Place the cloned scalar in the new loop. 3010 Builder.Insert(Cloned); 3011 3012 State.set(RepRecipe, Cloned, Instance); 3013 3014 // If we just cloned a new assumption, add it the assumption cache. 3015 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3016 AC->registerAssumption(II); 3017 3018 // End if-block. 3019 if (IfPredicateInstr) 3020 PredicatedInstructions.push_back(Cloned); 3021 } 3022 3023 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 3024 BasicBlock *Header = L->getHeader(); 3025 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 3026 3027 IRBuilder<> B(Header->getTerminator()); 3028 Instruction *OldInst = 3029 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 3030 setDebugLocFromInst(OldInst, &B); 3031 3032 // Connect the header to the exit and header blocks and replace the old 3033 // terminator. 3034 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 3035 3036 // Now we have two terminators. Remove the old one from the block. 3037 Header->getTerminator()->eraseFromParent(); 3038 } 3039 3040 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3041 if (TripCount) 3042 return TripCount; 3043 3044 assert(L && "Create Trip Count for null loop."); 3045 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3046 // Find the loop boundaries. 3047 ScalarEvolution *SE = PSE.getSE(); 3048 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3049 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3050 "Invalid loop count"); 3051 3052 Type *IdxTy = Legal->getWidestInductionType(); 3053 assert(IdxTy && "No type for induction"); 3054 3055 // The exit count might have the type of i64 while the phi is i32. This can 3056 // happen if we have an induction variable that is sign extended before the 3057 // compare. The only way that we get a backedge taken count is that the 3058 // induction variable was signed and as such will not overflow. In such a case 3059 // truncation is legal. 3060 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3061 IdxTy->getPrimitiveSizeInBits()) 3062 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3063 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3064 3065 // Get the total trip count from the count by adding 1. 3066 const SCEV *ExitCount = SE->getAddExpr( 3067 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3068 3069 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3070 3071 // Expand the trip count and place the new instructions in the preheader. 3072 // Notice that the pre-header does not change, only the loop body. 3073 SCEVExpander Exp(*SE, DL, "induction"); 3074 3075 // Count holds the overall loop count (N). 3076 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3077 L->getLoopPreheader()->getTerminator()); 3078 3079 if (TripCount->getType()->isPointerTy()) 3080 TripCount = 3081 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3082 L->getLoopPreheader()->getTerminator()); 3083 3084 return TripCount; 3085 } 3086 3087 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3088 if (VectorTripCount) 3089 return VectorTripCount; 3090 3091 Value *TC = getOrCreateTripCount(L); 3092 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3093 3094 Type *Ty = TC->getType(); 3095 // This is where we can make the step a runtime constant. 3096 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3097 3098 // If the tail is to be folded by masking, round the number of iterations N 3099 // up to a multiple of Step instead of rounding down. This is done by first 3100 // adding Step-1 and then rounding down. Note that it's ok if this addition 3101 // overflows: the vector induction variable will eventually wrap to zero given 3102 // that it starts at zero and its Step is a power of two; the loop will then 3103 // exit, with the last early-exit vector comparison also producing all-true. 3104 if (Cost->foldTailByMasking()) { 3105 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3106 "VF*UF must be a power of 2 when folding tail by masking"); 3107 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 3108 TC = Builder.CreateAdd( 3109 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 3110 } 3111 3112 // Now we need to generate the expression for the part of the loop that the 3113 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3114 // iterations are not required for correctness, or N - Step, otherwise. Step 3115 // is equal to the vectorization factor (number of SIMD elements) times the 3116 // unroll factor (number of SIMD instructions). 3117 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3118 3119 // There are cases where we *must* run at least one iteration in the remainder 3120 // loop. See the cost model for when this can happen. If the step evenly 3121 // divides the trip count, we set the remainder to be equal to the step. If 3122 // the step does not evenly divide the trip count, no adjustment is necessary 3123 // since there will already be scalar iterations. Note that the minimum 3124 // iterations check ensures that N >= Step. 3125 if (Cost->requiresScalarEpilogue(VF)) { 3126 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3127 R = Builder.CreateSelect(IsZero, Step, R); 3128 } 3129 3130 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3131 3132 return VectorTripCount; 3133 } 3134 3135 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3136 const DataLayout &DL) { 3137 // Verify that V is a vector type with same number of elements as DstVTy. 3138 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3139 unsigned VF = DstFVTy->getNumElements(); 3140 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3141 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3142 Type *SrcElemTy = SrcVecTy->getElementType(); 3143 Type *DstElemTy = DstFVTy->getElementType(); 3144 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3145 "Vector elements must have same size"); 3146 3147 // Do a direct cast if element types are castable. 3148 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3149 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3150 } 3151 // V cannot be directly casted to desired vector type. 3152 // May happen when V is a floating point vector but DstVTy is a vector of 3153 // pointers or vice-versa. Handle this using a two-step bitcast using an 3154 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3155 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3156 "Only one type should be a pointer type"); 3157 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3158 "Only one type should be a floating point type"); 3159 Type *IntTy = 3160 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3161 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3162 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3163 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3164 } 3165 3166 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3167 BasicBlock *Bypass) { 3168 Value *Count = getOrCreateTripCount(L); 3169 // Reuse existing vector loop preheader for TC checks. 3170 // Note that new preheader block is generated for vector loop. 3171 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3172 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3173 3174 // Generate code to check if the loop's trip count is less than VF * UF, or 3175 // equal to it in case a scalar epilogue is required; this implies that the 3176 // vector trip count is zero. This check also covers the case where adding one 3177 // to the backedge-taken count overflowed leading to an incorrect trip count 3178 // of zero. In this case we will also jump to the scalar loop. 3179 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3180 : ICmpInst::ICMP_ULT; 3181 3182 // If tail is to be folded, vector loop takes care of all iterations. 3183 Value *CheckMinIters = Builder.getFalse(); 3184 if (!Cost->foldTailByMasking()) { 3185 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3186 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3187 } 3188 // Create new preheader for vector loop. 3189 LoopVectorPreHeader = 3190 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3191 "vector.ph"); 3192 3193 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3194 DT->getNode(Bypass)->getIDom()) && 3195 "TC check is expected to dominate Bypass"); 3196 3197 // Update dominator for Bypass & LoopExit (if needed). 3198 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3199 if (!Cost->requiresScalarEpilogue(VF)) 3200 // If there is an epilogue which must run, there's no edge from the 3201 // middle block to exit blocks and thus no need to update the immediate 3202 // dominator of the exit blocks. 3203 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3204 3205 ReplaceInstWithInst( 3206 TCCheckBlock->getTerminator(), 3207 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3208 LoopBypassBlocks.push_back(TCCheckBlock); 3209 } 3210 3211 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3212 3213 BasicBlock *const SCEVCheckBlock = 3214 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3215 if (!SCEVCheckBlock) 3216 return nullptr; 3217 3218 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3219 (OptForSizeBasedOnProfile && 3220 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3221 "Cannot SCEV check stride or overflow when optimizing for size"); 3222 3223 3224 // Update dominator only if this is first RT check. 3225 if (LoopBypassBlocks.empty()) { 3226 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3227 if (!Cost->requiresScalarEpilogue(VF)) 3228 // If there is an epilogue which must run, there's no edge from the 3229 // middle block to exit blocks and thus no need to update the immediate 3230 // dominator of the exit blocks. 3231 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3232 } 3233 3234 LoopBypassBlocks.push_back(SCEVCheckBlock); 3235 AddedSafetyChecks = true; 3236 return SCEVCheckBlock; 3237 } 3238 3239 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3240 BasicBlock *Bypass) { 3241 // VPlan-native path does not do any analysis for runtime checks currently. 3242 if (EnableVPlanNativePath) 3243 return nullptr; 3244 3245 BasicBlock *const MemCheckBlock = 3246 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3247 3248 // Check if we generated code that checks in runtime if arrays overlap. We put 3249 // the checks into a separate block to make the more common case of few 3250 // elements faster. 3251 if (!MemCheckBlock) 3252 return nullptr; 3253 3254 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3255 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3256 "Cannot emit memory checks when optimizing for size, unless forced " 3257 "to vectorize."); 3258 ORE->emit([&]() { 3259 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3260 L->getStartLoc(), L->getHeader()) 3261 << "Code-size may be reduced by not forcing " 3262 "vectorization, or by source-code modifications " 3263 "eliminating the need for runtime checks " 3264 "(e.g., adding 'restrict')."; 3265 }); 3266 } 3267 3268 LoopBypassBlocks.push_back(MemCheckBlock); 3269 3270 AddedSafetyChecks = true; 3271 3272 // We currently don't use LoopVersioning for the actual loop cloning but we 3273 // still use it to add the noalias metadata. 3274 LVer = std::make_unique<LoopVersioning>( 3275 *Legal->getLAI(), 3276 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3277 DT, PSE.getSE()); 3278 LVer->prepareNoAliasMetadata(); 3279 return MemCheckBlock; 3280 } 3281 3282 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3283 LoopScalarBody = OrigLoop->getHeader(); 3284 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3285 assert(LoopVectorPreHeader && "Invalid loop structure"); 3286 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3287 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3288 "multiple exit loop without required epilogue?"); 3289 3290 LoopMiddleBlock = 3291 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3292 LI, nullptr, Twine(Prefix) + "middle.block"); 3293 LoopScalarPreHeader = 3294 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3295 nullptr, Twine(Prefix) + "scalar.ph"); 3296 3297 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3298 3299 // Set up the middle block terminator. Two cases: 3300 // 1) If we know that we must execute the scalar epilogue, emit an 3301 // unconditional branch. 3302 // 2) Otherwise, we must have a single unique exit block (due to how we 3303 // implement the multiple exit case). In this case, set up a conditonal 3304 // branch from the middle block to the loop scalar preheader, and the 3305 // exit block. completeLoopSkeleton will update the condition to use an 3306 // iteration check, if required to decide whether to execute the remainder. 3307 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3308 BranchInst::Create(LoopScalarPreHeader) : 3309 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3310 Builder.getTrue()); 3311 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3312 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3313 3314 // We intentionally don't let SplitBlock to update LoopInfo since 3315 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3316 // LoopVectorBody is explicitly added to the correct place few lines later. 3317 LoopVectorBody = 3318 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3319 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3320 3321 // Update dominator for loop exit. 3322 if (!Cost->requiresScalarEpilogue(VF)) 3323 // If there is an epilogue which must run, there's no edge from the 3324 // middle block to exit blocks and thus no need to update the immediate 3325 // dominator of the exit blocks. 3326 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3327 3328 // Create and register the new vector loop. 3329 Loop *Lp = LI->AllocateLoop(); 3330 Loop *ParentLoop = OrigLoop->getParentLoop(); 3331 3332 // Insert the new loop into the loop nest and register the new basic blocks 3333 // before calling any utilities such as SCEV that require valid LoopInfo. 3334 if (ParentLoop) { 3335 ParentLoop->addChildLoop(Lp); 3336 } else { 3337 LI->addTopLevelLoop(Lp); 3338 } 3339 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3340 return Lp; 3341 } 3342 3343 void InnerLoopVectorizer::createInductionResumeValues( 3344 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3345 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3346 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3347 "Inconsistent information about additional bypass."); 3348 3349 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3350 assert(VectorTripCount && L && "Expected valid arguments"); 3351 // We are going to resume the execution of the scalar loop. 3352 // Go over all of the induction variables that we found and fix the 3353 // PHIs that are left in the scalar version of the loop. 3354 // The starting values of PHI nodes depend on the counter of the last 3355 // iteration in the vectorized loop. 3356 // If we come from a bypass edge then we need to start from the original 3357 // start value. 3358 Instruction *OldInduction = Legal->getPrimaryInduction(); 3359 for (auto &InductionEntry : Legal->getInductionVars()) { 3360 PHINode *OrigPhi = InductionEntry.first; 3361 InductionDescriptor II = InductionEntry.second; 3362 3363 // Create phi nodes to merge from the backedge-taken check block. 3364 PHINode *BCResumeVal = 3365 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3366 LoopScalarPreHeader->getTerminator()); 3367 // Copy original phi DL over to the new one. 3368 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3369 Value *&EndValue = IVEndValues[OrigPhi]; 3370 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3371 if (OrigPhi == OldInduction) { 3372 // We know what the end value is. 3373 EndValue = VectorTripCount; 3374 } else { 3375 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3376 3377 // Fast-math-flags propagate from the original induction instruction. 3378 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3379 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3380 3381 Type *StepType = II.getStep()->getType(); 3382 Instruction::CastOps CastOp = 3383 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3384 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3385 Value *Step = 3386 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3387 EndValue = emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3388 EndValue->setName("ind.end"); 3389 3390 // Compute the end value for the additional bypass (if applicable). 3391 if (AdditionalBypass.first) { 3392 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3393 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3394 StepType, true); 3395 Value *Step = 3396 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3397 CRD = 3398 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3399 EndValueFromAdditionalBypass = 3400 emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3401 EndValueFromAdditionalBypass->setName("ind.end"); 3402 } 3403 } 3404 // The new PHI merges the original incoming value, in case of a bypass, 3405 // or the value at the end of the vectorized loop. 3406 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3407 3408 // Fix the scalar body counter (PHI node). 3409 // The old induction's phi node in the scalar body needs the truncated 3410 // value. 3411 for (BasicBlock *BB : LoopBypassBlocks) 3412 BCResumeVal->addIncoming(II.getStartValue(), BB); 3413 3414 if (AdditionalBypass.first) 3415 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3416 EndValueFromAdditionalBypass); 3417 3418 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3419 } 3420 } 3421 3422 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3423 MDNode *OrigLoopID) { 3424 assert(L && "Expected valid loop."); 3425 3426 // The trip counts should be cached by now. 3427 Value *Count = getOrCreateTripCount(L); 3428 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3429 3430 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3431 3432 // Add a check in the middle block to see if we have completed 3433 // all of the iterations in the first vector loop. Three cases: 3434 // 1) If we require a scalar epilogue, there is no conditional branch as 3435 // we unconditionally branch to the scalar preheader. Do nothing. 3436 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3437 // Thus if tail is to be folded, we know we don't need to run the 3438 // remainder and we can use the previous value for the condition (true). 3439 // 3) Otherwise, construct a runtime check. 3440 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3441 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3442 Count, VectorTripCount, "cmp.n", 3443 LoopMiddleBlock->getTerminator()); 3444 3445 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3446 // of the corresponding compare because they may have ended up with 3447 // different line numbers and we want to avoid awkward line stepping while 3448 // debugging. Eg. if the compare has got a line number inside the loop. 3449 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3450 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3451 } 3452 3453 // Get ready to start creating new instructions into the vectorized body. 3454 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3455 "Inconsistent vector loop preheader"); 3456 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3457 3458 #ifdef EXPENSIVE_CHECKS 3459 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3460 LI->verify(*DT); 3461 #endif 3462 3463 return LoopVectorPreHeader; 3464 } 3465 3466 std::pair<BasicBlock *, Value *> 3467 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3468 /* 3469 In this function we generate a new loop. The new loop will contain 3470 the vectorized instructions while the old loop will continue to run the 3471 scalar remainder. 3472 3473 [ ] <-- loop iteration number check. 3474 / | 3475 / v 3476 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3477 | / | 3478 | / v 3479 || [ ] <-- vector pre header. 3480 |/ | 3481 | v 3482 | [ ] \ 3483 | [ ]_| <-- vector loop. 3484 | | 3485 | v 3486 \ -[ ] <--- middle-block. 3487 \/ | 3488 /\ v 3489 | ->[ ] <--- new preheader. 3490 | | 3491 (opt) v <-- edge from middle to exit iff epilogue is not required. 3492 | [ ] \ 3493 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3494 \ | 3495 \ v 3496 >[ ] <-- exit block(s). 3497 ... 3498 */ 3499 3500 // Get the metadata of the original loop before it gets modified. 3501 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3502 3503 // Workaround! Compute the trip count of the original loop and cache it 3504 // before we start modifying the CFG. This code has a systemic problem 3505 // wherein it tries to run analysis over partially constructed IR; this is 3506 // wrong, and not simply for SCEV. The trip count of the original loop 3507 // simply happens to be prone to hitting this in practice. In theory, we 3508 // can hit the same issue for any SCEV, or ValueTracking query done during 3509 // mutation. See PR49900. 3510 getOrCreateTripCount(OrigLoop); 3511 3512 // Create an empty vector loop, and prepare basic blocks for the runtime 3513 // checks. 3514 Loop *Lp = createVectorLoopSkeleton(""); 3515 3516 // Now, compare the new count to zero. If it is zero skip the vector loop and 3517 // jump to the scalar loop. This check also covers the case where the 3518 // backedge-taken count is uint##_max: adding one to it will overflow leading 3519 // to an incorrect trip count of zero. In this (rare) case we will also jump 3520 // to the scalar loop. 3521 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3522 3523 // Generate the code to check any assumptions that we've made for SCEV 3524 // expressions. 3525 emitSCEVChecks(Lp, LoopScalarPreHeader); 3526 3527 // Generate the code that checks in runtime if arrays overlap. We put the 3528 // checks into a separate block to make the more common case of few elements 3529 // faster. 3530 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3531 3532 createHeaderBranch(Lp); 3533 3534 // Emit phis for the new starting index of the scalar loop. 3535 createInductionResumeValues(Lp); 3536 3537 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3538 } 3539 3540 // Fix up external users of the induction variable. At this point, we are 3541 // in LCSSA form, with all external PHIs that use the IV having one input value, 3542 // coming from the remainder loop. We need those PHIs to also have a correct 3543 // value for the IV when arriving directly from the middle block. 3544 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3545 const InductionDescriptor &II, 3546 Value *CountRoundDown, Value *EndValue, 3547 BasicBlock *MiddleBlock) { 3548 // There are two kinds of external IV usages - those that use the value 3549 // computed in the last iteration (the PHI) and those that use the penultimate 3550 // value (the value that feeds into the phi from the loop latch). 3551 // We allow both, but they, obviously, have different values. 3552 3553 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3554 3555 DenseMap<Value *, Value *> MissingVals; 3556 3557 // An external user of the last iteration's value should see the value that 3558 // the remainder loop uses to initialize its own IV. 3559 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3560 for (User *U : PostInc->users()) { 3561 Instruction *UI = cast<Instruction>(U); 3562 if (!OrigLoop->contains(UI)) { 3563 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3564 MissingVals[UI] = EndValue; 3565 } 3566 } 3567 3568 // An external user of the penultimate value need to see EndValue - Step. 3569 // The simplest way to get this is to recompute it from the constituent SCEVs, 3570 // that is Start + (Step * (CRD - 1)). 3571 for (User *U : OrigPhi->users()) { 3572 auto *UI = cast<Instruction>(U); 3573 if (!OrigLoop->contains(UI)) { 3574 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3575 3576 IRBuilder<> B(MiddleBlock->getTerminator()); 3577 3578 // Fast-math-flags propagate from the original induction instruction. 3579 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3580 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3581 3582 Value *CountMinusOne = B.CreateSub( 3583 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3584 Value *CMO = 3585 !II.getStep()->getType()->isIntegerTy() 3586 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3587 II.getStep()->getType()) 3588 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3589 CMO->setName("cast.cmo"); 3590 3591 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3592 LoopVectorBody->getTerminator()); 3593 Value *Escape = 3594 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3595 Escape->setName("ind.escape"); 3596 MissingVals[UI] = Escape; 3597 } 3598 } 3599 3600 for (auto &I : MissingVals) { 3601 PHINode *PHI = cast<PHINode>(I.first); 3602 // One corner case we have to handle is two IVs "chasing" each-other, 3603 // that is %IV2 = phi [...], [ %IV1, %latch ] 3604 // In this case, if IV1 has an external use, we need to avoid adding both 3605 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3606 // don't already have an incoming value for the middle block. 3607 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3608 PHI->addIncoming(I.second, MiddleBlock); 3609 } 3610 } 3611 3612 namespace { 3613 3614 struct CSEDenseMapInfo { 3615 static bool canHandle(const Instruction *I) { 3616 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3617 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3618 } 3619 3620 static inline Instruction *getEmptyKey() { 3621 return DenseMapInfo<Instruction *>::getEmptyKey(); 3622 } 3623 3624 static inline Instruction *getTombstoneKey() { 3625 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3626 } 3627 3628 static unsigned getHashValue(const Instruction *I) { 3629 assert(canHandle(I) && "Unknown instruction!"); 3630 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3631 I->value_op_end())); 3632 } 3633 3634 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3635 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3636 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3637 return LHS == RHS; 3638 return LHS->isIdenticalTo(RHS); 3639 } 3640 }; 3641 3642 } // end anonymous namespace 3643 3644 ///Perform cse of induction variable instructions. 3645 static void cse(BasicBlock *BB) { 3646 // Perform simple cse. 3647 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3648 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3649 if (!CSEDenseMapInfo::canHandle(&In)) 3650 continue; 3651 3652 // Check if we can replace this instruction with any of the 3653 // visited instructions. 3654 if (Instruction *V = CSEMap.lookup(&In)) { 3655 In.replaceAllUsesWith(V); 3656 In.eraseFromParent(); 3657 continue; 3658 } 3659 3660 CSEMap[&In] = &In; 3661 } 3662 } 3663 3664 InstructionCost 3665 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3666 bool &NeedToScalarize) const { 3667 Function *F = CI->getCalledFunction(); 3668 Type *ScalarRetTy = CI->getType(); 3669 SmallVector<Type *, 4> Tys, ScalarTys; 3670 for (auto &ArgOp : CI->args()) 3671 ScalarTys.push_back(ArgOp->getType()); 3672 3673 // Estimate cost of scalarized vector call. The source operands are assumed 3674 // to be vectors, so we need to extract individual elements from there, 3675 // execute VF scalar calls, and then gather the result into the vector return 3676 // value. 3677 InstructionCost ScalarCallCost = 3678 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3679 if (VF.isScalar()) 3680 return ScalarCallCost; 3681 3682 // Compute corresponding vector type for return value and arguments. 3683 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3684 for (Type *ScalarTy : ScalarTys) 3685 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3686 3687 // Compute costs of unpacking argument values for the scalar calls and 3688 // packing the return values to a vector. 3689 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3690 3691 InstructionCost Cost = 3692 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3693 3694 // If we can't emit a vector call for this function, then the currently found 3695 // cost is the cost we need to return. 3696 NeedToScalarize = true; 3697 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3698 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3699 3700 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3701 return Cost; 3702 3703 // If the corresponding vector cost is cheaper, return its cost. 3704 InstructionCost VectorCallCost = 3705 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3706 if (VectorCallCost < Cost) { 3707 NeedToScalarize = false; 3708 Cost = VectorCallCost; 3709 } 3710 return Cost; 3711 } 3712 3713 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3714 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3715 return Elt; 3716 return VectorType::get(Elt, VF); 3717 } 3718 3719 InstructionCost 3720 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3721 ElementCount VF) const { 3722 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3723 assert(ID && "Expected intrinsic call!"); 3724 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3725 FastMathFlags FMF; 3726 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3727 FMF = FPMO->getFastMathFlags(); 3728 3729 SmallVector<const Value *> Arguments(CI->args()); 3730 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3731 SmallVector<Type *> ParamTys; 3732 std::transform(FTy->param_begin(), FTy->param_end(), 3733 std::back_inserter(ParamTys), 3734 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3735 3736 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3737 dyn_cast<IntrinsicInst>(CI)); 3738 return TTI.getIntrinsicInstrCost(CostAttrs, 3739 TargetTransformInfo::TCK_RecipThroughput); 3740 } 3741 3742 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3743 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3744 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3745 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3746 } 3747 3748 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3749 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3750 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3751 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3752 } 3753 3754 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3755 // For every instruction `I` in MinBWs, truncate the operands, create a 3756 // truncated version of `I` and reextend its result. InstCombine runs 3757 // later and will remove any ext/trunc pairs. 3758 SmallPtrSet<Value *, 4> Erased; 3759 for (const auto &KV : Cost->getMinimalBitwidths()) { 3760 // If the value wasn't vectorized, we must maintain the original scalar 3761 // type. The absence of the value from State indicates that it 3762 // wasn't vectorized. 3763 // FIXME: Should not rely on getVPValue at this point. 3764 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3765 if (!State.hasAnyVectorValue(Def)) 3766 continue; 3767 for (unsigned Part = 0; Part < UF; ++Part) { 3768 Value *I = State.get(Def, Part); 3769 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3770 continue; 3771 Type *OriginalTy = I->getType(); 3772 Type *ScalarTruncatedTy = 3773 IntegerType::get(OriginalTy->getContext(), KV.second); 3774 auto *TruncatedTy = VectorType::get( 3775 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3776 if (TruncatedTy == OriginalTy) 3777 continue; 3778 3779 IRBuilder<> B(cast<Instruction>(I)); 3780 auto ShrinkOperand = [&](Value *V) -> Value * { 3781 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3782 if (ZI->getSrcTy() == TruncatedTy) 3783 return ZI->getOperand(0); 3784 return B.CreateZExtOrTrunc(V, TruncatedTy); 3785 }; 3786 3787 // The actual instruction modification depends on the instruction type, 3788 // unfortunately. 3789 Value *NewI = nullptr; 3790 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3791 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3792 ShrinkOperand(BO->getOperand(1))); 3793 3794 // Any wrapping introduced by shrinking this operation shouldn't be 3795 // considered undefined behavior. So, we can't unconditionally copy 3796 // arithmetic wrapping flags to NewI. 3797 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3798 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3799 NewI = 3800 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3801 ShrinkOperand(CI->getOperand(1))); 3802 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3803 NewI = B.CreateSelect(SI->getCondition(), 3804 ShrinkOperand(SI->getTrueValue()), 3805 ShrinkOperand(SI->getFalseValue())); 3806 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3807 switch (CI->getOpcode()) { 3808 default: 3809 llvm_unreachable("Unhandled cast!"); 3810 case Instruction::Trunc: 3811 NewI = ShrinkOperand(CI->getOperand(0)); 3812 break; 3813 case Instruction::SExt: 3814 NewI = B.CreateSExtOrTrunc( 3815 CI->getOperand(0), 3816 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3817 break; 3818 case Instruction::ZExt: 3819 NewI = B.CreateZExtOrTrunc( 3820 CI->getOperand(0), 3821 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3822 break; 3823 } 3824 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3825 auto Elements0 = 3826 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3827 auto *O0 = B.CreateZExtOrTrunc( 3828 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3829 auto Elements1 = 3830 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3831 auto *O1 = B.CreateZExtOrTrunc( 3832 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3833 3834 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3835 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3836 // Don't do anything with the operands, just extend the result. 3837 continue; 3838 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3839 auto Elements = 3840 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3841 auto *O0 = B.CreateZExtOrTrunc( 3842 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3843 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3844 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3845 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3846 auto Elements = 3847 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3848 auto *O0 = B.CreateZExtOrTrunc( 3849 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3850 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3851 } else { 3852 // If we don't know what to do, be conservative and don't do anything. 3853 continue; 3854 } 3855 3856 // Lastly, extend the result. 3857 NewI->takeName(cast<Instruction>(I)); 3858 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3859 I->replaceAllUsesWith(Res); 3860 cast<Instruction>(I)->eraseFromParent(); 3861 Erased.insert(I); 3862 State.reset(Def, Res, Part); 3863 } 3864 } 3865 3866 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3867 for (const auto &KV : Cost->getMinimalBitwidths()) { 3868 // If the value wasn't vectorized, we must maintain the original scalar 3869 // type. The absence of the value from State indicates that it 3870 // wasn't vectorized. 3871 // FIXME: Should not rely on getVPValue at this point. 3872 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3873 if (!State.hasAnyVectorValue(Def)) 3874 continue; 3875 for (unsigned Part = 0; Part < UF; ++Part) { 3876 Value *I = State.get(Def, Part); 3877 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3878 if (Inst && Inst->use_empty()) { 3879 Value *NewI = Inst->getOperand(0); 3880 Inst->eraseFromParent(); 3881 State.reset(Def, NewI, Part); 3882 } 3883 } 3884 } 3885 } 3886 3887 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3888 // Insert truncates and extends for any truncated instructions as hints to 3889 // InstCombine. 3890 if (VF.isVector()) 3891 truncateToMinimalBitwidths(State); 3892 3893 // Fix widened non-induction PHIs by setting up the PHI operands. 3894 if (OrigPHIsToFix.size()) { 3895 assert(EnableVPlanNativePath && 3896 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3897 fixNonInductionPHIs(State); 3898 } 3899 3900 // At this point every instruction in the original loop is widened to a 3901 // vector form. Now we need to fix the recurrences in the loop. These PHI 3902 // nodes are currently empty because we did not want to introduce cycles. 3903 // This is the second stage of vectorizing recurrences. 3904 fixCrossIterationPHIs(State); 3905 3906 // Forget the original basic block. 3907 PSE.getSE()->forgetLoop(OrigLoop); 3908 3909 // If we inserted an edge from the middle block to the unique exit block, 3910 // update uses outside the loop (phis) to account for the newly inserted 3911 // edge. 3912 if (!Cost->requiresScalarEpilogue(VF)) { 3913 // Fix-up external users of the induction variables. 3914 for (auto &Entry : Legal->getInductionVars()) 3915 fixupIVUsers(Entry.first, Entry.second, 3916 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3917 IVEndValues[Entry.first], LoopMiddleBlock); 3918 3919 fixLCSSAPHIs(State); 3920 } 3921 3922 for (Instruction *PI : PredicatedInstructions) 3923 sinkScalarOperands(&*PI); 3924 3925 // Remove redundant induction instructions. 3926 cse(LoopVectorBody); 3927 3928 // Set/update profile weights for the vector and remainder loops as original 3929 // loop iterations are now distributed among them. Note that original loop 3930 // represented by LoopScalarBody becomes remainder loop after vectorization. 3931 // 3932 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3933 // end up getting slightly roughened result but that should be OK since 3934 // profile is not inherently precise anyway. Note also possible bypass of 3935 // vector code caused by legality checks is ignored, assigning all the weight 3936 // to the vector loop, optimistically. 3937 // 3938 // For scalable vectorization we can't know at compile time how many iterations 3939 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3940 // vscale of '1'. 3941 setProfileInfoAfterUnrolling( 3942 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3943 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3944 } 3945 3946 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3947 // In order to support recurrences we need to be able to vectorize Phi nodes. 3948 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3949 // stage #2: We now need to fix the recurrences by adding incoming edges to 3950 // the currently empty PHI nodes. At this point every instruction in the 3951 // original loop is widened to a vector form so we can use them to construct 3952 // the incoming edges. 3953 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 3954 for (VPRecipeBase &R : Header->phis()) { 3955 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3956 fixReduction(ReductionPhi, State); 3957 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3958 fixFirstOrderRecurrence(FOR, State); 3959 } 3960 } 3961 3962 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3963 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3964 // This is the second phase of vectorizing first-order recurrences. An 3965 // overview of the transformation is described below. Suppose we have the 3966 // following loop. 3967 // 3968 // for (int i = 0; i < n; ++i) 3969 // b[i] = a[i] - a[i - 1]; 3970 // 3971 // There is a first-order recurrence on "a". For this loop, the shorthand 3972 // scalar IR looks like: 3973 // 3974 // scalar.ph: 3975 // s_init = a[-1] 3976 // br scalar.body 3977 // 3978 // scalar.body: 3979 // i = phi [0, scalar.ph], [i+1, scalar.body] 3980 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3981 // s2 = a[i] 3982 // b[i] = s2 - s1 3983 // br cond, scalar.body, ... 3984 // 3985 // In this example, s1 is a recurrence because it's value depends on the 3986 // previous iteration. In the first phase of vectorization, we created a 3987 // vector phi v1 for s1. We now complete the vectorization and produce the 3988 // shorthand vector IR shown below (for VF = 4, UF = 1). 3989 // 3990 // vector.ph: 3991 // v_init = vector(..., ..., ..., a[-1]) 3992 // br vector.body 3993 // 3994 // vector.body 3995 // i = phi [0, vector.ph], [i+4, vector.body] 3996 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3997 // v2 = a[i, i+1, i+2, i+3]; 3998 // v3 = vector(v1(3), v2(0, 1, 2)) 3999 // b[i, i+1, i+2, i+3] = v2 - v3 4000 // br cond, vector.body, middle.block 4001 // 4002 // middle.block: 4003 // x = v2(3) 4004 // br scalar.ph 4005 // 4006 // scalar.ph: 4007 // s_init = phi [x, middle.block], [a[-1], otherwise] 4008 // br scalar.body 4009 // 4010 // After execution completes the vector loop, we extract the next value of 4011 // the recurrence (x) to use as the initial value in the scalar loop. 4012 4013 // Extract the last vector element in the middle block. This will be the 4014 // initial value for the recurrence when jumping to the scalar loop. 4015 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4016 Value *Incoming = State.get(PreviousDef, UF - 1); 4017 auto *ExtractForScalar = Incoming; 4018 auto *IdxTy = Builder.getInt32Ty(); 4019 if (VF.isVector()) { 4020 auto *One = ConstantInt::get(IdxTy, 1); 4021 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4022 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4023 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4024 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4025 "vector.recur.extract"); 4026 } 4027 // Extract the second last element in the middle block if the 4028 // Phi is used outside the loop. We need to extract the phi itself 4029 // and not the last element (the phi update in the current iteration). This 4030 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4031 // when the scalar loop is not run at all. 4032 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4033 if (VF.isVector()) { 4034 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4035 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4036 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4037 Incoming, Idx, "vector.recur.extract.for.phi"); 4038 } else if (UF > 1) 4039 // When loop is unrolled without vectorizing, initialize 4040 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4041 // of `Incoming`. This is analogous to the vectorized case above: extracting 4042 // the second last element when VF > 1. 4043 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4044 4045 // Fix the initial value of the original recurrence in the scalar loop. 4046 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4047 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4048 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4049 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4050 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4051 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4052 Start->addIncoming(Incoming, BB); 4053 } 4054 4055 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4056 Phi->setName("scalar.recur"); 4057 4058 // Finally, fix users of the recurrence outside the loop. The users will need 4059 // either the last value of the scalar recurrence or the last value of the 4060 // vector recurrence we extracted in the middle block. Since the loop is in 4061 // LCSSA form, we just need to find all the phi nodes for the original scalar 4062 // recurrence in the exit block, and then add an edge for the middle block. 4063 // Note that LCSSA does not imply single entry when the original scalar loop 4064 // had multiple exiting edges (as we always run the last iteration in the 4065 // scalar epilogue); in that case, there is no edge from middle to exit and 4066 // and thus no phis which needed updated. 4067 if (!Cost->requiresScalarEpilogue(VF)) 4068 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4069 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4070 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4071 } 4072 4073 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4074 VPTransformState &State) { 4075 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4076 // Get it's reduction variable descriptor. 4077 assert(Legal->isReductionVariable(OrigPhi) && 4078 "Unable to find the reduction variable"); 4079 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4080 4081 RecurKind RK = RdxDesc.getRecurrenceKind(); 4082 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4083 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4084 setDebugLocFromInst(ReductionStartValue); 4085 4086 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4087 // This is the vector-clone of the value that leaves the loop. 4088 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4089 4090 // Wrap flags are in general invalid after vectorization, clear them. 4091 clearReductionWrapFlags(RdxDesc, State); 4092 4093 // Before each round, move the insertion point right between 4094 // the PHIs and the values we are going to write. 4095 // This allows us to write both PHINodes and the extractelement 4096 // instructions. 4097 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4098 4099 setDebugLocFromInst(LoopExitInst); 4100 4101 Type *PhiTy = OrigPhi->getType(); 4102 // If tail is folded by masking, the vector value to leave the loop should be 4103 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4104 // instead of the former. For an inloop reduction the reduction will already 4105 // be predicated, and does not need to be handled here. 4106 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4107 for (unsigned Part = 0; Part < UF; ++Part) { 4108 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4109 Value *Sel = nullptr; 4110 for (User *U : VecLoopExitInst->users()) { 4111 if (isa<SelectInst>(U)) { 4112 assert(!Sel && "Reduction exit feeding two selects"); 4113 Sel = U; 4114 } else 4115 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4116 } 4117 assert(Sel && "Reduction exit feeds no select"); 4118 State.reset(LoopExitInstDef, Sel, Part); 4119 4120 // If the target can create a predicated operator for the reduction at no 4121 // extra cost in the loop (for example a predicated vadd), it can be 4122 // cheaper for the select to remain in the loop than be sunk out of it, 4123 // and so use the select value for the phi instead of the old 4124 // LoopExitValue. 4125 if (PreferPredicatedReductionSelect || 4126 TTI->preferPredicatedReductionSelect( 4127 RdxDesc.getOpcode(), PhiTy, 4128 TargetTransformInfo::ReductionFlags())) { 4129 auto *VecRdxPhi = 4130 cast<PHINode>(State.get(PhiR, Part)); 4131 VecRdxPhi->setIncomingValueForBlock( 4132 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4133 } 4134 } 4135 } 4136 4137 // If the vector reduction can be performed in a smaller type, we truncate 4138 // then extend the loop exit value to enable InstCombine to evaluate the 4139 // entire expression in the smaller type. 4140 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4141 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4142 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4143 Builder.SetInsertPoint( 4144 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4145 VectorParts RdxParts(UF); 4146 for (unsigned Part = 0; Part < UF; ++Part) { 4147 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4148 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4149 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4150 : Builder.CreateZExt(Trunc, VecTy); 4151 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4152 if (U != Trunc) { 4153 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4154 RdxParts[Part] = Extnd; 4155 } 4156 } 4157 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4158 for (unsigned Part = 0; Part < UF; ++Part) { 4159 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4160 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4161 } 4162 } 4163 4164 // Reduce all of the unrolled parts into a single vector. 4165 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4166 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4167 4168 // The middle block terminator has already been assigned a DebugLoc here (the 4169 // OrigLoop's single latch terminator). We want the whole middle block to 4170 // appear to execute on this line because: (a) it is all compiler generated, 4171 // (b) these instructions are always executed after evaluating the latch 4172 // conditional branch, and (c) other passes may add new predecessors which 4173 // terminate on this line. This is the easiest way to ensure we don't 4174 // accidentally cause an extra step back into the loop while debugging. 4175 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4176 if (PhiR->isOrdered()) 4177 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4178 else { 4179 // Floating-point operations should have some FMF to enable the reduction. 4180 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4181 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4182 for (unsigned Part = 1; Part < UF; ++Part) { 4183 Value *RdxPart = State.get(LoopExitInstDef, Part); 4184 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4185 ReducedPartRdx = Builder.CreateBinOp( 4186 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4187 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4188 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4189 ReducedPartRdx, RdxPart); 4190 else 4191 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4192 } 4193 } 4194 4195 // Create the reduction after the loop. Note that inloop reductions create the 4196 // target reduction in the loop using a Reduction recipe. 4197 if (VF.isVector() && !PhiR->isInLoop()) { 4198 ReducedPartRdx = 4199 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4200 // If the reduction can be performed in a smaller type, we need to extend 4201 // the reduction to the wider type before we branch to the original loop. 4202 if (PhiTy != RdxDesc.getRecurrenceType()) 4203 ReducedPartRdx = RdxDesc.isSigned() 4204 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4205 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4206 } 4207 4208 PHINode *ResumePhi = 4209 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4210 4211 // Create a phi node that merges control-flow from the backedge-taken check 4212 // block and the middle block. 4213 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4214 LoopScalarPreHeader->getTerminator()); 4215 4216 // If we are fixing reductions in the epilogue loop then we should already 4217 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4218 // we carry over the incoming values correctly. 4219 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4220 if (Incoming == LoopMiddleBlock) 4221 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4222 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4223 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4224 Incoming); 4225 else 4226 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4227 } 4228 4229 // Set the resume value for this reduction 4230 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4231 4232 // Now, we need to fix the users of the reduction variable 4233 // inside and outside of the scalar remainder loop. 4234 4235 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4236 // in the exit blocks. See comment on analogous loop in 4237 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4238 if (!Cost->requiresScalarEpilogue(VF)) 4239 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4240 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4241 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4242 4243 // Fix the scalar loop reduction variable with the incoming reduction sum 4244 // from the vector body and from the backedge value. 4245 int IncomingEdgeBlockIdx = 4246 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4247 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4248 // Pick the other block. 4249 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4250 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4251 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4252 } 4253 4254 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4255 VPTransformState &State) { 4256 RecurKind RK = RdxDesc.getRecurrenceKind(); 4257 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4258 return; 4259 4260 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4261 assert(LoopExitInstr && "null loop exit instruction"); 4262 SmallVector<Instruction *, 8> Worklist; 4263 SmallPtrSet<Instruction *, 8> Visited; 4264 Worklist.push_back(LoopExitInstr); 4265 Visited.insert(LoopExitInstr); 4266 4267 while (!Worklist.empty()) { 4268 Instruction *Cur = Worklist.pop_back_val(); 4269 if (isa<OverflowingBinaryOperator>(Cur)) 4270 for (unsigned Part = 0; Part < UF; ++Part) { 4271 // FIXME: Should not rely on getVPValue at this point. 4272 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4273 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4274 } 4275 4276 for (User *U : Cur->users()) { 4277 Instruction *UI = cast<Instruction>(U); 4278 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4279 Visited.insert(UI).second) 4280 Worklist.push_back(UI); 4281 } 4282 } 4283 } 4284 4285 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4286 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4287 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4288 // Some phis were already hand updated by the reduction and recurrence 4289 // code above, leave them alone. 4290 continue; 4291 4292 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4293 // Non-instruction incoming values will have only one value. 4294 4295 VPLane Lane = VPLane::getFirstLane(); 4296 if (isa<Instruction>(IncomingValue) && 4297 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4298 VF)) 4299 Lane = VPLane::getLastLaneForVF(VF); 4300 4301 // Can be a loop invariant incoming value or the last scalar value to be 4302 // extracted from the vectorized loop. 4303 // FIXME: Should not rely on getVPValue at this point. 4304 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4305 Value *lastIncomingValue = 4306 OrigLoop->isLoopInvariant(IncomingValue) 4307 ? IncomingValue 4308 : State.get(State.Plan->getVPValue(IncomingValue, true), 4309 VPIteration(UF - 1, Lane)); 4310 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4311 } 4312 } 4313 4314 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4315 // The basic block and loop containing the predicated instruction. 4316 auto *PredBB = PredInst->getParent(); 4317 auto *VectorLoop = LI->getLoopFor(PredBB); 4318 4319 // Initialize a worklist with the operands of the predicated instruction. 4320 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4321 4322 // Holds instructions that we need to analyze again. An instruction may be 4323 // reanalyzed if we don't yet know if we can sink it or not. 4324 SmallVector<Instruction *, 8> InstsToReanalyze; 4325 4326 // Returns true if a given use occurs in the predicated block. Phi nodes use 4327 // their operands in their corresponding predecessor blocks. 4328 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4329 auto *I = cast<Instruction>(U.getUser()); 4330 BasicBlock *BB = I->getParent(); 4331 if (auto *Phi = dyn_cast<PHINode>(I)) 4332 BB = Phi->getIncomingBlock( 4333 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4334 return BB == PredBB; 4335 }; 4336 4337 // Iteratively sink the scalarized operands of the predicated instruction 4338 // into the block we created for it. When an instruction is sunk, it's 4339 // operands are then added to the worklist. The algorithm ends after one pass 4340 // through the worklist doesn't sink a single instruction. 4341 bool Changed; 4342 do { 4343 // Add the instructions that need to be reanalyzed to the worklist, and 4344 // reset the changed indicator. 4345 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4346 InstsToReanalyze.clear(); 4347 Changed = false; 4348 4349 while (!Worklist.empty()) { 4350 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4351 4352 // We can't sink an instruction if it is a phi node, is not in the loop, 4353 // or may have side effects. 4354 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4355 I->mayHaveSideEffects()) 4356 continue; 4357 4358 // If the instruction is already in PredBB, check if we can sink its 4359 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4360 // sinking the scalar instruction I, hence it appears in PredBB; but it 4361 // may have failed to sink I's operands (recursively), which we try 4362 // (again) here. 4363 if (I->getParent() == PredBB) { 4364 Worklist.insert(I->op_begin(), I->op_end()); 4365 continue; 4366 } 4367 4368 // It's legal to sink the instruction if all its uses occur in the 4369 // predicated block. Otherwise, there's nothing to do yet, and we may 4370 // need to reanalyze the instruction. 4371 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4372 InstsToReanalyze.push_back(I); 4373 continue; 4374 } 4375 4376 // Move the instruction to the beginning of the predicated block, and add 4377 // it's operands to the worklist. 4378 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4379 Worklist.insert(I->op_begin(), I->op_end()); 4380 4381 // The sinking may have enabled other instructions to be sunk, so we will 4382 // need to iterate. 4383 Changed = true; 4384 } 4385 } while (Changed); 4386 } 4387 4388 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4389 for (PHINode *OrigPhi : OrigPHIsToFix) { 4390 VPWidenPHIRecipe *VPPhi = 4391 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4392 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4393 // Make sure the builder has a valid insert point. 4394 Builder.SetInsertPoint(NewPhi); 4395 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4396 VPValue *Inc = VPPhi->getIncomingValue(i); 4397 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4398 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4399 } 4400 } 4401 } 4402 4403 bool InnerLoopVectorizer::useOrderedReductions( 4404 const RecurrenceDescriptor &RdxDesc) { 4405 return Cost->useOrderedReductions(RdxDesc); 4406 } 4407 4408 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4409 VPWidenPHIRecipe *PhiR, 4410 VPTransformState &State) { 4411 PHINode *P = cast<PHINode>(PN); 4412 if (EnableVPlanNativePath) { 4413 // Currently we enter here in the VPlan-native path for non-induction 4414 // PHIs where all control flow is uniform. We simply widen these PHIs. 4415 // Create a vector phi with no operands - the vector phi operands will be 4416 // set at the end of vector code generation. 4417 Type *VecTy = (State.VF.isScalar()) 4418 ? PN->getType() 4419 : VectorType::get(PN->getType(), State.VF); 4420 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4421 State.set(PhiR, VecPhi, 0); 4422 OrigPHIsToFix.push_back(P); 4423 4424 return; 4425 } 4426 4427 assert(PN->getParent() == OrigLoop->getHeader() && 4428 "Non-header phis should have been handled elsewhere"); 4429 4430 // In order to support recurrences we need to be able to vectorize Phi nodes. 4431 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4432 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4433 // this value when we vectorize all of the instructions that use the PHI. 4434 4435 assert(!Legal->isReductionVariable(P) && 4436 "reductions should be handled elsewhere"); 4437 4438 setDebugLocFromInst(P); 4439 4440 // This PHINode must be an induction variable. 4441 // Make sure that we know about it. 4442 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4443 4444 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4445 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4446 4447 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4448 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4449 4450 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4451 // which can be found from the original scalar operations. 4452 switch (II.getKind()) { 4453 case InductionDescriptor::IK_NoInduction: 4454 llvm_unreachable("Unknown induction"); 4455 case InductionDescriptor::IK_IntInduction: 4456 case InductionDescriptor::IK_FpInduction: 4457 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4458 case InductionDescriptor::IK_PtrInduction: { 4459 // Handle the pointer induction variable case. 4460 assert(P->getType()->isPointerTy() && "Unexpected type."); 4461 4462 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4463 // This is the normalized GEP that starts counting at zero. 4464 Value *PtrInd = 4465 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4466 // Determine the number of scalars we need to generate for each unroll 4467 // iteration. If the instruction is uniform, we only need to generate the 4468 // first lane. Otherwise, we generate all VF values. 4469 bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); 4470 assert((IsUniform || !State.VF.isScalable()) && 4471 "Cannot scalarize a scalable VF"); 4472 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4473 4474 for (unsigned Part = 0; Part < UF; ++Part) { 4475 Value *PartStart = 4476 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4477 4478 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4479 Value *Idx = Builder.CreateAdd( 4480 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4481 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4482 4483 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 4484 State.CFG.PrevBB->getTerminator()); 4485 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, 4486 II.getStartValue(), Step, II); 4487 SclrGep->setName("next.gep"); 4488 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4489 } 4490 } 4491 return; 4492 } 4493 assert(isa<SCEVConstant>(II.getStep()) && 4494 "Induction step not a SCEV constant!"); 4495 Type *PhiType = II.getStep()->getType(); 4496 4497 // Build a pointer phi 4498 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4499 Type *ScStValueType = ScalarStartValue->getType(); 4500 PHINode *NewPointerPhi = 4501 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4502 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4503 4504 // A pointer induction, performed by using a gep 4505 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4506 Instruction *InductionLoc = LoopLatch->getTerminator(); 4507 const SCEV *ScalarStep = II.getStep(); 4508 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4509 Value *ScalarStepValue = 4510 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4511 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4512 Value *NumUnrolledElems = 4513 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4514 Value *InductionGEP = GetElementPtrInst::Create( 4515 II.getElementType(), NewPointerPhi, 4516 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4517 InductionLoc); 4518 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4519 4520 // Create UF many actual address geps that use the pointer 4521 // phi as base and a vectorized version of the step value 4522 // (<step*0, ..., step*N>) as offset. 4523 for (unsigned Part = 0; Part < State.UF; ++Part) { 4524 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4525 Value *StartOffsetScalar = 4526 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4527 Value *StartOffset = 4528 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4529 // Create a vector of consecutive numbers from zero to VF. 4530 StartOffset = 4531 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4532 4533 Value *GEP = Builder.CreateGEP( 4534 II.getElementType(), NewPointerPhi, 4535 Builder.CreateMul( 4536 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4537 "vector.gep")); 4538 State.set(PhiR, GEP, Part); 4539 } 4540 } 4541 } 4542 } 4543 4544 /// A helper function for checking whether an integer division-related 4545 /// instruction may divide by zero (in which case it must be predicated if 4546 /// executed conditionally in the scalar code). 4547 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4548 /// Non-zero divisors that are non compile-time constants will not be 4549 /// converted into multiplication, so we will still end up scalarizing 4550 /// the division, but can do so w/o predication. 4551 static bool mayDivideByZero(Instruction &I) { 4552 assert((I.getOpcode() == Instruction::UDiv || 4553 I.getOpcode() == Instruction::SDiv || 4554 I.getOpcode() == Instruction::URem || 4555 I.getOpcode() == Instruction::SRem) && 4556 "Unexpected instruction"); 4557 Value *Divisor = I.getOperand(1); 4558 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4559 return !CInt || CInt->isZero(); 4560 } 4561 4562 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4563 VPUser &ArgOperands, 4564 VPTransformState &State) { 4565 assert(!isa<DbgInfoIntrinsic>(I) && 4566 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4567 setDebugLocFromInst(&I); 4568 4569 Module *M = I.getParent()->getParent()->getParent(); 4570 auto *CI = cast<CallInst>(&I); 4571 4572 SmallVector<Type *, 4> Tys; 4573 for (Value *ArgOperand : CI->args()) 4574 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4575 4576 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4577 4578 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4579 // version of the instruction. 4580 // Is it beneficial to perform intrinsic call compared to lib call? 4581 bool NeedToScalarize = false; 4582 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4583 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4584 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4585 assert((UseVectorIntrinsic || !NeedToScalarize) && 4586 "Instruction should be scalarized elsewhere."); 4587 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4588 "Either the intrinsic cost or vector call cost must be valid"); 4589 4590 for (unsigned Part = 0; Part < UF; ++Part) { 4591 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4592 SmallVector<Value *, 4> Args; 4593 for (auto &I : enumerate(ArgOperands.operands())) { 4594 // Some intrinsics have a scalar argument - don't replace it with a 4595 // vector. 4596 Value *Arg; 4597 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4598 Arg = State.get(I.value(), Part); 4599 else { 4600 Arg = State.get(I.value(), VPIteration(0, 0)); 4601 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4602 TysForDecl.push_back(Arg->getType()); 4603 } 4604 Args.push_back(Arg); 4605 } 4606 4607 Function *VectorF; 4608 if (UseVectorIntrinsic) { 4609 // Use vector version of the intrinsic. 4610 if (VF.isVector()) 4611 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4612 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4613 assert(VectorF && "Can't retrieve vector intrinsic."); 4614 } else { 4615 // Use vector version of the function call. 4616 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4617 #ifndef NDEBUG 4618 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4619 "Can't create vector function."); 4620 #endif 4621 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4622 } 4623 SmallVector<OperandBundleDef, 1> OpBundles; 4624 CI->getOperandBundlesAsDefs(OpBundles); 4625 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4626 4627 if (isa<FPMathOperator>(V)) 4628 V->copyFastMathFlags(CI); 4629 4630 State.set(Def, V, Part); 4631 addMetadata(V, &I); 4632 } 4633 } 4634 4635 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4636 // We should not collect Scalars more than once per VF. Right now, this 4637 // function is called from collectUniformsAndScalars(), which already does 4638 // this check. Collecting Scalars for VF=1 does not make any sense. 4639 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4640 "This function should not be visited twice for the same VF"); 4641 4642 SmallSetVector<Instruction *, 8> Worklist; 4643 4644 // These sets are used to seed the analysis with pointers used by memory 4645 // accesses that will remain scalar. 4646 SmallSetVector<Instruction *, 8> ScalarPtrs; 4647 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4648 auto *Latch = TheLoop->getLoopLatch(); 4649 4650 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4651 // The pointer operands of loads and stores will be scalar as long as the 4652 // memory access is not a gather or scatter operation. The value operand of a 4653 // store will remain scalar if the store is scalarized. 4654 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4655 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4656 assert(WideningDecision != CM_Unknown && 4657 "Widening decision should be ready at this moment"); 4658 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4659 if (Ptr == Store->getValueOperand()) 4660 return WideningDecision == CM_Scalarize; 4661 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4662 "Ptr is neither a value or pointer operand"); 4663 return WideningDecision != CM_GatherScatter; 4664 }; 4665 4666 // A helper that returns true if the given value is a bitcast or 4667 // getelementptr instruction contained in the loop. 4668 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4669 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4670 isa<GetElementPtrInst>(V)) && 4671 !TheLoop->isLoopInvariant(V); 4672 }; 4673 4674 // A helper that evaluates a memory access's use of a pointer. If the use will 4675 // be a scalar use and the pointer is only used by memory accesses, we place 4676 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4677 // PossibleNonScalarPtrs. 4678 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4679 // We only care about bitcast and getelementptr instructions contained in 4680 // the loop. 4681 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4682 return; 4683 4684 // If the pointer has already been identified as scalar (e.g., if it was 4685 // also identified as uniform), there's nothing to do. 4686 auto *I = cast<Instruction>(Ptr); 4687 if (Worklist.count(I)) 4688 return; 4689 4690 // If the use of the pointer will be a scalar use, and all users of the 4691 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4692 // place the pointer in PossibleNonScalarPtrs. 4693 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4694 return isa<LoadInst>(U) || isa<StoreInst>(U); 4695 })) 4696 ScalarPtrs.insert(I); 4697 else 4698 PossibleNonScalarPtrs.insert(I); 4699 }; 4700 4701 // We seed the scalars analysis with three classes of instructions: (1) 4702 // instructions marked uniform-after-vectorization and (2) bitcast, 4703 // getelementptr and (pointer) phi instructions used by memory accesses 4704 // requiring a scalar use. 4705 // 4706 // (1) Add to the worklist all instructions that have been identified as 4707 // uniform-after-vectorization. 4708 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4709 4710 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4711 // memory accesses requiring a scalar use. The pointer operands of loads and 4712 // stores will be scalar as long as the memory accesses is not a gather or 4713 // scatter operation. The value operand of a store will remain scalar if the 4714 // store is scalarized. 4715 for (auto *BB : TheLoop->blocks()) 4716 for (auto &I : *BB) { 4717 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4718 evaluatePtrUse(Load, Load->getPointerOperand()); 4719 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4720 evaluatePtrUse(Store, Store->getPointerOperand()); 4721 evaluatePtrUse(Store, Store->getValueOperand()); 4722 } 4723 } 4724 for (auto *I : ScalarPtrs) 4725 if (!PossibleNonScalarPtrs.count(I)) { 4726 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4727 Worklist.insert(I); 4728 } 4729 4730 // Insert the forced scalars. 4731 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4732 // induction variable when the PHI user is scalarized. 4733 auto ForcedScalar = ForcedScalars.find(VF); 4734 if (ForcedScalar != ForcedScalars.end()) 4735 for (auto *I : ForcedScalar->second) 4736 Worklist.insert(I); 4737 4738 // Expand the worklist by looking through any bitcasts and getelementptr 4739 // instructions we've already identified as scalar. This is similar to the 4740 // expansion step in collectLoopUniforms(); however, here we're only 4741 // expanding to include additional bitcasts and getelementptr instructions. 4742 unsigned Idx = 0; 4743 while (Idx != Worklist.size()) { 4744 Instruction *Dst = Worklist[Idx++]; 4745 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4746 continue; 4747 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4748 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4749 auto *J = cast<Instruction>(U); 4750 return !TheLoop->contains(J) || Worklist.count(J) || 4751 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4752 isScalarUse(J, Src)); 4753 })) { 4754 Worklist.insert(Src); 4755 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4756 } 4757 } 4758 4759 // An induction variable will remain scalar if all users of the induction 4760 // variable and induction variable update remain scalar. 4761 for (auto &Induction : Legal->getInductionVars()) { 4762 auto *Ind = Induction.first; 4763 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4764 4765 // If tail-folding is applied, the primary induction variable will be used 4766 // to feed a vector compare. 4767 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4768 continue; 4769 4770 // Returns true if \p Indvar is a pointer induction that is used directly by 4771 // load/store instruction \p I. 4772 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4773 Instruction *I) { 4774 return Induction.second.getKind() == 4775 InductionDescriptor::IK_PtrInduction && 4776 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4777 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4778 }; 4779 4780 // Determine if all users of the induction variable are scalar after 4781 // vectorization. 4782 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4783 auto *I = cast<Instruction>(U); 4784 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4785 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4786 }); 4787 if (!ScalarInd) 4788 continue; 4789 4790 // Determine if all users of the induction variable update instruction are 4791 // scalar after vectorization. 4792 auto ScalarIndUpdate = 4793 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4794 auto *I = cast<Instruction>(U); 4795 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4796 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4797 }); 4798 if (!ScalarIndUpdate) 4799 continue; 4800 4801 // The induction variable and its update instruction will remain scalar. 4802 Worklist.insert(Ind); 4803 Worklist.insert(IndUpdate); 4804 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4805 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4806 << "\n"); 4807 } 4808 4809 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4810 } 4811 4812 bool LoopVectorizationCostModel::isScalarWithPredication( 4813 Instruction *I, ElementCount VF) const { 4814 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4815 return false; 4816 switch(I->getOpcode()) { 4817 default: 4818 break; 4819 case Instruction::Load: 4820 case Instruction::Store: { 4821 if (!Legal->isMaskRequired(I)) 4822 return false; 4823 auto *Ptr = getLoadStorePointerOperand(I); 4824 auto *Ty = getLoadStoreType(I); 4825 Type *VTy = Ty; 4826 if (VF.isVector()) 4827 VTy = VectorType::get(Ty, VF); 4828 const Align Alignment = getLoadStoreAlignment(I); 4829 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4830 TTI.isLegalMaskedGather(VTy, Alignment)) 4831 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4832 TTI.isLegalMaskedScatter(VTy, Alignment)); 4833 } 4834 case Instruction::UDiv: 4835 case Instruction::SDiv: 4836 case Instruction::SRem: 4837 case Instruction::URem: 4838 return mayDivideByZero(*I); 4839 } 4840 return false; 4841 } 4842 4843 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4844 Instruction *I, ElementCount VF) { 4845 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4846 assert(getWideningDecision(I, VF) == CM_Unknown && 4847 "Decision should not be set yet."); 4848 auto *Group = getInterleavedAccessGroup(I); 4849 assert(Group && "Must have a group."); 4850 4851 // If the instruction's allocated size doesn't equal it's type size, it 4852 // requires padding and will be scalarized. 4853 auto &DL = I->getModule()->getDataLayout(); 4854 auto *ScalarTy = getLoadStoreType(I); 4855 if (hasIrregularType(ScalarTy, DL)) 4856 return false; 4857 4858 // Check if masking is required. 4859 // A Group may need masking for one of two reasons: it resides in a block that 4860 // needs predication, or it was decided to use masking to deal with gaps 4861 // (either a gap at the end of a load-access that may result in a speculative 4862 // load, or any gaps in a store-access). 4863 bool PredicatedAccessRequiresMasking = 4864 blockNeedsPredicationForAnyReason(I->getParent()) && 4865 Legal->isMaskRequired(I); 4866 bool LoadAccessWithGapsRequiresEpilogMasking = 4867 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4868 !isScalarEpilogueAllowed(); 4869 bool StoreAccessWithGapsRequiresMasking = 4870 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4871 if (!PredicatedAccessRequiresMasking && 4872 !LoadAccessWithGapsRequiresEpilogMasking && 4873 !StoreAccessWithGapsRequiresMasking) 4874 return true; 4875 4876 // If masked interleaving is required, we expect that the user/target had 4877 // enabled it, because otherwise it either wouldn't have been created or 4878 // it should have been invalidated by the CostModel. 4879 assert(useMaskedInterleavedAccesses(TTI) && 4880 "Masked interleave-groups for predicated accesses are not enabled."); 4881 4882 if (Group->isReverse()) 4883 return false; 4884 4885 auto *Ty = getLoadStoreType(I); 4886 const Align Alignment = getLoadStoreAlignment(I); 4887 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4888 : TTI.isLegalMaskedStore(Ty, Alignment); 4889 } 4890 4891 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4892 Instruction *I, ElementCount VF) { 4893 // Get and ensure we have a valid memory instruction. 4894 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4895 4896 auto *Ptr = getLoadStorePointerOperand(I); 4897 auto *ScalarTy = getLoadStoreType(I); 4898 4899 // In order to be widened, the pointer should be consecutive, first of all. 4900 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4901 return false; 4902 4903 // If the instruction is a store located in a predicated block, it will be 4904 // scalarized. 4905 if (isScalarWithPredication(I, VF)) 4906 return false; 4907 4908 // If the instruction's allocated size doesn't equal it's type size, it 4909 // requires padding and will be scalarized. 4910 auto &DL = I->getModule()->getDataLayout(); 4911 if (hasIrregularType(ScalarTy, DL)) 4912 return false; 4913 4914 return true; 4915 } 4916 4917 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4918 // We should not collect Uniforms more than once per VF. Right now, 4919 // this function is called from collectUniformsAndScalars(), which 4920 // already does this check. Collecting Uniforms for VF=1 does not make any 4921 // sense. 4922 4923 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4924 "This function should not be visited twice for the same VF"); 4925 4926 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4927 // not analyze again. Uniforms.count(VF) will return 1. 4928 Uniforms[VF].clear(); 4929 4930 // We now know that the loop is vectorizable! 4931 // Collect instructions inside the loop that will remain uniform after 4932 // vectorization. 4933 4934 // Global values, params and instructions outside of current loop are out of 4935 // scope. 4936 auto isOutOfScope = [&](Value *V) -> bool { 4937 Instruction *I = dyn_cast<Instruction>(V); 4938 return (!I || !TheLoop->contains(I)); 4939 }; 4940 4941 // Worklist containing uniform instructions demanding lane 0. 4942 SetVector<Instruction *> Worklist; 4943 BasicBlock *Latch = TheLoop->getLoopLatch(); 4944 4945 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4946 // that are scalar with predication must not be considered uniform after 4947 // vectorization, because that would create an erroneous replicating region 4948 // where only a single instance out of VF should be formed. 4949 // TODO: optimize such seldom cases if found important, see PR40816. 4950 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4951 if (isOutOfScope(I)) { 4952 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4953 << *I << "\n"); 4954 return; 4955 } 4956 if (isScalarWithPredication(I, VF)) { 4957 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4958 << *I << "\n"); 4959 return; 4960 } 4961 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4962 Worklist.insert(I); 4963 }; 4964 4965 // Start with the conditional branch. If the branch condition is an 4966 // instruction contained in the loop that is only used by the branch, it is 4967 // uniform. 4968 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4969 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4970 addToWorklistIfAllowed(Cmp); 4971 4972 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4973 InstWidening WideningDecision = getWideningDecision(I, VF); 4974 assert(WideningDecision != CM_Unknown && 4975 "Widening decision should be ready at this moment"); 4976 4977 // A uniform memory op is itself uniform. We exclude uniform stores 4978 // here as they demand the last lane, not the first one. 4979 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4980 assert(WideningDecision == CM_Scalarize); 4981 return true; 4982 } 4983 4984 return (WideningDecision == CM_Widen || 4985 WideningDecision == CM_Widen_Reverse || 4986 WideningDecision == CM_Interleave); 4987 }; 4988 4989 4990 // Returns true if Ptr is the pointer operand of a memory access instruction 4991 // I, and I is known to not require scalarization. 4992 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4993 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4994 }; 4995 4996 // Holds a list of values which are known to have at least one uniform use. 4997 // Note that there may be other uses which aren't uniform. A "uniform use" 4998 // here is something which only demands lane 0 of the unrolled iterations; 4999 // it does not imply that all lanes produce the same value (e.g. this is not 5000 // the usual meaning of uniform) 5001 SetVector<Value *> HasUniformUse; 5002 5003 // Scan the loop for instructions which are either a) known to have only 5004 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5005 for (auto *BB : TheLoop->blocks()) 5006 for (auto &I : *BB) { 5007 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5008 switch (II->getIntrinsicID()) { 5009 case Intrinsic::sideeffect: 5010 case Intrinsic::experimental_noalias_scope_decl: 5011 case Intrinsic::assume: 5012 case Intrinsic::lifetime_start: 5013 case Intrinsic::lifetime_end: 5014 if (TheLoop->hasLoopInvariantOperands(&I)) 5015 addToWorklistIfAllowed(&I); 5016 break; 5017 default: 5018 break; 5019 } 5020 } 5021 5022 // ExtractValue instructions must be uniform, because the operands are 5023 // known to be loop-invariant. 5024 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5025 assert(isOutOfScope(EVI->getAggregateOperand()) && 5026 "Expected aggregate value to be loop invariant"); 5027 addToWorklistIfAllowed(EVI); 5028 continue; 5029 } 5030 5031 // If there's no pointer operand, there's nothing to do. 5032 auto *Ptr = getLoadStorePointerOperand(&I); 5033 if (!Ptr) 5034 continue; 5035 5036 // A uniform memory op is itself uniform. We exclude uniform stores 5037 // here as they demand the last lane, not the first one. 5038 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5039 addToWorklistIfAllowed(&I); 5040 5041 if (isUniformDecision(&I, VF)) { 5042 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5043 HasUniformUse.insert(Ptr); 5044 } 5045 } 5046 5047 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5048 // demanding) users. Since loops are assumed to be in LCSSA form, this 5049 // disallows uses outside the loop as well. 5050 for (auto *V : HasUniformUse) { 5051 if (isOutOfScope(V)) 5052 continue; 5053 auto *I = cast<Instruction>(V); 5054 auto UsersAreMemAccesses = 5055 llvm::all_of(I->users(), [&](User *U) -> bool { 5056 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5057 }); 5058 if (UsersAreMemAccesses) 5059 addToWorklistIfAllowed(I); 5060 } 5061 5062 // Expand Worklist in topological order: whenever a new instruction 5063 // is added , its users should be already inside Worklist. It ensures 5064 // a uniform instruction will only be used by uniform instructions. 5065 unsigned idx = 0; 5066 while (idx != Worklist.size()) { 5067 Instruction *I = Worklist[idx++]; 5068 5069 for (auto OV : I->operand_values()) { 5070 // isOutOfScope operands cannot be uniform instructions. 5071 if (isOutOfScope(OV)) 5072 continue; 5073 // First order recurrence Phi's should typically be considered 5074 // non-uniform. 5075 auto *OP = dyn_cast<PHINode>(OV); 5076 if (OP && Legal->isFirstOrderRecurrence(OP)) 5077 continue; 5078 // If all the users of the operand are uniform, then add the 5079 // operand into the uniform worklist. 5080 auto *OI = cast<Instruction>(OV); 5081 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5082 auto *J = cast<Instruction>(U); 5083 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5084 })) 5085 addToWorklistIfAllowed(OI); 5086 } 5087 } 5088 5089 // For an instruction to be added into Worklist above, all its users inside 5090 // the loop should also be in Worklist. However, this condition cannot be 5091 // true for phi nodes that form a cyclic dependence. We must process phi 5092 // nodes separately. An induction variable will remain uniform if all users 5093 // of the induction variable and induction variable update remain uniform. 5094 // The code below handles both pointer and non-pointer induction variables. 5095 for (auto &Induction : Legal->getInductionVars()) { 5096 auto *Ind = Induction.first; 5097 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5098 5099 // Determine if all users of the induction variable are uniform after 5100 // vectorization. 5101 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5102 auto *I = cast<Instruction>(U); 5103 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5104 isVectorizedMemAccessUse(I, Ind); 5105 }); 5106 if (!UniformInd) 5107 continue; 5108 5109 // Determine if all users of the induction variable update instruction are 5110 // uniform after vectorization. 5111 auto UniformIndUpdate = 5112 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5113 auto *I = cast<Instruction>(U); 5114 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5115 isVectorizedMemAccessUse(I, IndUpdate); 5116 }); 5117 if (!UniformIndUpdate) 5118 continue; 5119 5120 // The induction variable and its update instruction will remain uniform. 5121 addToWorklistIfAllowed(Ind); 5122 addToWorklistIfAllowed(IndUpdate); 5123 } 5124 5125 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5126 } 5127 5128 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5129 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5130 5131 if (Legal->getRuntimePointerChecking()->Need) { 5132 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5133 "runtime pointer checks needed. Enable vectorization of this " 5134 "loop with '#pragma clang loop vectorize(enable)' when " 5135 "compiling with -Os/-Oz", 5136 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5137 return true; 5138 } 5139 5140 if (!PSE.getPredicate().isAlwaysTrue()) { 5141 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5142 "runtime SCEV checks needed. Enable vectorization of this " 5143 "loop with '#pragma clang loop vectorize(enable)' when " 5144 "compiling with -Os/-Oz", 5145 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5146 return true; 5147 } 5148 5149 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5150 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5151 reportVectorizationFailure("Runtime stride check for small trip count", 5152 "runtime stride == 1 checks needed. Enable vectorization of " 5153 "this loop without such check by compiling with -Os/-Oz", 5154 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5155 return true; 5156 } 5157 5158 return false; 5159 } 5160 5161 ElementCount 5162 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5163 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5164 return ElementCount::getScalable(0); 5165 5166 if (Hints->isScalableVectorizationDisabled()) { 5167 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5168 "ScalableVectorizationDisabled", ORE, TheLoop); 5169 return ElementCount::getScalable(0); 5170 } 5171 5172 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5173 5174 auto MaxScalableVF = ElementCount::getScalable( 5175 std::numeric_limits<ElementCount::ScalarTy>::max()); 5176 5177 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5178 // FIXME: While for scalable vectors this is currently sufficient, this should 5179 // be replaced by a more detailed mechanism that filters out specific VFs, 5180 // instead of invalidating vectorization for a whole set of VFs based on the 5181 // MaxVF. 5182 5183 // Disable scalable vectorization if the loop contains unsupported reductions. 5184 if (!canVectorizeReductions(MaxScalableVF)) { 5185 reportVectorizationInfo( 5186 "Scalable vectorization not supported for the reduction " 5187 "operations found in this loop.", 5188 "ScalableVFUnfeasible", ORE, TheLoop); 5189 return ElementCount::getScalable(0); 5190 } 5191 5192 // Disable scalable vectorization if the loop contains any instructions 5193 // with element types not supported for scalable vectors. 5194 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5195 return !Ty->isVoidTy() && 5196 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5197 })) { 5198 reportVectorizationInfo("Scalable vectorization is not supported " 5199 "for all element types found in this loop.", 5200 "ScalableVFUnfeasible", ORE, TheLoop); 5201 return ElementCount::getScalable(0); 5202 } 5203 5204 if (Legal->isSafeForAnyVectorWidth()) 5205 return MaxScalableVF; 5206 5207 // Limit MaxScalableVF by the maximum safe dependence distance. 5208 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5209 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5210 MaxVScale = 5211 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5212 MaxScalableVF = ElementCount::getScalable( 5213 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5214 if (!MaxScalableVF) 5215 reportVectorizationInfo( 5216 "Max legal vector width too small, scalable vectorization " 5217 "unfeasible.", 5218 "ScalableVFUnfeasible", ORE, TheLoop); 5219 5220 return MaxScalableVF; 5221 } 5222 5223 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5224 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5225 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5226 unsigned SmallestType, WidestType; 5227 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5228 5229 // Get the maximum safe dependence distance in bits computed by LAA. 5230 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5231 // the memory accesses that is most restrictive (involved in the smallest 5232 // dependence distance). 5233 unsigned MaxSafeElements = 5234 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5235 5236 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5237 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5238 5239 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5240 << ".\n"); 5241 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5242 << ".\n"); 5243 5244 // First analyze the UserVF, fall back if the UserVF should be ignored. 5245 if (UserVF) { 5246 auto MaxSafeUserVF = 5247 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5248 5249 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5250 // If `VF=vscale x N` is safe, then so is `VF=N` 5251 if (UserVF.isScalable()) 5252 return FixedScalableVFPair( 5253 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5254 else 5255 return UserVF; 5256 } 5257 5258 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5259 5260 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5261 // is better to ignore the hint and let the compiler choose a suitable VF. 5262 if (!UserVF.isScalable()) { 5263 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5264 << " is unsafe, clamping to max safe VF=" 5265 << MaxSafeFixedVF << ".\n"); 5266 ORE->emit([&]() { 5267 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5268 TheLoop->getStartLoc(), 5269 TheLoop->getHeader()) 5270 << "User-specified vectorization factor " 5271 << ore::NV("UserVectorizationFactor", UserVF) 5272 << " is unsafe, clamping to maximum safe vectorization factor " 5273 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5274 }); 5275 return MaxSafeFixedVF; 5276 } 5277 5278 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5279 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5280 << " is ignored because scalable vectors are not " 5281 "available.\n"); 5282 ORE->emit([&]() { 5283 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5284 TheLoop->getStartLoc(), 5285 TheLoop->getHeader()) 5286 << "User-specified vectorization factor " 5287 << ore::NV("UserVectorizationFactor", UserVF) 5288 << " is ignored because the target does not support scalable " 5289 "vectors. The compiler will pick a more suitable value."; 5290 }); 5291 } else { 5292 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5293 << " is unsafe. Ignoring scalable UserVF.\n"); 5294 ORE->emit([&]() { 5295 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5296 TheLoop->getStartLoc(), 5297 TheLoop->getHeader()) 5298 << "User-specified vectorization factor " 5299 << ore::NV("UserVectorizationFactor", UserVF) 5300 << " is unsafe. Ignoring the hint to let the compiler pick a " 5301 "more suitable value."; 5302 }); 5303 } 5304 } 5305 5306 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5307 << " / " << WidestType << " bits.\n"); 5308 5309 FixedScalableVFPair Result(ElementCount::getFixed(1), 5310 ElementCount::getScalable(0)); 5311 if (auto MaxVF = 5312 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5313 MaxSafeFixedVF, FoldTailByMasking)) 5314 Result.FixedVF = MaxVF; 5315 5316 if (auto MaxVF = 5317 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5318 MaxSafeScalableVF, FoldTailByMasking)) 5319 if (MaxVF.isScalable()) { 5320 Result.ScalableVF = MaxVF; 5321 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5322 << "\n"); 5323 } 5324 5325 return Result; 5326 } 5327 5328 FixedScalableVFPair 5329 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5330 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5331 // TODO: It may by useful to do since it's still likely to be dynamically 5332 // uniform if the target can skip. 5333 reportVectorizationFailure( 5334 "Not inserting runtime ptr check for divergent target", 5335 "runtime pointer checks needed. Not enabled for divergent target", 5336 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5337 return FixedScalableVFPair::getNone(); 5338 } 5339 5340 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5341 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5342 if (TC == 1) { 5343 reportVectorizationFailure("Single iteration (non) loop", 5344 "loop trip count is one, irrelevant for vectorization", 5345 "SingleIterationLoop", ORE, TheLoop); 5346 return FixedScalableVFPair::getNone(); 5347 } 5348 5349 switch (ScalarEpilogueStatus) { 5350 case CM_ScalarEpilogueAllowed: 5351 return computeFeasibleMaxVF(TC, UserVF, false); 5352 case CM_ScalarEpilogueNotAllowedUsePredicate: 5353 LLVM_FALLTHROUGH; 5354 case CM_ScalarEpilogueNotNeededUsePredicate: 5355 LLVM_DEBUG( 5356 dbgs() << "LV: vector predicate hint/switch found.\n" 5357 << "LV: Not allowing scalar epilogue, creating predicated " 5358 << "vector loop.\n"); 5359 break; 5360 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5361 // fallthrough as a special case of OptForSize 5362 case CM_ScalarEpilogueNotAllowedOptSize: 5363 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5364 LLVM_DEBUG( 5365 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5366 else 5367 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5368 << "count.\n"); 5369 5370 // Bail if runtime checks are required, which are not good when optimising 5371 // for size. 5372 if (runtimeChecksRequired()) 5373 return FixedScalableVFPair::getNone(); 5374 5375 break; 5376 } 5377 5378 // The only loops we can vectorize without a scalar epilogue, are loops with 5379 // a bottom-test and a single exiting block. We'd have to handle the fact 5380 // that not every instruction executes on the last iteration. This will 5381 // require a lane mask which varies through the vector loop body. (TODO) 5382 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5383 // If there was a tail-folding hint/switch, but we can't fold the tail by 5384 // masking, fallback to a vectorization with a scalar epilogue. 5385 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5386 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5387 "scalar epilogue instead.\n"); 5388 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5389 return computeFeasibleMaxVF(TC, UserVF, false); 5390 } 5391 return FixedScalableVFPair::getNone(); 5392 } 5393 5394 // Now try the tail folding 5395 5396 // Invalidate interleave groups that require an epilogue if we can't mask 5397 // the interleave-group. 5398 if (!useMaskedInterleavedAccesses(TTI)) { 5399 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5400 "No decisions should have been taken at this point"); 5401 // Note: There is no need to invalidate any cost modeling decisions here, as 5402 // non where taken so far. 5403 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5404 } 5405 5406 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5407 // Avoid tail folding if the trip count is known to be a multiple of any VF 5408 // we chose. 5409 // FIXME: The condition below pessimises the case for fixed-width vectors, 5410 // when scalable VFs are also candidates for vectorization. 5411 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5412 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5413 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5414 "MaxFixedVF must be a power of 2"); 5415 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5416 : MaxFixedVF.getFixedValue(); 5417 ScalarEvolution *SE = PSE.getSE(); 5418 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5419 const SCEV *ExitCount = SE->getAddExpr( 5420 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5421 const SCEV *Rem = SE->getURemExpr( 5422 SE->applyLoopGuards(ExitCount, TheLoop), 5423 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5424 if (Rem->isZero()) { 5425 // Accept MaxFixedVF if we do not have a tail. 5426 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5427 return MaxFactors; 5428 } 5429 } 5430 5431 // For scalable vectors don't use tail folding for low trip counts or 5432 // optimizing for code size. We only permit this if the user has explicitly 5433 // requested it. 5434 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5435 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5436 MaxFactors.ScalableVF.isVector()) 5437 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5438 5439 // If we don't know the precise trip count, or if the trip count that we 5440 // found modulo the vectorization factor is not zero, try to fold the tail 5441 // by masking. 5442 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5443 if (Legal->prepareToFoldTailByMasking()) { 5444 FoldTailByMasking = true; 5445 return MaxFactors; 5446 } 5447 5448 // If there was a tail-folding hint/switch, but we can't fold the tail by 5449 // masking, fallback to a vectorization with a scalar epilogue. 5450 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5451 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5452 "scalar epilogue instead.\n"); 5453 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5454 return MaxFactors; 5455 } 5456 5457 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5458 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5459 return FixedScalableVFPair::getNone(); 5460 } 5461 5462 if (TC == 0) { 5463 reportVectorizationFailure( 5464 "Unable to calculate the loop count due to complex control flow", 5465 "unable to calculate the loop count due to complex control flow", 5466 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5467 return FixedScalableVFPair::getNone(); 5468 } 5469 5470 reportVectorizationFailure( 5471 "Cannot optimize for size and vectorize at the same time.", 5472 "cannot optimize for size and vectorize at the same time. " 5473 "Enable vectorization of this loop with '#pragma clang loop " 5474 "vectorize(enable)' when compiling with -Os/-Oz", 5475 "NoTailLoopWithOptForSize", ORE, TheLoop); 5476 return FixedScalableVFPair::getNone(); 5477 } 5478 5479 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5480 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5481 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5482 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5483 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5484 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5485 : TargetTransformInfo::RGK_FixedWidthVector); 5486 5487 // Convenience function to return the minimum of two ElementCounts. 5488 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5489 assert((LHS.isScalable() == RHS.isScalable()) && 5490 "Scalable flags must match"); 5491 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5492 }; 5493 5494 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5495 // Note that both WidestRegister and WidestType may not be a powers of 2. 5496 auto MaxVectorElementCount = ElementCount::get( 5497 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5498 ComputeScalableMaxVF); 5499 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5500 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5501 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5502 5503 if (!MaxVectorElementCount) { 5504 LLVM_DEBUG(dbgs() << "LV: The target has no " 5505 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5506 << " vector registers.\n"); 5507 return ElementCount::getFixed(1); 5508 } 5509 5510 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5511 if (ConstTripCount && 5512 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5513 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5514 // If loop trip count (TC) is known at compile time there is no point in 5515 // choosing VF greater than TC (as done in the loop below). Select maximum 5516 // power of two which doesn't exceed TC. 5517 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5518 // when the TC is less than or equal to the known number of lanes. 5519 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5520 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5521 "exceeding the constant trip count: " 5522 << ClampedConstTripCount << "\n"); 5523 return ElementCount::getFixed(ClampedConstTripCount); 5524 } 5525 5526 ElementCount MaxVF = MaxVectorElementCount; 5527 if (TTI.shouldMaximizeVectorBandwidth() || 5528 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5529 auto MaxVectorElementCountMaxBW = ElementCount::get( 5530 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5531 ComputeScalableMaxVF); 5532 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5533 5534 // Collect all viable vectorization factors larger than the default MaxVF 5535 // (i.e. MaxVectorElementCount). 5536 SmallVector<ElementCount, 8> VFs; 5537 for (ElementCount VS = MaxVectorElementCount * 2; 5538 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5539 VFs.push_back(VS); 5540 5541 // For each VF calculate its register usage. 5542 auto RUs = calculateRegisterUsage(VFs); 5543 5544 // Select the largest VF which doesn't require more registers than existing 5545 // ones. 5546 for (int i = RUs.size() - 1; i >= 0; --i) { 5547 bool Selected = true; 5548 for (auto &pair : RUs[i].MaxLocalUsers) { 5549 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5550 if (pair.second > TargetNumRegisters) 5551 Selected = false; 5552 } 5553 if (Selected) { 5554 MaxVF = VFs[i]; 5555 break; 5556 } 5557 } 5558 if (ElementCount MinVF = 5559 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5560 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5561 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5562 << ") with target's minimum: " << MinVF << '\n'); 5563 MaxVF = MinVF; 5564 } 5565 } 5566 } 5567 return MaxVF; 5568 } 5569 5570 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5571 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5572 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5573 auto Min = Attr.getVScaleRangeMin(); 5574 auto Max = Attr.getVScaleRangeMax(); 5575 if (Max && Min == Max) 5576 return Max; 5577 } 5578 5579 return TTI.getVScaleForTuning(); 5580 } 5581 5582 bool LoopVectorizationCostModel::isMoreProfitable( 5583 const VectorizationFactor &A, const VectorizationFactor &B) const { 5584 InstructionCost CostA = A.Cost; 5585 InstructionCost CostB = B.Cost; 5586 5587 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5588 5589 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5590 MaxTripCount) { 5591 // If we are folding the tail and the trip count is a known (possibly small) 5592 // constant, the trip count will be rounded up to an integer number of 5593 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5594 // which we compare directly. When not folding the tail, the total cost will 5595 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5596 // approximated with the per-lane cost below instead of using the tripcount 5597 // as here. 5598 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5599 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5600 return RTCostA < RTCostB; 5601 } 5602 5603 // Improve estimate for the vector width if it is scalable. 5604 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5605 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5606 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5607 if (A.Width.isScalable()) 5608 EstimatedWidthA *= VScale.getValue(); 5609 if (B.Width.isScalable()) 5610 EstimatedWidthB *= VScale.getValue(); 5611 } 5612 5613 // Assume vscale may be larger than 1 (or the value being tuned for), 5614 // so that scalable vectorization is slightly favorable over fixed-width 5615 // vectorization. 5616 if (A.Width.isScalable() && !B.Width.isScalable()) 5617 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5618 5619 // To avoid the need for FP division: 5620 // (CostA / A.Width) < (CostB / B.Width) 5621 // <=> (CostA * B.Width) < (CostB * A.Width) 5622 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5623 } 5624 5625 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5626 const ElementCountSet &VFCandidates) { 5627 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5628 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5629 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5630 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5631 "Expected Scalar VF to be a candidate"); 5632 5633 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5634 VectorizationFactor ChosenFactor = ScalarCost; 5635 5636 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5637 if (ForceVectorization && VFCandidates.size() > 1) { 5638 // Ignore scalar width, because the user explicitly wants vectorization. 5639 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5640 // evaluation. 5641 ChosenFactor.Cost = InstructionCost::getMax(); 5642 } 5643 5644 SmallVector<InstructionVFPair> InvalidCosts; 5645 for (const auto &i : VFCandidates) { 5646 // The cost for scalar VF=1 is already calculated, so ignore it. 5647 if (i.isScalar()) 5648 continue; 5649 5650 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5651 VectorizationFactor Candidate(i, C.first); 5652 5653 #ifndef NDEBUG 5654 unsigned AssumedMinimumVscale = 1; 5655 if (Optional<unsigned> VScale = getVScaleForTuning()) 5656 AssumedMinimumVscale = VScale.getValue(); 5657 unsigned Width = 5658 Candidate.Width.isScalable() 5659 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5660 : Candidate.Width.getFixedValue(); 5661 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5662 << " costs: " << (Candidate.Cost / Width)); 5663 if (i.isScalable()) 5664 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5665 << AssumedMinimumVscale << ")"); 5666 LLVM_DEBUG(dbgs() << ".\n"); 5667 #endif 5668 5669 if (!C.second && !ForceVectorization) { 5670 LLVM_DEBUG( 5671 dbgs() << "LV: Not considering vector loop of width " << i 5672 << " because it will not generate any vector instructions.\n"); 5673 continue; 5674 } 5675 5676 // If profitable add it to ProfitableVF list. 5677 if (isMoreProfitable(Candidate, ScalarCost)) 5678 ProfitableVFs.push_back(Candidate); 5679 5680 if (isMoreProfitable(Candidate, ChosenFactor)) 5681 ChosenFactor = Candidate; 5682 } 5683 5684 // Emit a report of VFs with invalid costs in the loop. 5685 if (!InvalidCosts.empty()) { 5686 // Group the remarks per instruction, keeping the instruction order from 5687 // InvalidCosts. 5688 std::map<Instruction *, unsigned> Numbering; 5689 unsigned I = 0; 5690 for (auto &Pair : InvalidCosts) 5691 if (!Numbering.count(Pair.first)) 5692 Numbering[Pair.first] = I++; 5693 5694 // Sort the list, first on instruction(number) then on VF. 5695 llvm::sort(InvalidCosts, 5696 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5697 if (Numbering[A.first] != Numbering[B.first]) 5698 return Numbering[A.first] < Numbering[B.first]; 5699 ElementCountComparator ECC; 5700 return ECC(A.second, B.second); 5701 }); 5702 5703 // For a list of ordered instruction-vf pairs: 5704 // [(load, vf1), (load, vf2), (store, vf1)] 5705 // Group the instructions together to emit separate remarks for: 5706 // load (vf1, vf2) 5707 // store (vf1) 5708 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5709 auto Subset = ArrayRef<InstructionVFPair>(); 5710 do { 5711 if (Subset.empty()) 5712 Subset = Tail.take_front(1); 5713 5714 Instruction *I = Subset.front().first; 5715 5716 // If the next instruction is different, or if there are no other pairs, 5717 // emit a remark for the collated subset. e.g. 5718 // [(load, vf1), (load, vf2))] 5719 // to emit: 5720 // remark: invalid costs for 'load' at VF=(vf, vf2) 5721 if (Subset == Tail || Tail[Subset.size()].first != I) { 5722 std::string OutString; 5723 raw_string_ostream OS(OutString); 5724 assert(!Subset.empty() && "Unexpected empty range"); 5725 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5726 for (auto &Pair : Subset) 5727 OS << (Pair.second == Subset.front().second ? "" : ", ") 5728 << Pair.second; 5729 OS << "):"; 5730 if (auto *CI = dyn_cast<CallInst>(I)) 5731 OS << " call to " << CI->getCalledFunction()->getName(); 5732 else 5733 OS << " " << I->getOpcodeName(); 5734 OS.flush(); 5735 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5736 Tail = Tail.drop_front(Subset.size()); 5737 Subset = {}; 5738 } else 5739 // Grow the subset by one element 5740 Subset = Tail.take_front(Subset.size() + 1); 5741 } while (!Tail.empty()); 5742 } 5743 5744 if (!EnableCondStoresVectorization && NumPredStores) { 5745 reportVectorizationFailure("There are conditional stores.", 5746 "store that is conditionally executed prevents vectorization", 5747 "ConditionalStore", ORE, TheLoop); 5748 ChosenFactor = ScalarCost; 5749 } 5750 5751 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5752 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5753 << "LV: Vectorization seems to be not beneficial, " 5754 << "but was forced by a user.\n"); 5755 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5756 return ChosenFactor; 5757 } 5758 5759 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5760 const Loop &L, ElementCount VF) const { 5761 // Cross iteration phis such as reductions need special handling and are 5762 // currently unsupported. 5763 if (any_of(L.getHeader()->phis(), 5764 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5765 return false; 5766 5767 // Phis with uses outside of the loop require special handling and are 5768 // currently unsupported. 5769 for (auto &Entry : Legal->getInductionVars()) { 5770 // Look for uses of the value of the induction at the last iteration. 5771 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5772 for (User *U : PostInc->users()) 5773 if (!L.contains(cast<Instruction>(U))) 5774 return false; 5775 // Look for uses of penultimate value of the induction. 5776 for (User *U : Entry.first->users()) 5777 if (!L.contains(cast<Instruction>(U))) 5778 return false; 5779 } 5780 5781 // Induction variables that are widened require special handling that is 5782 // currently not supported. 5783 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5784 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5785 this->isProfitableToScalarize(Entry.first, VF)); 5786 })) 5787 return false; 5788 5789 // Epilogue vectorization code has not been auditted to ensure it handles 5790 // non-latch exits properly. It may be fine, but it needs auditted and 5791 // tested. 5792 if (L.getExitingBlock() != L.getLoopLatch()) 5793 return false; 5794 5795 return true; 5796 } 5797 5798 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5799 const ElementCount VF) const { 5800 // FIXME: We need a much better cost-model to take different parameters such 5801 // as register pressure, code size increase and cost of extra branches into 5802 // account. For now we apply a very crude heuristic and only consider loops 5803 // with vectorization factors larger than a certain value. 5804 // We also consider epilogue vectorization unprofitable for targets that don't 5805 // consider interleaving beneficial (eg. MVE). 5806 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5807 return false; 5808 // FIXME: We should consider changing the threshold for scalable 5809 // vectors to take VScaleForTuning into account. 5810 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5811 return true; 5812 return false; 5813 } 5814 5815 VectorizationFactor 5816 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5817 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5818 VectorizationFactor Result = VectorizationFactor::Disabled(); 5819 if (!EnableEpilogueVectorization) { 5820 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5821 return Result; 5822 } 5823 5824 if (!isScalarEpilogueAllowed()) { 5825 LLVM_DEBUG( 5826 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5827 "allowed.\n";); 5828 return Result; 5829 } 5830 5831 // Not really a cost consideration, but check for unsupported cases here to 5832 // simplify the logic. 5833 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5834 LLVM_DEBUG( 5835 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5836 "not a supported candidate.\n";); 5837 return Result; 5838 } 5839 5840 if (EpilogueVectorizationForceVF > 1) { 5841 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5842 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5843 if (LVP.hasPlanWithVF(ForcedEC)) 5844 return {ForcedEC, 0}; 5845 else { 5846 LLVM_DEBUG( 5847 dbgs() 5848 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5849 return Result; 5850 } 5851 } 5852 5853 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5854 TheLoop->getHeader()->getParent()->hasMinSize()) { 5855 LLVM_DEBUG( 5856 dbgs() 5857 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5858 return Result; 5859 } 5860 5861 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5862 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5863 "this loop\n"); 5864 return Result; 5865 } 5866 5867 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5868 // the main loop handles 8 lanes per iteration. We could still benefit from 5869 // vectorizing the epilogue loop with VF=4. 5870 ElementCount EstimatedRuntimeVF = MainLoopVF; 5871 if (MainLoopVF.isScalable()) { 5872 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5873 if (Optional<unsigned> VScale = getVScaleForTuning()) 5874 EstimatedRuntimeVF *= VScale.getValue(); 5875 } 5876 5877 for (auto &NextVF : ProfitableVFs) 5878 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5879 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5880 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5881 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5882 LVP.hasPlanWithVF(NextVF.Width)) 5883 Result = NextVF; 5884 5885 if (Result != VectorizationFactor::Disabled()) 5886 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5887 << Result.Width << "\n";); 5888 return Result; 5889 } 5890 5891 std::pair<unsigned, unsigned> 5892 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5893 unsigned MinWidth = -1U; 5894 unsigned MaxWidth = 8; 5895 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5896 // For in-loop reductions, no element types are added to ElementTypesInLoop 5897 // if there are no loads/stores in the loop. In this case, check through the 5898 // reduction variables to determine the maximum width. 5899 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5900 // Reset MaxWidth so that we can find the smallest type used by recurrences 5901 // in the loop. 5902 MaxWidth = -1U; 5903 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5904 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5905 // When finding the min width used by the recurrence we need to account 5906 // for casts on the input operands of the recurrence. 5907 MaxWidth = std::min<unsigned>( 5908 MaxWidth, std::min<unsigned>( 5909 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5910 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5911 } 5912 } else { 5913 for (Type *T : ElementTypesInLoop) { 5914 MinWidth = std::min<unsigned>( 5915 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5916 MaxWidth = std::max<unsigned>( 5917 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5918 } 5919 } 5920 return {MinWidth, MaxWidth}; 5921 } 5922 5923 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5924 ElementTypesInLoop.clear(); 5925 // For each block. 5926 for (BasicBlock *BB : TheLoop->blocks()) { 5927 // For each instruction in the loop. 5928 for (Instruction &I : BB->instructionsWithoutDebug()) { 5929 Type *T = I.getType(); 5930 5931 // Skip ignored values. 5932 if (ValuesToIgnore.count(&I)) 5933 continue; 5934 5935 // Only examine Loads, Stores and PHINodes. 5936 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5937 continue; 5938 5939 // Examine PHI nodes that are reduction variables. Update the type to 5940 // account for the recurrence type. 5941 if (auto *PN = dyn_cast<PHINode>(&I)) { 5942 if (!Legal->isReductionVariable(PN)) 5943 continue; 5944 const RecurrenceDescriptor &RdxDesc = 5945 Legal->getReductionVars().find(PN)->second; 5946 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5947 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5948 RdxDesc.getRecurrenceType(), 5949 TargetTransformInfo::ReductionFlags())) 5950 continue; 5951 T = RdxDesc.getRecurrenceType(); 5952 } 5953 5954 // Examine the stored values. 5955 if (auto *ST = dyn_cast<StoreInst>(&I)) 5956 T = ST->getValueOperand()->getType(); 5957 5958 assert(T->isSized() && 5959 "Expected the load/store/recurrence type to be sized"); 5960 5961 ElementTypesInLoop.insert(T); 5962 } 5963 } 5964 } 5965 5966 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5967 unsigned LoopCost) { 5968 // -- The interleave heuristics -- 5969 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5970 // There are many micro-architectural considerations that we can't predict 5971 // at this level. For example, frontend pressure (on decode or fetch) due to 5972 // code size, or the number and capabilities of the execution ports. 5973 // 5974 // We use the following heuristics to select the interleave count: 5975 // 1. If the code has reductions, then we interleave to break the cross 5976 // iteration dependency. 5977 // 2. If the loop is really small, then we interleave to reduce the loop 5978 // overhead. 5979 // 3. We don't interleave if we think that we will spill registers to memory 5980 // due to the increased register pressure. 5981 5982 if (!isScalarEpilogueAllowed()) 5983 return 1; 5984 5985 // We used the distance for the interleave count. 5986 if (Legal->getMaxSafeDepDistBytes() != -1U) 5987 return 1; 5988 5989 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5990 const bool HasReductions = !Legal->getReductionVars().empty(); 5991 // Do not interleave loops with a relatively small known or estimated trip 5992 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5993 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5994 // because with the above conditions interleaving can expose ILP and break 5995 // cross iteration dependences for reductions. 5996 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5997 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5998 return 1; 5999 6000 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6001 // We divide by these constants so assume that we have at least one 6002 // instruction that uses at least one register. 6003 for (auto& pair : R.MaxLocalUsers) { 6004 pair.second = std::max(pair.second, 1U); 6005 } 6006 6007 // We calculate the interleave count using the following formula. 6008 // Subtract the number of loop invariants from the number of available 6009 // registers. These registers are used by all of the interleaved instances. 6010 // Next, divide the remaining registers by the number of registers that is 6011 // required by the loop, in order to estimate how many parallel instances 6012 // fit without causing spills. All of this is rounded down if necessary to be 6013 // a power of two. We want power of two interleave count to simplify any 6014 // addressing operations or alignment considerations. 6015 // We also want power of two interleave counts to ensure that the induction 6016 // variable of the vector loop wraps to zero, when tail is folded by masking; 6017 // this currently happens when OptForSize, in which case IC is set to 1 above. 6018 unsigned IC = UINT_MAX; 6019 6020 for (auto& pair : R.MaxLocalUsers) { 6021 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6022 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6023 << " registers of " 6024 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6025 if (VF.isScalar()) { 6026 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6027 TargetNumRegisters = ForceTargetNumScalarRegs; 6028 } else { 6029 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6030 TargetNumRegisters = ForceTargetNumVectorRegs; 6031 } 6032 unsigned MaxLocalUsers = pair.second; 6033 unsigned LoopInvariantRegs = 0; 6034 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6035 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6036 6037 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6038 // Don't count the induction variable as interleaved. 6039 if (EnableIndVarRegisterHeur) { 6040 TmpIC = 6041 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6042 std::max(1U, (MaxLocalUsers - 1))); 6043 } 6044 6045 IC = std::min(IC, TmpIC); 6046 } 6047 6048 // Clamp the interleave ranges to reasonable counts. 6049 unsigned MaxInterleaveCount = 6050 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6051 6052 // Check if the user has overridden the max. 6053 if (VF.isScalar()) { 6054 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6055 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6056 } else { 6057 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6058 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6059 } 6060 6061 // If trip count is known or estimated compile time constant, limit the 6062 // interleave count to be less than the trip count divided by VF, provided it 6063 // is at least 1. 6064 // 6065 // For scalable vectors we can't know if interleaving is beneficial. It may 6066 // not be beneficial for small loops if none of the lanes in the second vector 6067 // iterations is enabled. However, for larger loops, there is likely to be a 6068 // similar benefit as for fixed-width vectors. For now, we choose to leave 6069 // the InterleaveCount as if vscale is '1', although if some information about 6070 // the vector is known (e.g. min vector size), we can make a better decision. 6071 if (BestKnownTC) { 6072 MaxInterleaveCount = 6073 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6074 // Make sure MaxInterleaveCount is greater than 0. 6075 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6076 } 6077 6078 assert(MaxInterleaveCount > 0 && 6079 "Maximum interleave count must be greater than 0"); 6080 6081 // Clamp the calculated IC to be between the 1 and the max interleave count 6082 // that the target and trip count allows. 6083 if (IC > MaxInterleaveCount) 6084 IC = MaxInterleaveCount; 6085 else 6086 // Make sure IC is greater than 0. 6087 IC = std::max(1u, IC); 6088 6089 assert(IC > 0 && "Interleave count must be greater than 0."); 6090 6091 // If we did not calculate the cost for VF (because the user selected the VF) 6092 // then we calculate the cost of VF here. 6093 if (LoopCost == 0) { 6094 InstructionCost C = expectedCost(VF).first; 6095 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6096 LoopCost = *C.getValue(); 6097 } 6098 6099 assert(LoopCost && "Non-zero loop cost expected"); 6100 6101 // Interleave if we vectorized this loop and there is a reduction that could 6102 // benefit from interleaving. 6103 if (VF.isVector() && HasReductions) { 6104 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6105 return IC; 6106 } 6107 6108 // For any scalar loop that either requires runtime checks or predication we 6109 // are better off leaving this to the unroller. Note that if we've already 6110 // vectorized the loop we will have done the runtime check and so interleaving 6111 // won't require further checks. 6112 bool ScalarInterleavingRequiresPredication = 6113 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 6114 return Legal->blockNeedsPredication(BB); 6115 })); 6116 bool ScalarInterleavingRequiresRuntimePointerCheck = 6117 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6118 6119 // We want to interleave small loops in order to reduce the loop overhead and 6120 // potentially expose ILP opportunities. 6121 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6122 << "LV: IC is " << IC << '\n' 6123 << "LV: VF is " << VF << '\n'); 6124 const bool AggressivelyInterleaveReductions = 6125 TTI.enableAggressiveInterleaving(HasReductions); 6126 if (!ScalarInterleavingRequiresRuntimePointerCheck && 6127 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 6128 // We assume that the cost overhead is 1 and we use the cost model 6129 // to estimate the cost of the loop and interleave until the cost of the 6130 // loop overhead is about 5% of the cost of the loop. 6131 unsigned SmallIC = 6132 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6133 6134 // Interleave until store/load ports (estimated by max interleave count) are 6135 // saturated. 6136 unsigned NumStores = Legal->getNumStores(); 6137 unsigned NumLoads = Legal->getNumLoads(); 6138 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6139 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6140 6141 // There is little point in interleaving for reductions containing selects 6142 // and compares when VF=1 since it may just create more overhead than it's 6143 // worth for loops with small trip counts. This is because we still have to 6144 // do the final reduction after the loop. 6145 bool HasSelectCmpReductions = 6146 HasReductions && 6147 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6148 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6149 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6150 RdxDesc.getRecurrenceKind()); 6151 }); 6152 if (HasSelectCmpReductions) { 6153 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6154 return 1; 6155 } 6156 6157 // If we have a scalar reduction (vector reductions are already dealt with 6158 // by this point), we can increase the critical path length if the loop 6159 // we're interleaving is inside another loop. For tree-wise reductions 6160 // set the limit to 2, and for ordered reductions it's best to disable 6161 // interleaving entirely. 6162 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6163 bool HasOrderedReductions = 6164 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6165 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6166 return RdxDesc.isOrdered(); 6167 }); 6168 if (HasOrderedReductions) { 6169 LLVM_DEBUG( 6170 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6171 return 1; 6172 } 6173 6174 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6175 SmallIC = std::min(SmallIC, F); 6176 StoresIC = std::min(StoresIC, F); 6177 LoadsIC = std::min(LoadsIC, F); 6178 } 6179 6180 if (EnableLoadStoreRuntimeInterleave && 6181 std::max(StoresIC, LoadsIC) > SmallIC) { 6182 LLVM_DEBUG( 6183 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6184 return std::max(StoresIC, LoadsIC); 6185 } 6186 6187 // If there are scalar reductions and TTI has enabled aggressive 6188 // interleaving for reductions, we will interleave to expose ILP. 6189 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6190 AggressivelyInterleaveReductions) { 6191 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6192 // Interleave no less than SmallIC but not as aggressive as the normal IC 6193 // to satisfy the rare situation when resources are too limited. 6194 return std::max(IC / 2, SmallIC); 6195 } else { 6196 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6197 return SmallIC; 6198 } 6199 } 6200 6201 // Interleave if this is a large loop (small loops are already dealt with by 6202 // this point) that could benefit from interleaving. 6203 if (AggressivelyInterleaveReductions) { 6204 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6205 return IC; 6206 } 6207 6208 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6209 return 1; 6210 } 6211 6212 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6213 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6214 // This function calculates the register usage by measuring the highest number 6215 // of values that are alive at a single location. Obviously, this is a very 6216 // rough estimation. We scan the loop in a topological order in order and 6217 // assign a number to each instruction. We use RPO to ensure that defs are 6218 // met before their users. We assume that each instruction that has in-loop 6219 // users starts an interval. We record every time that an in-loop value is 6220 // used, so we have a list of the first and last occurrences of each 6221 // instruction. Next, we transpose this data structure into a multi map that 6222 // holds the list of intervals that *end* at a specific location. This multi 6223 // map allows us to perform a linear search. We scan the instructions linearly 6224 // and record each time that a new interval starts, by placing it in a set. 6225 // If we find this value in the multi-map then we remove it from the set. 6226 // The max register usage is the maximum size of the set. 6227 // We also search for instructions that are defined outside the loop, but are 6228 // used inside the loop. We need this number separately from the max-interval 6229 // usage number because when we unroll, loop-invariant values do not take 6230 // more register. 6231 LoopBlocksDFS DFS(TheLoop); 6232 DFS.perform(LI); 6233 6234 RegisterUsage RU; 6235 6236 // Each 'key' in the map opens a new interval. The values 6237 // of the map are the index of the 'last seen' usage of the 6238 // instruction that is the key. 6239 using IntervalMap = DenseMap<Instruction *, unsigned>; 6240 6241 // Maps instruction to its index. 6242 SmallVector<Instruction *, 64> IdxToInstr; 6243 // Marks the end of each interval. 6244 IntervalMap EndPoint; 6245 // Saves the list of instruction indices that are used in the loop. 6246 SmallPtrSet<Instruction *, 8> Ends; 6247 // Saves the list of values that are used in the loop but are 6248 // defined outside the loop, such as arguments and constants. 6249 SmallPtrSet<Value *, 8> LoopInvariants; 6250 6251 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6252 for (Instruction &I : BB->instructionsWithoutDebug()) { 6253 IdxToInstr.push_back(&I); 6254 6255 // Save the end location of each USE. 6256 for (Value *U : I.operands()) { 6257 auto *Instr = dyn_cast<Instruction>(U); 6258 6259 // Ignore non-instruction values such as arguments, constants, etc. 6260 if (!Instr) 6261 continue; 6262 6263 // If this instruction is outside the loop then record it and continue. 6264 if (!TheLoop->contains(Instr)) { 6265 LoopInvariants.insert(Instr); 6266 continue; 6267 } 6268 6269 // Overwrite previous end points. 6270 EndPoint[Instr] = IdxToInstr.size(); 6271 Ends.insert(Instr); 6272 } 6273 } 6274 } 6275 6276 // Saves the list of intervals that end with the index in 'key'. 6277 using InstrList = SmallVector<Instruction *, 2>; 6278 DenseMap<unsigned, InstrList> TransposeEnds; 6279 6280 // Transpose the EndPoints to a list of values that end at each index. 6281 for (auto &Interval : EndPoint) 6282 TransposeEnds[Interval.second].push_back(Interval.first); 6283 6284 SmallPtrSet<Instruction *, 8> OpenIntervals; 6285 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6286 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6287 6288 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6289 6290 // A lambda that gets the register usage for the given type and VF. 6291 const auto &TTICapture = TTI; 6292 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6293 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6294 return 0; 6295 InstructionCost::CostType RegUsage = 6296 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6297 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6298 "Nonsensical values for register usage."); 6299 return RegUsage; 6300 }; 6301 6302 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6303 Instruction *I = IdxToInstr[i]; 6304 6305 // Remove all of the instructions that end at this location. 6306 InstrList &List = TransposeEnds[i]; 6307 for (Instruction *ToRemove : List) 6308 OpenIntervals.erase(ToRemove); 6309 6310 // Ignore instructions that are never used within the loop. 6311 if (!Ends.count(I)) 6312 continue; 6313 6314 // Skip ignored values. 6315 if (ValuesToIgnore.count(I)) 6316 continue; 6317 6318 // For each VF find the maximum usage of registers. 6319 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6320 // Count the number of live intervals. 6321 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6322 6323 if (VFs[j].isScalar()) { 6324 for (auto Inst : OpenIntervals) { 6325 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6326 if (RegUsage.find(ClassID) == RegUsage.end()) 6327 RegUsage[ClassID] = 1; 6328 else 6329 RegUsage[ClassID] += 1; 6330 } 6331 } else { 6332 collectUniformsAndScalars(VFs[j]); 6333 for (auto Inst : OpenIntervals) { 6334 // Skip ignored values for VF > 1. 6335 if (VecValuesToIgnore.count(Inst)) 6336 continue; 6337 if (isScalarAfterVectorization(Inst, VFs[j])) { 6338 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6339 if (RegUsage.find(ClassID) == RegUsage.end()) 6340 RegUsage[ClassID] = 1; 6341 else 6342 RegUsage[ClassID] += 1; 6343 } else { 6344 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6345 if (RegUsage.find(ClassID) == RegUsage.end()) 6346 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6347 else 6348 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6349 } 6350 } 6351 } 6352 6353 for (auto& pair : RegUsage) { 6354 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6355 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6356 else 6357 MaxUsages[j][pair.first] = pair.second; 6358 } 6359 } 6360 6361 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6362 << OpenIntervals.size() << '\n'); 6363 6364 // Add the current instruction to the list of open intervals. 6365 OpenIntervals.insert(I); 6366 } 6367 6368 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6369 SmallMapVector<unsigned, unsigned, 4> Invariant; 6370 6371 for (auto Inst : LoopInvariants) { 6372 unsigned Usage = 6373 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6374 unsigned ClassID = 6375 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6376 if (Invariant.find(ClassID) == Invariant.end()) 6377 Invariant[ClassID] = Usage; 6378 else 6379 Invariant[ClassID] += Usage; 6380 } 6381 6382 LLVM_DEBUG({ 6383 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6384 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6385 << " item\n"; 6386 for (const auto &pair : MaxUsages[i]) { 6387 dbgs() << "LV(REG): RegisterClass: " 6388 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6389 << " registers\n"; 6390 } 6391 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6392 << " item\n"; 6393 for (const auto &pair : Invariant) { 6394 dbgs() << "LV(REG): RegisterClass: " 6395 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6396 << " registers\n"; 6397 } 6398 }); 6399 6400 RU.LoopInvariantRegs = Invariant; 6401 RU.MaxLocalUsers = MaxUsages[i]; 6402 RUs[i] = RU; 6403 } 6404 6405 return RUs; 6406 } 6407 6408 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6409 ElementCount VF) { 6410 // TODO: Cost model for emulated masked load/store is completely 6411 // broken. This hack guides the cost model to use an artificially 6412 // high enough value to practically disable vectorization with such 6413 // operations, except where previously deployed legality hack allowed 6414 // using very low cost values. This is to avoid regressions coming simply 6415 // from moving "masked load/store" check from legality to cost model. 6416 // Masked Load/Gather emulation was previously never allowed. 6417 // Limited number of Masked Store/Scatter emulation was allowed. 6418 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6419 return isa<LoadInst>(I) || 6420 (isa<StoreInst>(I) && 6421 NumPredStores > NumberOfStoresToPredicate); 6422 } 6423 6424 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6425 // If we aren't vectorizing the loop, or if we've already collected the 6426 // instructions to scalarize, there's nothing to do. Collection may already 6427 // have occurred if we have a user-selected VF and are now computing the 6428 // expected cost for interleaving. 6429 if (VF.isScalar() || VF.isZero() || 6430 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6431 return; 6432 6433 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6434 // not profitable to scalarize any instructions, the presence of VF in the 6435 // map will indicate that we've analyzed it already. 6436 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6437 6438 // Find all the instructions that are scalar with predication in the loop and 6439 // determine if it would be better to not if-convert the blocks they are in. 6440 // If so, we also record the instructions to scalarize. 6441 for (BasicBlock *BB : TheLoop->blocks()) { 6442 if (!blockNeedsPredicationForAnyReason(BB)) 6443 continue; 6444 for (Instruction &I : *BB) 6445 if (isScalarWithPredication(&I, VF)) { 6446 ScalarCostsTy ScalarCosts; 6447 // Do not apply discount if scalable, because that would lead to 6448 // invalid scalarization costs. 6449 // Do not apply discount logic if hacked cost is needed 6450 // for emulated masked memrefs. 6451 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6452 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6453 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6454 // Remember that BB will remain after vectorization. 6455 PredicatedBBsAfterVectorization.insert(BB); 6456 } 6457 } 6458 } 6459 6460 int LoopVectorizationCostModel::computePredInstDiscount( 6461 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6462 assert(!isUniformAfterVectorization(PredInst, VF) && 6463 "Instruction marked uniform-after-vectorization will be predicated"); 6464 6465 // Initialize the discount to zero, meaning that the scalar version and the 6466 // vector version cost the same. 6467 InstructionCost Discount = 0; 6468 6469 // Holds instructions to analyze. The instructions we visit are mapped in 6470 // ScalarCosts. Those instructions are the ones that would be scalarized if 6471 // we find that the scalar version costs less. 6472 SmallVector<Instruction *, 8> Worklist; 6473 6474 // Returns true if the given instruction can be scalarized. 6475 auto canBeScalarized = [&](Instruction *I) -> bool { 6476 // We only attempt to scalarize instructions forming a single-use chain 6477 // from the original predicated block that would otherwise be vectorized. 6478 // Although not strictly necessary, we give up on instructions we know will 6479 // already be scalar to avoid traversing chains that are unlikely to be 6480 // beneficial. 6481 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6482 isScalarAfterVectorization(I, VF)) 6483 return false; 6484 6485 // If the instruction is scalar with predication, it will be analyzed 6486 // separately. We ignore it within the context of PredInst. 6487 if (isScalarWithPredication(I, VF)) 6488 return false; 6489 6490 // If any of the instruction's operands are uniform after vectorization, 6491 // the instruction cannot be scalarized. This prevents, for example, a 6492 // masked load from being scalarized. 6493 // 6494 // We assume we will only emit a value for lane zero of an instruction 6495 // marked uniform after vectorization, rather than VF identical values. 6496 // Thus, if we scalarize an instruction that uses a uniform, we would 6497 // create uses of values corresponding to the lanes we aren't emitting code 6498 // for. This behavior can be changed by allowing getScalarValue to clone 6499 // the lane zero values for uniforms rather than asserting. 6500 for (Use &U : I->operands()) 6501 if (auto *J = dyn_cast<Instruction>(U.get())) 6502 if (isUniformAfterVectorization(J, VF)) 6503 return false; 6504 6505 // Otherwise, we can scalarize the instruction. 6506 return true; 6507 }; 6508 6509 // Compute the expected cost discount from scalarizing the entire expression 6510 // feeding the predicated instruction. We currently only consider expressions 6511 // that are single-use instruction chains. 6512 Worklist.push_back(PredInst); 6513 while (!Worklist.empty()) { 6514 Instruction *I = Worklist.pop_back_val(); 6515 6516 // If we've already analyzed the instruction, there's nothing to do. 6517 if (ScalarCosts.find(I) != ScalarCosts.end()) 6518 continue; 6519 6520 // Compute the cost of the vector instruction. Note that this cost already 6521 // includes the scalarization overhead of the predicated instruction. 6522 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6523 6524 // Compute the cost of the scalarized instruction. This cost is the cost of 6525 // the instruction as if it wasn't if-converted and instead remained in the 6526 // predicated block. We will scale this cost by block probability after 6527 // computing the scalarization overhead. 6528 InstructionCost ScalarCost = 6529 VF.getFixedValue() * 6530 getInstructionCost(I, ElementCount::getFixed(1)).first; 6531 6532 // Compute the scalarization overhead of needed insertelement instructions 6533 // and phi nodes. 6534 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6535 ScalarCost += TTI.getScalarizationOverhead( 6536 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6537 APInt::getAllOnes(VF.getFixedValue()), true, false); 6538 ScalarCost += 6539 VF.getFixedValue() * 6540 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6541 } 6542 6543 // Compute the scalarization overhead of needed extractelement 6544 // instructions. For each of the instruction's operands, if the operand can 6545 // be scalarized, add it to the worklist; otherwise, account for the 6546 // overhead. 6547 for (Use &U : I->operands()) 6548 if (auto *J = dyn_cast<Instruction>(U.get())) { 6549 assert(VectorType::isValidElementType(J->getType()) && 6550 "Instruction has non-scalar type"); 6551 if (canBeScalarized(J)) 6552 Worklist.push_back(J); 6553 else if (needsExtract(J, VF)) { 6554 ScalarCost += TTI.getScalarizationOverhead( 6555 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6556 APInt::getAllOnes(VF.getFixedValue()), false, true); 6557 } 6558 } 6559 6560 // Scale the total scalar cost by block probability. 6561 ScalarCost /= getReciprocalPredBlockProb(); 6562 6563 // Compute the discount. A non-negative discount means the vector version 6564 // of the instruction costs more, and scalarizing would be beneficial. 6565 Discount += VectorCost - ScalarCost; 6566 ScalarCosts[I] = ScalarCost; 6567 } 6568 6569 return *Discount.getValue(); 6570 } 6571 6572 LoopVectorizationCostModel::VectorizationCostTy 6573 LoopVectorizationCostModel::expectedCost( 6574 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6575 VectorizationCostTy Cost; 6576 6577 // For each block. 6578 for (BasicBlock *BB : TheLoop->blocks()) { 6579 VectorizationCostTy BlockCost; 6580 6581 // For each instruction in the old loop. 6582 for (Instruction &I : BB->instructionsWithoutDebug()) { 6583 // Skip ignored values. 6584 if (ValuesToIgnore.count(&I) || 6585 (VF.isVector() && VecValuesToIgnore.count(&I))) 6586 continue; 6587 6588 VectorizationCostTy C = getInstructionCost(&I, VF); 6589 6590 // Check if we should override the cost. 6591 if (C.first.isValid() && 6592 ForceTargetInstructionCost.getNumOccurrences() > 0) 6593 C.first = InstructionCost(ForceTargetInstructionCost); 6594 6595 // Keep a list of instructions with invalid costs. 6596 if (Invalid && !C.first.isValid()) 6597 Invalid->emplace_back(&I, VF); 6598 6599 BlockCost.first += C.first; 6600 BlockCost.second |= C.second; 6601 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6602 << " for VF " << VF << " For instruction: " << I 6603 << '\n'); 6604 } 6605 6606 // If we are vectorizing a predicated block, it will have been 6607 // if-converted. This means that the block's instructions (aside from 6608 // stores and instructions that may divide by zero) will now be 6609 // unconditionally executed. For the scalar case, we may not always execute 6610 // the predicated block, if it is an if-else block. Thus, scale the block's 6611 // cost by the probability of executing it. blockNeedsPredication from 6612 // Legal is used so as to not include all blocks in tail folded loops. 6613 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6614 BlockCost.first /= getReciprocalPredBlockProb(); 6615 6616 Cost.first += BlockCost.first; 6617 Cost.second |= BlockCost.second; 6618 } 6619 6620 return Cost; 6621 } 6622 6623 /// Gets Address Access SCEV after verifying that the access pattern 6624 /// is loop invariant except the induction variable dependence. 6625 /// 6626 /// This SCEV can be sent to the Target in order to estimate the address 6627 /// calculation cost. 6628 static const SCEV *getAddressAccessSCEV( 6629 Value *Ptr, 6630 LoopVectorizationLegality *Legal, 6631 PredicatedScalarEvolution &PSE, 6632 const Loop *TheLoop) { 6633 6634 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6635 if (!Gep) 6636 return nullptr; 6637 6638 // We are looking for a gep with all loop invariant indices except for one 6639 // which should be an induction variable. 6640 auto SE = PSE.getSE(); 6641 unsigned NumOperands = Gep->getNumOperands(); 6642 for (unsigned i = 1; i < NumOperands; ++i) { 6643 Value *Opd = Gep->getOperand(i); 6644 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6645 !Legal->isInductionVariable(Opd)) 6646 return nullptr; 6647 } 6648 6649 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6650 return PSE.getSCEV(Ptr); 6651 } 6652 6653 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6654 return Legal->hasStride(I->getOperand(0)) || 6655 Legal->hasStride(I->getOperand(1)); 6656 } 6657 6658 InstructionCost 6659 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6660 ElementCount VF) { 6661 assert(VF.isVector() && 6662 "Scalarization cost of instruction implies vectorization."); 6663 if (VF.isScalable()) 6664 return InstructionCost::getInvalid(); 6665 6666 Type *ValTy = getLoadStoreType(I); 6667 auto SE = PSE.getSE(); 6668 6669 unsigned AS = getLoadStoreAddressSpace(I); 6670 Value *Ptr = getLoadStorePointerOperand(I); 6671 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6672 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6673 // that it is being called from this specific place. 6674 6675 // Figure out whether the access is strided and get the stride value 6676 // if it's known in compile time 6677 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6678 6679 // Get the cost of the scalar memory instruction and address computation. 6680 InstructionCost Cost = 6681 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6682 6683 // Don't pass *I here, since it is scalar but will actually be part of a 6684 // vectorized loop where the user of it is a vectorized instruction. 6685 const Align Alignment = getLoadStoreAlignment(I); 6686 Cost += VF.getKnownMinValue() * 6687 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6688 AS, TTI::TCK_RecipThroughput); 6689 6690 // Get the overhead of the extractelement and insertelement instructions 6691 // we might create due to scalarization. 6692 Cost += getScalarizationOverhead(I, VF); 6693 6694 // If we have a predicated load/store, it will need extra i1 extracts and 6695 // conditional branches, but may not be executed for each vector lane. Scale 6696 // the cost by the probability of executing the predicated block. 6697 if (isPredicatedInst(I, VF)) { 6698 Cost /= getReciprocalPredBlockProb(); 6699 6700 // Add the cost of an i1 extract and a branch 6701 auto *Vec_i1Ty = 6702 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6703 Cost += TTI.getScalarizationOverhead( 6704 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6705 /*Insert=*/false, /*Extract=*/true); 6706 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6707 6708 if (useEmulatedMaskMemRefHack(I, VF)) 6709 // Artificially setting to a high enough value to practically disable 6710 // vectorization with such operations. 6711 Cost = 3000000; 6712 } 6713 6714 return Cost; 6715 } 6716 6717 InstructionCost 6718 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6719 ElementCount VF) { 6720 Type *ValTy = getLoadStoreType(I); 6721 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6722 Value *Ptr = getLoadStorePointerOperand(I); 6723 unsigned AS = getLoadStoreAddressSpace(I); 6724 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6725 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6726 6727 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6728 "Stride should be 1 or -1 for consecutive memory access"); 6729 const Align Alignment = getLoadStoreAlignment(I); 6730 InstructionCost Cost = 0; 6731 if (Legal->isMaskRequired(I)) 6732 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6733 CostKind); 6734 else 6735 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6736 CostKind, I); 6737 6738 bool Reverse = ConsecutiveStride < 0; 6739 if (Reverse) 6740 Cost += 6741 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6742 return Cost; 6743 } 6744 6745 InstructionCost 6746 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6747 ElementCount VF) { 6748 assert(Legal->isUniformMemOp(*I)); 6749 6750 Type *ValTy = getLoadStoreType(I); 6751 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6752 const Align Alignment = getLoadStoreAlignment(I); 6753 unsigned AS = getLoadStoreAddressSpace(I); 6754 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6755 if (isa<LoadInst>(I)) { 6756 return TTI.getAddressComputationCost(ValTy) + 6757 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6758 CostKind) + 6759 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6760 } 6761 StoreInst *SI = cast<StoreInst>(I); 6762 6763 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6764 return TTI.getAddressComputationCost(ValTy) + 6765 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6766 CostKind) + 6767 (isLoopInvariantStoreValue 6768 ? 0 6769 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6770 VF.getKnownMinValue() - 1)); 6771 } 6772 6773 InstructionCost 6774 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6775 ElementCount VF) { 6776 Type *ValTy = getLoadStoreType(I); 6777 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6778 const Align Alignment = getLoadStoreAlignment(I); 6779 const Value *Ptr = getLoadStorePointerOperand(I); 6780 6781 return TTI.getAddressComputationCost(VectorTy) + 6782 TTI.getGatherScatterOpCost( 6783 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6784 TargetTransformInfo::TCK_RecipThroughput, I); 6785 } 6786 6787 InstructionCost 6788 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6789 ElementCount VF) { 6790 // TODO: Once we have support for interleaving with scalable vectors 6791 // we can calculate the cost properly here. 6792 if (VF.isScalable()) 6793 return InstructionCost::getInvalid(); 6794 6795 Type *ValTy = getLoadStoreType(I); 6796 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6797 unsigned AS = getLoadStoreAddressSpace(I); 6798 6799 auto Group = getInterleavedAccessGroup(I); 6800 assert(Group && "Fail to get an interleaved access group."); 6801 6802 unsigned InterleaveFactor = Group->getFactor(); 6803 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6804 6805 // Holds the indices of existing members in the interleaved group. 6806 SmallVector<unsigned, 4> Indices; 6807 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6808 if (Group->getMember(IF)) 6809 Indices.push_back(IF); 6810 6811 // Calculate the cost of the whole interleaved group. 6812 bool UseMaskForGaps = 6813 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6814 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6815 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6816 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6817 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6818 6819 if (Group->isReverse()) { 6820 // TODO: Add support for reversed masked interleaved access. 6821 assert(!Legal->isMaskRequired(I) && 6822 "Reverse masked interleaved access not supported."); 6823 Cost += 6824 Group->getNumMembers() * 6825 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6826 } 6827 return Cost; 6828 } 6829 6830 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6831 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6832 using namespace llvm::PatternMatch; 6833 // Early exit for no inloop reductions 6834 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6835 return None; 6836 auto *VectorTy = cast<VectorType>(Ty); 6837 6838 // We are looking for a pattern of, and finding the minimal acceptable cost: 6839 // reduce(mul(ext(A), ext(B))) or 6840 // reduce(mul(A, B)) or 6841 // reduce(ext(A)) or 6842 // reduce(A). 6843 // The basic idea is that we walk down the tree to do that, finding the root 6844 // reduction instruction in InLoopReductionImmediateChains. From there we find 6845 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6846 // of the components. If the reduction cost is lower then we return it for the 6847 // reduction instruction and 0 for the other instructions in the pattern. If 6848 // it is not we return an invalid cost specifying the orignal cost method 6849 // should be used. 6850 Instruction *RetI = I; 6851 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6852 if (!RetI->hasOneUser()) 6853 return None; 6854 RetI = RetI->user_back(); 6855 } 6856 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6857 RetI->user_back()->getOpcode() == Instruction::Add) { 6858 if (!RetI->hasOneUser()) 6859 return None; 6860 RetI = RetI->user_back(); 6861 } 6862 6863 // Test if the found instruction is a reduction, and if not return an invalid 6864 // cost specifying the parent to use the original cost modelling. 6865 if (!InLoopReductionImmediateChains.count(RetI)) 6866 return None; 6867 6868 // Find the reduction this chain is a part of and calculate the basic cost of 6869 // the reduction on its own. 6870 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6871 Instruction *ReductionPhi = LastChain; 6872 while (!isa<PHINode>(ReductionPhi)) 6873 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6874 6875 const RecurrenceDescriptor &RdxDesc = 6876 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6877 6878 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6879 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6880 6881 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6882 // normal fmul instruction to the cost of the fadd reduction. 6883 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6884 BaseCost += 6885 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6886 6887 // If we're using ordered reductions then we can just return the base cost 6888 // here, since getArithmeticReductionCost calculates the full ordered 6889 // reduction cost when FP reassociation is not allowed. 6890 if (useOrderedReductions(RdxDesc)) 6891 return BaseCost; 6892 6893 // Get the operand that was not the reduction chain and match it to one of the 6894 // patterns, returning the better cost if it is found. 6895 Instruction *RedOp = RetI->getOperand(1) == LastChain 6896 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6897 : dyn_cast<Instruction>(RetI->getOperand(1)); 6898 6899 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6900 6901 Instruction *Op0, *Op1; 6902 if (RedOp && 6903 match(RedOp, 6904 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6905 match(Op0, m_ZExtOrSExt(m_Value())) && 6906 Op0->getOpcode() == Op1->getOpcode() && 6907 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6908 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6909 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6910 6911 // Matched reduce(ext(mul(ext(A), ext(B))) 6912 // Note that the extend opcodes need to all match, or if A==B they will have 6913 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6914 // which is equally fine. 6915 bool IsUnsigned = isa<ZExtInst>(Op0); 6916 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6917 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6918 6919 InstructionCost ExtCost = 6920 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6921 TTI::CastContextHint::None, CostKind, Op0); 6922 InstructionCost MulCost = 6923 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6924 InstructionCost Ext2Cost = 6925 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6926 TTI::CastContextHint::None, CostKind, RedOp); 6927 6928 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6929 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6930 CostKind); 6931 6932 if (RedCost.isValid() && 6933 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6934 return I == RetI ? RedCost : 0; 6935 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6936 !TheLoop->isLoopInvariant(RedOp)) { 6937 // Matched reduce(ext(A)) 6938 bool IsUnsigned = isa<ZExtInst>(RedOp); 6939 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6940 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6941 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6942 CostKind); 6943 6944 InstructionCost ExtCost = 6945 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6946 TTI::CastContextHint::None, CostKind, RedOp); 6947 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6948 return I == RetI ? RedCost : 0; 6949 } else if (RedOp && 6950 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6951 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6952 Op0->getOpcode() == Op1->getOpcode() && 6953 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6954 bool IsUnsigned = isa<ZExtInst>(Op0); 6955 Type *Op0Ty = Op0->getOperand(0)->getType(); 6956 Type *Op1Ty = Op1->getOperand(0)->getType(); 6957 Type *LargestOpTy = 6958 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6959 : Op0Ty; 6960 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6961 6962 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6963 // different sizes. We take the largest type as the ext to reduce, and add 6964 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6965 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6966 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6967 TTI::CastContextHint::None, CostKind, Op0); 6968 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6969 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6970 TTI::CastContextHint::None, CostKind, Op1); 6971 InstructionCost MulCost = 6972 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6973 6974 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6975 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6976 CostKind); 6977 InstructionCost ExtraExtCost = 0; 6978 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6979 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6980 ExtraExtCost = TTI.getCastInstrCost( 6981 ExtraExtOp->getOpcode(), ExtType, 6982 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6983 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6984 } 6985 6986 if (RedCost.isValid() && 6987 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6988 return I == RetI ? RedCost : 0; 6989 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6990 // Matched reduce(mul()) 6991 InstructionCost MulCost = 6992 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6993 6994 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6995 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6996 CostKind); 6997 6998 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6999 return I == RetI ? RedCost : 0; 7000 } 7001 } 7002 7003 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7004 } 7005 7006 InstructionCost 7007 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7008 ElementCount VF) { 7009 // Calculate scalar cost only. Vectorization cost should be ready at this 7010 // moment. 7011 if (VF.isScalar()) { 7012 Type *ValTy = getLoadStoreType(I); 7013 const Align Alignment = getLoadStoreAlignment(I); 7014 unsigned AS = getLoadStoreAddressSpace(I); 7015 7016 return TTI.getAddressComputationCost(ValTy) + 7017 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7018 TTI::TCK_RecipThroughput, I); 7019 } 7020 return getWideningCost(I, VF); 7021 } 7022 7023 LoopVectorizationCostModel::VectorizationCostTy 7024 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7025 ElementCount VF) { 7026 // If we know that this instruction will remain uniform, check the cost of 7027 // the scalar version. 7028 if (isUniformAfterVectorization(I, VF)) 7029 VF = ElementCount::getFixed(1); 7030 7031 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7032 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7033 7034 // Forced scalars do not have any scalarization overhead. 7035 auto ForcedScalar = ForcedScalars.find(VF); 7036 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7037 auto InstSet = ForcedScalar->second; 7038 if (InstSet.count(I)) 7039 return VectorizationCostTy( 7040 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7041 VF.getKnownMinValue()), 7042 false); 7043 } 7044 7045 Type *VectorTy; 7046 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7047 7048 bool TypeNotScalarized = false; 7049 if (VF.isVector() && VectorTy->isVectorTy()) { 7050 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7051 if (NumParts) 7052 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7053 else 7054 C = InstructionCost::getInvalid(); 7055 } 7056 return VectorizationCostTy(C, TypeNotScalarized); 7057 } 7058 7059 InstructionCost 7060 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7061 ElementCount VF) const { 7062 7063 // There is no mechanism yet to create a scalable scalarization loop, 7064 // so this is currently Invalid. 7065 if (VF.isScalable()) 7066 return InstructionCost::getInvalid(); 7067 7068 if (VF.isScalar()) 7069 return 0; 7070 7071 InstructionCost Cost = 0; 7072 Type *RetTy = ToVectorTy(I->getType(), VF); 7073 if (!RetTy->isVoidTy() && 7074 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7075 Cost += TTI.getScalarizationOverhead( 7076 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7077 false); 7078 7079 // Some targets keep addresses scalar. 7080 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7081 return Cost; 7082 7083 // Some targets support efficient element stores. 7084 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7085 return Cost; 7086 7087 // Collect operands to consider. 7088 CallInst *CI = dyn_cast<CallInst>(I); 7089 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7090 7091 // Skip operands that do not require extraction/scalarization and do not incur 7092 // any overhead. 7093 SmallVector<Type *> Tys; 7094 for (auto *V : filterExtractingOperands(Ops, VF)) 7095 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7096 return Cost + TTI.getOperandsScalarizationOverhead( 7097 filterExtractingOperands(Ops, VF), Tys); 7098 } 7099 7100 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7101 if (VF.isScalar()) 7102 return; 7103 NumPredStores = 0; 7104 for (BasicBlock *BB : TheLoop->blocks()) { 7105 // For each instruction in the old loop. 7106 for (Instruction &I : *BB) { 7107 Value *Ptr = getLoadStorePointerOperand(&I); 7108 if (!Ptr) 7109 continue; 7110 7111 // TODO: We should generate better code and update the cost model for 7112 // predicated uniform stores. Today they are treated as any other 7113 // predicated store (see added test cases in 7114 // invariant-store-vectorization.ll). 7115 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 7116 NumPredStores++; 7117 7118 if (Legal->isUniformMemOp(I)) { 7119 // TODO: Avoid replicating loads and stores instead of 7120 // relying on instcombine to remove them. 7121 // Load: Scalar load + broadcast 7122 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7123 InstructionCost Cost; 7124 if (isa<StoreInst>(&I) && VF.isScalable() && 7125 isLegalGatherOrScatter(&I, VF)) { 7126 Cost = getGatherScatterCost(&I, VF); 7127 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7128 } else { 7129 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7130 "Cannot yet scalarize uniform stores"); 7131 Cost = getUniformMemOpCost(&I, VF); 7132 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7133 } 7134 continue; 7135 } 7136 7137 // We assume that widening is the best solution when possible. 7138 if (memoryInstructionCanBeWidened(&I, VF)) { 7139 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7140 int ConsecutiveStride = Legal->isConsecutivePtr( 7141 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7142 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7143 "Expected consecutive stride."); 7144 InstWidening Decision = 7145 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7146 setWideningDecision(&I, VF, Decision, Cost); 7147 continue; 7148 } 7149 7150 // Choose between Interleaving, Gather/Scatter or Scalarization. 7151 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7152 unsigned NumAccesses = 1; 7153 if (isAccessInterleaved(&I)) { 7154 auto Group = getInterleavedAccessGroup(&I); 7155 assert(Group && "Fail to get an interleaved access group."); 7156 7157 // Make one decision for the whole group. 7158 if (getWideningDecision(&I, VF) != CM_Unknown) 7159 continue; 7160 7161 NumAccesses = Group->getNumMembers(); 7162 if (interleavedAccessCanBeWidened(&I, VF)) 7163 InterleaveCost = getInterleaveGroupCost(&I, VF); 7164 } 7165 7166 InstructionCost GatherScatterCost = 7167 isLegalGatherOrScatter(&I, VF) 7168 ? getGatherScatterCost(&I, VF) * NumAccesses 7169 : InstructionCost::getInvalid(); 7170 7171 InstructionCost ScalarizationCost = 7172 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7173 7174 // Choose better solution for the current VF, 7175 // write down this decision and use it during vectorization. 7176 InstructionCost Cost; 7177 InstWidening Decision; 7178 if (InterleaveCost <= GatherScatterCost && 7179 InterleaveCost < ScalarizationCost) { 7180 Decision = CM_Interleave; 7181 Cost = InterleaveCost; 7182 } else if (GatherScatterCost < ScalarizationCost) { 7183 Decision = CM_GatherScatter; 7184 Cost = GatherScatterCost; 7185 } else { 7186 Decision = CM_Scalarize; 7187 Cost = ScalarizationCost; 7188 } 7189 // If the instructions belongs to an interleave group, the whole group 7190 // receives the same decision. The whole group receives the cost, but 7191 // the cost will actually be assigned to one instruction. 7192 if (auto Group = getInterleavedAccessGroup(&I)) 7193 setWideningDecision(Group, VF, Decision, Cost); 7194 else 7195 setWideningDecision(&I, VF, Decision, Cost); 7196 } 7197 } 7198 7199 // Make sure that any load of address and any other address computation 7200 // remains scalar unless there is gather/scatter support. This avoids 7201 // inevitable extracts into address registers, and also has the benefit of 7202 // activating LSR more, since that pass can't optimize vectorized 7203 // addresses. 7204 if (TTI.prefersVectorizedAddressing()) 7205 return; 7206 7207 // Start with all scalar pointer uses. 7208 SmallPtrSet<Instruction *, 8> AddrDefs; 7209 for (BasicBlock *BB : TheLoop->blocks()) 7210 for (Instruction &I : *BB) { 7211 Instruction *PtrDef = 7212 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7213 if (PtrDef && TheLoop->contains(PtrDef) && 7214 getWideningDecision(&I, VF) != CM_GatherScatter) 7215 AddrDefs.insert(PtrDef); 7216 } 7217 7218 // Add all instructions used to generate the addresses. 7219 SmallVector<Instruction *, 4> Worklist; 7220 append_range(Worklist, AddrDefs); 7221 while (!Worklist.empty()) { 7222 Instruction *I = Worklist.pop_back_val(); 7223 for (auto &Op : I->operands()) 7224 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7225 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7226 AddrDefs.insert(InstOp).second) 7227 Worklist.push_back(InstOp); 7228 } 7229 7230 for (auto *I : AddrDefs) { 7231 if (isa<LoadInst>(I)) { 7232 // Setting the desired widening decision should ideally be handled in 7233 // by cost functions, but since this involves the task of finding out 7234 // if the loaded register is involved in an address computation, it is 7235 // instead changed here when we know this is the case. 7236 InstWidening Decision = getWideningDecision(I, VF); 7237 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7238 // Scalarize a widened load of address. 7239 setWideningDecision( 7240 I, VF, CM_Scalarize, 7241 (VF.getKnownMinValue() * 7242 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7243 else if (auto Group = getInterleavedAccessGroup(I)) { 7244 // Scalarize an interleave group of address loads. 7245 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7246 if (Instruction *Member = Group->getMember(I)) 7247 setWideningDecision( 7248 Member, VF, CM_Scalarize, 7249 (VF.getKnownMinValue() * 7250 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7251 } 7252 } 7253 } else 7254 // Make sure I gets scalarized and a cost estimate without 7255 // scalarization overhead. 7256 ForcedScalars[VF].insert(I); 7257 } 7258 } 7259 7260 InstructionCost 7261 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7262 Type *&VectorTy) { 7263 Type *RetTy = I->getType(); 7264 if (canTruncateToMinimalBitwidth(I, VF)) 7265 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7266 auto SE = PSE.getSE(); 7267 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7268 7269 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7270 ElementCount VF) -> bool { 7271 if (VF.isScalar()) 7272 return true; 7273 7274 auto Scalarized = InstsToScalarize.find(VF); 7275 assert(Scalarized != InstsToScalarize.end() && 7276 "VF not yet analyzed for scalarization profitability"); 7277 return !Scalarized->second.count(I) && 7278 llvm::all_of(I->users(), [&](User *U) { 7279 auto *UI = cast<Instruction>(U); 7280 return !Scalarized->second.count(UI); 7281 }); 7282 }; 7283 (void) hasSingleCopyAfterVectorization; 7284 7285 if (isScalarAfterVectorization(I, VF)) { 7286 // With the exception of GEPs and PHIs, after scalarization there should 7287 // only be one copy of the instruction generated in the loop. This is 7288 // because the VF is either 1, or any instructions that need scalarizing 7289 // have already been dealt with by the the time we get here. As a result, 7290 // it means we don't have to multiply the instruction cost by VF. 7291 assert(I->getOpcode() == Instruction::GetElementPtr || 7292 I->getOpcode() == Instruction::PHI || 7293 (I->getOpcode() == Instruction::BitCast && 7294 I->getType()->isPointerTy()) || 7295 hasSingleCopyAfterVectorization(I, VF)); 7296 VectorTy = RetTy; 7297 } else 7298 VectorTy = ToVectorTy(RetTy, VF); 7299 7300 // TODO: We need to estimate the cost of intrinsic calls. 7301 switch (I->getOpcode()) { 7302 case Instruction::GetElementPtr: 7303 // We mark this instruction as zero-cost because the cost of GEPs in 7304 // vectorized code depends on whether the corresponding memory instruction 7305 // is scalarized or not. Therefore, we handle GEPs with the memory 7306 // instruction cost. 7307 return 0; 7308 case Instruction::Br: { 7309 // In cases of scalarized and predicated instructions, there will be VF 7310 // predicated blocks in the vectorized loop. Each branch around these 7311 // blocks requires also an extract of its vector compare i1 element. 7312 bool ScalarPredicatedBB = false; 7313 BranchInst *BI = cast<BranchInst>(I); 7314 if (VF.isVector() && BI->isConditional() && 7315 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7316 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7317 ScalarPredicatedBB = true; 7318 7319 if (ScalarPredicatedBB) { 7320 // Not possible to scalarize scalable vector with predicated instructions. 7321 if (VF.isScalable()) 7322 return InstructionCost::getInvalid(); 7323 // Return cost for branches around scalarized and predicated blocks. 7324 auto *Vec_i1Ty = 7325 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7326 return ( 7327 TTI.getScalarizationOverhead( 7328 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7329 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7330 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7331 // The back-edge branch will remain, as will all scalar branches. 7332 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7333 else 7334 // This branch will be eliminated by if-conversion. 7335 return 0; 7336 // Note: We currently assume zero cost for an unconditional branch inside 7337 // a predicated block since it will become a fall-through, although we 7338 // may decide in the future to call TTI for all branches. 7339 } 7340 case Instruction::PHI: { 7341 auto *Phi = cast<PHINode>(I); 7342 7343 // First-order recurrences are replaced by vector shuffles inside the loop. 7344 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7345 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7346 return TTI.getShuffleCost( 7347 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7348 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7349 7350 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7351 // converted into select instructions. We require N - 1 selects per phi 7352 // node, where N is the number of incoming values. 7353 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7354 return (Phi->getNumIncomingValues() - 1) * 7355 TTI.getCmpSelInstrCost( 7356 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7357 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7358 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7359 7360 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7361 } 7362 case Instruction::UDiv: 7363 case Instruction::SDiv: 7364 case Instruction::URem: 7365 case Instruction::SRem: 7366 // If we have a predicated instruction, it may not be executed for each 7367 // vector lane. Get the scalarization cost and scale this amount by the 7368 // probability of executing the predicated block. If the instruction is not 7369 // predicated, we fall through to the next case. 7370 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7371 InstructionCost Cost = 0; 7372 7373 // These instructions have a non-void type, so account for the phi nodes 7374 // that we will create. This cost is likely to be zero. The phi node 7375 // cost, if any, should be scaled by the block probability because it 7376 // models a copy at the end of each predicated block. 7377 Cost += VF.getKnownMinValue() * 7378 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7379 7380 // The cost of the non-predicated instruction. 7381 Cost += VF.getKnownMinValue() * 7382 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7383 7384 // The cost of insertelement and extractelement instructions needed for 7385 // scalarization. 7386 Cost += getScalarizationOverhead(I, VF); 7387 7388 // Scale the cost by the probability of executing the predicated blocks. 7389 // This assumes the predicated block for each vector lane is equally 7390 // likely. 7391 return Cost / getReciprocalPredBlockProb(); 7392 } 7393 LLVM_FALLTHROUGH; 7394 case Instruction::Add: 7395 case Instruction::FAdd: 7396 case Instruction::Sub: 7397 case Instruction::FSub: 7398 case Instruction::Mul: 7399 case Instruction::FMul: 7400 case Instruction::FDiv: 7401 case Instruction::FRem: 7402 case Instruction::Shl: 7403 case Instruction::LShr: 7404 case Instruction::AShr: 7405 case Instruction::And: 7406 case Instruction::Or: 7407 case Instruction::Xor: { 7408 // Since we will replace the stride by 1 the multiplication should go away. 7409 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7410 return 0; 7411 7412 // Detect reduction patterns 7413 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7414 return *RedCost; 7415 7416 // Certain instructions can be cheaper to vectorize if they have a constant 7417 // second vector operand. One example of this are shifts on x86. 7418 Value *Op2 = I->getOperand(1); 7419 TargetTransformInfo::OperandValueProperties Op2VP; 7420 TargetTransformInfo::OperandValueKind Op2VK = 7421 TTI.getOperandInfo(Op2, Op2VP); 7422 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7423 Op2VK = TargetTransformInfo::OK_UniformValue; 7424 7425 SmallVector<const Value *, 4> Operands(I->operand_values()); 7426 return TTI.getArithmeticInstrCost( 7427 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7428 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7429 } 7430 case Instruction::FNeg: { 7431 return TTI.getArithmeticInstrCost( 7432 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7433 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7434 TargetTransformInfo::OP_None, I->getOperand(0), I); 7435 } 7436 case Instruction::Select: { 7437 SelectInst *SI = cast<SelectInst>(I); 7438 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7439 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7440 7441 const Value *Op0, *Op1; 7442 using namespace llvm::PatternMatch; 7443 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7444 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7445 // select x, y, false --> x & y 7446 // select x, true, y --> x | y 7447 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7448 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7449 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7450 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7451 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7452 Op1->getType()->getScalarSizeInBits() == 1); 7453 7454 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7455 return TTI.getArithmeticInstrCost( 7456 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7457 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7458 } 7459 7460 Type *CondTy = SI->getCondition()->getType(); 7461 if (!ScalarCond) 7462 CondTy = VectorType::get(CondTy, VF); 7463 7464 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7465 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7466 Pred = Cmp->getPredicate(); 7467 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7468 CostKind, I); 7469 } 7470 case Instruction::ICmp: 7471 case Instruction::FCmp: { 7472 Type *ValTy = I->getOperand(0)->getType(); 7473 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7474 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7475 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7476 VectorTy = ToVectorTy(ValTy, VF); 7477 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7478 cast<CmpInst>(I)->getPredicate(), CostKind, 7479 I); 7480 } 7481 case Instruction::Store: 7482 case Instruction::Load: { 7483 ElementCount Width = VF; 7484 if (Width.isVector()) { 7485 InstWidening Decision = getWideningDecision(I, Width); 7486 assert(Decision != CM_Unknown && 7487 "CM decision should be taken at this point"); 7488 if (Decision == CM_Scalarize) 7489 Width = ElementCount::getFixed(1); 7490 } 7491 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7492 return getMemoryInstructionCost(I, VF); 7493 } 7494 case Instruction::BitCast: 7495 if (I->getType()->isPointerTy()) 7496 return 0; 7497 LLVM_FALLTHROUGH; 7498 case Instruction::ZExt: 7499 case Instruction::SExt: 7500 case Instruction::FPToUI: 7501 case Instruction::FPToSI: 7502 case Instruction::FPExt: 7503 case Instruction::PtrToInt: 7504 case Instruction::IntToPtr: 7505 case Instruction::SIToFP: 7506 case Instruction::UIToFP: 7507 case Instruction::Trunc: 7508 case Instruction::FPTrunc: { 7509 // Computes the CastContextHint from a Load/Store instruction. 7510 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7511 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7512 "Expected a load or a store!"); 7513 7514 if (VF.isScalar() || !TheLoop->contains(I)) 7515 return TTI::CastContextHint::Normal; 7516 7517 switch (getWideningDecision(I, VF)) { 7518 case LoopVectorizationCostModel::CM_GatherScatter: 7519 return TTI::CastContextHint::GatherScatter; 7520 case LoopVectorizationCostModel::CM_Interleave: 7521 return TTI::CastContextHint::Interleave; 7522 case LoopVectorizationCostModel::CM_Scalarize: 7523 case LoopVectorizationCostModel::CM_Widen: 7524 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7525 : TTI::CastContextHint::Normal; 7526 case LoopVectorizationCostModel::CM_Widen_Reverse: 7527 return TTI::CastContextHint::Reversed; 7528 case LoopVectorizationCostModel::CM_Unknown: 7529 llvm_unreachable("Instr did not go through cost modelling?"); 7530 } 7531 7532 llvm_unreachable("Unhandled case!"); 7533 }; 7534 7535 unsigned Opcode = I->getOpcode(); 7536 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7537 // For Trunc, the context is the only user, which must be a StoreInst. 7538 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7539 if (I->hasOneUse()) 7540 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7541 CCH = ComputeCCH(Store); 7542 } 7543 // For Z/Sext, the context is the operand, which must be a LoadInst. 7544 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7545 Opcode == Instruction::FPExt) { 7546 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7547 CCH = ComputeCCH(Load); 7548 } 7549 7550 // We optimize the truncation of induction variables having constant 7551 // integer steps. The cost of these truncations is the same as the scalar 7552 // operation. 7553 if (isOptimizableIVTruncate(I, VF)) { 7554 auto *Trunc = cast<TruncInst>(I); 7555 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7556 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7557 } 7558 7559 // Detect reduction patterns 7560 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7561 return *RedCost; 7562 7563 Type *SrcScalarTy = I->getOperand(0)->getType(); 7564 Type *SrcVecTy = 7565 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7566 if (canTruncateToMinimalBitwidth(I, VF)) { 7567 // This cast is going to be shrunk. This may remove the cast or it might 7568 // turn it into slightly different cast. For example, if MinBW == 16, 7569 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7570 // 7571 // Calculate the modified src and dest types. 7572 Type *MinVecTy = VectorTy; 7573 if (Opcode == Instruction::Trunc) { 7574 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7575 VectorTy = 7576 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7577 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7578 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7579 VectorTy = 7580 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7581 } 7582 } 7583 7584 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7585 } 7586 case Instruction::Call: { 7587 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7588 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7589 return *RedCost; 7590 bool NeedToScalarize; 7591 CallInst *CI = cast<CallInst>(I); 7592 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7593 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7594 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7595 return std::min(CallCost, IntrinsicCost); 7596 } 7597 return CallCost; 7598 } 7599 case Instruction::ExtractValue: 7600 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7601 case Instruction::Alloca: 7602 // We cannot easily widen alloca to a scalable alloca, as 7603 // the result would need to be a vector of pointers. 7604 if (VF.isScalable()) 7605 return InstructionCost::getInvalid(); 7606 LLVM_FALLTHROUGH; 7607 default: 7608 // This opcode is unknown. Assume that it is the same as 'mul'. 7609 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7610 } // end of switch. 7611 } 7612 7613 char LoopVectorize::ID = 0; 7614 7615 static const char lv_name[] = "Loop Vectorization"; 7616 7617 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7618 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7619 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7620 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7621 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7622 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7623 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7624 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7625 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7626 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7627 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7628 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7629 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7630 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7631 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7632 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7633 7634 namespace llvm { 7635 7636 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7637 7638 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7639 bool VectorizeOnlyWhenForced) { 7640 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7641 } 7642 7643 } // end namespace llvm 7644 7645 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7646 // Check if the pointer operand of a load or store instruction is 7647 // consecutive. 7648 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7649 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7650 return false; 7651 } 7652 7653 void LoopVectorizationCostModel::collectValuesToIgnore() { 7654 // Ignore ephemeral values. 7655 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7656 7657 // Ignore type-promoting instructions we identified during reduction 7658 // detection. 7659 for (auto &Reduction : Legal->getReductionVars()) { 7660 const RecurrenceDescriptor &RedDes = Reduction.second; 7661 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7662 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7663 } 7664 // Ignore type-casting instructions we identified during induction 7665 // detection. 7666 for (auto &Induction : Legal->getInductionVars()) { 7667 const InductionDescriptor &IndDes = Induction.second; 7668 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7669 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7670 } 7671 } 7672 7673 void LoopVectorizationCostModel::collectInLoopReductions() { 7674 for (auto &Reduction : Legal->getReductionVars()) { 7675 PHINode *Phi = Reduction.first; 7676 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7677 7678 // We don't collect reductions that are type promoted (yet). 7679 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7680 continue; 7681 7682 // If the target would prefer this reduction to happen "in-loop", then we 7683 // want to record it as such. 7684 unsigned Opcode = RdxDesc.getOpcode(); 7685 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7686 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7687 TargetTransformInfo::ReductionFlags())) 7688 continue; 7689 7690 // Check that we can correctly put the reductions into the loop, by 7691 // finding the chain of operations that leads from the phi to the loop 7692 // exit value. 7693 SmallVector<Instruction *, 4> ReductionOperations = 7694 RdxDesc.getReductionOpChain(Phi, TheLoop); 7695 bool InLoop = !ReductionOperations.empty(); 7696 if (InLoop) { 7697 InLoopReductionChains[Phi] = ReductionOperations; 7698 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7699 Instruction *LastChain = Phi; 7700 for (auto *I : ReductionOperations) { 7701 InLoopReductionImmediateChains[I] = LastChain; 7702 LastChain = I; 7703 } 7704 } 7705 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7706 << " reduction for phi: " << *Phi << "\n"); 7707 } 7708 } 7709 7710 // TODO: we could return a pair of values that specify the max VF and 7711 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7712 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7713 // doesn't have a cost model that can choose which plan to execute if 7714 // more than one is generated. 7715 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7716 LoopVectorizationCostModel &CM) { 7717 unsigned WidestType; 7718 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7719 return WidestVectorRegBits / WidestType; 7720 } 7721 7722 VectorizationFactor 7723 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7724 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7725 ElementCount VF = UserVF; 7726 // Outer loop handling: They may require CFG and instruction level 7727 // transformations before even evaluating whether vectorization is profitable. 7728 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7729 // the vectorization pipeline. 7730 if (!OrigLoop->isInnermost()) { 7731 // If the user doesn't provide a vectorization factor, determine a 7732 // reasonable one. 7733 if (UserVF.isZero()) { 7734 VF = ElementCount::getFixed(determineVPlanVF( 7735 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7736 .getFixedSize(), 7737 CM)); 7738 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7739 7740 // Make sure we have a VF > 1 for stress testing. 7741 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7742 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7743 << "overriding computed VF.\n"); 7744 VF = ElementCount::getFixed(4); 7745 } 7746 } 7747 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7748 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7749 "VF needs to be a power of two"); 7750 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7751 << "VF " << VF << " to build VPlans.\n"); 7752 buildVPlans(VF, VF); 7753 7754 // For VPlan build stress testing, we bail out after VPlan construction. 7755 if (VPlanBuildStressTest) 7756 return VectorizationFactor::Disabled(); 7757 7758 return {VF, 0 /*Cost*/}; 7759 } 7760 7761 LLVM_DEBUG( 7762 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7763 "VPlan-native path.\n"); 7764 return VectorizationFactor::Disabled(); 7765 } 7766 7767 Optional<VectorizationFactor> 7768 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7769 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7770 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7771 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7772 return None; 7773 7774 // Invalidate interleave groups if all blocks of loop will be predicated. 7775 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7776 !useMaskedInterleavedAccesses(*TTI)) { 7777 LLVM_DEBUG( 7778 dbgs() 7779 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7780 "which requires masked-interleaved support.\n"); 7781 if (CM.InterleaveInfo.invalidateGroups()) 7782 // Invalidating interleave groups also requires invalidating all decisions 7783 // based on them, which includes widening decisions and uniform and scalar 7784 // values. 7785 CM.invalidateCostModelingDecisions(); 7786 } 7787 7788 ElementCount MaxUserVF = 7789 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7790 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7791 if (!UserVF.isZero() && UserVFIsLegal) { 7792 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7793 "VF needs to be a power of two"); 7794 // Collect the instructions (and their associated costs) that will be more 7795 // profitable to scalarize. 7796 if (CM.selectUserVectorizationFactor(UserVF)) { 7797 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7798 CM.collectInLoopReductions(); 7799 buildVPlansWithVPRecipes(UserVF, UserVF); 7800 LLVM_DEBUG(printPlans(dbgs())); 7801 return {{UserVF, 0}}; 7802 } else 7803 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7804 "InvalidCost", ORE, OrigLoop); 7805 } 7806 7807 // Populate the set of Vectorization Factor Candidates. 7808 ElementCountSet VFCandidates; 7809 for (auto VF = ElementCount::getFixed(1); 7810 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7811 VFCandidates.insert(VF); 7812 for (auto VF = ElementCount::getScalable(1); 7813 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7814 VFCandidates.insert(VF); 7815 7816 for (const auto &VF : VFCandidates) { 7817 // Collect Uniform and Scalar instructions after vectorization with VF. 7818 CM.collectUniformsAndScalars(VF); 7819 7820 // Collect the instructions (and their associated costs) that will be more 7821 // profitable to scalarize. 7822 if (VF.isVector()) 7823 CM.collectInstsToScalarize(VF); 7824 } 7825 7826 CM.collectInLoopReductions(); 7827 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7828 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7829 7830 LLVM_DEBUG(printPlans(dbgs())); 7831 if (!MaxFactors.hasVector()) 7832 return VectorizationFactor::Disabled(); 7833 7834 // Select the optimal vectorization factor. 7835 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7836 7837 // Check if it is profitable to vectorize with runtime checks. 7838 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7839 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7840 bool PragmaThresholdReached = 7841 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7842 bool ThresholdReached = 7843 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7844 if ((ThresholdReached && !Hints.allowReordering()) || 7845 PragmaThresholdReached) { 7846 ORE->emit([&]() { 7847 return OptimizationRemarkAnalysisAliasing( 7848 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7849 OrigLoop->getHeader()) 7850 << "loop not vectorized: cannot prove it is safe to reorder " 7851 "memory operations"; 7852 }); 7853 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7854 Hints.emitRemarkWithHints(); 7855 return VectorizationFactor::Disabled(); 7856 } 7857 } 7858 return SelectedVF; 7859 } 7860 7861 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7862 assert(count_if(VPlans, 7863 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7864 1 && 7865 "Best VF has not a single VPlan."); 7866 7867 for (const VPlanPtr &Plan : VPlans) { 7868 if (Plan->hasVF(VF)) 7869 return *Plan.get(); 7870 } 7871 llvm_unreachable("No plan found!"); 7872 } 7873 7874 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7875 SmallVector<Metadata *, 4> MDs; 7876 // Reserve first location for self reference to the LoopID metadata node. 7877 MDs.push_back(nullptr); 7878 bool IsUnrollMetadata = false; 7879 MDNode *LoopID = L->getLoopID(); 7880 if (LoopID) { 7881 // First find existing loop unrolling disable metadata. 7882 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7883 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7884 if (MD) { 7885 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7886 IsUnrollMetadata = 7887 S && S->getString().startswith("llvm.loop.unroll.disable"); 7888 } 7889 MDs.push_back(LoopID->getOperand(i)); 7890 } 7891 } 7892 7893 if (!IsUnrollMetadata) { 7894 // Add runtime unroll disable metadata. 7895 LLVMContext &Context = L->getHeader()->getContext(); 7896 SmallVector<Metadata *, 1> DisableOperands; 7897 DisableOperands.push_back( 7898 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7899 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7900 MDs.push_back(DisableNode); 7901 MDNode *NewLoopID = MDNode::get(Context, MDs); 7902 // Set operand 0 to refer to the loop id itself. 7903 NewLoopID->replaceOperandWith(0, NewLoopID); 7904 L->setLoopID(NewLoopID); 7905 } 7906 } 7907 7908 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7909 VPlan &BestVPlan, 7910 InnerLoopVectorizer &ILV, 7911 DominatorTree *DT) { 7912 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7913 << '\n'); 7914 7915 // Perform the actual loop transformation. 7916 7917 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7918 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7919 Value *CanonicalIVStartValue; 7920 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7921 ILV.createVectorizedLoopSkeleton(); 7922 ILV.collectPoisonGeneratingRecipes(State); 7923 7924 ILV.printDebugTracesAtStart(); 7925 7926 //===------------------------------------------------===// 7927 // 7928 // Notice: any optimization or new instruction that go 7929 // into the code below should also be implemented in 7930 // the cost-model. 7931 // 7932 //===------------------------------------------------===// 7933 7934 // 2. Copy and widen instructions from the old loop into the new loop. 7935 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7936 ILV.getOrCreateVectorTripCount(nullptr), 7937 CanonicalIVStartValue, State); 7938 BestVPlan.execute(&State); 7939 7940 // Keep all loop hints from the original loop on the vector loop (we'll 7941 // replace the vectorizer-specific hints below). 7942 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7943 7944 Optional<MDNode *> VectorizedLoopID = 7945 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7946 LLVMLoopVectorizeFollowupVectorized}); 7947 7948 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 7949 if (VectorizedLoopID.hasValue()) 7950 L->setLoopID(VectorizedLoopID.getValue()); 7951 else { 7952 // Keep all loop hints from the original loop on the vector loop (we'll 7953 // replace the vectorizer-specific hints below). 7954 if (MDNode *LID = OrigLoop->getLoopID()) 7955 L->setLoopID(LID); 7956 7957 LoopVectorizeHints Hints(L, true, *ORE); 7958 Hints.setAlreadyVectorized(); 7959 } 7960 // Disable runtime unrolling when vectorizing the epilogue loop. 7961 if (CanonicalIVStartValue) 7962 AddRuntimeUnrollDisableMetaData(L); 7963 7964 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7965 // predication, updating analyses. 7966 ILV.fixVectorizedLoop(State); 7967 7968 ILV.printDebugTracesAtEnd(); 7969 } 7970 7971 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7972 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7973 for (const auto &Plan : VPlans) 7974 if (PrintVPlansInDotFormat) 7975 Plan->printDOT(O); 7976 else 7977 Plan->print(O); 7978 } 7979 #endif 7980 7981 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7982 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7983 7984 // We create new control-flow for the vectorized loop, so the original exit 7985 // conditions will be dead after vectorization if it's only used by the 7986 // terminator 7987 SmallVector<BasicBlock*> ExitingBlocks; 7988 OrigLoop->getExitingBlocks(ExitingBlocks); 7989 for (auto *BB : ExitingBlocks) { 7990 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7991 if (!Cmp || !Cmp->hasOneUse()) 7992 continue; 7993 7994 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7995 if (!DeadInstructions.insert(Cmp).second) 7996 continue; 7997 7998 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7999 // TODO: can recurse through operands in general 8000 for (Value *Op : Cmp->operands()) { 8001 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8002 DeadInstructions.insert(cast<Instruction>(Op)); 8003 } 8004 } 8005 8006 // We create new "steps" for induction variable updates to which the original 8007 // induction variables map. An original update instruction will be dead if 8008 // all its users except the induction variable are dead. 8009 auto *Latch = OrigLoop->getLoopLatch(); 8010 for (auto &Induction : Legal->getInductionVars()) { 8011 PHINode *Ind = Induction.first; 8012 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8013 8014 // If the tail is to be folded by masking, the primary induction variable, 8015 // if exists, isn't dead: it will be used for masking. Don't kill it. 8016 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8017 continue; 8018 8019 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8020 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8021 })) 8022 DeadInstructions.insert(IndUpdate); 8023 } 8024 } 8025 8026 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8027 8028 //===--------------------------------------------------------------------===// 8029 // EpilogueVectorizerMainLoop 8030 //===--------------------------------------------------------------------===// 8031 8032 /// This function is partially responsible for generating the control flow 8033 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8034 std::pair<BasicBlock *, Value *> 8035 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8036 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8037 Loop *Lp = createVectorLoopSkeleton(""); 8038 8039 // Generate the code to check the minimum iteration count of the vector 8040 // epilogue (see below). 8041 EPI.EpilogueIterationCountCheck = 8042 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8043 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8044 8045 // Generate the code to check any assumptions that we've made for SCEV 8046 // expressions. 8047 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8048 8049 // Generate the code that checks at runtime if arrays overlap. We put the 8050 // checks into a separate block to make the more common case of few elements 8051 // faster. 8052 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8053 8054 // Generate the iteration count check for the main loop, *after* the check 8055 // for the epilogue loop, so that the path-length is shorter for the case 8056 // that goes directly through the vector epilogue. The longer-path length for 8057 // the main loop is compensated for, by the gain from vectorizing the larger 8058 // trip count. Note: the branch will get updated later on when we vectorize 8059 // the epilogue. 8060 EPI.MainLoopIterationCountCheck = 8061 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8062 8063 // Generate the induction variable. 8064 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8065 EPI.VectorTripCount = CountRoundDown; 8066 createHeaderBranch(Lp); 8067 8068 // Skip induction resume value creation here because they will be created in 8069 // the second pass. If we created them here, they wouldn't be used anyway, 8070 // because the vplan in the second pass still contains the inductions from the 8071 // original loop. 8072 8073 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 8074 } 8075 8076 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8077 LLVM_DEBUG({ 8078 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8079 << "Main Loop VF:" << EPI.MainLoopVF 8080 << ", Main Loop UF:" << EPI.MainLoopUF 8081 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8082 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8083 }); 8084 } 8085 8086 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8087 DEBUG_WITH_TYPE(VerboseDebug, { 8088 dbgs() << "intermediate fn:\n" 8089 << *OrigLoop->getHeader()->getParent() << "\n"; 8090 }); 8091 } 8092 8093 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8094 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8095 assert(L && "Expected valid Loop."); 8096 assert(Bypass && "Expected valid bypass basic block."); 8097 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8098 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8099 Value *Count = getOrCreateTripCount(L); 8100 // Reuse existing vector loop preheader for TC checks. 8101 // Note that new preheader block is generated for vector loop. 8102 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8103 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8104 8105 // Generate code to check if the loop's trip count is less than VF * UF of the 8106 // main vector loop. 8107 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8108 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8109 8110 Value *CheckMinIters = Builder.CreateICmp( 8111 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8112 "min.iters.check"); 8113 8114 if (!ForEpilogue) 8115 TCCheckBlock->setName("vector.main.loop.iter.check"); 8116 8117 // Create new preheader for vector loop. 8118 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8119 DT, LI, nullptr, "vector.ph"); 8120 8121 if (ForEpilogue) { 8122 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8123 DT->getNode(Bypass)->getIDom()) && 8124 "TC check is expected to dominate Bypass"); 8125 8126 // Update dominator for Bypass & LoopExit. 8127 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8128 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8129 // For loops with multiple exits, there's no edge from the middle block 8130 // to exit blocks (as the epilogue must run) and thus no need to update 8131 // the immediate dominator of the exit blocks. 8132 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8133 8134 LoopBypassBlocks.push_back(TCCheckBlock); 8135 8136 // Save the trip count so we don't have to regenerate it in the 8137 // vec.epilog.iter.check. This is safe to do because the trip count 8138 // generated here dominates the vector epilog iter check. 8139 EPI.TripCount = Count; 8140 } 8141 8142 ReplaceInstWithInst( 8143 TCCheckBlock->getTerminator(), 8144 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8145 8146 return TCCheckBlock; 8147 } 8148 8149 //===--------------------------------------------------------------------===// 8150 // EpilogueVectorizerEpilogueLoop 8151 //===--------------------------------------------------------------------===// 8152 8153 /// This function is partially responsible for generating the control flow 8154 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8155 std::pair<BasicBlock *, Value *> 8156 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8157 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8158 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8159 8160 // Now, compare the remaining count and if there aren't enough iterations to 8161 // execute the vectorized epilogue skip to the scalar part. 8162 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8163 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8164 LoopVectorPreHeader = 8165 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8166 LI, nullptr, "vec.epilog.ph"); 8167 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8168 VecEpilogueIterationCountCheck); 8169 8170 // Adjust the control flow taking the state info from the main loop 8171 // vectorization into account. 8172 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8173 "expected this to be saved from the previous pass."); 8174 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8175 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8176 8177 DT->changeImmediateDominator(LoopVectorPreHeader, 8178 EPI.MainLoopIterationCountCheck); 8179 8180 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8181 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8182 8183 if (EPI.SCEVSafetyCheck) 8184 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8185 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8186 if (EPI.MemSafetyCheck) 8187 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8188 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8189 8190 DT->changeImmediateDominator( 8191 VecEpilogueIterationCountCheck, 8192 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8193 8194 DT->changeImmediateDominator(LoopScalarPreHeader, 8195 EPI.EpilogueIterationCountCheck); 8196 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8197 // If there is an epilogue which must run, there's no edge from the 8198 // middle block to exit blocks and thus no need to update the immediate 8199 // dominator of the exit blocks. 8200 DT->changeImmediateDominator(LoopExitBlock, 8201 EPI.EpilogueIterationCountCheck); 8202 8203 // Keep track of bypass blocks, as they feed start values to the induction 8204 // phis in the scalar loop preheader. 8205 if (EPI.SCEVSafetyCheck) 8206 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8207 if (EPI.MemSafetyCheck) 8208 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8209 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8210 8211 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 8212 // merge control-flow from the latch block and the middle block. Update the 8213 // incoming values here and move the Phi into the preheader. 8214 SmallVector<PHINode *, 4> PhisInBlock; 8215 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8216 PhisInBlock.push_back(&Phi); 8217 8218 for (PHINode *Phi : PhisInBlock) { 8219 Phi->replaceIncomingBlockWith( 8220 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8221 VecEpilogueIterationCountCheck); 8222 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8223 if (EPI.SCEVSafetyCheck) 8224 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8225 if (EPI.MemSafetyCheck) 8226 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8227 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8228 } 8229 8230 // Generate a resume induction for the vector epilogue and put it in the 8231 // vector epilogue preheader 8232 Type *IdxTy = Legal->getWidestInductionType(); 8233 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8234 LoopVectorPreHeader->getFirstNonPHI()); 8235 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8236 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8237 EPI.MainLoopIterationCountCheck); 8238 8239 // Generate the induction variable. 8240 createHeaderBranch(Lp); 8241 8242 // Generate induction resume values. These variables save the new starting 8243 // indexes for the scalar loop. They are used to test if there are any tail 8244 // iterations left once the vector loop has completed. 8245 // Note that when the vectorized epilogue is skipped due to iteration count 8246 // check, then the resume value for the induction variable comes from 8247 // the trip count of the main vector loop, hence passing the AdditionalBypass 8248 // argument. 8249 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8250 EPI.VectorTripCount} /* AdditionalBypass */); 8251 8252 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8253 } 8254 8255 BasicBlock * 8256 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8257 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8258 8259 assert(EPI.TripCount && 8260 "Expected trip count to have been safed in the first pass."); 8261 assert( 8262 (!isa<Instruction>(EPI.TripCount) || 8263 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8264 "saved trip count does not dominate insertion point."); 8265 Value *TC = EPI.TripCount; 8266 IRBuilder<> Builder(Insert->getTerminator()); 8267 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8268 8269 // Generate code to check if the loop's trip count is less than VF * UF of the 8270 // vector epilogue loop. 8271 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8272 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8273 8274 Value *CheckMinIters = 8275 Builder.CreateICmp(P, Count, 8276 createStepForVF(Builder, Count->getType(), 8277 EPI.EpilogueVF, EPI.EpilogueUF), 8278 "min.epilog.iters.check"); 8279 8280 ReplaceInstWithInst( 8281 Insert->getTerminator(), 8282 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8283 8284 LoopBypassBlocks.push_back(Insert); 8285 return Insert; 8286 } 8287 8288 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8289 LLVM_DEBUG({ 8290 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8291 << "Epilogue Loop VF:" << EPI.EpilogueVF 8292 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8293 }); 8294 } 8295 8296 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8297 DEBUG_WITH_TYPE(VerboseDebug, { 8298 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8299 }); 8300 } 8301 8302 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8303 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8304 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8305 bool PredicateAtRangeStart = Predicate(Range.Start); 8306 8307 for (ElementCount TmpVF = Range.Start * 2; 8308 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8309 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8310 Range.End = TmpVF; 8311 break; 8312 } 8313 8314 return PredicateAtRangeStart; 8315 } 8316 8317 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8318 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8319 /// of VF's starting at a given VF and extending it as much as possible. Each 8320 /// vectorization decision can potentially shorten this sub-range during 8321 /// buildVPlan(). 8322 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8323 ElementCount MaxVF) { 8324 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8325 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8326 VFRange SubRange = {VF, MaxVFPlusOne}; 8327 VPlans.push_back(buildVPlan(SubRange)); 8328 VF = SubRange.End; 8329 } 8330 } 8331 8332 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8333 VPlanPtr &Plan) { 8334 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8335 8336 // Look for cached value. 8337 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8338 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8339 if (ECEntryIt != EdgeMaskCache.end()) 8340 return ECEntryIt->second; 8341 8342 VPValue *SrcMask = createBlockInMask(Src, Plan); 8343 8344 // The terminator has to be a branch inst! 8345 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8346 assert(BI && "Unexpected terminator found"); 8347 8348 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8349 return EdgeMaskCache[Edge] = SrcMask; 8350 8351 // If source is an exiting block, we know the exit edge is dynamically dead 8352 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8353 // adding uses of an otherwise potentially dead instruction. 8354 if (OrigLoop->isLoopExiting(Src)) 8355 return EdgeMaskCache[Edge] = SrcMask; 8356 8357 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8358 assert(EdgeMask && "No Edge Mask found for condition"); 8359 8360 if (BI->getSuccessor(0) != Dst) 8361 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8362 8363 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8364 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8365 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8366 // The select version does not introduce new UB if SrcMask is false and 8367 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8368 VPValue *False = Plan->getOrAddVPValue( 8369 ConstantInt::getFalse(BI->getCondition()->getType())); 8370 EdgeMask = 8371 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8372 } 8373 8374 return EdgeMaskCache[Edge] = EdgeMask; 8375 } 8376 8377 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8378 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8379 8380 // Look for cached value. 8381 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8382 if (BCEntryIt != BlockMaskCache.end()) 8383 return BCEntryIt->second; 8384 8385 // All-one mask is modelled as no-mask following the convention for masked 8386 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8387 VPValue *BlockMask = nullptr; 8388 8389 if (OrigLoop->getHeader() == BB) { 8390 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8391 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8392 8393 // Introduce the early-exit compare IV <= BTC to form header block mask. 8394 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8395 // constructing the desired canonical IV in the header block as its first 8396 // non-phi instructions. 8397 assert(CM.foldTailByMasking() && "must fold the tail"); 8398 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8399 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8400 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8401 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8402 8403 VPBuilder::InsertPointGuard Guard(Builder); 8404 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8405 if (CM.TTI.emitGetActiveLaneMask()) { 8406 VPValue *TC = Plan->getOrCreateTripCount(); 8407 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8408 } else { 8409 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8410 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8411 } 8412 return BlockMaskCache[BB] = BlockMask; 8413 } 8414 8415 // This is the block mask. We OR all incoming edges. 8416 for (auto *Predecessor : predecessors(BB)) { 8417 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8418 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8419 return BlockMaskCache[BB] = EdgeMask; 8420 8421 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8422 BlockMask = EdgeMask; 8423 continue; 8424 } 8425 8426 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8427 } 8428 8429 return BlockMaskCache[BB] = BlockMask; 8430 } 8431 8432 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8433 ArrayRef<VPValue *> Operands, 8434 VFRange &Range, 8435 VPlanPtr &Plan) { 8436 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8437 "Must be called with either a load or store"); 8438 8439 auto willWiden = [&](ElementCount VF) -> bool { 8440 if (VF.isScalar()) 8441 return false; 8442 LoopVectorizationCostModel::InstWidening Decision = 8443 CM.getWideningDecision(I, VF); 8444 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8445 "CM decision should be taken at this point."); 8446 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8447 return true; 8448 if (CM.isScalarAfterVectorization(I, VF) || 8449 CM.isProfitableToScalarize(I, VF)) 8450 return false; 8451 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8452 }; 8453 8454 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8455 return nullptr; 8456 8457 VPValue *Mask = nullptr; 8458 if (Legal->isMaskRequired(I)) 8459 Mask = createBlockInMask(I->getParent(), Plan); 8460 8461 // Determine if the pointer operand of the access is either consecutive or 8462 // reverse consecutive. 8463 LoopVectorizationCostModel::InstWidening Decision = 8464 CM.getWideningDecision(I, Range.Start); 8465 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8466 bool Consecutive = 8467 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8468 8469 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8470 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8471 Consecutive, Reverse); 8472 8473 StoreInst *Store = cast<StoreInst>(I); 8474 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8475 Mask, Consecutive, Reverse); 8476 } 8477 8478 static VPWidenIntOrFpInductionRecipe * 8479 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, 8480 VPValue *Start, const InductionDescriptor &IndDesc, 8481 LoopVectorizationCostModel &CM, Loop &OrigLoop, 8482 VFRange &Range) { 8483 // Returns true if an instruction \p I should be scalarized instead of 8484 // vectorized for the chosen vectorization factor. 8485 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8486 return CM.isScalarAfterVectorization(I, VF) || 8487 CM.isProfitableToScalarize(I, VF); 8488 }; 8489 8490 bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( 8491 [&](ElementCount VF) { 8492 // Returns true if we should generate a scalar version of \p IV. 8493 if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) 8494 return true; 8495 auto isScalarInst = [&](User *U) -> bool { 8496 auto *I = cast<Instruction>(U); 8497 return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); 8498 }; 8499 return any_of(PhiOrTrunc->users(), isScalarInst); 8500 }, 8501 Range); 8502 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8503 [&](ElementCount VF) { 8504 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8505 }, 8506 Range); 8507 assert(IndDesc.getStartValue() == 8508 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8509 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8510 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, 8511 NeedsScalarIV, !NeedsScalarIVOnly); 8512 } 8513 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8514 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, 8515 !NeedsScalarIVOnly); 8516 } 8517 8518 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8519 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { 8520 8521 // Check if this is an integer or fp induction. If so, build the recipe that 8522 // produces its scalar and vector values. 8523 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8524 return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, 8525 Range); 8526 8527 return nullptr; 8528 } 8529 8530 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8531 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8532 VPlan &Plan) const { 8533 // Optimize the special case where the source is a constant integer 8534 // induction variable. Notice that we can only optimize the 'trunc' case 8535 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8536 // (c) other casts depend on pointer size. 8537 8538 // Determine whether \p K is a truncation based on an induction variable that 8539 // can be optimized. 8540 auto isOptimizableIVTruncate = 8541 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8542 return [=](ElementCount VF) -> bool { 8543 return CM.isOptimizableIVTruncate(K, VF); 8544 }; 8545 }; 8546 8547 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8548 isOptimizableIVTruncate(I), Range)) { 8549 8550 auto *Phi = cast<PHINode>(I->getOperand(0)); 8551 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8552 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8553 return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); 8554 } 8555 return nullptr; 8556 } 8557 8558 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8559 ArrayRef<VPValue *> Operands, 8560 VPlanPtr &Plan) { 8561 // If all incoming values are equal, the incoming VPValue can be used directly 8562 // instead of creating a new VPBlendRecipe. 8563 VPValue *FirstIncoming = Operands[0]; 8564 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8565 return FirstIncoming == Inc; 8566 })) { 8567 return Operands[0]; 8568 } 8569 8570 unsigned NumIncoming = Phi->getNumIncomingValues(); 8571 // For in-loop reductions, we do not need to create an additional select. 8572 VPValue *InLoopVal = nullptr; 8573 for (unsigned In = 0; In < NumIncoming; In++) { 8574 PHINode *PhiOp = 8575 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8576 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8577 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8578 InLoopVal = Operands[In]; 8579 } 8580 } 8581 8582 assert((!InLoopVal || NumIncoming == 2) && 8583 "Found an in-loop reduction for PHI with unexpected number of " 8584 "incoming values"); 8585 if (InLoopVal) 8586 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8587 8588 // We know that all PHIs in non-header blocks are converted into selects, so 8589 // we don't have to worry about the insertion order and we can just use the 8590 // builder. At this point we generate the predication tree. There may be 8591 // duplications since this is a simple recursive scan, but future 8592 // optimizations will clean it up. 8593 SmallVector<VPValue *, 2> OperandsWithMask; 8594 8595 for (unsigned In = 0; In < NumIncoming; In++) { 8596 VPValue *EdgeMask = 8597 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8598 assert((EdgeMask || NumIncoming == 1) && 8599 "Multiple predecessors with one having a full mask"); 8600 OperandsWithMask.push_back(Operands[In]); 8601 if (EdgeMask) 8602 OperandsWithMask.push_back(EdgeMask); 8603 } 8604 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8605 } 8606 8607 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8608 ArrayRef<VPValue *> Operands, 8609 VFRange &Range) const { 8610 8611 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8612 [this, CI](ElementCount VF) { 8613 return CM.isScalarWithPredication(CI, VF); 8614 }, 8615 Range); 8616 8617 if (IsPredicated) 8618 return nullptr; 8619 8620 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8621 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8622 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8623 ID == Intrinsic::pseudoprobe || 8624 ID == Intrinsic::experimental_noalias_scope_decl)) 8625 return nullptr; 8626 8627 auto willWiden = [&](ElementCount VF) -> bool { 8628 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8629 // The following case may be scalarized depending on the VF. 8630 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8631 // version of the instruction. 8632 // Is it beneficial to perform intrinsic call compared to lib call? 8633 bool NeedToScalarize = false; 8634 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8635 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8636 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8637 return UseVectorIntrinsic || !NeedToScalarize; 8638 }; 8639 8640 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8641 return nullptr; 8642 8643 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8644 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8645 } 8646 8647 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8648 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8649 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8650 // Instruction should be widened, unless it is scalar after vectorization, 8651 // scalarization is profitable or it is predicated. 8652 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8653 return CM.isScalarAfterVectorization(I, VF) || 8654 CM.isProfitableToScalarize(I, VF) || 8655 CM.isScalarWithPredication(I, VF); 8656 }; 8657 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8658 Range); 8659 } 8660 8661 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8662 ArrayRef<VPValue *> Operands) const { 8663 auto IsVectorizableOpcode = [](unsigned Opcode) { 8664 switch (Opcode) { 8665 case Instruction::Add: 8666 case Instruction::And: 8667 case Instruction::AShr: 8668 case Instruction::BitCast: 8669 case Instruction::FAdd: 8670 case Instruction::FCmp: 8671 case Instruction::FDiv: 8672 case Instruction::FMul: 8673 case Instruction::FNeg: 8674 case Instruction::FPExt: 8675 case Instruction::FPToSI: 8676 case Instruction::FPToUI: 8677 case Instruction::FPTrunc: 8678 case Instruction::FRem: 8679 case Instruction::FSub: 8680 case Instruction::ICmp: 8681 case Instruction::IntToPtr: 8682 case Instruction::LShr: 8683 case Instruction::Mul: 8684 case Instruction::Or: 8685 case Instruction::PtrToInt: 8686 case Instruction::SDiv: 8687 case Instruction::Select: 8688 case Instruction::SExt: 8689 case Instruction::Shl: 8690 case Instruction::SIToFP: 8691 case Instruction::SRem: 8692 case Instruction::Sub: 8693 case Instruction::Trunc: 8694 case Instruction::UDiv: 8695 case Instruction::UIToFP: 8696 case Instruction::URem: 8697 case Instruction::Xor: 8698 case Instruction::ZExt: 8699 return true; 8700 } 8701 return false; 8702 }; 8703 8704 if (!IsVectorizableOpcode(I->getOpcode())) 8705 return nullptr; 8706 8707 // Success: widen this instruction. 8708 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8709 } 8710 8711 void VPRecipeBuilder::fixHeaderPhis() { 8712 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8713 for (VPHeaderPHIRecipe *R : PhisToFix) { 8714 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8715 VPRecipeBase *IncR = 8716 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8717 R->addOperand(IncR->getVPSingleValue()); 8718 } 8719 } 8720 8721 VPBasicBlock *VPRecipeBuilder::handleReplication( 8722 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8723 VPlanPtr &Plan) { 8724 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8725 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8726 Range); 8727 8728 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8729 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8730 Range); 8731 8732 // Even if the instruction is not marked as uniform, there are certain 8733 // intrinsic calls that can be effectively treated as such, so we check for 8734 // them here. Conservatively, we only do this for scalable vectors, since 8735 // for fixed-width VFs we can always fall back on full scalarization. 8736 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8737 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8738 case Intrinsic::assume: 8739 case Intrinsic::lifetime_start: 8740 case Intrinsic::lifetime_end: 8741 // For scalable vectors if one of the operands is variant then we still 8742 // want to mark as uniform, which will generate one instruction for just 8743 // the first lane of the vector. We can't scalarize the call in the same 8744 // way as for fixed-width vectors because we don't know how many lanes 8745 // there are. 8746 // 8747 // The reasons for doing it this way for scalable vectors are: 8748 // 1. For the assume intrinsic generating the instruction for the first 8749 // lane is still be better than not generating any at all. For 8750 // example, the input may be a splat across all lanes. 8751 // 2. For the lifetime start/end intrinsics the pointer operand only 8752 // does anything useful when the input comes from a stack object, 8753 // which suggests it should always be uniform. For non-stack objects 8754 // the effect is to poison the object, which still allows us to 8755 // remove the call. 8756 IsUniform = true; 8757 break; 8758 default: 8759 break; 8760 } 8761 } 8762 8763 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8764 IsUniform, IsPredicated); 8765 setRecipe(I, Recipe); 8766 Plan->addVPValue(I, Recipe); 8767 8768 // Find if I uses a predicated instruction. If so, it will use its scalar 8769 // value. Avoid hoisting the insert-element which packs the scalar value into 8770 // a vector value, as that happens iff all users use the vector value. 8771 for (VPValue *Op : Recipe->operands()) { 8772 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8773 if (!PredR) 8774 continue; 8775 auto *RepR = 8776 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8777 assert(RepR->isPredicated() && 8778 "expected Replicate recipe to be predicated"); 8779 RepR->setAlsoPack(false); 8780 } 8781 8782 // Finalize the recipe for Instr, first if it is not predicated. 8783 if (!IsPredicated) { 8784 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8785 VPBB->appendRecipe(Recipe); 8786 return VPBB; 8787 } 8788 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8789 8790 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8791 assert(SingleSucc && "VPBB must have a single successor when handling " 8792 "predicated replication."); 8793 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8794 // Record predicated instructions for above packing optimizations. 8795 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8796 VPBlockUtils::insertBlockAfter(Region, VPBB); 8797 auto *RegSucc = new VPBasicBlock(); 8798 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8799 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8800 return RegSucc; 8801 } 8802 8803 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8804 VPRecipeBase *PredRecipe, 8805 VPlanPtr &Plan) { 8806 // Instructions marked for predication are replicated and placed under an 8807 // if-then construct to prevent side-effects. 8808 8809 // Generate recipes to compute the block mask for this region. 8810 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8811 8812 // Build the triangular if-then region. 8813 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8814 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8815 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8816 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8817 auto *PHIRecipe = Instr->getType()->isVoidTy() 8818 ? nullptr 8819 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8820 if (PHIRecipe) { 8821 Plan->removeVPValueFor(Instr); 8822 Plan->addVPValue(Instr, PHIRecipe); 8823 } 8824 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8825 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8826 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8827 8828 // Note: first set Entry as region entry and then connect successors starting 8829 // from it in order, to propagate the "parent" of each VPBasicBlock. 8830 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8831 VPBlockUtils::connectBlocks(Pred, Exit); 8832 8833 return Region; 8834 } 8835 8836 VPRecipeOrVPValueTy 8837 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8838 ArrayRef<VPValue *> Operands, 8839 VFRange &Range, VPlanPtr &Plan) { 8840 // First, check for specific widening recipes that deal with calls, memory 8841 // operations, inductions and Phi nodes. 8842 if (auto *CI = dyn_cast<CallInst>(Instr)) 8843 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8844 8845 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8846 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8847 8848 VPRecipeBase *Recipe; 8849 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8850 if (Phi->getParent() != OrigLoop->getHeader()) 8851 return tryToBlend(Phi, Operands, Plan); 8852 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8853 return toVPRecipeResult(Recipe); 8854 8855 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8856 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8857 VPValue *StartV = Operands[0]; 8858 if (Legal->isReductionVariable(Phi)) { 8859 const RecurrenceDescriptor &RdxDesc = 8860 Legal->getReductionVars().find(Phi)->second; 8861 assert(RdxDesc.getRecurrenceStartValue() == 8862 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8863 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8864 CM.isInLoopReduction(Phi), 8865 CM.useOrderedReductions(RdxDesc)); 8866 } else { 8867 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8868 } 8869 8870 // Record the incoming value from the backedge, so we can add the incoming 8871 // value from the backedge after all recipes have been created. 8872 recordRecipeOf(cast<Instruction>( 8873 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8874 PhisToFix.push_back(PhiRecipe); 8875 } else { 8876 // TODO: record backedge value for remaining pointer induction phis. 8877 assert(Phi->getType()->isPointerTy() && 8878 "only pointer phis should be handled here"); 8879 assert(Legal->getInductionVars().count(Phi) && 8880 "Not an induction variable"); 8881 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8882 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8883 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8884 } 8885 8886 return toVPRecipeResult(PhiRecipe); 8887 } 8888 8889 if (isa<TruncInst>(Instr) && 8890 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8891 Range, *Plan))) 8892 return toVPRecipeResult(Recipe); 8893 8894 if (!shouldWiden(Instr, Range)) 8895 return nullptr; 8896 8897 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8898 return toVPRecipeResult(new VPWidenGEPRecipe( 8899 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8900 8901 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8902 bool InvariantCond = 8903 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8904 return toVPRecipeResult(new VPWidenSelectRecipe( 8905 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8906 } 8907 8908 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8909 } 8910 8911 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8912 ElementCount MaxVF) { 8913 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8914 8915 // Collect instructions from the original loop that will become trivially dead 8916 // in the vectorized loop. We don't need to vectorize these instructions. For 8917 // example, original induction update instructions can become dead because we 8918 // separately emit induction "steps" when generating code for the new loop. 8919 // Similarly, we create a new latch condition when setting up the structure 8920 // of the new loop, so the old one can become dead. 8921 SmallPtrSet<Instruction *, 4> DeadInstructions; 8922 collectTriviallyDeadInstructions(DeadInstructions); 8923 8924 // Add assume instructions we need to drop to DeadInstructions, to prevent 8925 // them from being added to the VPlan. 8926 // TODO: We only need to drop assumes in blocks that get flattend. If the 8927 // control flow is preserved, we should keep them. 8928 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8929 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8930 8931 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8932 // Dead instructions do not need sinking. Remove them from SinkAfter. 8933 for (Instruction *I : DeadInstructions) 8934 SinkAfter.erase(I); 8935 8936 // Cannot sink instructions after dead instructions (there won't be any 8937 // recipes for them). Instead, find the first non-dead previous instruction. 8938 for (auto &P : Legal->getSinkAfter()) { 8939 Instruction *SinkTarget = P.second; 8940 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8941 (void)FirstInst; 8942 while (DeadInstructions.contains(SinkTarget)) { 8943 assert( 8944 SinkTarget != FirstInst && 8945 "Must find a live instruction (at least the one feeding the " 8946 "first-order recurrence PHI) before reaching beginning of the block"); 8947 SinkTarget = SinkTarget->getPrevNode(); 8948 assert(SinkTarget != P.first && 8949 "sink source equals target, no sinking required"); 8950 } 8951 P.second = SinkTarget; 8952 } 8953 8954 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8955 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8956 VFRange SubRange = {VF, MaxVFPlusOne}; 8957 VPlans.push_back( 8958 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8959 VF = SubRange.End; 8960 } 8961 } 8962 8963 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8964 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8965 // BranchOnCount VPInstruction to the latch. 8966 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8967 bool HasNUW, bool IsVPlanNative) { 8968 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8969 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8970 8971 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8972 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8973 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8974 if (IsVPlanNative) 8975 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 8976 Header->insert(CanonicalIVPHI, Header->begin()); 8977 8978 auto *CanonicalIVIncrement = 8979 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8980 : VPInstruction::CanonicalIVIncrement, 8981 {CanonicalIVPHI}, DL); 8982 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8983 8984 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 8985 if (IsVPlanNative) { 8986 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 8987 EB->setCondBit(nullptr); 8988 } 8989 EB->appendRecipe(CanonicalIVIncrement); 8990 8991 auto *BranchOnCount = 8992 new VPInstruction(VPInstruction::BranchOnCount, 8993 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8994 EB->appendRecipe(BranchOnCount); 8995 } 8996 8997 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8998 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8999 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9000 9001 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9002 9003 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9004 9005 // --------------------------------------------------------------------------- 9006 // Pre-construction: record ingredients whose recipes we'll need to further 9007 // process after constructing the initial VPlan. 9008 // --------------------------------------------------------------------------- 9009 9010 // Mark instructions we'll need to sink later and their targets as 9011 // ingredients whose recipe we'll need to record. 9012 for (auto &Entry : SinkAfter) { 9013 RecipeBuilder.recordRecipeOf(Entry.first); 9014 RecipeBuilder.recordRecipeOf(Entry.second); 9015 } 9016 for (auto &Reduction : CM.getInLoopReductionChains()) { 9017 PHINode *Phi = Reduction.first; 9018 RecurKind Kind = 9019 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 9020 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9021 9022 RecipeBuilder.recordRecipeOf(Phi); 9023 for (auto &R : ReductionOperations) { 9024 RecipeBuilder.recordRecipeOf(R); 9025 // For min/max reducitons, where we have a pair of icmp/select, we also 9026 // need to record the ICmp recipe, so it can be removed later. 9027 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9028 "Only min/max recurrences allowed for inloop reductions"); 9029 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9030 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9031 } 9032 } 9033 9034 // For each interleave group which is relevant for this (possibly trimmed) 9035 // Range, add it to the set of groups to be later applied to the VPlan and add 9036 // placeholders for its members' Recipes which we'll be replacing with a 9037 // single VPInterleaveRecipe. 9038 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9039 auto applyIG = [IG, this](ElementCount VF) -> bool { 9040 return (VF.isVector() && // Query is illegal for VF == 1 9041 CM.getWideningDecision(IG->getInsertPos(), VF) == 9042 LoopVectorizationCostModel::CM_Interleave); 9043 }; 9044 if (!getDecisionAndClampRange(applyIG, Range)) 9045 continue; 9046 InterleaveGroups.insert(IG); 9047 for (unsigned i = 0; i < IG->getFactor(); i++) 9048 if (Instruction *Member = IG->getMember(i)) 9049 RecipeBuilder.recordRecipeOf(Member); 9050 }; 9051 9052 // --------------------------------------------------------------------------- 9053 // Build initial VPlan: Scan the body of the loop in a topological order to 9054 // visit each basic block after having visited its predecessor basic blocks. 9055 // --------------------------------------------------------------------------- 9056 9057 // Create initial VPlan skeleton, with separate header and latch blocks. 9058 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9059 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9060 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9061 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9062 auto Plan = std::make_unique<VPlan>(TopRegion); 9063 9064 Instruction *DLInst = 9065 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9066 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 9067 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 9068 !CM.foldTailByMasking(), false); 9069 9070 // Scan the body of the loop in a topological order to visit each basic block 9071 // after having visited its predecessor basic blocks. 9072 LoopBlocksDFS DFS(OrigLoop); 9073 DFS.perform(LI); 9074 9075 VPBasicBlock *VPBB = HeaderVPBB; 9076 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9077 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9078 // Relevant instructions from basic block BB will be grouped into VPRecipe 9079 // ingredients and fill a new VPBasicBlock. 9080 unsigned VPBBsForBB = 0; 9081 VPBB->setName(BB->getName()); 9082 Builder.setInsertPoint(VPBB); 9083 9084 // Introduce each ingredient into VPlan. 9085 // TODO: Model and preserve debug instrinsics in VPlan. 9086 for (Instruction &I : BB->instructionsWithoutDebug()) { 9087 Instruction *Instr = &I; 9088 9089 // First filter out irrelevant instructions, to ensure no recipes are 9090 // built for them. 9091 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9092 continue; 9093 9094 SmallVector<VPValue *, 4> Operands; 9095 auto *Phi = dyn_cast<PHINode>(Instr); 9096 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9097 Operands.push_back(Plan->getOrAddVPValue( 9098 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9099 } else { 9100 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9101 Operands = {OpRange.begin(), OpRange.end()}; 9102 } 9103 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9104 Instr, Operands, Range, Plan)) { 9105 // If Instr can be simplified to an existing VPValue, use it. 9106 if (RecipeOrValue.is<VPValue *>()) { 9107 auto *VPV = RecipeOrValue.get<VPValue *>(); 9108 Plan->addVPValue(Instr, VPV); 9109 // If the re-used value is a recipe, register the recipe for the 9110 // instruction, in case the recipe for Instr needs to be recorded. 9111 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9112 RecipeBuilder.setRecipe(Instr, R); 9113 continue; 9114 } 9115 // Otherwise, add the new recipe. 9116 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9117 for (auto *Def : Recipe->definedValues()) { 9118 auto *UV = Def->getUnderlyingValue(); 9119 Plan->addVPValue(UV, Def); 9120 } 9121 9122 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9123 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9124 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9125 // of the header block. That can happen for truncates of induction 9126 // variables. Those recipes are moved to the phi section of the header 9127 // block after applying SinkAfter, which relies on the original 9128 // position of the trunc. 9129 assert(isa<TruncInst>(Instr)); 9130 InductionsToMove.push_back( 9131 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9132 } 9133 RecipeBuilder.setRecipe(Instr, Recipe); 9134 VPBB->appendRecipe(Recipe); 9135 continue; 9136 } 9137 9138 // Otherwise, if all widening options failed, Instruction is to be 9139 // replicated. This may create a successor for VPBB. 9140 VPBasicBlock *NextVPBB = 9141 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9142 if (NextVPBB != VPBB) { 9143 VPBB = NextVPBB; 9144 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9145 : ""); 9146 } 9147 } 9148 9149 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9150 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9151 } 9152 9153 // Fold the last, empty block into its predecessor. 9154 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9155 assert(VPBB && "expected to fold last (empty) block"); 9156 // After here, VPBB should not be used. 9157 VPBB = nullptr; 9158 9159 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9160 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9161 "entry block must be set to a VPRegionBlock having a non-empty entry " 9162 "VPBasicBlock"); 9163 RecipeBuilder.fixHeaderPhis(); 9164 9165 // --------------------------------------------------------------------------- 9166 // Transform initial VPlan: Apply previously taken decisions, in order, to 9167 // bring the VPlan to its final state. 9168 // --------------------------------------------------------------------------- 9169 9170 // Apply Sink-After legal constraints. 9171 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9172 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9173 if (Region && Region->isReplicator()) { 9174 assert(Region->getNumSuccessors() == 1 && 9175 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9176 assert(R->getParent()->size() == 1 && 9177 "A recipe in an original replicator region must be the only " 9178 "recipe in its block"); 9179 return Region; 9180 } 9181 return nullptr; 9182 }; 9183 for (auto &Entry : SinkAfter) { 9184 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9185 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9186 9187 auto *TargetRegion = GetReplicateRegion(Target); 9188 auto *SinkRegion = GetReplicateRegion(Sink); 9189 if (!SinkRegion) { 9190 // If the sink source is not a replicate region, sink the recipe directly. 9191 if (TargetRegion) { 9192 // The target is in a replication region, make sure to move Sink to 9193 // the block after it, not into the replication region itself. 9194 VPBasicBlock *NextBlock = 9195 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9196 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9197 } else 9198 Sink->moveAfter(Target); 9199 continue; 9200 } 9201 9202 // The sink source is in a replicate region. Unhook the region from the CFG. 9203 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9204 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9205 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9206 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9207 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9208 9209 if (TargetRegion) { 9210 // The target recipe is also in a replicate region, move the sink region 9211 // after the target region. 9212 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9213 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9214 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9215 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9216 } else { 9217 // The sink source is in a replicate region, we need to move the whole 9218 // replicate region, which should only contain a single recipe in the 9219 // main block. 9220 auto *SplitBlock = 9221 Target->getParent()->splitAt(std::next(Target->getIterator())); 9222 9223 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9224 9225 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9226 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9227 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9228 } 9229 } 9230 9231 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9232 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9233 9234 // Now that sink-after is done, move induction recipes for optimized truncates 9235 // to the phi section of the header block. 9236 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9237 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9238 9239 // Adjust the recipes for any inloop reductions. 9240 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9241 RecipeBuilder, Range.Start); 9242 9243 // Introduce a recipe to combine the incoming and previous values of a 9244 // first-order recurrence. 9245 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9246 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9247 if (!RecurPhi) 9248 continue; 9249 9250 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9251 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9252 auto *Region = GetReplicateRegion(PrevRecipe); 9253 if (Region) 9254 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9255 if (Region || PrevRecipe->isPhi()) 9256 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9257 else 9258 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9259 9260 auto *RecurSplice = cast<VPInstruction>( 9261 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9262 {RecurPhi, RecurPhi->getBackedgeValue()})); 9263 9264 RecurPhi->replaceAllUsesWith(RecurSplice); 9265 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9266 // all users. 9267 RecurSplice->setOperand(0, RecurPhi); 9268 } 9269 9270 // Interleave memory: for each Interleave Group we marked earlier as relevant 9271 // for this VPlan, replace the Recipes widening its memory instructions with a 9272 // single VPInterleaveRecipe at its insertion point. 9273 for (auto IG : InterleaveGroups) { 9274 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9275 RecipeBuilder.getRecipe(IG->getInsertPos())); 9276 SmallVector<VPValue *, 4> StoredValues; 9277 for (unsigned i = 0; i < IG->getFactor(); ++i) 9278 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9279 auto *StoreR = 9280 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9281 StoredValues.push_back(StoreR->getStoredValue()); 9282 } 9283 9284 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9285 Recipe->getMask()); 9286 VPIG->insertBefore(Recipe); 9287 unsigned J = 0; 9288 for (unsigned i = 0; i < IG->getFactor(); ++i) 9289 if (Instruction *Member = IG->getMember(i)) { 9290 if (!Member->getType()->isVoidTy()) { 9291 VPValue *OriginalV = Plan->getVPValue(Member); 9292 Plan->removeVPValueFor(Member); 9293 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9294 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9295 J++; 9296 } 9297 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9298 } 9299 } 9300 9301 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9302 // in ways that accessing values using original IR values is incorrect. 9303 Plan->disableValue2VPValue(); 9304 9305 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9306 VPlanTransforms::sinkScalarOperands(*Plan); 9307 VPlanTransforms::mergeReplicateRegions(*Plan); 9308 VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop); 9309 9310 std::string PlanName; 9311 raw_string_ostream RSO(PlanName); 9312 ElementCount VF = Range.Start; 9313 Plan->addVF(VF); 9314 RSO << "Initial VPlan for VF={" << VF; 9315 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9316 Plan->addVF(VF); 9317 RSO << "," << VF; 9318 } 9319 RSO << "},UF>=1"; 9320 RSO.flush(); 9321 Plan->setName(PlanName); 9322 9323 // Fold Exit block into its predecessor if possible. 9324 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9325 // VPBasicBlock as exit. 9326 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9327 9328 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9329 return Plan; 9330 } 9331 9332 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9333 // Outer loop handling: They may require CFG and instruction level 9334 // transformations before even evaluating whether vectorization is profitable. 9335 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9336 // the vectorization pipeline. 9337 assert(!OrigLoop->isInnermost()); 9338 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9339 9340 // Create new empty VPlan 9341 auto Plan = std::make_unique<VPlan>(); 9342 9343 // Build hierarchical CFG 9344 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9345 HCFGBuilder.buildHierarchicalCFG(); 9346 9347 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9348 VF *= 2) 9349 Plan->addVF(VF); 9350 9351 if (EnableVPlanPredication) { 9352 VPlanPredicator VPP(*Plan); 9353 VPP.predicate(); 9354 9355 // Avoid running transformation to recipes until masked code generation in 9356 // VPlan-native path is in place. 9357 return Plan; 9358 } 9359 9360 SmallPtrSet<Instruction *, 1> DeadInstructions; 9361 VPlanTransforms::VPInstructionsToVPRecipes( 9362 OrigLoop, Plan, 9363 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9364 DeadInstructions, *PSE.getSE()); 9365 9366 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9367 true, true); 9368 return Plan; 9369 } 9370 9371 // Adjust the recipes for reductions. For in-loop reductions the chain of 9372 // instructions leading from the loop exit instr to the phi need to be converted 9373 // to reductions, with one operand being vector and the other being the scalar 9374 // reduction chain. For other reductions, a select is introduced between the phi 9375 // and live-out recipes when folding the tail. 9376 void LoopVectorizationPlanner::adjustRecipesForReductions( 9377 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9378 ElementCount MinVF) { 9379 for (auto &Reduction : CM.getInLoopReductionChains()) { 9380 PHINode *Phi = Reduction.first; 9381 const RecurrenceDescriptor &RdxDesc = 9382 Legal->getReductionVars().find(Phi)->second; 9383 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9384 9385 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9386 continue; 9387 9388 // ReductionOperations are orders top-down from the phi's use to the 9389 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9390 // which of the two operands will remain scalar and which will be reduced. 9391 // For minmax the chain will be the select instructions. 9392 Instruction *Chain = Phi; 9393 for (Instruction *R : ReductionOperations) { 9394 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9395 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9396 9397 VPValue *ChainOp = Plan->getVPValue(Chain); 9398 unsigned FirstOpId; 9399 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9400 "Only min/max recurrences allowed for inloop reductions"); 9401 // Recognize a call to the llvm.fmuladd intrinsic. 9402 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9403 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9404 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9405 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9406 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9407 "Expected to replace a VPWidenSelectSC"); 9408 FirstOpId = 1; 9409 } else { 9410 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9411 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9412 "Expected to replace a VPWidenSC"); 9413 FirstOpId = 0; 9414 } 9415 unsigned VecOpId = 9416 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9417 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9418 9419 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9420 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9421 : nullptr; 9422 9423 if (IsFMulAdd) { 9424 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9425 // need to create an fmul recipe to use as the vector operand for the 9426 // fadd reduction. 9427 VPInstruction *FMulRecipe = new VPInstruction( 9428 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9429 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9430 WidenRecipe->getParent()->insert(FMulRecipe, 9431 WidenRecipe->getIterator()); 9432 VecOp = FMulRecipe; 9433 } 9434 VPReductionRecipe *RedRecipe = 9435 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9436 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9437 Plan->removeVPValueFor(R); 9438 Plan->addVPValue(R, RedRecipe); 9439 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9440 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9441 WidenRecipe->eraseFromParent(); 9442 9443 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9444 VPRecipeBase *CompareRecipe = 9445 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9446 assert(isa<VPWidenRecipe>(CompareRecipe) && 9447 "Expected to replace a VPWidenSC"); 9448 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9449 "Expected no remaining users"); 9450 CompareRecipe->eraseFromParent(); 9451 } 9452 Chain = R; 9453 } 9454 } 9455 9456 // If tail is folded by masking, introduce selects between the phi 9457 // and the live-out instruction of each reduction, at the beginning of the 9458 // dedicated latch block. 9459 if (CM.foldTailByMasking()) { 9460 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9461 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9462 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9463 if (!PhiR || PhiR->isInLoop()) 9464 continue; 9465 VPValue *Cond = 9466 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9467 VPValue *Red = PhiR->getBackedgeValue(); 9468 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9469 "reduction recipe must be defined before latch"); 9470 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9471 } 9472 } 9473 } 9474 9475 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9476 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9477 VPSlotTracker &SlotTracker) const { 9478 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9479 IG->getInsertPos()->printAsOperand(O, false); 9480 O << ", "; 9481 getAddr()->printAsOperand(O, SlotTracker); 9482 VPValue *Mask = getMask(); 9483 if (Mask) { 9484 O << ", "; 9485 Mask->printAsOperand(O, SlotTracker); 9486 } 9487 9488 unsigned OpIdx = 0; 9489 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9490 if (!IG->getMember(i)) 9491 continue; 9492 if (getNumStoreOperands() > 0) { 9493 O << "\n" << Indent << " store "; 9494 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9495 O << " to index " << i; 9496 } else { 9497 O << "\n" << Indent << " "; 9498 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9499 O << " = load from index " << i; 9500 } 9501 ++OpIdx; 9502 } 9503 } 9504 #endif 9505 9506 void VPWidenCallRecipe::execute(VPTransformState &State) { 9507 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9508 *this, State); 9509 } 9510 9511 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9512 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9513 State.ILV->setDebugLocFromInst(&I); 9514 9515 // The condition can be loop invariant but still defined inside the 9516 // loop. This means that we can't just use the original 'cond' value. 9517 // We have to take the 'vectorized' value and pick the first lane. 9518 // Instcombine will make this a no-op. 9519 auto *InvarCond = 9520 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9521 9522 for (unsigned Part = 0; Part < State.UF; ++Part) { 9523 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9524 Value *Op0 = State.get(getOperand(1), Part); 9525 Value *Op1 = State.get(getOperand(2), Part); 9526 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9527 State.set(this, Sel, Part); 9528 State.ILV->addMetadata(Sel, &I); 9529 } 9530 } 9531 9532 void VPWidenRecipe::execute(VPTransformState &State) { 9533 auto &I = *cast<Instruction>(getUnderlyingValue()); 9534 auto &Builder = State.Builder; 9535 switch (I.getOpcode()) { 9536 case Instruction::Call: 9537 case Instruction::Br: 9538 case Instruction::PHI: 9539 case Instruction::GetElementPtr: 9540 case Instruction::Select: 9541 llvm_unreachable("This instruction is handled by a different recipe."); 9542 case Instruction::UDiv: 9543 case Instruction::SDiv: 9544 case Instruction::SRem: 9545 case Instruction::URem: 9546 case Instruction::Add: 9547 case Instruction::FAdd: 9548 case Instruction::Sub: 9549 case Instruction::FSub: 9550 case Instruction::FNeg: 9551 case Instruction::Mul: 9552 case Instruction::FMul: 9553 case Instruction::FDiv: 9554 case Instruction::FRem: 9555 case Instruction::Shl: 9556 case Instruction::LShr: 9557 case Instruction::AShr: 9558 case Instruction::And: 9559 case Instruction::Or: 9560 case Instruction::Xor: { 9561 // Just widen unops and binops. 9562 State.ILV->setDebugLocFromInst(&I); 9563 9564 for (unsigned Part = 0; Part < State.UF; ++Part) { 9565 SmallVector<Value *, 2> Ops; 9566 for (VPValue *VPOp : operands()) 9567 Ops.push_back(State.get(VPOp, Part)); 9568 9569 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9570 9571 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9572 VecOp->copyIRFlags(&I); 9573 9574 // If the instruction is vectorized and was in a basic block that needed 9575 // predication, we can't propagate poison-generating flags (nuw/nsw, 9576 // exact, etc.). The control flow has been linearized and the 9577 // instruction is no longer guarded by the predicate, which could make 9578 // the flag properties to no longer hold. 9579 if (State.MayGeneratePoisonRecipes.contains(this)) 9580 VecOp->dropPoisonGeneratingFlags(); 9581 } 9582 9583 // Use this vector value for all users of the original instruction. 9584 State.set(this, V, Part); 9585 State.ILV->addMetadata(V, &I); 9586 } 9587 9588 break; 9589 } 9590 case Instruction::ICmp: 9591 case Instruction::FCmp: { 9592 // Widen compares. Generate vector compares. 9593 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9594 auto *Cmp = cast<CmpInst>(&I); 9595 State.ILV->setDebugLocFromInst(Cmp); 9596 for (unsigned Part = 0; Part < State.UF; ++Part) { 9597 Value *A = State.get(getOperand(0), Part); 9598 Value *B = State.get(getOperand(1), Part); 9599 Value *C = nullptr; 9600 if (FCmp) { 9601 // Propagate fast math flags. 9602 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9603 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9604 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9605 } else { 9606 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9607 } 9608 State.set(this, C, Part); 9609 State.ILV->addMetadata(C, &I); 9610 } 9611 9612 break; 9613 } 9614 9615 case Instruction::ZExt: 9616 case Instruction::SExt: 9617 case Instruction::FPToUI: 9618 case Instruction::FPToSI: 9619 case Instruction::FPExt: 9620 case Instruction::PtrToInt: 9621 case Instruction::IntToPtr: 9622 case Instruction::SIToFP: 9623 case Instruction::UIToFP: 9624 case Instruction::Trunc: 9625 case Instruction::FPTrunc: 9626 case Instruction::BitCast: { 9627 auto *CI = cast<CastInst>(&I); 9628 State.ILV->setDebugLocFromInst(CI); 9629 9630 /// Vectorize casts. 9631 Type *DestTy = (State.VF.isScalar()) 9632 ? CI->getType() 9633 : VectorType::get(CI->getType(), State.VF); 9634 9635 for (unsigned Part = 0; Part < State.UF; ++Part) { 9636 Value *A = State.get(getOperand(0), Part); 9637 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9638 State.set(this, Cast, Part); 9639 State.ILV->addMetadata(Cast, &I); 9640 } 9641 break; 9642 } 9643 default: 9644 // This instruction is not vectorized by simple widening. 9645 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9646 llvm_unreachable("Unhandled instruction!"); 9647 } // end of switch. 9648 } 9649 9650 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9651 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9652 // Construct a vector GEP by widening the operands of the scalar GEP as 9653 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9654 // results in a vector of pointers when at least one operand of the GEP 9655 // is vector-typed. Thus, to keep the representation compact, we only use 9656 // vector-typed operands for loop-varying values. 9657 9658 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9659 // If we are vectorizing, but the GEP has only loop-invariant operands, 9660 // the GEP we build (by only using vector-typed operands for 9661 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9662 // produce a vector of pointers, we need to either arbitrarily pick an 9663 // operand to broadcast, or broadcast a clone of the original GEP. 9664 // Here, we broadcast a clone of the original. 9665 // 9666 // TODO: If at some point we decide to scalarize instructions having 9667 // loop-invariant operands, this special case will no longer be 9668 // required. We would add the scalarization decision to 9669 // collectLoopScalars() and teach getVectorValue() to broadcast 9670 // the lane-zero scalar value. 9671 auto *Clone = State.Builder.Insert(GEP->clone()); 9672 for (unsigned Part = 0; Part < State.UF; ++Part) { 9673 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9674 State.set(this, EntryPart, Part); 9675 State.ILV->addMetadata(EntryPart, GEP); 9676 } 9677 } else { 9678 // If the GEP has at least one loop-varying operand, we are sure to 9679 // produce a vector of pointers. But if we are only unrolling, we want 9680 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9681 // produce with the code below will be scalar (if VF == 1) or vector 9682 // (otherwise). Note that for the unroll-only case, we still maintain 9683 // values in the vector mapping with initVector, as we do for other 9684 // instructions. 9685 for (unsigned Part = 0; Part < State.UF; ++Part) { 9686 // The pointer operand of the new GEP. If it's loop-invariant, we 9687 // won't broadcast it. 9688 auto *Ptr = IsPtrLoopInvariant 9689 ? State.get(getOperand(0), VPIteration(0, 0)) 9690 : State.get(getOperand(0), Part); 9691 9692 // Collect all the indices for the new GEP. If any index is 9693 // loop-invariant, we won't broadcast it. 9694 SmallVector<Value *, 4> Indices; 9695 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9696 VPValue *Operand = getOperand(I); 9697 if (IsIndexLoopInvariant[I - 1]) 9698 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9699 else 9700 Indices.push_back(State.get(Operand, Part)); 9701 } 9702 9703 // If the GEP instruction is vectorized and was in a basic block that 9704 // needed predication, we can't propagate the poison-generating 'inbounds' 9705 // flag. The control flow has been linearized and the GEP is no longer 9706 // guarded by the predicate, which could make the 'inbounds' properties to 9707 // no longer hold. 9708 bool IsInBounds = 9709 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9710 9711 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9712 // but it should be a vector, otherwise. 9713 auto *NewGEP = IsInBounds 9714 ? State.Builder.CreateInBoundsGEP( 9715 GEP->getSourceElementType(), Ptr, Indices) 9716 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9717 Ptr, Indices); 9718 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9719 "NewGEP is not a pointer vector"); 9720 State.set(this, NewGEP, Part); 9721 State.ILV->addMetadata(NewGEP, GEP); 9722 } 9723 } 9724 } 9725 9726 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9727 assert(!State.Instance && "Int or FP induction being replicated."); 9728 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9729 State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); 9730 } 9731 9732 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9733 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9734 9735 // Fast-math-flags propagate from the original induction instruction. 9736 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9737 if (IndDesc.getInductionBinOp() && 9738 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9739 State.Builder.setFastMathFlags( 9740 IndDesc.getInductionBinOp()->getFastMathFlags()); 9741 9742 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9743 auto *Trunc = dyn_cast<TruncInst>(getUnderlyingValue()); 9744 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9745 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9746 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9747 if (!isCanonical() || CanonicalIV->getType() != IV->getType()) { 9748 ScalarIV = IV->getType()->isIntegerTy() 9749 ? State.Builder.CreateSExtOrTrunc(ScalarIV, IV->getType()) 9750 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, 9751 IV->getType()); 9752 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9753 getStartValue()->getLiveInIRValue(), Step, 9754 IndDesc); 9755 ScalarIV->setName("offset.idx"); 9756 } 9757 if (Trunc) { 9758 auto *TruncType = cast<IntegerType>(Trunc->getType()); 9759 assert(Step->getType()->isIntegerTy() && 9760 "Truncation requires an integer step"); 9761 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncType); 9762 Step = State.Builder.CreateTrunc(Step, TruncType); 9763 } 9764 return ScalarIV; 9765 }; 9766 9767 Value *ScalarIV = CreateScalarIV(Step); 9768 if (State.VF.isVector()) { 9769 buildScalarSteps(ScalarIV, Step, IV, IndDesc, this, State); 9770 return; 9771 } 9772 9773 for (unsigned Part = 0; Part < State.UF; ++Part) { 9774 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9775 Value *EntryPart; 9776 if (Step->getType()->isFloatingPointTy()) { 9777 Value *StartIdx = 9778 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9779 // Floating-point operations inherit FMF via the builder's flags. 9780 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9781 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9782 ScalarIV, MulOp); 9783 } else { 9784 Value *StartIdx = 9785 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9786 EntryPart = State.Builder.CreateAdd( 9787 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9788 } 9789 State.set(this, EntryPart, Part); 9790 if (Trunc) 9791 State.ILV->addMetadata(EntryPart, Trunc); 9792 } 9793 } 9794 9795 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9796 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9797 State); 9798 } 9799 9800 void VPBlendRecipe::execute(VPTransformState &State) { 9801 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9802 // We know that all PHIs in non-header blocks are converted into 9803 // selects, so we don't have to worry about the insertion order and we 9804 // can just use the builder. 9805 // At this point we generate the predication tree. There may be 9806 // duplications since this is a simple recursive scan, but future 9807 // optimizations will clean it up. 9808 9809 unsigned NumIncoming = getNumIncomingValues(); 9810 9811 // Generate a sequence of selects of the form: 9812 // SELECT(Mask3, In3, 9813 // SELECT(Mask2, In2, 9814 // SELECT(Mask1, In1, 9815 // In0))) 9816 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9817 // are essentially undef are taken from In0. 9818 InnerLoopVectorizer::VectorParts Entry(State.UF); 9819 for (unsigned In = 0; In < NumIncoming; ++In) { 9820 for (unsigned Part = 0; Part < State.UF; ++Part) { 9821 // We might have single edge PHIs (blocks) - use an identity 9822 // 'select' for the first PHI operand. 9823 Value *In0 = State.get(getIncomingValue(In), Part); 9824 if (In == 0) 9825 Entry[Part] = In0; // Initialize with the first incoming value. 9826 else { 9827 // Select between the current value and the previous incoming edge 9828 // based on the incoming mask. 9829 Value *Cond = State.get(getMask(In), Part); 9830 Entry[Part] = 9831 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9832 } 9833 } 9834 } 9835 for (unsigned Part = 0; Part < State.UF; ++Part) 9836 State.set(this, Entry[Part], Part); 9837 } 9838 9839 void VPInterleaveRecipe::execute(VPTransformState &State) { 9840 assert(!State.Instance && "Interleave group being replicated."); 9841 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9842 getStoredValues(), getMask()); 9843 } 9844 9845 void VPReductionRecipe::execute(VPTransformState &State) { 9846 assert(!State.Instance && "Reduction being replicated."); 9847 Value *PrevInChain = State.get(getChainOp(), 0); 9848 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9849 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9850 // Propagate the fast-math flags carried by the underlying instruction. 9851 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9852 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9853 for (unsigned Part = 0; Part < State.UF; ++Part) { 9854 Value *NewVecOp = State.get(getVecOp(), Part); 9855 if (VPValue *Cond = getCondOp()) { 9856 Value *NewCond = State.get(Cond, Part); 9857 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9858 Value *Iden = RdxDesc->getRecurrenceIdentity( 9859 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9860 Value *IdenVec = 9861 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9862 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9863 NewVecOp = Select; 9864 } 9865 Value *NewRed; 9866 Value *NextInChain; 9867 if (IsOrdered) { 9868 if (State.VF.isVector()) 9869 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9870 PrevInChain); 9871 else 9872 NewRed = State.Builder.CreateBinOp( 9873 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9874 NewVecOp); 9875 PrevInChain = NewRed; 9876 } else { 9877 PrevInChain = State.get(getChainOp(), Part); 9878 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9879 } 9880 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9881 NextInChain = 9882 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9883 NewRed, PrevInChain); 9884 } else if (IsOrdered) 9885 NextInChain = NewRed; 9886 else 9887 NextInChain = State.Builder.CreateBinOp( 9888 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9889 PrevInChain); 9890 State.set(this, NextInChain, Part); 9891 } 9892 } 9893 9894 void VPReplicateRecipe::execute(VPTransformState &State) { 9895 if (State.Instance) { // Generate a single instance. 9896 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9897 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9898 IsPredicated, State); 9899 // Insert scalar instance packing it into a vector. 9900 if (AlsoPack && State.VF.isVector()) { 9901 // If we're constructing lane 0, initialize to start from poison. 9902 if (State.Instance->Lane.isFirstLane()) { 9903 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9904 Value *Poison = PoisonValue::get( 9905 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9906 State.set(this, Poison, State.Instance->Part); 9907 } 9908 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9909 } 9910 return; 9911 } 9912 9913 // Generate scalar instances for all VF lanes of all UF parts, unless the 9914 // instruction is uniform inwhich case generate only the first lane for each 9915 // of the UF parts. 9916 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9917 assert((!State.VF.isScalable() || IsUniform) && 9918 "Can't scalarize a scalable vector"); 9919 for (unsigned Part = 0; Part < State.UF; ++Part) 9920 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9921 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9922 VPIteration(Part, Lane), IsPredicated, 9923 State); 9924 } 9925 9926 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9927 assert(State.Instance && "Branch on Mask works only on single instance."); 9928 9929 unsigned Part = State.Instance->Part; 9930 unsigned Lane = State.Instance->Lane.getKnownLane(); 9931 9932 Value *ConditionBit = nullptr; 9933 VPValue *BlockInMask = getMask(); 9934 if (BlockInMask) { 9935 ConditionBit = State.get(BlockInMask, Part); 9936 if (ConditionBit->getType()->isVectorTy()) 9937 ConditionBit = State.Builder.CreateExtractElement( 9938 ConditionBit, State.Builder.getInt32(Lane)); 9939 } else // Block in mask is all-one. 9940 ConditionBit = State.Builder.getTrue(); 9941 9942 // Replace the temporary unreachable terminator with a new conditional branch, 9943 // whose two destinations will be set later when they are created. 9944 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9945 assert(isa<UnreachableInst>(CurrentTerminator) && 9946 "Expected to replace unreachable terminator with conditional branch."); 9947 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9948 CondBr->setSuccessor(0, nullptr); 9949 ReplaceInstWithInst(CurrentTerminator, CondBr); 9950 } 9951 9952 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9953 assert(State.Instance && "Predicated instruction PHI works per instance."); 9954 Instruction *ScalarPredInst = 9955 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9956 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9957 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9958 assert(PredicatingBB && "Predicated block has no single predecessor."); 9959 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9960 "operand must be VPReplicateRecipe"); 9961 9962 // By current pack/unpack logic we need to generate only a single phi node: if 9963 // a vector value for the predicated instruction exists at this point it means 9964 // the instruction has vector users only, and a phi for the vector value is 9965 // needed. In this case the recipe of the predicated instruction is marked to 9966 // also do that packing, thereby "hoisting" the insert-element sequence. 9967 // Otherwise, a phi node for the scalar value is needed. 9968 unsigned Part = State.Instance->Part; 9969 if (State.hasVectorValue(getOperand(0), Part)) { 9970 Value *VectorValue = State.get(getOperand(0), Part); 9971 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9972 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9973 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9974 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9975 if (State.hasVectorValue(this, Part)) 9976 State.reset(this, VPhi, Part); 9977 else 9978 State.set(this, VPhi, Part); 9979 // NOTE: Currently we need to update the value of the operand, so the next 9980 // predicated iteration inserts its generated value in the correct vector. 9981 State.reset(getOperand(0), VPhi, Part); 9982 } else { 9983 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9984 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9985 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9986 PredicatingBB); 9987 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9988 if (State.hasScalarValue(this, *State.Instance)) 9989 State.reset(this, Phi, *State.Instance); 9990 else 9991 State.set(this, Phi, *State.Instance); 9992 // NOTE: Currently we need to update the value of the operand, so the next 9993 // predicated iteration inserts its generated value in the correct vector. 9994 State.reset(getOperand(0), Phi, *State.Instance); 9995 } 9996 } 9997 9998 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9999 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 10000 10001 // Attempt to issue a wide load. 10002 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 10003 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 10004 10005 assert((LI || SI) && "Invalid Load/Store instruction"); 10006 assert((!SI || StoredValue) && "No stored value provided for widened store"); 10007 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 10008 10009 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 10010 10011 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 10012 const Align Alignment = getLoadStoreAlignment(&Ingredient); 10013 bool CreateGatherScatter = !Consecutive; 10014 10015 auto &Builder = State.Builder; 10016 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 10017 bool isMaskRequired = getMask(); 10018 if (isMaskRequired) 10019 for (unsigned Part = 0; Part < State.UF; ++Part) 10020 BlockInMaskParts[Part] = State.get(getMask(), Part); 10021 10022 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 10023 // Calculate the pointer for the specific unroll-part. 10024 GetElementPtrInst *PartPtr = nullptr; 10025 10026 bool InBounds = false; 10027 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 10028 InBounds = gep->isInBounds(); 10029 if (Reverse) { 10030 // If the address is consecutive but reversed, then the 10031 // wide store needs to start at the last vector element. 10032 // RunTimeVF = VScale * VF.getKnownMinValue() 10033 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 10034 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 10035 // NumElt = -Part * RunTimeVF 10036 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 10037 // LastLane = 1 - RunTimeVF 10038 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 10039 PartPtr = 10040 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 10041 PartPtr->setIsInBounds(InBounds); 10042 PartPtr = cast<GetElementPtrInst>( 10043 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 10044 PartPtr->setIsInBounds(InBounds); 10045 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 10046 BlockInMaskParts[Part] = 10047 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 10048 } else { 10049 Value *Increment = 10050 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 10051 PartPtr = cast<GetElementPtrInst>( 10052 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 10053 PartPtr->setIsInBounds(InBounds); 10054 } 10055 10056 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 10057 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 10058 }; 10059 10060 // Handle Stores: 10061 if (SI) { 10062 State.ILV->setDebugLocFromInst(SI); 10063 10064 for (unsigned Part = 0; Part < State.UF; ++Part) { 10065 Instruction *NewSI = nullptr; 10066 Value *StoredVal = State.get(StoredValue, Part); 10067 if (CreateGatherScatter) { 10068 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10069 Value *VectorGep = State.get(getAddr(), Part); 10070 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 10071 MaskPart); 10072 } else { 10073 if (Reverse) { 10074 // If we store to reverse consecutive memory locations, then we need 10075 // to reverse the order of elements in the stored value. 10076 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 10077 // We don't want to update the value in the map as it might be used in 10078 // another expression. So don't call resetVectorValue(StoredVal). 10079 } 10080 auto *VecPtr = 10081 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10082 if (isMaskRequired) 10083 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 10084 BlockInMaskParts[Part]); 10085 else 10086 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 10087 } 10088 State.ILV->addMetadata(NewSI, SI); 10089 } 10090 return; 10091 } 10092 10093 // Handle loads. 10094 assert(LI && "Must have a load instruction"); 10095 State.ILV->setDebugLocFromInst(LI); 10096 for (unsigned Part = 0; Part < State.UF; ++Part) { 10097 Value *NewLI; 10098 if (CreateGatherScatter) { 10099 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10100 Value *VectorGep = State.get(getAddr(), Part); 10101 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10102 nullptr, "wide.masked.gather"); 10103 State.ILV->addMetadata(NewLI, LI); 10104 } else { 10105 auto *VecPtr = 10106 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10107 if (isMaskRequired) 10108 NewLI = Builder.CreateMaskedLoad( 10109 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10110 PoisonValue::get(DataTy), "wide.masked.load"); 10111 else 10112 NewLI = 10113 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10114 10115 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10116 State.ILV->addMetadata(NewLI, LI); 10117 if (Reverse) 10118 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10119 } 10120 10121 State.set(this, NewLI, Part); 10122 } 10123 } 10124 10125 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10126 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10127 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10128 // for predication. 10129 static ScalarEpilogueLowering getScalarEpilogueLowering( 10130 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10131 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10132 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10133 LoopVectorizationLegality &LVL) { 10134 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10135 // don't look at hints or options, and don't request a scalar epilogue. 10136 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10137 // LoopAccessInfo (due to code dependency and not being able to reliably get 10138 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10139 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10140 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10141 // back to the old way and vectorize with versioning when forced. See D81345.) 10142 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10143 PGSOQueryType::IRPass) && 10144 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10145 return CM_ScalarEpilogueNotAllowedOptSize; 10146 10147 // 2) If set, obey the directives 10148 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10149 switch (PreferPredicateOverEpilogue) { 10150 case PreferPredicateTy::ScalarEpilogue: 10151 return CM_ScalarEpilogueAllowed; 10152 case PreferPredicateTy::PredicateElseScalarEpilogue: 10153 return CM_ScalarEpilogueNotNeededUsePredicate; 10154 case PreferPredicateTy::PredicateOrDontVectorize: 10155 return CM_ScalarEpilogueNotAllowedUsePredicate; 10156 }; 10157 } 10158 10159 // 3) If set, obey the hints 10160 switch (Hints.getPredicate()) { 10161 case LoopVectorizeHints::FK_Enabled: 10162 return CM_ScalarEpilogueNotNeededUsePredicate; 10163 case LoopVectorizeHints::FK_Disabled: 10164 return CM_ScalarEpilogueAllowed; 10165 }; 10166 10167 // 4) if the TTI hook indicates this is profitable, request predication. 10168 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10169 LVL.getLAI())) 10170 return CM_ScalarEpilogueNotNeededUsePredicate; 10171 10172 return CM_ScalarEpilogueAllowed; 10173 } 10174 10175 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10176 // If Values have been set for this Def return the one relevant for \p Part. 10177 if (hasVectorValue(Def, Part)) 10178 return Data.PerPartOutput[Def][Part]; 10179 10180 if (!hasScalarValue(Def, {Part, 0})) { 10181 Value *IRV = Def->getLiveInIRValue(); 10182 Value *B = ILV->getBroadcastInstrs(IRV); 10183 set(Def, B, Part); 10184 return B; 10185 } 10186 10187 Value *ScalarValue = get(Def, {Part, 0}); 10188 // If we aren't vectorizing, we can just copy the scalar map values over 10189 // to the vector map. 10190 if (VF.isScalar()) { 10191 set(Def, ScalarValue, Part); 10192 return ScalarValue; 10193 } 10194 10195 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10196 bool IsUniform = RepR && RepR->isUniform(); 10197 10198 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10199 // Check if there is a scalar value for the selected lane. 10200 if (!hasScalarValue(Def, {Part, LastLane})) { 10201 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10202 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10203 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10204 "unexpected recipe found to be invariant"); 10205 IsUniform = true; 10206 LastLane = 0; 10207 } 10208 10209 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10210 // Set the insert point after the last scalarized instruction or after the 10211 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10212 // will directly follow the scalar definitions. 10213 auto OldIP = Builder.saveIP(); 10214 auto NewIP = 10215 isa<PHINode>(LastInst) 10216 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10217 : std::next(BasicBlock::iterator(LastInst)); 10218 Builder.SetInsertPoint(&*NewIP); 10219 10220 // However, if we are vectorizing, we need to construct the vector values. 10221 // If the value is known to be uniform after vectorization, we can just 10222 // broadcast the scalar value corresponding to lane zero for each unroll 10223 // iteration. Otherwise, we construct the vector values using 10224 // insertelement instructions. Since the resulting vectors are stored in 10225 // State, we will only generate the insertelements once. 10226 Value *VectorValue = nullptr; 10227 if (IsUniform) { 10228 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10229 set(Def, VectorValue, Part); 10230 } else { 10231 // Initialize packing with insertelements to start from undef. 10232 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10233 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10234 set(Def, Undef, Part); 10235 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10236 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10237 VectorValue = get(Def, Part); 10238 } 10239 Builder.restoreIP(OldIP); 10240 return VectorValue; 10241 } 10242 10243 // Process the loop in the VPlan-native vectorization path. This path builds 10244 // VPlan upfront in the vectorization pipeline, which allows to apply 10245 // VPlan-to-VPlan transformations from the very beginning without modifying the 10246 // input LLVM IR. 10247 static bool processLoopInVPlanNativePath( 10248 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10249 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10250 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10251 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10252 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10253 LoopVectorizationRequirements &Requirements) { 10254 10255 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10256 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10257 return false; 10258 } 10259 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10260 Function *F = L->getHeader()->getParent(); 10261 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10262 10263 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10264 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10265 10266 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10267 &Hints, IAI); 10268 // Use the planner for outer loop vectorization. 10269 // TODO: CM is not used at this point inside the planner. Turn CM into an 10270 // optional argument if we don't need it in the future. 10271 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10272 Requirements, ORE); 10273 10274 // Get user vectorization factor. 10275 ElementCount UserVF = Hints.getWidth(); 10276 10277 CM.collectElementTypesForWidening(); 10278 10279 // Plan how to best vectorize, return the best VF and its cost. 10280 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10281 10282 // If we are stress testing VPlan builds, do not attempt to generate vector 10283 // code. Masked vector code generation support will follow soon. 10284 // Also, do not attempt to vectorize if no vector code will be produced. 10285 if (VPlanBuildStressTest || EnableVPlanPredication || 10286 VectorizationFactor::Disabled() == VF) 10287 return false; 10288 10289 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10290 10291 { 10292 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10293 F->getParent()->getDataLayout()); 10294 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10295 &CM, BFI, PSI, Checks); 10296 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10297 << L->getHeader()->getParent()->getName() << "\"\n"); 10298 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10299 } 10300 10301 // Mark the loop as already vectorized to avoid vectorizing again. 10302 Hints.setAlreadyVectorized(); 10303 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10304 return true; 10305 } 10306 10307 // Emit a remark if there are stores to floats that required a floating point 10308 // extension. If the vectorized loop was generated with floating point there 10309 // will be a performance penalty from the conversion overhead and the change in 10310 // the vector width. 10311 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10312 SmallVector<Instruction *, 4> Worklist; 10313 for (BasicBlock *BB : L->getBlocks()) { 10314 for (Instruction &Inst : *BB) { 10315 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10316 if (S->getValueOperand()->getType()->isFloatTy()) 10317 Worklist.push_back(S); 10318 } 10319 } 10320 } 10321 10322 // Traverse the floating point stores upwards searching, for floating point 10323 // conversions. 10324 SmallPtrSet<const Instruction *, 4> Visited; 10325 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10326 while (!Worklist.empty()) { 10327 auto *I = Worklist.pop_back_val(); 10328 if (!L->contains(I)) 10329 continue; 10330 if (!Visited.insert(I).second) 10331 continue; 10332 10333 // Emit a remark if the floating point store required a floating 10334 // point conversion. 10335 // TODO: More work could be done to identify the root cause such as a 10336 // constant or a function return type and point the user to it. 10337 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10338 ORE->emit([&]() { 10339 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10340 I->getDebugLoc(), L->getHeader()) 10341 << "floating point conversion changes vector width. " 10342 << "Mixed floating point precision requires an up/down " 10343 << "cast that will negatively impact performance."; 10344 }); 10345 10346 for (Use &Op : I->operands()) 10347 if (auto *OpI = dyn_cast<Instruction>(Op)) 10348 Worklist.push_back(OpI); 10349 } 10350 } 10351 10352 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10353 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10354 !EnableLoopInterleaving), 10355 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10356 !EnableLoopVectorization) {} 10357 10358 bool LoopVectorizePass::processLoop(Loop *L) { 10359 assert((EnableVPlanNativePath || L->isInnermost()) && 10360 "VPlan-native path is not enabled. Only process inner loops."); 10361 10362 #ifndef NDEBUG 10363 const std::string DebugLocStr = getDebugLocString(L); 10364 #endif /* NDEBUG */ 10365 10366 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10367 << L->getHeader()->getParent()->getName() << "\" from " 10368 << DebugLocStr << "\n"); 10369 10370 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10371 10372 LLVM_DEBUG( 10373 dbgs() << "LV: Loop hints:" 10374 << " force=" 10375 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10376 ? "disabled" 10377 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10378 ? "enabled" 10379 : "?")) 10380 << " width=" << Hints.getWidth() 10381 << " interleave=" << Hints.getInterleave() << "\n"); 10382 10383 // Function containing loop 10384 Function *F = L->getHeader()->getParent(); 10385 10386 // Looking at the diagnostic output is the only way to determine if a loop 10387 // was vectorized (other than looking at the IR or machine code), so it 10388 // is important to generate an optimization remark for each loop. Most of 10389 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10390 // generated as OptimizationRemark and OptimizationRemarkMissed are 10391 // less verbose reporting vectorized loops and unvectorized loops that may 10392 // benefit from vectorization, respectively. 10393 10394 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10395 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10396 return false; 10397 } 10398 10399 PredicatedScalarEvolution PSE(*SE, *L); 10400 10401 // Check if it is legal to vectorize the loop. 10402 LoopVectorizationRequirements Requirements; 10403 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10404 &Requirements, &Hints, DB, AC, BFI, PSI); 10405 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10406 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10407 Hints.emitRemarkWithHints(); 10408 return false; 10409 } 10410 10411 // Check the function attributes and profiles to find out if this function 10412 // should be optimized for size. 10413 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10414 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10415 10416 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10417 // here. They may require CFG and instruction level transformations before 10418 // even evaluating whether vectorization is profitable. Since we cannot modify 10419 // the incoming IR, we need to build VPlan upfront in the vectorization 10420 // pipeline. 10421 if (!L->isInnermost()) 10422 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10423 ORE, BFI, PSI, Hints, Requirements); 10424 10425 assert(L->isInnermost() && "Inner loop expected."); 10426 10427 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10428 // count by optimizing for size, to minimize overheads. 10429 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10430 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10431 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10432 << "This loop is worth vectorizing only if no scalar " 10433 << "iteration overheads are incurred."); 10434 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10435 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10436 else { 10437 LLVM_DEBUG(dbgs() << "\n"); 10438 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10439 } 10440 } 10441 10442 // Check the function attributes to see if implicit floats are allowed. 10443 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10444 // an integer loop and the vector instructions selected are purely integer 10445 // vector instructions? 10446 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10447 reportVectorizationFailure( 10448 "Can't vectorize when the NoImplicitFloat attribute is used", 10449 "loop not vectorized due to NoImplicitFloat attribute", 10450 "NoImplicitFloat", ORE, L); 10451 Hints.emitRemarkWithHints(); 10452 return false; 10453 } 10454 10455 // Check if the target supports potentially unsafe FP vectorization. 10456 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10457 // for the target we're vectorizing for, to make sure none of the 10458 // additional fp-math flags can help. 10459 if (Hints.isPotentiallyUnsafe() && 10460 TTI->isFPVectorizationPotentiallyUnsafe()) { 10461 reportVectorizationFailure( 10462 "Potentially unsafe FP op prevents vectorization", 10463 "loop not vectorized due to unsafe FP support.", 10464 "UnsafeFP", ORE, L); 10465 Hints.emitRemarkWithHints(); 10466 return false; 10467 } 10468 10469 bool AllowOrderedReductions; 10470 // If the flag is set, use that instead and override the TTI behaviour. 10471 if (ForceOrderedReductions.getNumOccurrences() > 0) 10472 AllowOrderedReductions = ForceOrderedReductions; 10473 else 10474 AllowOrderedReductions = TTI->enableOrderedReductions(); 10475 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10476 ORE->emit([&]() { 10477 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10478 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10479 ExactFPMathInst->getDebugLoc(), 10480 ExactFPMathInst->getParent()) 10481 << "loop not vectorized: cannot prove it is safe to reorder " 10482 "floating-point operations"; 10483 }); 10484 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10485 "reorder floating-point operations\n"); 10486 Hints.emitRemarkWithHints(); 10487 return false; 10488 } 10489 10490 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10491 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10492 10493 // If an override option has been passed in for interleaved accesses, use it. 10494 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10495 UseInterleaved = EnableInterleavedMemAccesses; 10496 10497 // Analyze interleaved memory accesses. 10498 if (UseInterleaved) { 10499 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10500 } 10501 10502 // Use the cost model. 10503 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10504 F, &Hints, IAI); 10505 CM.collectValuesToIgnore(); 10506 CM.collectElementTypesForWidening(); 10507 10508 // Use the planner for vectorization. 10509 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10510 Requirements, ORE); 10511 10512 // Get user vectorization factor and interleave count. 10513 ElementCount UserVF = Hints.getWidth(); 10514 unsigned UserIC = Hints.getInterleave(); 10515 10516 // Plan how to best vectorize, return the best VF and its cost. 10517 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10518 10519 VectorizationFactor VF = VectorizationFactor::Disabled(); 10520 unsigned IC = 1; 10521 10522 if (MaybeVF) { 10523 VF = *MaybeVF; 10524 // Select the interleave count. 10525 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10526 } 10527 10528 // Identify the diagnostic messages that should be produced. 10529 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10530 bool VectorizeLoop = true, InterleaveLoop = true; 10531 if (VF.Width.isScalar()) { 10532 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10533 VecDiagMsg = std::make_pair( 10534 "VectorizationNotBeneficial", 10535 "the cost-model indicates that vectorization is not beneficial"); 10536 VectorizeLoop = false; 10537 } 10538 10539 if (!MaybeVF && UserIC > 1) { 10540 // Tell the user interleaving was avoided up-front, despite being explicitly 10541 // requested. 10542 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10543 "interleaving should be avoided up front\n"); 10544 IntDiagMsg = std::make_pair( 10545 "InterleavingAvoided", 10546 "Ignoring UserIC, because interleaving was avoided up front"); 10547 InterleaveLoop = false; 10548 } else if (IC == 1 && UserIC <= 1) { 10549 // Tell the user interleaving is not beneficial. 10550 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10551 IntDiagMsg = std::make_pair( 10552 "InterleavingNotBeneficial", 10553 "the cost-model indicates that interleaving is not beneficial"); 10554 InterleaveLoop = false; 10555 if (UserIC == 1) { 10556 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10557 IntDiagMsg.second += 10558 " and is explicitly disabled or interleave count is set to 1"; 10559 } 10560 } else if (IC > 1 && UserIC == 1) { 10561 // Tell the user interleaving is beneficial, but it explicitly disabled. 10562 LLVM_DEBUG( 10563 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10564 IntDiagMsg = std::make_pair( 10565 "InterleavingBeneficialButDisabled", 10566 "the cost-model indicates that interleaving is beneficial " 10567 "but is explicitly disabled or interleave count is set to 1"); 10568 InterleaveLoop = false; 10569 } 10570 10571 // Override IC if user provided an interleave count. 10572 IC = UserIC > 0 ? UserIC : IC; 10573 10574 // Emit diagnostic messages, if any. 10575 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10576 if (!VectorizeLoop && !InterleaveLoop) { 10577 // Do not vectorize or interleaving the loop. 10578 ORE->emit([&]() { 10579 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10580 L->getStartLoc(), L->getHeader()) 10581 << VecDiagMsg.second; 10582 }); 10583 ORE->emit([&]() { 10584 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10585 L->getStartLoc(), L->getHeader()) 10586 << IntDiagMsg.second; 10587 }); 10588 return false; 10589 } else if (!VectorizeLoop && InterleaveLoop) { 10590 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10591 ORE->emit([&]() { 10592 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10593 L->getStartLoc(), L->getHeader()) 10594 << VecDiagMsg.second; 10595 }); 10596 } else if (VectorizeLoop && !InterleaveLoop) { 10597 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10598 << ") in " << DebugLocStr << '\n'); 10599 ORE->emit([&]() { 10600 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10601 L->getStartLoc(), L->getHeader()) 10602 << IntDiagMsg.second; 10603 }); 10604 } else if (VectorizeLoop && InterleaveLoop) { 10605 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10606 << ") in " << DebugLocStr << '\n'); 10607 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10608 } 10609 10610 bool DisableRuntimeUnroll = false; 10611 MDNode *OrigLoopID = L->getLoopID(); 10612 { 10613 // Optimistically generate runtime checks. Drop them if they turn out to not 10614 // be profitable. Limit the scope of Checks, so the cleanup happens 10615 // immediately after vector codegeneration is done. 10616 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10617 F->getParent()->getDataLayout()); 10618 if (!VF.Width.isScalar() || IC > 1) 10619 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate()); 10620 10621 using namespace ore; 10622 if (!VectorizeLoop) { 10623 assert(IC > 1 && "interleave count should not be 1 or 0"); 10624 // If we decided that it is not legal to vectorize the loop, then 10625 // interleave it. 10626 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10627 &CM, BFI, PSI, Checks); 10628 10629 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10630 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10631 10632 ORE->emit([&]() { 10633 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10634 L->getHeader()) 10635 << "interleaved loop (interleaved count: " 10636 << NV("InterleaveCount", IC) << ")"; 10637 }); 10638 } else { 10639 // If we decided that it is *legal* to vectorize the loop, then do it. 10640 10641 // Consider vectorizing the epilogue too if it's profitable. 10642 VectorizationFactor EpilogueVF = 10643 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10644 if (EpilogueVF.Width.isVector()) { 10645 10646 // The first pass vectorizes the main loop and creates a scalar epilogue 10647 // to be vectorized by executing the plan (potentially with a different 10648 // factor) again shortly afterwards. 10649 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10650 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10651 EPI, &LVL, &CM, BFI, PSI, Checks); 10652 10653 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10654 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10655 DT); 10656 ++LoopsVectorized; 10657 10658 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10659 formLCSSARecursively(*L, *DT, LI, SE); 10660 10661 // Second pass vectorizes the epilogue and adjusts the control flow 10662 // edges from the first pass. 10663 EPI.MainLoopVF = EPI.EpilogueVF; 10664 EPI.MainLoopUF = EPI.EpilogueUF; 10665 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10666 ORE, EPI, &LVL, &CM, BFI, PSI, 10667 Checks); 10668 10669 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10670 10671 // Ensure that the start values for any VPReductionPHIRecipes are 10672 // updated before vectorising the epilogue loop. 10673 VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); 10674 for (VPRecipeBase &R : Header->phis()) { 10675 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10676 if (auto *Resume = MainILV.getReductionResumeValue( 10677 ReductionPhi->getRecurrenceDescriptor())) { 10678 VPValue *StartVal = new VPValue(Resume); 10679 BestEpiPlan.addExternalDef(StartVal); 10680 ReductionPhi->setOperand(0, StartVal); 10681 } 10682 } 10683 } 10684 10685 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10686 DT); 10687 ++LoopsEpilogueVectorized; 10688 10689 if (!MainILV.areSafetyChecksAdded()) 10690 DisableRuntimeUnroll = true; 10691 } else { 10692 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10693 &LVL, &CM, BFI, PSI, Checks); 10694 10695 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10696 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10697 ++LoopsVectorized; 10698 10699 // Add metadata to disable runtime unrolling a scalar loop when there 10700 // are no runtime checks about strides and memory. A scalar loop that is 10701 // rarely used is not worth unrolling. 10702 if (!LB.areSafetyChecksAdded()) 10703 DisableRuntimeUnroll = true; 10704 } 10705 // Report the vectorization decision. 10706 ORE->emit([&]() { 10707 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10708 L->getHeader()) 10709 << "vectorized loop (vectorization width: " 10710 << NV("VectorizationFactor", VF.Width) 10711 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10712 }); 10713 } 10714 10715 if (ORE->allowExtraAnalysis(LV_NAME)) 10716 checkMixedPrecision(L, ORE); 10717 } 10718 10719 Optional<MDNode *> RemainderLoopID = 10720 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10721 LLVMLoopVectorizeFollowupEpilogue}); 10722 if (RemainderLoopID.hasValue()) { 10723 L->setLoopID(RemainderLoopID.getValue()); 10724 } else { 10725 if (DisableRuntimeUnroll) 10726 AddRuntimeUnrollDisableMetaData(L); 10727 10728 // Mark the loop as already vectorized to avoid vectorizing again. 10729 Hints.setAlreadyVectorized(); 10730 } 10731 10732 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10733 return true; 10734 } 10735 10736 LoopVectorizeResult LoopVectorizePass::runImpl( 10737 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10738 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10739 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10740 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10741 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10742 SE = &SE_; 10743 LI = &LI_; 10744 TTI = &TTI_; 10745 DT = &DT_; 10746 BFI = &BFI_; 10747 TLI = TLI_; 10748 AA = &AA_; 10749 AC = &AC_; 10750 GetLAA = &GetLAA_; 10751 DB = &DB_; 10752 ORE = &ORE_; 10753 PSI = PSI_; 10754 10755 // Don't attempt if 10756 // 1. the target claims to have no vector registers, and 10757 // 2. interleaving won't help ILP. 10758 // 10759 // The second condition is necessary because, even if the target has no 10760 // vector registers, loop vectorization may still enable scalar 10761 // interleaving. 10762 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10763 TTI->getMaxInterleaveFactor(1) < 2) 10764 return LoopVectorizeResult(false, false); 10765 10766 bool Changed = false, CFGChanged = false; 10767 10768 // The vectorizer requires loops to be in simplified form. 10769 // Since simplification may add new inner loops, it has to run before the 10770 // legality and profitability checks. This means running the loop vectorizer 10771 // will simplify all loops, regardless of whether anything end up being 10772 // vectorized. 10773 for (auto &L : *LI) 10774 Changed |= CFGChanged |= 10775 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10776 10777 // Build up a worklist of inner-loops to vectorize. This is necessary as 10778 // the act of vectorizing or partially unrolling a loop creates new loops 10779 // and can invalidate iterators across the loops. 10780 SmallVector<Loop *, 8> Worklist; 10781 10782 for (Loop *L : *LI) 10783 collectSupportedLoops(*L, LI, ORE, Worklist); 10784 10785 LoopsAnalyzed += Worklist.size(); 10786 10787 // Now walk the identified inner loops. 10788 while (!Worklist.empty()) { 10789 Loop *L = Worklist.pop_back_val(); 10790 10791 // For the inner loops we actually process, form LCSSA to simplify the 10792 // transform. 10793 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10794 10795 Changed |= CFGChanged |= processLoop(L); 10796 } 10797 10798 // Process each loop nest in the function. 10799 return LoopVectorizeResult(Changed, CFGChanged); 10800 } 10801 10802 PreservedAnalyses LoopVectorizePass::run(Function &F, 10803 FunctionAnalysisManager &AM) { 10804 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10805 auto &LI = AM.getResult<LoopAnalysis>(F); 10806 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10807 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10808 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10809 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10810 auto &AA = AM.getResult<AAManager>(F); 10811 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10812 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10813 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10814 10815 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10816 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10817 [&](Loop &L) -> const LoopAccessInfo & { 10818 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10819 TLI, TTI, nullptr, nullptr, nullptr}; 10820 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10821 }; 10822 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10823 ProfileSummaryInfo *PSI = 10824 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10825 LoopVectorizeResult Result = 10826 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10827 if (!Result.MadeAnyChange) 10828 return PreservedAnalyses::all(); 10829 PreservedAnalyses PA; 10830 10831 // We currently do not preserve loopinfo/dominator analyses with outer loop 10832 // vectorization. Until this is addressed, mark these analyses as preserved 10833 // only for non-VPlan-native path. 10834 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10835 if (!EnableVPlanNativePath) { 10836 PA.preserve<LoopAnalysis>(); 10837 PA.preserve<DominatorTreeAnalysis>(); 10838 } 10839 10840 if (Result.MadeCFGChange) { 10841 // Making CFG changes likely means a loop got vectorized. Indicate that 10842 // extra simplification passes should be run. 10843 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10844 // be run if runtime checks have been added. 10845 AM.getResult<ShouldRunExtraVectorPasses>(F); 10846 PA.preserve<ShouldRunExtraVectorPasses>(); 10847 } else { 10848 PA.preserveSet<CFGAnalyses>(); 10849 } 10850 return PA; 10851 } 10852 10853 void LoopVectorizePass::printPipeline( 10854 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10855 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10856 OS, MapClassName2PassName); 10857 10858 OS << "<"; 10859 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10860 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10861 OS << ">"; 10862 } 10863