1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop and the start value for the canonical induction, if it is != 0. The 474 /// latter is the case when vectorizing the epilogue loop. In the case of 475 /// epilogue vectorization, this function is overriden to handle the more 476 /// complex control flow around the loops. 477 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 478 479 /// Widen a single call instruction within the innermost loop. 480 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 481 VPTransformState &State); 482 483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 484 void fixVectorizedLoop(VPTransformState &State); 485 486 // Return true if any runtime check is added. 487 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 488 489 /// A type for vectorized values in the new loop. Each value from the 490 /// original loop, when vectorized, is represented by UF vector values in the 491 /// new unrolled loop, where UF is the unroll factor. 492 using VectorParts = SmallVector<Value *, 2>; 493 494 /// Vectorize a single first-order recurrence or pointer induction PHINode in 495 /// a block. This method handles the induction variable canonicalization. It 496 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 497 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 498 VPTransformState &State); 499 500 /// A helper function to scalarize a single Instruction in the innermost loop. 501 /// Generates a sequence of scalar instances for each lane between \p MinLane 502 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 503 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 504 /// Instr's operands. 505 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 506 const VPIteration &Instance, bool IfPredicateInstr, 507 VPTransformState &State); 508 509 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 510 /// is provided, the integer induction variable will first be truncated to 511 /// the corresponding type. \p CanonicalIV is the scalar value generated for 512 /// the canonical induction variable. 513 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, 514 VPTransformState &State, Value *CanonicalIV); 515 516 /// Construct the vector value of a scalarized value \p V one lane at a time. 517 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 518 VPTransformState &State); 519 520 /// Try to vectorize interleaved access group \p Group with the base address 521 /// given in \p Addr, optionally masking the vector operations if \p 522 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 523 /// values in the vectorized loop. 524 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 525 ArrayRef<VPValue *> VPDefs, 526 VPTransformState &State, VPValue *Addr, 527 ArrayRef<VPValue *> StoredValues, 528 VPValue *BlockInMask = nullptr); 529 530 /// Set the debug location in the builder \p Ptr using the debug location in 531 /// \p V. If \p Ptr is None then it uses the class member's Builder. 532 void setDebugLocFromInst(const Value *V, 533 Optional<IRBuilder<> *> CustomBuilder = None); 534 535 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 536 void fixNonInductionPHIs(VPTransformState &State); 537 538 /// Returns true if the reordering of FP operations is not allowed, but we are 539 /// able to vectorize with strict in-order reductions for the given RdxDesc. 540 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 541 542 /// Create a broadcast instruction. This method generates a broadcast 543 /// instruction (shuffle) for loop invariant values and for the induction 544 /// value. If this is the induction variable then we extend it to N, N+1, ... 545 /// this is needed because each iteration in the loop corresponds to a SIMD 546 /// element. 547 virtual Value *getBroadcastInstrs(Value *V); 548 549 /// Add metadata from one instruction to another. 550 /// 551 /// This includes both the original MDs from \p From and additional ones (\see 552 /// addNewMetadata). Use this for *newly created* instructions in the vector 553 /// loop. 554 void addMetadata(Instruction *To, Instruction *From); 555 556 /// Similar to the previous function but it adds the metadata to a 557 /// vector of instructions. 558 void addMetadata(ArrayRef<Value *> To, Instruction *From); 559 560 protected: 561 friend class LoopVectorizationPlanner; 562 563 /// A small list of PHINodes. 564 using PhiVector = SmallVector<PHINode *, 4>; 565 566 /// A type for scalarized values in the new loop. Each value from the 567 /// original loop, when scalarized, is represented by UF x VF scalar values 568 /// in the new unrolled loop, where UF is the unroll factor and VF is the 569 /// vectorization factor. 570 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 571 572 /// Set up the values of the IVs correctly when exiting the vector loop. 573 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 574 Value *CountRoundDown, Value *EndValue, 575 BasicBlock *MiddleBlock); 576 577 /// Introduce a conditional branch (on true, condition to be set later) at the 578 /// end of the header=latch connecting it to itself (across the backedge) and 579 /// to the exit block of \p L. 580 void createHeaderBranch(Loop *L); 581 582 /// Handle all cross-iteration phis in the header. 583 void fixCrossIterationPHIs(VPTransformState &State); 584 585 /// Create the exit value of first order recurrences in the middle block and 586 /// update their users. 587 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 588 VPTransformState &State); 589 590 /// Create code for the loop exit value of the reduction. 591 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 592 593 /// Clear NSW/NUW flags from reduction instructions if necessary. 594 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 595 VPTransformState &State); 596 597 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 598 /// means we need to add the appropriate incoming value from the middle 599 /// block as exiting edges from the scalar epilogue loop (if present) are 600 /// already in place, and we exit the vector loop exclusively to the middle 601 /// block. 602 void fixLCSSAPHIs(VPTransformState &State); 603 604 /// Iteratively sink the scalarized operands of a predicated instruction into 605 /// the block that was created for it. 606 void sinkScalarOperands(Instruction *PredInst); 607 608 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 609 /// represented as. 610 void truncateToMinimalBitwidths(VPTransformState &State); 611 612 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 613 /// variable on which to base the steps, \p Step is the size of the step, and 614 /// \p EntryVal is the value from the original loop that maps to the steps. 615 /// Note that \p EntryVal doesn't have to be an induction variable - it 616 /// can also be a truncate instruction. 617 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 618 const InductionDescriptor &ID, VPValue *Def, 619 VPTransformState &State); 620 621 /// Create a vector induction phi node based on an existing scalar one. \p 622 /// EntryVal is the value from the original loop that maps to the vector phi 623 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 624 /// truncate instruction, instead of widening the original IV, we widen a 625 /// version of the IV truncated to \p EntryVal's type. 626 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 627 Value *Step, Value *Start, 628 Instruction *EntryVal, VPValue *Def, 629 VPTransformState &State); 630 631 /// Returns true if an instruction \p I should be scalarized instead of 632 /// vectorized for the chosen vectorization factor. 633 bool shouldScalarizeInstruction(Instruction *I) const; 634 635 /// Returns true if we should generate a scalar version of \p IV. 636 bool needsScalarInduction(Instruction *IV) const; 637 638 /// Returns (and creates if needed) the original loop trip count. 639 Value *getOrCreateTripCount(Loop *NewLoop); 640 641 /// Returns (and creates if needed) the trip count of the widened loop. 642 Value *getOrCreateVectorTripCount(Loop *NewLoop); 643 644 /// Returns a bitcasted value to the requested vector type. 645 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 646 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 647 const DataLayout &DL); 648 649 /// Emit a bypass check to see if the vector trip count is zero, including if 650 /// it overflows. 651 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 652 653 /// Emit a bypass check to see if all of the SCEV assumptions we've 654 /// had to make are correct. Returns the block containing the checks or 655 /// nullptr if no checks have been added. 656 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 657 658 /// Emit bypass checks to check any memory assumptions we may have made. 659 /// Returns the block containing the checks or nullptr if no checks have been 660 /// added. 661 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 662 663 /// Compute the transformed value of Index at offset StartValue using step 664 /// StepValue. 665 /// For integer induction, returns StartValue + Index * StepValue. 666 /// For pointer induction, returns StartValue[Index * StepValue]. 667 /// FIXME: The newly created binary instructions should contain nsw/nuw 668 /// flags, which can be found from the original scalar operations. 669 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 670 const DataLayout &DL, 671 const InductionDescriptor &ID, 672 BasicBlock *VectorHeader) const; 673 674 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 675 /// vector loop preheader, middle block and scalar preheader. Also 676 /// allocate a loop object for the new vector loop and return it. 677 Loop *createVectorLoopSkeleton(StringRef Prefix); 678 679 /// Create new phi nodes for the induction variables to resume iteration count 680 /// in the scalar epilogue, from where the vectorized loop left off. 681 /// In cases where the loop skeleton is more complicated (eg. epilogue 682 /// vectorization) and the resume values can come from an additional bypass 683 /// block, the \p AdditionalBypass pair provides information about the bypass 684 /// block and the end value on the edge from bypass to this loop. 685 void createInductionResumeValues( 686 Loop *L, 687 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 688 689 /// Complete the loop skeleton by adding debug MDs, creating appropriate 690 /// conditional branches in the middle block, preparing the builder and 691 /// running the verifier. Take in the vector loop \p L as argument, and return 692 /// the preheader of the completed vector loop. 693 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 694 695 /// Add additional metadata to \p To that was not present on \p Orig. 696 /// 697 /// Currently this is used to add the noalias annotations based on the 698 /// inserted memchecks. Use this for instructions that are *cloned* into the 699 /// vector loop. 700 void addNewMetadata(Instruction *To, const Instruction *Orig); 701 702 /// Collect poison-generating recipes that may generate a poison value that is 703 /// used after vectorization, even when their operands are not poison. Those 704 /// recipes meet the following conditions: 705 /// * Contribute to the address computation of a recipe generating a widen 706 /// memory load/store (VPWidenMemoryInstructionRecipe or 707 /// VPInterleaveRecipe). 708 /// * Such a widen memory load/store has at least one underlying Instruction 709 /// that is in a basic block that needs predication and after vectorization 710 /// the generated instruction won't be predicated. 711 void collectPoisonGeneratingRecipes(VPTransformState &State); 712 713 /// Allow subclasses to override and print debug traces before/after vplan 714 /// execution, when trace information is requested. 715 virtual void printDebugTracesAtStart(){}; 716 virtual void printDebugTracesAtEnd(){}; 717 718 /// The original loop. 719 Loop *OrigLoop; 720 721 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 722 /// dynamic knowledge to simplify SCEV expressions and converts them to a 723 /// more usable form. 724 PredicatedScalarEvolution &PSE; 725 726 /// Loop Info. 727 LoopInfo *LI; 728 729 /// Dominator Tree. 730 DominatorTree *DT; 731 732 /// Alias Analysis. 733 AAResults *AA; 734 735 /// Target Library Info. 736 const TargetLibraryInfo *TLI; 737 738 /// Target Transform Info. 739 const TargetTransformInfo *TTI; 740 741 /// Assumption Cache. 742 AssumptionCache *AC; 743 744 /// Interface to emit optimization remarks. 745 OptimizationRemarkEmitter *ORE; 746 747 /// LoopVersioning. It's only set up (non-null) if memchecks were 748 /// used. 749 /// 750 /// This is currently only used to add no-alias metadata based on the 751 /// memchecks. The actually versioning is performed manually. 752 std::unique_ptr<LoopVersioning> LVer; 753 754 /// The vectorization SIMD factor to use. Each vector will have this many 755 /// vector elements. 756 ElementCount VF; 757 758 /// The vectorization unroll factor to use. Each scalar is vectorized to this 759 /// many different vector instructions. 760 unsigned UF; 761 762 /// The builder that we use 763 IRBuilder<> Builder; 764 765 // --- Vectorization state --- 766 767 /// The vector-loop preheader. 768 BasicBlock *LoopVectorPreHeader; 769 770 /// The scalar-loop preheader. 771 BasicBlock *LoopScalarPreHeader; 772 773 /// Middle Block between the vector and the scalar. 774 BasicBlock *LoopMiddleBlock; 775 776 /// The unique ExitBlock of the scalar loop if one exists. Note that 777 /// there can be multiple exiting edges reaching this block. 778 BasicBlock *LoopExitBlock; 779 780 /// The vector loop body. 781 BasicBlock *LoopVectorBody; 782 783 /// The scalar loop body. 784 BasicBlock *LoopScalarBody; 785 786 /// A list of all bypass blocks. The first block is the entry of the loop. 787 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 788 789 /// Store instructions that were predicated. 790 SmallVector<Instruction *, 4> PredicatedInstructions; 791 792 /// Trip count of the original loop. 793 Value *TripCount = nullptr; 794 795 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 796 Value *VectorTripCount = nullptr; 797 798 /// The legality analysis. 799 LoopVectorizationLegality *Legal; 800 801 /// The profitablity analysis. 802 LoopVectorizationCostModel *Cost; 803 804 // Record whether runtime checks are added. 805 bool AddedSafetyChecks = false; 806 807 // Holds the end values for each induction variable. We save the end values 808 // so we can later fix-up the external users of the induction variables. 809 DenseMap<PHINode *, Value *> IVEndValues; 810 811 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 812 // fixed up at the end of vector code generation. 813 SmallVector<PHINode *, 8> OrigPHIsToFix; 814 815 /// BFI and PSI are used to check for profile guided size optimizations. 816 BlockFrequencyInfo *BFI; 817 ProfileSummaryInfo *PSI; 818 819 // Whether this loop should be optimized for size based on profile guided size 820 // optimizatios. 821 bool OptForSizeBasedOnProfile; 822 823 /// Structure to hold information about generated runtime checks, responsible 824 /// for cleaning the checks, if vectorization turns out unprofitable. 825 GeneratedRTChecks &RTChecks; 826 }; 827 828 class InnerLoopUnroller : public InnerLoopVectorizer { 829 public: 830 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 831 LoopInfo *LI, DominatorTree *DT, 832 const TargetLibraryInfo *TLI, 833 const TargetTransformInfo *TTI, AssumptionCache *AC, 834 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 835 LoopVectorizationLegality *LVL, 836 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 837 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 838 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 839 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 840 BFI, PSI, Check) {} 841 842 private: 843 Value *getBroadcastInstrs(Value *V) override; 844 }; 845 846 /// Encapsulate information regarding vectorization of a loop and its epilogue. 847 /// This information is meant to be updated and used across two stages of 848 /// epilogue vectorization. 849 struct EpilogueLoopVectorizationInfo { 850 ElementCount MainLoopVF = ElementCount::getFixed(0); 851 unsigned MainLoopUF = 0; 852 ElementCount EpilogueVF = ElementCount::getFixed(0); 853 unsigned EpilogueUF = 0; 854 BasicBlock *MainLoopIterationCountCheck = nullptr; 855 BasicBlock *EpilogueIterationCountCheck = nullptr; 856 BasicBlock *SCEVSafetyCheck = nullptr; 857 BasicBlock *MemSafetyCheck = nullptr; 858 Value *TripCount = nullptr; 859 Value *VectorTripCount = nullptr; 860 861 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 862 ElementCount EVF, unsigned EUF) 863 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 864 assert(EUF == 1 && 865 "A high UF for the epilogue loop is likely not beneficial."); 866 } 867 }; 868 869 /// An extension of the inner loop vectorizer that creates a skeleton for a 870 /// vectorized loop that has its epilogue (residual) also vectorized. 871 /// The idea is to run the vplan on a given loop twice, firstly to setup the 872 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 873 /// from the first step and vectorize the epilogue. This is achieved by 874 /// deriving two concrete strategy classes from this base class and invoking 875 /// them in succession from the loop vectorizer planner. 876 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 877 public: 878 InnerLoopAndEpilogueVectorizer( 879 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 880 DominatorTree *DT, const TargetLibraryInfo *TLI, 881 const TargetTransformInfo *TTI, AssumptionCache *AC, 882 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 883 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 884 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 885 GeneratedRTChecks &Checks) 886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 887 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 888 Checks), 889 EPI(EPI) {} 890 891 // Override this function to handle the more complex control flow around the 892 // three loops. 893 std::pair<BasicBlock *, Value *> 894 createVectorizedLoopSkeleton() final override { 895 return createEpilogueVectorizedLoopSkeleton(); 896 } 897 898 /// The interface for creating a vectorized skeleton using one of two 899 /// different strategies, each corresponding to one execution of the vplan 900 /// as described above. 901 virtual std::pair<BasicBlock *, Value *> 902 createEpilogueVectorizedLoopSkeleton() = 0; 903 904 /// Holds and updates state information required to vectorize the main loop 905 /// and its epilogue in two separate passes. This setup helps us avoid 906 /// regenerating and recomputing runtime safety checks. It also helps us to 907 /// shorten the iteration-count-check path length for the cases where the 908 /// iteration count of the loop is so small that the main vector loop is 909 /// completely skipped. 910 EpilogueLoopVectorizationInfo &EPI; 911 }; 912 913 /// A specialized derived class of inner loop vectorizer that performs 914 /// vectorization of *main* loops in the process of vectorizing loops and their 915 /// epilogues. 916 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 917 public: 918 EpilogueVectorizerMainLoop( 919 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 920 DominatorTree *DT, const TargetLibraryInfo *TLI, 921 const TargetTransformInfo *TTI, AssumptionCache *AC, 922 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 923 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 924 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 925 GeneratedRTChecks &Check) 926 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 927 EPI, LVL, CM, BFI, PSI, Check) {} 928 /// Implements the interface for creating a vectorized skeleton using the 929 /// *main loop* strategy (ie the first pass of vplan execution). 930 std::pair<BasicBlock *, Value *> 931 createEpilogueVectorizedLoopSkeleton() final override; 932 933 protected: 934 /// Emits an iteration count bypass check once for the main loop (when \p 935 /// ForEpilogue is false) and once for the epilogue loop (when \p 936 /// ForEpilogue is true). 937 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 938 bool ForEpilogue); 939 void printDebugTracesAtStart() override; 940 void printDebugTracesAtEnd() override; 941 }; 942 943 // A specialized derived class of inner loop vectorizer that performs 944 // vectorization of *epilogue* loops in the process of vectorizing loops and 945 // their epilogues. 946 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 947 public: 948 EpilogueVectorizerEpilogueLoop( 949 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 950 DominatorTree *DT, const TargetLibraryInfo *TLI, 951 const TargetTransformInfo *TTI, AssumptionCache *AC, 952 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 953 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 954 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 955 GeneratedRTChecks &Checks) 956 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 957 EPI, LVL, CM, BFI, PSI, Checks) {} 958 /// Implements the interface for creating a vectorized skeleton using the 959 /// *epilogue loop* strategy (ie the second pass of vplan execution). 960 std::pair<BasicBlock *, Value *> 961 createEpilogueVectorizedLoopSkeleton() final override; 962 963 protected: 964 /// Emits an iteration count bypass check after the main vector loop has 965 /// finished to see if there are any iterations left to execute by either 966 /// the vector epilogue or the scalar epilogue. 967 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 968 BasicBlock *Bypass, 969 BasicBlock *Insert); 970 void printDebugTracesAtStart() override; 971 void printDebugTracesAtEnd() override; 972 }; 973 } // end namespace llvm 974 975 /// Look for a meaningful debug location on the instruction or it's 976 /// operands. 977 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 978 if (!I) 979 return I; 980 981 DebugLoc Empty; 982 if (I->getDebugLoc() != Empty) 983 return I; 984 985 for (Use &Op : I->operands()) { 986 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 987 if (OpInst->getDebugLoc() != Empty) 988 return OpInst; 989 } 990 991 return I; 992 } 993 994 void InnerLoopVectorizer::setDebugLocFromInst( 995 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 996 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 997 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 998 const DILocation *DIL = Inst->getDebugLoc(); 999 1000 // When a FSDiscriminator is enabled, we don't need to add the multiply 1001 // factors to the discriminators. 1002 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1003 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1004 // FIXME: For scalable vectors, assume vscale=1. 1005 auto NewDIL = 1006 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1007 if (NewDIL) 1008 B->SetCurrentDebugLocation(NewDIL.getValue()); 1009 else 1010 LLVM_DEBUG(dbgs() 1011 << "Failed to create new discriminator: " 1012 << DIL->getFilename() << " Line: " << DIL->getLine()); 1013 } else 1014 B->SetCurrentDebugLocation(DIL); 1015 } else 1016 B->SetCurrentDebugLocation(DebugLoc()); 1017 } 1018 1019 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1020 /// is passed, the message relates to that particular instruction. 1021 #ifndef NDEBUG 1022 static void debugVectorizationMessage(const StringRef Prefix, 1023 const StringRef DebugMsg, 1024 Instruction *I) { 1025 dbgs() << "LV: " << Prefix << DebugMsg; 1026 if (I != nullptr) 1027 dbgs() << " " << *I; 1028 else 1029 dbgs() << '.'; 1030 dbgs() << '\n'; 1031 } 1032 #endif 1033 1034 /// Create an analysis remark that explains why vectorization failed 1035 /// 1036 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1037 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1038 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1039 /// the location of the remark. \return the remark object that can be 1040 /// streamed to. 1041 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1042 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1043 Value *CodeRegion = TheLoop->getHeader(); 1044 DebugLoc DL = TheLoop->getStartLoc(); 1045 1046 if (I) { 1047 CodeRegion = I->getParent(); 1048 // If there is no debug location attached to the instruction, revert back to 1049 // using the loop's. 1050 if (I->getDebugLoc()) 1051 DL = I->getDebugLoc(); 1052 } 1053 1054 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1055 } 1056 1057 namespace llvm { 1058 1059 /// Return a value for Step multiplied by VF. 1060 Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1061 int64_t Step) { 1062 assert(Ty->isIntegerTy() && "Expected an integer step"); 1063 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1064 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1065 } 1066 1067 /// Return the runtime value for VF. 1068 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1069 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1070 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1071 } 1072 1073 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1074 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1075 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1076 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1077 return B.CreateUIToFP(RuntimeVF, FTy); 1078 } 1079 1080 void reportVectorizationFailure(const StringRef DebugMsg, 1081 const StringRef OREMsg, const StringRef ORETag, 1082 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1083 Instruction *I) { 1084 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1085 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1086 ORE->emit( 1087 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1088 << "loop not vectorized: " << OREMsg); 1089 } 1090 1091 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1092 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1093 Instruction *I) { 1094 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1095 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1096 ORE->emit( 1097 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1098 << Msg); 1099 } 1100 1101 } // end namespace llvm 1102 1103 #ifndef NDEBUG 1104 /// \return string containing a file name and a line # for the given loop. 1105 static std::string getDebugLocString(const Loop *L) { 1106 std::string Result; 1107 if (L) { 1108 raw_string_ostream OS(Result); 1109 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1110 LoopDbgLoc.print(OS); 1111 else 1112 // Just print the module name. 1113 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1114 OS.flush(); 1115 } 1116 return Result; 1117 } 1118 #endif 1119 1120 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1121 const Instruction *Orig) { 1122 // If the loop was versioned with memchecks, add the corresponding no-alias 1123 // metadata. 1124 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1125 LVer->annotateInstWithNoAlias(To, Orig); 1126 } 1127 1128 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1129 VPTransformState &State) { 1130 1131 // Collect recipes in the backward slice of `Root` that may generate a poison 1132 // value that is used after vectorization. 1133 SmallPtrSet<VPRecipeBase *, 16> Visited; 1134 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1135 SmallVector<VPRecipeBase *, 16> Worklist; 1136 Worklist.push_back(Root); 1137 1138 // Traverse the backward slice of Root through its use-def chain. 1139 while (!Worklist.empty()) { 1140 VPRecipeBase *CurRec = Worklist.back(); 1141 Worklist.pop_back(); 1142 1143 if (!Visited.insert(CurRec).second) 1144 continue; 1145 1146 // Prune search if we find another recipe generating a widen memory 1147 // instruction. Widen memory instructions involved in address computation 1148 // will lead to gather/scatter instructions, which don't need to be 1149 // handled. 1150 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1151 isa<VPInterleaveRecipe>(CurRec) || 1152 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1153 continue; 1154 1155 // This recipe contributes to the address computation of a widen 1156 // load/store. Collect recipe if its underlying instruction has 1157 // poison-generating flags. 1158 Instruction *Instr = CurRec->getUnderlyingInstr(); 1159 if (Instr && Instr->hasPoisonGeneratingFlags()) 1160 State.MayGeneratePoisonRecipes.insert(CurRec); 1161 1162 // Add new definitions to the worklist. 1163 for (VPValue *operand : CurRec->operands()) 1164 if (VPDef *OpDef = operand->getDef()) 1165 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1166 } 1167 }); 1168 1169 // Traverse all the recipes in the VPlan and collect the poison-generating 1170 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1171 // VPInterleaveRecipe. 1172 auto Iter = depth_first( 1173 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1174 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1175 for (VPRecipeBase &Recipe : *VPBB) { 1176 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1177 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1178 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1179 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1180 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1181 collectPoisonGeneratingInstrsInBackwardSlice( 1182 cast<VPRecipeBase>(AddrDef)); 1183 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1184 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1185 if (AddrDef) { 1186 // Check if any member of the interleave group needs predication. 1187 const InterleaveGroup<Instruction> *InterGroup = 1188 InterleaveRec->getInterleaveGroup(); 1189 bool NeedPredication = false; 1190 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1191 I < NumMembers; ++I) { 1192 Instruction *Member = InterGroup->getMember(I); 1193 if (Member) 1194 NeedPredication |= 1195 Legal->blockNeedsPredication(Member->getParent()); 1196 } 1197 1198 if (NeedPredication) 1199 collectPoisonGeneratingInstrsInBackwardSlice( 1200 cast<VPRecipeBase>(AddrDef)); 1201 } 1202 } 1203 } 1204 } 1205 } 1206 1207 void InnerLoopVectorizer::addMetadata(Instruction *To, 1208 Instruction *From) { 1209 propagateMetadata(To, From); 1210 addNewMetadata(To, From); 1211 } 1212 1213 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1214 Instruction *From) { 1215 for (Value *V : To) { 1216 if (Instruction *I = dyn_cast<Instruction>(V)) 1217 addMetadata(I, From); 1218 } 1219 } 1220 1221 namespace llvm { 1222 1223 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1224 // lowered. 1225 enum ScalarEpilogueLowering { 1226 1227 // The default: allowing scalar epilogues. 1228 CM_ScalarEpilogueAllowed, 1229 1230 // Vectorization with OptForSize: don't allow epilogues. 1231 CM_ScalarEpilogueNotAllowedOptSize, 1232 1233 // A special case of vectorisation with OptForSize: loops with a very small 1234 // trip count are considered for vectorization under OptForSize, thereby 1235 // making sure the cost of their loop body is dominant, free of runtime 1236 // guards and scalar iteration overheads. 1237 CM_ScalarEpilogueNotAllowedLowTripLoop, 1238 1239 // Loop hint predicate indicating an epilogue is undesired. 1240 CM_ScalarEpilogueNotNeededUsePredicate, 1241 1242 // Directive indicating we must either tail fold or not vectorize 1243 CM_ScalarEpilogueNotAllowedUsePredicate 1244 }; 1245 1246 /// ElementCountComparator creates a total ordering for ElementCount 1247 /// for the purposes of using it in a set structure. 1248 struct ElementCountComparator { 1249 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1250 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1251 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1252 } 1253 }; 1254 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1255 1256 /// LoopVectorizationCostModel - estimates the expected speedups due to 1257 /// vectorization. 1258 /// In many cases vectorization is not profitable. This can happen because of 1259 /// a number of reasons. In this class we mainly attempt to predict the 1260 /// expected speedup/slowdowns due to the supported instruction set. We use the 1261 /// TargetTransformInfo to query the different backends for the cost of 1262 /// different operations. 1263 class LoopVectorizationCostModel { 1264 public: 1265 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1266 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1267 LoopVectorizationLegality *Legal, 1268 const TargetTransformInfo &TTI, 1269 const TargetLibraryInfo *TLI, DemandedBits *DB, 1270 AssumptionCache *AC, 1271 OptimizationRemarkEmitter *ORE, const Function *F, 1272 const LoopVectorizeHints *Hints, 1273 InterleavedAccessInfo &IAI) 1274 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1275 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1276 Hints(Hints), InterleaveInfo(IAI) {} 1277 1278 /// \return An upper bound for the vectorization factors (both fixed and 1279 /// scalable). If the factors are 0, vectorization and interleaving should be 1280 /// avoided up front. 1281 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1282 1283 /// \return True if runtime checks are required for vectorization, and false 1284 /// otherwise. 1285 bool runtimeChecksRequired(); 1286 1287 /// \return The most profitable vectorization factor and the cost of that VF. 1288 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1289 /// then this vectorization factor will be selected if vectorization is 1290 /// possible. 1291 VectorizationFactor 1292 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1293 1294 VectorizationFactor 1295 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1296 const LoopVectorizationPlanner &LVP); 1297 1298 /// Setup cost-based decisions for user vectorization factor. 1299 /// \return true if the UserVF is a feasible VF to be chosen. 1300 bool selectUserVectorizationFactor(ElementCount UserVF) { 1301 collectUniformsAndScalars(UserVF); 1302 collectInstsToScalarize(UserVF); 1303 return expectedCost(UserVF).first.isValid(); 1304 } 1305 1306 /// \return The size (in bits) of the smallest and widest types in the code 1307 /// that needs to be vectorized. We ignore values that remain scalar such as 1308 /// 64 bit loop indices. 1309 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1310 1311 /// \return The desired interleave count. 1312 /// If interleave count has been specified by metadata it will be returned. 1313 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1314 /// are the selected vectorization factor and the cost of the selected VF. 1315 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1316 1317 /// Memory access instruction may be vectorized in more than one way. 1318 /// Form of instruction after vectorization depends on cost. 1319 /// This function takes cost-based decisions for Load/Store instructions 1320 /// and collects them in a map. This decisions map is used for building 1321 /// the lists of loop-uniform and loop-scalar instructions. 1322 /// The calculated cost is saved with widening decision in order to 1323 /// avoid redundant calculations. 1324 void setCostBasedWideningDecision(ElementCount VF); 1325 1326 /// A struct that represents some properties of the register usage 1327 /// of a loop. 1328 struct RegisterUsage { 1329 /// Holds the number of loop invariant values that are used in the loop. 1330 /// The key is ClassID of target-provided register class. 1331 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1332 /// Holds the maximum number of concurrent live intervals in the loop. 1333 /// The key is ClassID of target-provided register class. 1334 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1335 }; 1336 1337 /// \return Returns information about the register usages of the loop for the 1338 /// given vectorization factors. 1339 SmallVector<RegisterUsage, 8> 1340 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1341 1342 /// Collect values we want to ignore in the cost model. 1343 void collectValuesToIgnore(); 1344 1345 /// Collect all element types in the loop for which widening is needed. 1346 void collectElementTypesForWidening(); 1347 1348 /// Split reductions into those that happen in the loop, and those that happen 1349 /// outside. In loop reductions are collected into InLoopReductionChains. 1350 void collectInLoopReductions(); 1351 1352 /// Returns true if we should use strict in-order reductions for the given 1353 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1354 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1355 /// of FP operations. 1356 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1357 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1358 } 1359 1360 /// \returns The smallest bitwidth each instruction can be represented with. 1361 /// The vector equivalents of these instructions should be truncated to this 1362 /// type. 1363 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1364 return MinBWs; 1365 } 1366 1367 /// \returns True if it is more profitable to scalarize instruction \p I for 1368 /// vectorization factor \p VF. 1369 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1370 assert(VF.isVector() && 1371 "Profitable to scalarize relevant only for VF > 1."); 1372 1373 // Cost model is not run in the VPlan-native path - return conservative 1374 // result until this changes. 1375 if (EnableVPlanNativePath) 1376 return false; 1377 1378 auto Scalars = InstsToScalarize.find(VF); 1379 assert(Scalars != InstsToScalarize.end() && 1380 "VF not yet analyzed for scalarization profitability"); 1381 return Scalars->second.find(I) != Scalars->second.end(); 1382 } 1383 1384 /// Returns true if \p I is known to be uniform after vectorization. 1385 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1386 if (VF.isScalar()) 1387 return true; 1388 1389 // Cost model is not run in the VPlan-native path - return conservative 1390 // result until this changes. 1391 if (EnableVPlanNativePath) 1392 return false; 1393 1394 auto UniformsPerVF = Uniforms.find(VF); 1395 assert(UniformsPerVF != Uniforms.end() && 1396 "VF not yet analyzed for uniformity"); 1397 return UniformsPerVF->second.count(I); 1398 } 1399 1400 /// Returns true if \p I is known to be scalar after vectorization. 1401 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1402 if (VF.isScalar()) 1403 return true; 1404 1405 // Cost model is not run in the VPlan-native path - return conservative 1406 // result until this changes. 1407 if (EnableVPlanNativePath) 1408 return false; 1409 1410 auto ScalarsPerVF = Scalars.find(VF); 1411 assert(ScalarsPerVF != Scalars.end() && 1412 "Scalar values are not calculated for VF"); 1413 return ScalarsPerVF->second.count(I); 1414 } 1415 1416 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1417 /// for vectorization factor \p VF. 1418 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1419 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1420 !isProfitableToScalarize(I, VF) && 1421 !isScalarAfterVectorization(I, VF); 1422 } 1423 1424 /// Decision that was taken during cost calculation for memory instruction. 1425 enum InstWidening { 1426 CM_Unknown, 1427 CM_Widen, // For consecutive accesses with stride +1. 1428 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1429 CM_Interleave, 1430 CM_GatherScatter, 1431 CM_Scalarize 1432 }; 1433 1434 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1435 /// instruction \p I and vector width \p VF. 1436 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1437 InstructionCost Cost) { 1438 assert(VF.isVector() && "Expected VF >=2"); 1439 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1440 } 1441 1442 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1443 /// interleaving group \p Grp and vector width \p VF. 1444 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1445 ElementCount VF, InstWidening W, 1446 InstructionCost Cost) { 1447 assert(VF.isVector() && "Expected VF >=2"); 1448 /// Broadcast this decicion to all instructions inside the group. 1449 /// But the cost will be assigned to one instruction only. 1450 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1451 if (auto *I = Grp->getMember(i)) { 1452 if (Grp->getInsertPos() == I) 1453 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1454 else 1455 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1456 } 1457 } 1458 } 1459 1460 /// Return the cost model decision for the given instruction \p I and vector 1461 /// width \p VF. Return CM_Unknown if this instruction did not pass 1462 /// through the cost modeling. 1463 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1464 assert(VF.isVector() && "Expected VF to be a vector VF"); 1465 // Cost model is not run in the VPlan-native path - return conservative 1466 // result until this changes. 1467 if (EnableVPlanNativePath) 1468 return CM_GatherScatter; 1469 1470 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1471 auto Itr = WideningDecisions.find(InstOnVF); 1472 if (Itr == WideningDecisions.end()) 1473 return CM_Unknown; 1474 return Itr->second.first; 1475 } 1476 1477 /// Return the vectorization cost for the given instruction \p I and vector 1478 /// width \p VF. 1479 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1480 assert(VF.isVector() && "Expected VF >=2"); 1481 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1482 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1483 "The cost is not calculated"); 1484 return WideningDecisions[InstOnVF].second; 1485 } 1486 1487 /// Return True if instruction \p I is an optimizable truncate whose operand 1488 /// is an induction variable. Such a truncate will be removed by adding a new 1489 /// induction variable with the destination type. 1490 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1491 // If the instruction is not a truncate, return false. 1492 auto *Trunc = dyn_cast<TruncInst>(I); 1493 if (!Trunc) 1494 return false; 1495 1496 // Get the source and destination types of the truncate. 1497 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1498 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1499 1500 // If the truncate is free for the given types, return false. Replacing a 1501 // free truncate with an induction variable would add an induction variable 1502 // update instruction to each iteration of the loop. We exclude from this 1503 // check the primary induction variable since it will need an update 1504 // instruction regardless. 1505 Value *Op = Trunc->getOperand(0); 1506 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1507 return false; 1508 1509 // If the truncated value is not an induction variable, return false. 1510 return Legal->isInductionPhi(Op); 1511 } 1512 1513 /// Collects the instructions to scalarize for each predicated instruction in 1514 /// the loop. 1515 void collectInstsToScalarize(ElementCount VF); 1516 1517 /// Collect Uniform and Scalar values for the given \p VF. 1518 /// The sets depend on CM decision for Load/Store instructions 1519 /// that may be vectorized as interleave, gather-scatter or scalarized. 1520 void collectUniformsAndScalars(ElementCount VF) { 1521 // Do the analysis once. 1522 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1523 return; 1524 setCostBasedWideningDecision(VF); 1525 collectLoopUniforms(VF); 1526 collectLoopScalars(VF); 1527 } 1528 1529 /// Returns true if the target machine supports masked store operation 1530 /// for the given \p DataType and kind of access to \p Ptr. 1531 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1532 return Legal->isConsecutivePtr(DataType, Ptr) && 1533 TTI.isLegalMaskedStore(DataType, Alignment); 1534 } 1535 1536 /// Returns true if the target machine supports masked load operation 1537 /// for the given \p DataType and kind of access to \p Ptr. 1538 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1539 return Legal->isConsecutivePtr(DataType, Ptr) && 1540 TTI.isLegalMaskedLoad(DataType, Alignment); 1541 } 1542 1543 /// Returns true if the target machine can represent \p V as a masked gather 1544 /// or scatter operation. 1545 bool isLegalGatherOrScatter(Value *V, 1546 ElementCount VF = ElementCount::getFixed(1)) { 1547 bool LI = isa<LoadInst>(V); 1548 bool SI = isa<StoreInst>(V); 1549 if (!LI && !SI) 1550 return false; 1551 auto *Ty = getLoadStoreType(V); 1552 Align Align = getLoadStoreAlignment(V); 1553 if (VF.isVector()) 1554 Ty = VectorType::get(Ty, VF); 1555 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1556 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1557 } 1558 1559 /// Returns true if the target machine supports all of the reduction 1560 /// variables found for the given VF. 1561 bool canVectorizeReductions(ElementCount VF) const { 1562 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1563 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1564 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1565 })); 1566 } 1567 1568 /// Returns true if \p I is an instruction that will be scalarized with 1569 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1570 /// instructions include conditional stores and instructions that may divide 1571 /// by zero. 1572 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1573 1574 // Returns true if \p I is an instruction that will be predicated either 1575 // through scalar predication or masked load/store or masked gather/scatter. 1576 // \p VF is the vectorization factor that will be used to vectorize \p I. 1577 // Superset of instructions that return true for isScalarWithPredication. 1578 bool isPredicatedInst(Instruction *I, ElementCount VF, 1579 bool IsKnownUniform = false) { 1580 // When we know the load is uniform and the original scalar loop was not 1581 // predicated we don't need to mark it as a predicated instruction. Any 1582 // vectorised blocks created when tail-folding are something artificial we 1583 // have introduced and we know there is always at least one active lane. 1584 // That's why we call Legal->blockNeedsPredication here because it doesn't 1585 // query tail-folding. 1586 if (IsKnownUniform && isa<LoadInst>(I) && 1587 !Legal->blockNeedsPredication(I->getParent())) 1588 return false; 1589 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1590 return false; 1591 // Loads and stores that need some form of masked operation are predicated 1592 // instructions. 1593 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1594 return Legal->isMaskRequired(I); 1595 return isScalarWithPredication(I, VF); 1596 } 1597 1598 /// Returns true if \p I is a memory instruction with consecutive memory 1599 /// access that can be widened. 1600 bool 1601 memoryInstructionCanBeWidened(Instruction *I, 1602 ElementCount VF = ElementCount::getFixed(1)); 1603 1604 /// Returns true if \p I is a memory instruction in an interleaved-group 1605 /// of memory accesses that can be vectorized with wide vector loads/stores 1606 /// and shuffles. 1607 bool 1608 interleavedAccessCanBeWidened(Instruction *I, 1609 ElementCount VF = ElementCount::getFixed(1)); 1610 1611 /// Check if \p Instr belongs to any interleaved access group. 1612 bool isAccessInterleaved(Instruction *Instr) { 1613 return InterleaveInfo.isInterleaved(Instr); 1614 } 1615 1616 /// Get the interleaved access group that \p Instr belongs to. 1617 const InterleaveGroup<Instruction> * 1618 getInterleavedAccessGroup(Instruction *Instr) { 1619 return InterleaveInfo.getInterleaveGroup(Instr); 1620 } 1621 1622 /// Returns true if we're required to use a scalar epilogue for at least 1623 /// the final iteration of the original loop. 1624 bool requiresScalarEpilogue(ElementCount VF) const { 1625 if (!isScalarEpilogueAllowed()) 1626 return false; 1627 // If we might exit from anywhere but the latch, must run the exiting 1628 // iteration in scalar form. 1629 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1630 return true; 1631 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1632 } 1633 1634 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1635 /// loop hint annotation. 1636 bool isScalarEpilogueAllowed() const { 1637 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1638 } 1639 1640 /// Returns true if all loop blocks should be masked to fold tail loop. 1641 bool foldTailByMasking() const { return FoldTailByMasking; } 1642 1643 /// Returns true if the instructions in this block requires predication 1644 /// for any reason, e.g. because tail folding now requires a predicate 1645 /// or because the block in the original loop was predicated. 1646 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1647 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1648 } 1649 1650 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1651 /// nodes to the chain of instructions representing the reductions. Uses a 1652 /// MapVector to ensure deterministic iteration order. 1653 using ReductionChainMap = 1654 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1655 1656 /// Return the chain of instructions representing an inloop reduction. 1657 const ReductionChainMap &getInLoopReductionChains() const { 1658 return InLoopReductionChains; 1659 } 1660 1661 /// Returns true if the Phi is part of an inloop reduction. 1662 bool isInLoopReduction(PHINode *Phi) const { 1663 return InLoopReductionChains.count(Phi); 1664 } 1665 1666 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1667 /// with factor VF. Return the cost of the instruction, including 1668 /// scalarization overhead if it's needed. 1669 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1670 1671 /// Estimate cost of a call instruction CI if it were vectorized with factor 1672 /// VF. Return the cost of the instruction, including scalarization overhead 1673 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1674 /// scalarized - 1675 /// i.e. either vector version isn't available, or is too expensive. 1676 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1677 bool &NeedToScalarize) const; 1678 1679 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1680 /// that of B. 1681 bool isMoreProfitable(const VectorizationFactor &A, 1682 const VectorizationFactor &B) const; 1683 1684 /// Invalidates decisions already taken by the cost model. 1685 void invalidateCostModelingDecisions() { 1686 WideningDecisions.clear(); 1687 Uniforms.clear(); 1688 Scalars.clear(); 1689 } 1690 1691 private: 1692 unsigned NumPredStores = 0; 1693 1694 /// \return An upper bound for the vectorization factors for both 1695 /// fixed and scalable vectorization, where the minimum-known number of 1696 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1697 /// disabled or unsupported, then the scalable part will be equal to 1698 /// ElementCount::getScalable(0). 1699 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1700 ElementCount UserVF, 1701 bool FoldTailByMasking); 1702 1703 /// \return the maximized element count based on the targets vector 1704 /// registers and the loop trip-count, but limited to a maximum safe VF. 1705 /// This is a helper function of computeFeasibleMaxVF. 1706 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1707 /// issue that occurred on one of the buildbots which cannot be reproduced 1708 /// without having access to the properietary compiler (see comments on 1709 /// D98509). The issue is currently under investigation and this workaround 1710 /// will be removed as soon as possible. 1711 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1712 unsigned SmallestType, 1713 unsigned WidestType, 1714 const ElementCount &MaxSafeVF, 1715 bool FoldTailByMasking); 1716 1717 /// \return the maximum legal scalable VF, based on the safe max number 1718 /// of elements. 1719 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1720 1721 /// The vectorization cost is a combination of the cost itself and a boolean 1722 /// indicating whether any of the contributing operations will actually 1723 /// operate on vector values after type legalization in the backend. If this 1724 /// latter value is false, then all operations will be scalarized (i.e. no 1725 /// vectorization has actually taken place). 1726 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1727 1728 /// Returns the expected execution cost. The unit of the cost does 1729 /// not matter because we use the 'cost' units to compare different 1730 /// vector widths. The cost that is returned is *not* normalized by 1731 /// the factor width. If \p Invalid is not nullptr, this function 1732 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1733 /// each instruction that has an Invalid cost for the given VF. 1734 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1735 VectorizationCostTy 1736 expectedCost(ElementCount VF, 1737 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1738 1739 /// Returns the execution time cost of an instruction for a given vector 1740 /// width. Vector width of one means scalar. 1741 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1742 1743 /// The cost-computation logic from getInstructionCost which provides 1744 /// the vector type as an output parameter. 1745 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1746 Type *&VectorTy); 1747 1748 /// Return the cost of instructions in an inloop reduction pattern, if I is 1749 /// part of that pattern. 1750 Optional<InstructionCost> 1751 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1752 TTI::TargetCostKind CostKind); 1753 1754 /// Calculate vectorization cost of memory instruction \p I. 1755 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1756 1757 /// The cost computation for scalarized memory instruction. 1758 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1759 1760 /// The cost computation for interleaving group of memory instructions. 1761 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1762 1763 /// The cost computation for Gather/Scatter instruction. 1764 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1765 1766 /// The cost computation for widening instruction \p I with consecutive 1767 /// memory access. 1768 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1769 1770 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1771 /// Load: scalar load + broadcast. 1772 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1773 /// element) 1774 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1775 1776 /// Estimate the overhead of scalarizing an instruction. This is a 1777 /// convenience wrapper for the type-based getScalarizationOverhead API. 1778 InstructionCost getScalarizationOverhead(Instruction *I, 1779 ElementCount VF) const; 1780 1781 /// Returns whether the instruction is a load or store and will be a emitted 1782 /// as a vector operation. 1783 bool isConsecutiveLoadOrStore(Instruction *I); 1784 1785 /// Returns true if an artificially high cost for emulated masked memrefs 1786 /// should be used. 1787 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1788 1789 /// Map of scalar integer values to the smallest bitwidth they can be legally 1790 /// represented as. The vector equivalents of these values should be truncated 1791 /// to this type. 1792 MapVector<Instruction *, uint64_t> MinBWs; 1793 1794 /// A type representing the costs for instructions if they were to be 1795 /// scalarized rather than vectorized. The entries are Instruction-Cost 1796 /// pairs. 1797 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1798 1799 /// A set containing all BasicBlocks that are known to present after 1800 /// vectorization as a predicated block. 1801 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1802 1803 /// Records whether it is allowed to have the original scalar loop execute at 1804 /// least once. This may be needed as a fallback loop in case runtime 1805 /// aliasing/dependence checks fail, or to handle the tail/remainder 1806 /// iterations when the trip count is unknown or doesn't divide by the VF, 1807 /// or as a peel-loop to handle gaps in interleave-groups. 1808 /// Under optsize and when the trip count is very small we don't allow any 1809 /// iterations to execute in the scalar loop. 1810 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1811 1812 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1813 bool FoldTailByMasking = false; 1814 1815 /// A map holding scalar costs for different vectorization factors. The 1816 /// presence of a cost for an instruction in the mapping indicates that the 1817 /// instruction will be scalarized when vectorizing with the associated 1818 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1819 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1820 1821 /// Holds the instructions known to be uniform after vectorization. 1822 /// The data is collected per VF. 1823 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1824 1825 /// Holds the instructions known to be scalar after vectorization. 1826 /// The data is collected per VF. 1827 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1828 1829 /// Holds the instructions (address computations) that are forced to be 1830 /// scalarized. 1831 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1832 1833 /// PHINodes of the reductions that should be expanded in-loop along with 1834 /// their associated chains of reduction operations, in program order from top 1835 /// (PHI) to bottom 1836 ReductionChainMap InLoopReductionChains; 1837 1838 /// A Map of inloop reduction operations and their immediate chain operand. 1839 /// FIXME: This can be removed once reductions can be costed correctly in 1840 /// vplan. This was added to allow quick lookup to the inloop operations, 1841 /// without having to loop through InLoopReductionChains. 1842 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1843 1844 /// Returns the expected difference in cost from scalarizing the expression 1845 /// feeding a predicated instruction \p PredInst. The instructions to 1846 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1847 /// non-negative return value implies the expression will be scalarized. 1848 /// Currently, only single-use chains are considered for scalarization. 1849 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1850 ElementCount VF); 1851 1852 /// Collect the instructions that are uniform after vectorization. An 1853 /// instruction is uniform if we represent it with a single scalar value in 1854 /// the vectorized loop corresponding to each vector iteration. Examples of 1855 /// uniform instructions include pointer operands of consecutive or 1856 /// interleaved memory accesses. Note that although uniformity implies an 1857 /// instruction will be scalar, the reverse is not true. In general, a 1858 /// scalarized instruction will be represented by VF scalar values in the 1859 /// vectorized loop, each corresponding to an iteration of the original 1860 /// scalar loop. 1861 void collectLoopUniforms(ElementCount VF); 1862 1863 /// Collect the instructions that are scalar after vectorization. An 1864 /// instruction is scalar if it is known to be uniform or will be scalarized 1865 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1866 /// to the list if they are used by a load/store instruction that is marked as 1867 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1868 /// VF values in the vectorized loop, each corresponding to an iteration of 1869 /// the original scalar loop. 1870 void collectLoopScalars(ElementCount VF); 1871 1872 /// Keeps cost model vectorization decision and cost for instructions. 1873 /// Right now it is used for memory instructions only. 1874 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1875 std::pair<InstWidening, InstructionCost>>; 1876 1877 DecisionList WideningDecisions; 1878 1879 /// Returns true if \p V is expected to be vectorized and it needs to be 1880 /// extracted. 1881 bool needsExtract(Value *V, ElementCount VF) const { 1882 Instruction *I = dyn_cast<Instruction>(V); 1883 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1884 TheLoop->isLoopInvariant(I)) 1885 return false; 1886 1887 // Assume we can vectorize V (and hence we need extraction) if the 1888 // scalars are not computed yet. This can happen, because it is called 1889 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1890 // the scalars are collected. That should be a safe assumption in most 1891 // cases, because we check if the operands have vectorizable types 1892 // beforehand in LoopVectorizationLegality. 1893 return Scalars.find(VF) == Scalars.end() || 1894 !isScalarAfterVectorization(I, VF); 1895 }; 1896 1897 /// Returns a range containing only operands needing to be extracted. 1898 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1899 ElementCount VF) const { 1900 return SmallVector<Value *, 4>(make_filter_range( 1901 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1902 } 1903 1904 /// Determines if we have the infrastructure to vectorize loop \p L and its 1905 /// epilogue, assuming the main loop is vectorized by \p VF. 1906 bool isCandidateForEpilogueVectorization(const Loop &L, 1907 const ElementCount VF) const; 1908 1909 /// Returns true if epilogue vectorization is considered profitable, and 1910 /// false otherwise. 1911 /// \p VF is the vectorization factor chosen for the original loop. 1912 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1913 1914 public: 1915 /// The loop that we evaluate. 1916 Loop *TheLoop; 1917 1918 /// Predicated scalar evolution analysis. 1919 PredicatedScalarEvolution &PSE; 1920 1921 /// Loop Info analysis. 1922 LoopInfo *LI; 1923 1924 /// Vectorization legality. 1925 LoopVectorizationLegality *Legal; 1926 1927 /// Vector target information. 1928 const TargetTransformInfo &TTI; 1929 1930 /// Target Library Info. 1931 const TargetLibraryInfo *TLI; 1932 1933 /// Demanded bits analysis. 1934 DemandedBits *DB; 1935 1936 /// Assumption cache. 1937 AssumptionCache *AC; 1938 1939 /// Interface to emit optimization remarks. 1940 OptimizationRemarkEmitter *ORE; 1941 1942 const Function *TheFunction; 1943 1944 /// Loop Vectorize Hint. 1945 const LoopVectorizeHints *Hints; 1946 1947 /// The interleave access information contains groups of interleaved accesses 1948 /// with the same stride and close to each other. 1949 InterleavedAccessInfo &InterleaveInfo; 1950 1951 /// Values to ignore in the cost model. 1952 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1953 1954 /// Values to ignore in the cost model when VF > 1. 1955 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1956 1957 /// All element types found in the loop. 1958 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1959 1960 /// Profitable vector factors. 1961 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1962 }; 1963 } // end namespace llvm 1964 1965 /// Helper struct to manage generating runtime checks for vectorization. 1966 /// 1967 /// The runtime checks are created up-front in temporary blocks to allow better 1968 /// estimating the cost and un-linked from the existing IR. After deciding to 1969 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1970 /// temporary blocks are completely removed. 1971 class GeneratedRTChecks { 1972 /// Basic block which contains the generated SCEV checks, if any. 1973 BasicBlock *SCEVCheckBlock = nullptr; 1974 1975 /// The value representing the result of the generated SCEV checks. If it is 1976 /// nullptr, either no SCEV checks have been generated or they have been used. 1977 Value *SCEVCheckCond = nullptr; 1978 1979 /// Basic block which contains the generated memory runtime checks, if any. 1980 BasicBlock *MemCheckBlock = nullptr; 1981 1982 /// The value representing the result of the generated memory runtime checks. 1983 /// If it is nullptr, either no memory runtime checks have been generated or 1984 /// they have been used. 1985 Value *MemRuntimeCheckCond = nullptr; 1986 1987 DominatorTree *DT; 1988 LoopInfo *LI; 1989 1990 SCEVExpander SCEVExp; 1991 SCEVExpander MemCheckExp; 1992 1993 public: 1994 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1995 const DataLayout &DL) 1996 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1997 MemCheckExp(SE, DL, "scev.check") {} 1998 1999 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 2000 /// accurately estimate the cost of the runtime checks. The blocks are 2001 /// un-linked from the IR and is added back during vector code generation. If 2002 /// there is no vector code generation, the check blocks are removed 2003 /// completely. 2004 void Create(Loop *L, const LoopAccessInfo &LAI, 2005 const SCEVUnionPredicate &UnionPred) { 2006 2007 BasicBlock *LoopHeader = L->getHeader(); 2008 BasicBlock *Preheader = L->getLoopPreheader(); 2009 2010 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2011 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2012 // may be used by SCEVExpander. The blocks will be un-linked from their 2013 // predecessors and removed from LI & DT at the end of the function. 2014 if (!UnionPred.isAlwaysTrue()) { 2015 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2016 nullptr, "vector.scevcheck"); 2017 2018 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2019 &UnionPred, SCEVCheckBlock->getTerminator()); 2020 } 2021 2022 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2023 if (RtPtrChecking.Need) { 2024 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2025 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2026 "vector.memcheck"); 2027 2028 MemRuntimeCheckCond = 2029 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2030 RtPtrChecking.getChecks(), MemCheckExp); 2031 assert(MemRuntimeCheckCond && 2032 "no RT checks generated although RtPtrChecking " 2033 "claimed checks are required"); 2034 } 2035 2036 if (!MemCheckBlock && !SCEVCheckBlock) 2037 return; 2038 2039 // Unhook the temporary block with the checks, update various places 2040 // accordingly. 2041 if (SCEVCheckBlock) 2042 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2043 if (MemCheckBlock) 2044 MemCheckBlock->replaceAllUsesWith(Preheader); 2045 2046 if (SCEVCheckBlock) { 2047 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2048 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2049 Preheader->getTerminator()->eraseFromParent(); 2050 } 2051 if (MemCheckBlock) { 2052 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2053 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2054 Preheader->getTerminator()->eraseFromParent(); 2055 } 2056 2057 DT->changeImmediateDominator(LoopHeader, Preheader); 2058 if (MemCheckBlock) { 2059 DT->eraseNode(MemCheckBlock); 2060 LI->removeBlock(MemCheckBlock); 2061 } 2062 if (SCEVCheckBlock) { 2063 DT->eraseNode(SCEVCheckBlock); 2064 LI->removeBlock(SCEVCheckBlock); 2065 } 2066 } 2067 2068 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2069 /// unused. 2070 ~GeneratedRTChecks() { 2071 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2072 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2073 if (!SCEVCheckCond) 2074 SCEVCleaner.markResultUsed(); 2075 2076 if (!MemRuntimeCheckCond) 2077 MemCheckCleaner.markResultUsed(); 2078 2079 if (MemRuntimeCheckCond) { 2080 auto &SE = *MemCheckExp.getSE(); 2081 // Memory runtime check generation creates compares that use expanded 2082 // values. Remove them before running the SCEVExpanderCleaners. 2083 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2084 if (MemCheckExp.isInsertedInstruction(&I)) 2085 continue; 2086 SE.forgetValue(&I); 2087 I.eraseFromParent(); 2088 } 2089 } 2090 MemCheckCleaner.cleanup(); 2091 SCEVCleaner.cleanup(); 2092 2093 if (SCEVCheckCond) 2094 SCEVCheckBlock->eraseFromParent(); 2095 if (MemRuntimeCheckCond) 2096 MemCheckBlock->eraseFromParent(); 2097 } 2098 2099 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2100 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2101 /// depending on the generated condition. 2102 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2103 BasicBlock *LoopVectorPreHeader, 2104 BasicBlock *LoopExitBlock) { 2105 if (!SCEVCheckCond) 2106 return nullptr; 2107 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2108 if (C->isZero()) 2109 return nullptr; 2110 2111 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2112 2113 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2114 // Create new preheader for vector loop. 2115 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2116 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2117 2118 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2119 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2120 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2121 SCEVCheckBlock); 2122 2123 DT->addNewBlock(SCEVCheckBlock, Pred); 2124 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2125 2126 ReplaceInstWithInst( 2127 SCEVCheckBlock->getTerminator(), 2128 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2129 // Mark the check as used, to prevent it from being removed during cleanup. 2130 SCEVCheckCond = nullptr; 2131 return SCEVCheckBlock; 2132 } 2133 2134 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2135 /// the branches to branch to the vector preheader or \p Bypass, depending on 2136 /// the generated condition. 2137 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2138 BasicBlock *LoopVectorPreHeader) { 2139 // Check if we generated code that checks in runtime if arrays overlap. 2140 if (!MemRuntimeCheckCond) 2141 return nullptr; 2142 2143 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2144 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2145 MemCheckBlock); 2146 2147 DT->addNewBlock(MemCheckBlock, Pred); 2148 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2149 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2150 2151 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2152 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2153 2154 ReplaceInstWithInst( 2155 MemCheckBlock->getTerminator(), 2156 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2157 MemCheckBlock->getTerminator()->setDebugLoc( 2158 Pred->getTerminator()->getDebugLoc()); 2159 2160 // Mark the check as used, to prevent it from being removed during cleanup. 2161 MemRuntimeCheckCond = nullptr; 2162 return MemCheckBlock; 2163 } 2164 }; 2165 2166 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2167 // vectorization. The loop needs to be annotated with #pragma omp simd 2168 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2169 // vector length information is not provided, vectorization is not considered 2170 // explicit. Interleave hints are not allowed either. These limitations will be 2171 // relaxed in the future. 2172 // Please, note that we are currently forced to abuse the pragma 'clang 2173 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2174 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2175 // provides *explicit vectorization hints* (LV can bypass legal checks and 2176 // assume that vectorization is legal). However, both hints are implemented 2177 // using the same metadata (llvm.loop.vectorize, processed by 2178 // LoopVectorizeHints). This will be fixed in the future when the native IR 2179 // representation for pragma 'omp simd' is introduced. 2180 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2181 OptimizationRemarkEmitter *ORE) { 2182 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2183 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2184 2185 // Only outer loops with an explicit vectorization hint are supported. 2186 // Unannotated outer loops are ignored. 2187 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2188 return false; 2189 2190 Function *Fn = OuterLp->getHeader()->getParent(); 2191 if (!Hints.allowVectorization(Fn, OuterLp, 2192 true /*VectorizeOnlyWhenForced*/)) { 2193 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2194 return false; 2195 } 2196 2197 if (Hints.getInterleave() > 1) { 2198 // TODO: Interleave support is future work. 2199 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2200 "outer loops.\n"); 2201 Hints.emitRemarkWithHints(); 2202 return false; 2203 } 2204 2205 return true; 2206 } 2207 2208 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2209 OptimizationRemarkEmitter *ORE, 2210 SmallVectorImpl<Loop *> &V) { 2211 // Collect inner loops and outer loops without irreducible control flow. For 2212 // now, only collect outer loops that have explicit vectorization hints. If we 2213 // are stress testing the VPlan H-CFG construction, we collect the outermost 2214 // loop of every loop nest. 2215 if (L.isInnermost() || VPlanBuildStressTest || 2216 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2217 LoopBlocksRPO RPOT(&L); 2218 RPOT.perform(LI); 2219 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2220 V.push_back(&L); 2221 // TODO: Collect inner loops inside marked outer loops in case 2222 // vectorization fails for the outer loop. Do not invoke 2223 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2224 // already known to be reducible. We can use an inherited attribute for 2225 // that. 2226 return; 2227 } 2228 } 2229 for (Loop *InnerL : L) 2230 collectSupportedLoops(*InnerL, LI, ORE, V); 2231 } 2232 2233 namespace { 2234 2235 /// The LoopVectorize Pass. 2236 struct LoopVectorize : public FunctionPass { 2237 /// Pass identification, replacement for typeid 2238 static char ID; 2239 2240 LoopVectorizePass Impl; 2241 2242 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2243 bool VectorizeOnlyWhenForced = false) 2244 : FunctionPass(ID), 2245 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2246 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2247 } 2248 2249 bool runOnFunction(Function &F) override { 2250 if (skipFunction(F)) 2251 return false; 2252 2253 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2254 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2255 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2256 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2257 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2258 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2259 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2260 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2261 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2262 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2263 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2264 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2265 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2266 2267 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2268 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2269 2270 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2271 GetLAA, *ORE, PSI).MadeAnyChange; 2272 } 2273 2274 void getAnalysisUsage(AnalysisUsage &AU) const override { 2275 AU.addRequired<AssumptionCacheTracker>(); 2276 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2277 AU.addRequired<DominatorTreeWrapperPass>(); 2278 AU.addRequired<LoopInfoWrapperPass>(); 2279 AU.addRequired<ScalarEvolutionWrapperPass>(); 2280 AU.addRequired<TargetTransformInfoWrapperPass>(); 2281 AU.addRequired<AAResultsWrapperPass>(); 2282 AU.addRequired<LoopAccessLegacyAnalysis>(); 2283 AU.addRequired<DemandedBitsWrapperPass>(); 2284 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2285 AU.addRequired<InjectTLIMappingsLegacy>(); 2286 2287 // We currently do not preserve loopinfo/dominator analyses with outer loop 2288 // vectorization. Until this is addressed, mark these analyses as preserved 2289 // only for non-VPlan-native path. 2290 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2291 if (!EnableVPlanNativePath) { 2292 AU.addPreserved<LoopInfoWrapperPass>(); 2293 AU.addPreserved<DominatorTreeWrapperPass>(); 2294 } 2295 2296 AU.addPreserved<BasicAAWrapperPass>(); 2297 AU.addPreserved<GlobalsAAWrapperPass>(); 2298 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2299 } 2300 }; 2301 2302 } // end anonymous namespace 2303 2304 //===----------------------------------------------------------------------===// 2305 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2306 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2307 //===----------------------------------------------------------------------===// 2308 2309 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2310 // We need to place the broadcast of invariant variables outside the loop, 2311 // but only if it's proven safe to do so. Else, broadcast will be inside 2312 // vector loop body. 2313 Instruction *Instr = dyn_cast<Instruction>(V); 2314 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2315 (!Instr || 2316 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2317 // Place the code for broadcasting invariant variables in the new preheader. 2318 IRBuilder<>::InsertPointGuard Guard(Builder); 2319 if (SafeToHoist) 2320 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2321 2322 // Broadcast the scalar into all locations in the vector. 2323 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2324 2325 return Shuf; 2326 } 2327 2328 /// This function adds 2329 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2330 /// to each vector element of Val. The sequence starts at StartIndex. 2331 /// \p Opcode is relevant for FP induction variable. 2332 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2333 Instruction::BinaryOps BinOp, ElementCount VF, 2334 IRBuilder<> &Builder) { 2335 assert(VF.isVector() && "only vector VFs are supported"); 2336 2337 // Create and check the types. 2338 auto *ValVTy = cast<VectorType>(Val->getType()); 2339 ElementCount VLen = ValVTy->getElementCount(); 2340 2341 Type *STy = Val->getType()->getScalarType(); 2342 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2343 "Induction Step must be an integer or FP"); 2344 assert(Step->getType() == STy && "Step has wrong type"); 2345 2346 SmallVector<Constant *, 8> Indices; 2347 2348 // Create a vector of consecutive numbers from zero to VF. 2349 VectorType *InitVecValVTy = ValVTy; 2350 Type *InitVecValSTy = STy; 2351 if (STy->isFloatingPointTy()) { 2352 InitVecValSTy = 2353 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2354 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2355 } 2356 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2357 2358 // Splat the StartIdx 2359 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2360 2361 if (STy->isIntegerTy()) { 2362 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2363 Step = Builder.CreateVectorSplat(VLen, Step); 2364 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2365 // FIXME: The newly created binary instructions should contain nsw/nuw 2366 // flags, which can be found from the original scalar operations. 2367 Step = Builder.CreateMul(InitVec, Step); 2368 return Builder.CreateAdd(Val, Step, "induction"); 2369 } 2370 2371 // Floating point induction. 2372 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2373 "Binary Opcode should be specified for FP induction"); 2374 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2375 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2376 2377 Step = Builder.CreateVectorSplat(VLen, Step); 2378 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2379 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2380 } 2381 2382 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2383 const InductionDescriptor &II, Value *Step, Value *Start, 2384 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2385 IRBuilder<> &Builder = State.Builder; 2386 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2387 "Expected either an induction phi-node or a truncate of it!"); 2388 2389 // Construct the initial value of the vector IV in the vector loop preheader 2390 auto CurrIP = Builder.saveIP(); 2391 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2392 if (isa<TruncInst>(EntryVal)) { 2393 assert(Start->getType()->isIntegerTy() && 2394 "Truncation requires an integer type"); 2395 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2396 Step = Builder.CreateTrunc(Step, TruncType); 2397 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2398 } 2399 2400 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2401 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2402 Value *SteppedStart = getStepVector( 2403 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2404 2405 // We create vector phi nodes for both integer and floating-point induction 2406 // variables. Here, we determine the kind of arithmetic we will perform. 2407 Instruction::BinaryOps AddOp; 2408 Instruction::BinaryOps MulOp; 2409 if (Step->getType()->isIntegerTy()) { 2410 AddOp = Instruction::Add; 2411 MulOp = Instruction::Mul; 2412 } else { 2413 AddOp = II.getInductionOpcode(); 2414 MulOp = Instruction::FMul; 2415 } 2416 2417 // Multiply the vectorization factor by the step using integer or 2418 // floating-point arithmetic as appropriate. 2419 Type *StepType = Step->getType(); 2420 Value *RuntimeVF; 2421 if (Step->getType()->isFloatingPointTy()) 2422 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2423 else 2424 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2425 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2426 2427 // Create a vector splat to use in the induction update. 2428 // 2429 // FIXME: If the step is non-constant, we create the vector splat with 2430 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2431 // handle a constant vector splat. 2432 Value *SplatVF = isa<Constant>(Mul) 2433 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2434 : Builder.CreateVectorSplat(State.VF, Mul); 2435 Builder.restoreIP(CurrIP); 2436 2437 // We may need to add the step a number of times, depending on the unroll 2438 // factor. The last of those goes into the PHI. 2439 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2440 &*LoopVectorBody->getFirstInsertionPt()); 2441 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2442 Instruction *LastInduction = VecInd; 2443 for (unsigned Part = 0; Part < UF; ++Part) { 2444 State.set(Def, LastInduction, Part); 2445 2446 if (isa<TruncInst>(EntryVal)) 2447 addMetadata(LastInduction, EntryVal); 2448 2449 LastInduction = cast<Instruction>( 2450 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2451 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2452 } 2453 2454 // Move the last step to the end of the latch block. This ensures consistent 2455 // placement of all induction updates. 2456 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2457 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2458 LastInduction->moveBefore(Br); 2459 LastInduction->setName("vec.ind.next"); 2460 2461 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2462 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2463 } 2464 2465 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2466 return Cost->isScalarAfterVectorization(I, VF) || 2467 Cost->isProfitableToScalarize(I, VF); 2468 } 2469 2470 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2471 if (shouldScalarizeInstruction(IV)) 2472 return true; 2473 auto isScalarInst = [&](User *U) -> bool { 2474 auto *I = cast<Instruction>(U); 2475 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2476 }; 2477 return llvm::any_of(IV->users(), isScalarInst); 2478 } 2479 2480 void InnerLoopVectorizer::widenIntOrFpInduction( 2481 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, 2482 Value *CanonicalIV) { 2483 Value *Start = Def->getStartValue()->getLiveInIRValue(); 2484 const InductionDescriptor &ID = Def->getInductionDescriptor(); 2485 TruncInst *Trunc = Def->getTruncInst(); 2486 IRBuilder<> &Builder = State.Builder; 2487 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2488 assert(!State.VF.isZero() && "VF must be non-zero"); 2489 2490 // The value from the original loop to which we are mapping the new induction 2491 // variable. 2492 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2493 2494 auto &DL = EntryVal->getModule()->getDataLayout(); 2495 2496 // Generate code for the induction step. Note that induction steps are 2497 // required to be loop-invariant 2498 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2499 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2500 "Induction step should be loop invariant"); 2501 if (PSE.getSE()->isSCEVable(IV->getType())) { 2502 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2503 return Exp.expandCodeFor(Step, Step->getType(), 2504 State.CFG.VectorPreHeader->getTerminator()); 2505 } 2506 return cast<SCEVUnknown>(Step)->getValue(); 2507 }; 2508 2509 // The scalar value to broadcast. This is derived from the canonical 2510 // induction variable. If a truncation type is given, truncate the canonical 2511 // induction variable and step. Otherwise, derive these values from the 2512 // induction descriptor. 2513 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2514 Value *ScalarIV = CanonicalIV; 2515 Type *NeededType = IV->getType(); 2516 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { 2517 ScalarIV = 2518 NeededType->isIntegerTy() 2519 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) 2520 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); 2521 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, 2522 State.CFG.PrevBB); 2523 ScalarIV->setName("offset.idx"); 2524 } 2525 if (Trunc) { 2526 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2527 assert(Step->getType()->isIntegerTy() && 2528 "Truncation requires an integer step"); 2529 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2530 Step = Builder.CreateTrunc(Step, TruncType); 2531 } 2532 return ScalarIV; 2533 }; 2534 2535 // Create the vector values from the scalar IV, in the absence of creating a 2536 // vector IV. 2537 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2538 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2539 for (unsigned Part = 0; Part < UF; ++Part) { 2540 Value *StartIdx; 2541 if (Step->getType()->isFloatingPointTy()) 2542 StartIdx = 2543 getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); 2544 else 2545 StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); 2546 2547 Value *EntryPart = 2548 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), 2549 State.VF, State.Builder); 2550 State.set(Def, EntryPart, Part); 2551 if (Trunc) 2552 addMetadata(EntryPart, Trunc); 2553 } 2554 }; 2555 2556 // Fast-math-flags propagate from the original induction instruction. 2557 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2558 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2559 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2560 2561 // Now do the actual transformations, and start with creating the step value. 2562 Value *Step = CreateStepValue(ID.getStep()); 2563 if (State.VF.isScalar()) { 2564 Value *ScalarIV = CreateScalarIV(Step); 2565 Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), 2566 Step->getType()->getScalarSizeInBits()); 2567 2568 Instruction::BinaryOps IncOp = ID.getInductionOpcode(); 2569 if (IncOp == Instruction::BinaryOpsEnd) 2570 IncOp = Instruction::Add; 2571 for (unsigned Part = 0; Part < UF; ++Part) { 2572 Value *StartIdx = ConstantInt::get(ScalarTy, Part); 2573 Instruction::BinaryOps MulOp = Instruction::Mul; 2574 if (Step->getType()->isFloatingPointTy()) { 2575 StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); 2576 MulOp = Instruction::FMul; 2577 } 2578 2579 Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2580 Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); 2581 State.set(Def, EntryPart, Part); 2582 if (Trunc) { 2583 assert(!Step->getType()->isFloatingPointTy() && 2584 "fp inductions shouldn't be truncated"); 2585 addMetadata(EntryPart, Trunc); 2586 } 2587 } 2588 return; 2589 } 2590 2591 // Determine if we want a scalar version of the induction variable. This is 2592 // true if the induction variable itself is not widened, or if it has at 2593 // least one user in the loop that is not widened. 2594 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2595 if (!NeedsScalarIV) { 2596 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2597 return; 2598 } 2599 2600 // Try to create a new independent vector induction variable. If we can't 2601 // create the phi node, we will splat the scalar induction variable in each 2602 // loop iteration. 2603 if (!shouldScalarizeInstruction(EntryVal)) { 2604 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2605 Value *ScalarIV = CreateScalarIV(Step); 2606 // Create scalar steps that can be used by instructions we will later 2607 // scalarize. Note that the addition of the scalar steps will not increase 2608 // the number of instructions in the loop in the common case prior to 2609 // InstCombine. We will be trading one vector extract for each scalar step. 2610 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2611 return; 2612 } 2613 2614 // All IV users are scalar instructions, so only emit a scalar IV, not a 2615 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2616 // predicate used by the masked loads/stores. 2617 Value *ScalarIV = CreateScalarIV(Step); 2618 if (!Cost->isScalarEpilogueAllowed()) 2619 CreateSplatIV(ScalarIV, Step); 2620 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2621 } 2622 2623 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2624 Instruction *EntryVal, 2625 const InductionDescriptor &ID, 2626 VPValue *Def, 2627 VPTransformState &State) { 2628 IRBuilder<> &Builder = State.Builder; 2629 // We shouldn't have to build scalar steps if we aren't vectorizing. 2630 assert(State.VF.isVector() && "VF should be greater than one"); 2631 // Get the value type and ensure it and the step have the same integer type. 2632 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2633 assert(ScalarIVTy == Step->getType() && 2634 "Val and Step should have the same type"); 2635 2636 // We build scalar steps for both integer and floating-point induction 2637 // variables. Here, we determine the kind of arithmetic we will perform. 2638 Instruction::BinaryOps AddOp; 2639 Instruction::BinaryOps MulOp; 2640 if (ScalarIVTy->isIntegerTy()) { 2641 AddOp = Instruction::Add; 2642 MulOp = Instruction::Mul; 2643 } else { 2644 AddOp = ID.getInductionOpcode(); 2645 MulOp = Instruction::FMul; 2646 } 2647 2648 // Determine the number of scalars we need to generate for each unroll 2649 // iteration. If EntryVal is uniform, we only need to generate the first 2650 // lane. Otherwise, we generate all VF values. 2651 bool IsUniform = 2652 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); 2653 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 2654 // Compute the scalar steps and save the results in State. 2655 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2656 ScalarIVTy->getScalarSizeInBits()); 2657 Type *VecIVTy = nullptr; 2658 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2659 if (!IsUniform && State.VF.isScalable()) { 2660 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2661 UnitStepVec = 2662 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2663 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2664 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2665 } 2666 2667 for (unsigned Part = 0; Part < State.UF; ++Part) { 2668 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2669 2670 if (!IsUniform && State.VF.isScalable()) { 2671 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2672 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2673 if (ScalarIVTy->isFloatingPointTy()) 2674 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2675 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2676 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2677 State.set(Def, Add, Part); 2678 // It's useful to record the lane values too for the known minimum number 2679 // of elements so we do those below. This improves the code quality when 2680 // trying to extract the first element, for example. 2681 } 2682 2683 if (ScalarIVTy->isFloatingPointTy()) 2684 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2685 2686 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2687 Value *StartIdx = Builder.CreateBinOp( 2688 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2689 // The step returned by `createStepForVF` is a runtime-evaluated value 2690 // when VF is scalable. Otherwise, it should be folded into a Constant. 2691 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2692 "Expected StartIdx to be folded to a constant when VF is not " 2693 "scalable"); 2694 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2695 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2696 State.set(Def, Add, VPIteration(Part, Lane)); 2697 } 2698 } 2699 } 2700 2701 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2702 const VPIteration &Instance, 2703 VPTransformState &State) { 2704 Value *ScalarInst = State.get(Def, Instance); 2705 Value *VectorValue = State.get(Def, Instance.Part); 2706 VectorValue = Builder.CreateInsertElement( 2707 VectorValue, ScalarInst, 2708 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2709 State.set(Def, VectorValue, Instance.Part); 2710 } 2711 2712 // Return whether we allow using masked interleave-groups (for dealing with 2713 // strided loads/stores that reside in predicated blocks, or for dealing 2714 // with gaps). 2715 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2716 // If an override option has been passed in for interleaved accesses, use it. 2717 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2718 return EnableMaskedInterleavedMemAccesses; 2719 2720 return TTI.enableMaskedInterleavedAccessVectorization(); 2721 } 2722 2723 // Try to vectorize the interleave group that \p Instr belongs to. 2724 // 2725 // E.g. Translate following interleaved load group (factor = 3): 2726 // for (i = 0; i < N; i+=3) { 2727 // R = Pic[i]; // Member of index 0 2728 // G = Pic[i+1]; // Member of index 1 2729 // B = Pic[i+2]; // Member of index 2 2730 // ... // do something to R, G, B 2731 // } 2732 // To: 2733 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2734 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2735 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2736 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2737 // 2738 // Or translate following interleaved store group (factor = 3): 2739 // for (i = 0; i < N; i+=3) { 2740 // ... do something to R, G, B 2741 // Pic[i] = R; // Member of index 0 2742 // Pic[i+1] = G; // Member of index 1 2743 // Pic[i+2] = B; // Member of index 2 2744 // } 2745 // To: 2746 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2747 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2748 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2749 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2750 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2751 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2752 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2753 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2754 VPValue *BlockInMask) { 2755 Instruction *Instr = Group->getInsertPos(); 2756 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2757 2758 // Prepare for the vector type of the interleaved load/store. 2759 Type *ScalarTy = getLoadStoreType(Instr); 2760 unsigned InterleaveFactor = Group->getFactor(); 2761 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2762 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2763 2764 // Prepare for the new pointers. 2765 SmallVector<Value *, 2> AddrParts; 2766 unsigned Index = Group->getIndex(Instr); 2767 2768 // TODO: extend the masked interleaved-group support to reversed access. 2769 assert((!BlockInMask || !Group->isReverse()) && 2770 "Reversed masked interleave-group not supported."); 2771 2772 // If the group is reverse, adjust the index to refer to the last vector lane 2773 // instead of the first. We adjust the index from the first vector lane, 2774 // rather than directly getting the pointer for lane VF - 1, because the 2775 // pointer operand of the interleaved access is supposed to be uniform. For 2776 // uniform instructions, we're only required to generate a value for the 2777 // first vector lane in each unroll iteration. 2778 if (Group->isReverse()) 2779 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2780 2781 for (unsigned Part = 0; Part < UF; Part++) { 2782 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2783 setDebugLocFromInst(AddrPart); 2784 2785 // Notice current instruction could be any index. Need to adjust the address 2786 // to the member of index 0. 2787 // 2788 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2789 // b = A[i]; // Member of index 0 2790 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2791 // 2792 // E.g. A[i+1] = a; // Member of index 1 2793 // A[i] = b; // Member of index 0 2794 // A[i+2] = c; // Member of index 2 (Current instruction) 2795 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2796 2797 bool InBounds = false; 2798 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2799 InBounds = gep->isInBounds(); 2800 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2801 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2802 2803 // Cast to the vector pointer type. 2804 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2805 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2806 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2807 } 2808 2809 setDebugLocFromInst(Instr); 2810 Value *PoisonVec = PoisonValue::get(VecTy); 2811 2812 Value *MaskForGaps = nullptr; 2813 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2814 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2815 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2816 } 2817 2818 // Vectorize the interleaved load group. 2819 if (isa<LoadInst>(Instr)) { 2820 // For each unroll part, create a wide load for the group. 2821 SmallVector<Value *, 2> NewLoads; 2822 for (unsigned Part = 0; Part < UF; Part++) { 2823 Instruction *NewLoad; 2824 if (BlockInMask || MaskForGaps) { 2825 assert(useMaskedInterleavedAccesses(*TTI) && 2826 "masked interleaved groups are not allowed."); 2827 Value *GroupMask = MaskForGaps; 2828 if (BlockInMask) { 2829 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2830 Value *ShuffledMask = Builder.CreateShuffleVector( 2831 BlockInMaskPart, 2832 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2833 "interleaved.mask"); 2834 GroupMask = MaskForGaps 2835 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2836 MaskForGaps) 2837 : ShuffledMask; 2838 } 2839 NewLoad = 2840 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2841 GroupMask, PoisonVec, "wide.masked.vec"); 2842 } 2843 else 2844 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2845 Group->getAlign(), "wide.vec"); 2846 Group->addMetadata(NewLoad); 2847 NewLoads.push_back(NewLoad); 2848 } 2849 2850 // For each member in the group, shuffle out the appropriate data from the 2851 // wide loads. 2852 unsigned J = 0; 2853 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2854 Instruction *Member = Group->getMember(I); 2855 2856 // Skip the gaps in the group. 2857 if (!Member) 2858 continue; 2859 2860 auto StrideMask = 2861 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2862 for (unsigned Part = 0; Part < UF; Part++) { 2863 Value *StridedVec = Builder.CreateShuffleVector( 2864 NewLoads[Part], StrideMask, "strided.vec"); 2865 2866 // If this member has different type, cast the result type. 2867 if (Member->getType() != ScalarTy) { 2868 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2869 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2870 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2871 } 2872 2873 if (Group->isReverse()) 2874 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2875 2876 State.set(VPDefs[J], StridedVec, Part); 2877 } 2878 ++J; 2879 } 2880 return; 2881 } 2882 2883 // The sub vector type for current instruction. 2884 auto *SubVT = VectorType::get(ScalarTy, VF); 2885 2886 // Vectorize the interleaved store group. 2887 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2888 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2889 "masked interleaved groups are not allowed."); 2890 assert((!MaskForGaps || !VF.isScalable()) && 2891 "masking gaps for scalable vectors is not yet supported."); 2892 for (unsigned Part = 0; Part < UF; Part++) { 2893 // Collect the stored vector from each member. 2894 SmallVector<Value *, 4> StoredVecs; 2895 for (unsigned i = 0; i < InterleaveFactor; i++) { 2896 assert((Group->getMember(i) || MaskForGaps) && 2897 "Fail to get a member from an interleaved store group"); 2898 Instruction *Member = Group->getMember(i); 2899 2900 // Skip the gaps in the group. 2901 if (!Member) { 2902 Value *Undef = PoisonValue::get(SubVT); 2903 StoredVecs.push_back(Undef); 2904 continue; 2905 } 2906 2907 Value *StoredVec = State.get(StoredValues[i], Part); 2908 2909 if (Group->isReverse()) 2910 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2911 2912 // If this member has different type, cast it to a unified type. 2913 2914 if (StoredVec->getType() != SubVT) 2915 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2916 2917 StoredVecs.push_back(StoredVec); 2918 } 2919 2920 // Concatenate all vectors into a wide vector. 2921 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2922 2923 // Interleave the elements in the wide vector. 2924 Value *IVec = Builder.CreateShuffleVector( 2925 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2926 "interleaved.vec"); 2927 2928 Instruction *NewStoreInstr; 2929 if (BlockInMask || MaskForGaps) { 2930 Value *GroupMask = MaskForGaps; 2931 if (BlockInMask) { 2932 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2933 Value *ShuffledMask = Builder.CreateShuffleVector( 2934 BlockInMaskPart, 2935 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2936 "interleaved.mask"); 2937 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2938 ShuffledMask, MaskForGaps) 2939 : ShuffledMask; 2940 } 2941 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2942 Group->getAlign(), GroupMask); 2943 } else 2944 NewStoreInstr = 2945 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2946 2947 Group->addMetadata(NewStoreInstr); 2948 } 2949 } 2950 2951 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2952 VPReplicateRecipe *RepRecipe, 2953 const VPIteration &Instance, 2954 bool IfPredicateInstr, 2955 VPTransformState &State) { 2956 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2957 2958 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2959 // the first lane and part. 2960 if (isa<NoAliasScopeDeclInst>(Instr)) 2961 if (!Instance.isFirstIteration()) 2962 return; 2963 2964 setDebugLocFromInst(Instr); 2965 2966 // Does this instruction return a value ? 2967 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2968 2969 Instruction *Cloned = Instr->clone(); 2970 if (!IsVoidRetTy) 2971 Cloned->setName(Instr->getName() + ".cloned"); 2972 2973 // If the scalarized instruction contributes to the address computation of a 2974 // widen masked load/store which was in a basic block that needed predication 2975 // and is not predicated after vectorization, we can't propagate 2976 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2977 // instruction could feed a poison value to the base address of the widen 2978 // load/store. 2979 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2980 Cloned->dropPoisonGeneratingFlags(); 2981 2982 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2983 Builder.GetInsertPoint()); 2984 // Replace the operands of the cloned instructions with their scalar 2985 // equivalents in the new loop. 2986 for (auto &I : enumerate(RepRecipe->operands())) { 2987 auto InputInstance = Instance; 2988 VPValue *Operand = I.value(); 2989 if (State.Plan->isUniformAfterVectorization(Operand)) 2990 InputInstance.Lane = VPLane::getFirstLane(); 2991 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2992 } 2993 addNewMetadata(Cloned, Instr); 2994 2995 // Place the cloned scalar in the new loop. 2996 Builder.Insert(Cloned); 2997 2998 State.set(RepRecipe, Cloned, Instance); 2999 3000 // If we just cloned a new assumption, add it the assumption cache. 3001 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3002 AC->registerAssumption(II); 3003 3004 // End if-block. 3005 if (IfPredicateInstr) 3006 PredicatedInstructions.push_back(Cloned); 3007 } 3008 3009 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 3010 BasicBlock *Header = L->getHeader(); 3011 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 3012 3013 IRBuilder<> B(Header->getTerminator()); 3014 Instruction *OldInst = 3015 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 3016 setDebugLocFromInst(OldInst, &B); 3017 3018 // Connect the header to the exit and header blocks and replace the old 3019 // terminator. 3020 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 3021 3022 // Now we have two terminators. Remove the old one from the block. 3023 Header->getTerminator()->eraseFromParent(); 3024 } 3025 3026 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3027 if (TripCount) 3028 return TripCount; 3029 3030 assert(L && "Create Trip Count for null loop."); 3031 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3032 // Find the loop boundaries. 3033 ScalarEvolution *SE = PSE.getSE(); 3034 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3035 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3036 "Invalid loop count"); 3037 3038 Type *IdxTy = Legal->getWidestInductionType(); 3039 assert(IdxTy && "No type for induction"); 3040 3041 // The exit count might have the type of i64 while the phi is i32. This can 3042 // happen if we have an induction variable that is sign extended before the 3043 // compare. The only way that we get a backedge taken count is that the 3044 // induction variable was signed and as such will not overflow. In such a case 3045 // truncation is legal. 3046 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3047 IdxTy->getPrimitiveSizeInBits()) 3048 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3049 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3050 3051 // Get the total trip count from the count by adding 1. 3052 const SCEV *ExitCount = SE->getAddExpr( 3053 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3054 3055 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3056 3057 // Expand the trip count and place the new instructions in the preheader. 3058 // Notice that the pre-header does not change, only the loop body. 3059 SCEVExpander Exp(*SE, DL, "induction"); 3060 3061 // Count holds the overall loop count (N). 3062 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3063 L->getLoopPreheader()->getTerminator()); 3064 3065 if (TripCount->getType()->isPointerTy()) 3066 TripCount = 3067 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3068 L->getLoopPreheader()->getTerminator()); 3069 3070 return TripCount; 3071 } 3072 3073 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3074 if (VectorTripCount) 3075 return VectorTripCount; 3076 3077 Value *TC = getOrCreateTripCount(L); 3078 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3079 3080 Type *Ty = TC->getType(); 3081 // This is where we can make the step a runtime constant. 3082 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3083 3084 // If the tail is to be folded by masking, round the number of iterations N 3085 // up to a multiple of Step instead of rounding down. This is done by first 3086 // adding Step-1 and then rounding down. Note that it's ok if this addition 3087 // overflows: the vector induction variable will eventually wrap to zero given 3088 // that it starts at zero and its Step is a power of two; the loop will then 3089 // exit, with the last early-exit vector comparison also producing all-true. 3090 if (Cost->foldTailByMasking()) { 3091 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3092 "VF*UF must be a power of 2 when folding tail by masking"); 3093 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 3094 TC = Builder.CreateAdd( 3095 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 3096 } 3097 3098 // Now we need to generate the expression for the part of the loop that the 3099 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3100 // iterations are not required for correctness, or N - Step, otherwise. Step 3101 // is equal to the vectorization factor (number of SIMD elements) times the 3102 // unroll factor (number of SIMD instructions). 3103 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3104 3105 // There are cases where we *must* run at least one iteration in the remainder 3106 // loop. See the cost model for when this can happen. If the step evenly 3107 // divides the trip count, we set the remainder to be equal to the step. If 3108 // the step does not evenly divide the trip count, no adjustment is necessary 3109 // since there will already be scalar iterations. Note that the minimum 3110 // iterations check ensures that N >= Step. 3111 if (Cost->requiresScalarEpilogue(VF)) { 3112 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3113 R = Builder.CreateSelect(IsZero, Step, R); 3114 } 3115 3116 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3117 3118 return VectorTripCount; 3119 } 3120 3121 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3122 const DataLayout &DL) { 3123 // Verify that V is a vector type with same number of elements as DstVTy. 3124 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3125 unsigned VF = DstFVTy->getNumElements(); 3126 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3127 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3128 Type *SrcElemTy = SrcVecTy->getElementType(); 3129 Type *DstElemTy = DstFVTy->getElementType(); 3130 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3131 "Vector elements must have same size"); 3132 3133 // Do a direct cast if element types are castable. 3134 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3135 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3136 } 3137 // V cannot be directly casted to desired vector type. 3138 // May happen when V is a floating point vector but DstVTy is a vector of 3139 // pointers or vice-versa. Handle this using a two-step bitcast using an 3140 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3141 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3142 "Only one type should be a pointer type"); 3143 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3144 "Only one type should be a floating point type"); 3145 Type *IntTy = 3146 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3147 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3148 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3149 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3150 } 3151 3152 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3153 BasicBlock *Bypass) { 3154 Value *Count = getOrCreateTripCount(L); 3155 // Reuse existing vector loop preheader for TC checks. 3156 // Note that new preheader block is generated for vector loop. 3157 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3158 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3159 3160 // Generate code to check if the loop's trip count is less than VF * UF, or 3161 // equal to it in case a scalar epilogue is required; this implies that the 3162 // vector trip count is zero. This check also covers the case where adding one 3163 // to the backedge-taken count overflowed leading to an incorrect trip count 3164 // of zero. In this case we will also jump to the scalar loop. 3165 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3166 : ICmpInst::ICMP_ULT; 3167 3168 // If tail is to be folded, vector loop takes care of all iterations. 3169 Value *CheckMinIters = Builder.getFalse(); 3170 if (!Cost->foldTailByMasking()) { 3171 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3172 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3173 } 3174 // Create new preheader for vector loop. 3175 LoopVectorPreHeader = 3176 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3177 "vector.ph"); 3178 3179 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3180 DT->getNode(Bypass)->getIDom()) && 3181 "TC check is expected to dominate Bypass"); 3182 3183 // Update dominator for Bypass & LoopExit (if needed). 3184 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3185 if (!Cost->requiresScalarEpilogue(VF)) 3186 // If there is an epilogue which must run, there's no edge from the 3187 // middle block to exit blocks and thus no need to update the immediate 3188 // dominator of the exit blocks. 3189 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3190 3191 ReplaceInstWithInst( 3192 TCCheckBlock->getTerminator(), 3193 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3194 LoopBypassBlocks.push_back(TCCheckBlock); 3195 } 3196 3197 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3198 3199 BasicBlock *const SCEVCheckBlock = 3200 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3201 if (!SCEVCheckBlock) 3202 return nullptr; 3203 3204 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3205 (OptForSizeBasedOnProfile && 3206 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3207 "Cannot SCEV check stride or overflow when optimizing for size"); 3208 3209 3210 // Update dominator only if this is first RT check. 3211 if (LoopBypassBlocks.empty()) { 3212 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3213 if (!Cost->requiresScalarEpilogue(VF)) 3214 // If there is an epilogue which must run, there's no edge from the 3215 // middle block to exit blocks and thus no need to update the immediate 3216 // dominator of the exit blocks. 3217 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3218 } 3219 3220 LoopBypassBlocks.push_back(SCEVCheckBlock); 3221 AddedSafetyChecks = true; 3222 return SCEVCheckBlock; 3223 } 3224 3225 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3226 BasicBlock *Bypass) { 3227 // VPlan-native path does not do any analysis for runtime checks currently. 3228 if (EnableVPlanNativePath) 3229 return nullptr; 3230 3231 BasicBlock *const MemCheckBlock = 3232 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3233 3234 // Check if we generated code that checks in runtime if arrays overlap. We put 3235 // the checks into a separate block to make the more common case of few 3236 // elements faster. 3237 if (!MemCheckBlock) 3238 return nullptr; 3239 3240 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3241 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3242 "Cannot emit memory checks when optimizing for size, unless forced " 3243 "to vectorize."); 3244 ORE->emit([&]() { 3245 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3246 L->getStartLoc(), L->getHeader()) 3247 << "Code-size may be reduced by not forcing " 3248 "vectorization, or by source-code modifications " 3249 "eliminating the need for runtime checks " 3250 "(e.g., adding 'restrict')."; 3251 }); 3252 } 3253 3254 LoopBypassBlocks.push_back(MemCheckBlock); 3255 3256 AddedSafetyChecks = true; 3257 3258 // We currently don't use LoopVersioning for the actual loop cloning but we 3259 // still use it to add the noalias metadata. 3260 LVer = std::make_unique<LoopVersioning>( 3261 *Legal->getLAI(), 3262 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3263 DT, PSE.getSE()); 3264 LVer->prepareNoAliasMetadata(); 3265 return MemCheckBlock; 3266 } 3267 3268 Value *InnerLoopVectorizer::emitTransformedIndex( 3269 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3270 const InductionDescriptor &ID, BasicBlock *VectorHeader) const { 3271 3272 SCEVExpander Exp(*SE, DL, "induction"); 3273 auto Step = ID.getStep(); 3274 auto StartValue = ID.getStartValue(); 3275 assert(Index->getType()->getScalarType() == Step->getType() && 3276 "Index scalar type does not match StepValue type"); 3277 3278 // Note: the IR at this point is broken. We cannot use SE to create any new 3279 // SCEV and then expand it, hoping that SCEV's simplification will give us 3280 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3281 // lead to various SCEV crashes. So all we can do is to use builder and rely 3282 // on InstCombine for future simplifications. Here we handle some trivial 3283 // cases only. 3284 auto CreateAdd = [&B](Value *X, Value *Y) { 3285 assert(X->getType() == Y->getType() && "Types don't match!"); 3286 if (auto *CX = dyn_cast<ConstantInt>(X)) 3287 if (CX->isZero()) 3288 return Y; 3289 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3290 if (CY->isZero()) 3291 return X; 3292 return B.CreateAdd(X, Y); 3293 }; 3294 3295 // We allow X to be a vector type, in which case Y will potentially be 3296 // splatted into a vector with the same element count. 3297 auto CreateMul = [&B](Value *X, Value *Y) { 3298 assert(X->getType()->getScalarType() == Y->getType() && 3299 "Types don't match!"); 3300 if (auto *CX = dyn_cast<ConstantInt>(X)) 3301 if (CX->isOne()) 3302 return Y; 3303 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3304 if (CY->isOne()) 3305 return X; 3306 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3307 if (XVTy && !isa<VectorType>(Y->getType())) 3308 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3309 return B.CreateMul(X, Y); 3310 }; 3311 3312 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3313 // loop, choose the end of the vector loop header (=VectorHeader), because 3314 // the DomTree is not kept up-to-date for additional blocks generated in the 3315 // vector loop. By using the header as insertion point, we guarantee that the 3316 // expanded instructions dominate all their uses. 3317 auto GetInsertPoint = [this, &B, VectorHeader]() { 3318 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3319 if (InsertBB != LoopVectorBody && 3320 LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) 3321 return VectorHeader->getTerminator(); 3322 return &*B.GetInsertPoint(); 3323 }; 3324 3325 switch (ID.getKind()) { 3326 case InductionDescriptor::IK_IntInduction: { 3327 assert(!isa<VectorType>(Index->getType()) && 3328 "Vector indices not supported for integer inductions yet"); 3329 assert(Index->getType() == StartValue->getType() && 3330 "Index type does not match StartValue type"); 3331 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3332 return B.CreateSub(StartValue, Index); 3333 auto *Offset = CreateMul( 3334 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3335 return CreateAdd(StartValue, Offset); 3336 } 3337 case InductionDescriptor::IK_PtrInduction: { 3338 assert(isa<SCEVConstant>(Step) && 3339 "Expected constant step for pointer induction"); 3340 return B.CreateGEP( 3341 ID.getElementType(), StartValue, 3342 CreateMul(Index, 3343 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3344 GetInsertPoint()))); 3345 } 3346 case InductionDescriptor::IK_FpInduction: { 3347 assert(!isa<VectorType>(Index->getType()) && 3348 "Vector indices not supported for FP inductions yet"); 3349 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3350 auto InductionBinOp = ID.getInductionBinOp(); 3351 assert(InductionBinOp && 3352 (InductionBinOp->getOpcode() == Instruction::FAdd || 3353 InductionBinOp->getOpcode() == Instruction::FSub) && 3354 "Original bin op should be defined for FP induction"); 3355 3356 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3357 Value *MulExp = B.CreateFMul(StepValue, Index); 3358 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3359 "induction"); 3360 } 3361 case InductionDescriptor::IK_NoInduction: 3362 return nullptr; 3363 } 3364 llvm_unreachable("invalid enum"); 3365 } 3366 3367 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3368 LoopScalarBody = OrigLoop->getHeader(); 3369 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3370 assert(LoopVectorPreHeader && "Invalid loop structure"); 3371 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3372 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3373 "multiple exit loop without required epilogue?"); 3374 3375 LoopMiddleBlock = 3376 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3377 LI, nullptr, Twine(Prefix) + "middle.block"); 3378 LoopScalarPreHeader = 3379 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3380 nullptr, Twine(Prefix) + "scalar.ph"); 3381 3382 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3383 3384 // Set up the middle block terminator. Two cases: 3385 // 1) If we know that we must execute the scalar epilogue, emit an 3386 // unconditional branch. 3387 // 2) Otherwise, we must have a single unique exit block (due to how we 3388 // implement the multiple exit case). In this case, set up a conditonal 3389 // branch from the middle block to the loop scalar preheader, and the 3390 // exit block. completeLoopSkeleton will update the condition to use an 3391 // iteration check, if required to decide whether to execute the remainder. 3392 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3393 BranchInst::Create(LoopScalarPreHeader) : 3394 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3395 Builder.getTrue()); 3396 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3397 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3398 3399 // We intentionally don't let SplitBlock to update LoopInfo since 3400 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3401 // LoopVectorBody is explicitly added to the correct place few lines later. 3402 LoopVectorBody = 3403 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3404 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3405 3406 // Update dominator for loop exit. 3407 if (!Cost->requiresScalarEpilogue(VF)) 3408 // If there is an epilogue which must run, there's no edge from the 3409 // middle block to exit blocks and thus no need to update the immediate 3410 // dominator of the exit blocks. 3411 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3412 3413 // Create and register the new vector loop. 3414 Loop *Lp = LI->AllocateLoop(); 3415 Loop *ParentLoop = OrigLoop->getParentLoop(); 3416 3417 // Insert the new loop into the loop nest and register the new basic blocks 3418 // before calling any utilities such as SCEV that require valid LoopInfo. 3419 if (ParentLoop) { 3420 ParentLoop->addChildLoop(Lp); 3421 } else { 3422 LI->addTopLevelLoop(Lp); 3423 } 3424 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3425 return Lp; 3426 } 3427 3428 void InnerLoopVectorizer::createInductionResumeValues( 3429 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3430 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3431 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3432 "Inconsistent information about additional bypass."); 3433 3434 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3435 assert(VectorTripCount && L && "Expected valid arguments"); 3436 // We are going to resume the execution of the scalar loop. 3437 // Go over all of the induction variables that we found and fix the 3438 // PHIs that are left in the scalar version of the loop. 3439 // The starting values of PHI nodes depend on the counter of the last 3440 // iteration in the vectorized loop. 3441 // If we come from a bypass edge then we need to start from the original 3442 // start value. 3443 Instruction *OldInduction = Legal->getPrimaryInduction(); 3444 for (auto &InductionEntry : Legal->getInductionVars()) { 3445 PHINode *OrigPhi = InductionEntry.first; 3446 InductionDescriptor II = InductionEntry.second; 3447 3448 // Create phi nodes to merge from the backedge-taken check block. 3449 PHINode *BCResumeVal = 3450 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3451 LoopScalarPreHeader->getTerminator()); 3452 // Copy original phi DL over to the new one. 3453 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3454 Value *&EndValue = IVEndValues[OrigPhi]; 3455 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3456 if (OrigPhi == OldInduction) { 3457 // We know what the end value is. 3458 EndValue = VectorTripCount; 3459 } else { 3460 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3461 3462 // Fast-math-flags propagate from the original induction instruction. 3463 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3464 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3465 3466 Type *StepType = II.getStep()->getType(); 3467 Instruction::CastOps CastOp = 3468 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3469 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3470 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3471 EndValue = 3472 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3473 EndValue->setName("ind.end"); 3474 3475 // Compute the end value for the additional bypass (if applicable). 3476 if (AdditionalBypass.first) { 3477 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3478 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3479 StepType, true); 3480 CRD = 3481 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3482 EndValueFromAdditionalBypass = 3483 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3484 EndValueFromAdditionalBypass->setName("ind.end"); 3485 } 3486 } 3487 // The new PHI merges the original incoming value, in case of a bypass, 3488 // or the value at the end of the vectorized loop. 3489 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3490 3491 // Fix the scalar body counter (PHI node). 3492 // The old induction's phi node in the scalar body needs the truncated 3493 // value. 3494 for (BasicBlock *BB : LoopBypassBlocks) 3495 BCResumeVal->addIncoming(II.getStartValue(), BB); 3496 3497 if (AdditionalBypass.first) 3498 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3499 EndValueFromAdditionalBypass); 3500 3501 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3502 } 3503 } 3504 3505 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3506 MDNode *OrigLoopID) { 3507 assert(L && "Expected valid loop."); 3508 3509 // The trip counts should be cached by now. 3510 Value *Count = getOrCreateTripCount(L); 3511 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3512 3513 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3514 3515 // Add a check in the middle block to see if we have completed 3516 // all of the iterations in the first vector loop. Three cases: 3517 // 1) If we require a scalar epilogue, there is no conditional branch as 3518 // we unconditionally branch to the scalar preheader. Do nothing. 3519 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3520 // Thus if tail is to be folded, we know we don't need to run the 3521 // remainder and we can use the previous value for the condition (true). 3522 // 3) Otherwise, construct a runtime check. 3523 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3524 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3525 Count, VectorTripCount, "cmp.n", 3526 LoopMiddleBlock->getTerminator()); 3527 3528 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3529 // of the corresponding compare because they may have ended up with 3530 // different line numbers and we want to avoid awkward line stepping while 3531 // debugging. Eg. if the compare has got a line number inside the loop. 3532 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3533 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3534 } 3535 3536 // Get ready to start creating new instructions into the vectorized body. 3537 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3538 "Inconsistent vector loop preheader"); 3539 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3540 3541 #ifdef EXPENSIVE_CHECKS 3542 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3543 LI->verify(*DT); 3544 #endif 3545 3546 return LoopVectorPreHeader; 3547 } 3548 3549 std::pair<BasicBlock *, Value *> 3550 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3551 /* 3552 In this function we generate a new loop. The new loop will contain 3553 the vectorized instructions while the old loop will continue to run the 3554 scalar remainder. 3555 3556 [ ] <-- loop iteration number check. 3557 / | 3558 / v 3559 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3560 | / | 3561 | / v 3562 || [ ] <-- vector pre header. 3563 |/ | 3564 | v 3565 | [ ] \ 3566 | [ ]_| <-- vector loop. 3567 | | 3568 | v 3569 \ -[ ] <--- middle-block. 3570 \/ | 3571 /\ v 3572 | ->[ ] <--- new preheader. 3573 | | 3574 (opt) v <-- edge from middle to exit iff epilogue is not required. 3575 | [ ] \ 3576 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3577 \ | 3578 \ v 3579 >[ ] <-- exit block(s). 3580 ... 3581 */ 3582 3583 // Get the metadata of the original loop before it gets modified. 3584 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3585 3586 // Workaround! Compute the trip count of the original loop and cache it 3587 // before we start modifying the CFG. This code has a systemic problem 3588 // wherein it tries to run analysis over partially constructed IR; this is 3589 // wrong, and not simply for SCEV. The trip count of the original loop 3590 // simply happens to be prone to hitting this in practice. In theory, we 3591 // can hit the same issue for any SCEV, or ValueTracking query done during 3592 // mutation. See PR49900. 3593 getOrCreateTripCount(OrigLoop); 3594 3595 // Create an empty vector loop, and prepare basic blocks for the runtime 3596 // checks. 3597 Loop *Lp = createVectorLoopSkeleton(""); 3598 3599 // Now, compare the new count to zero. If it is zero skip the vector loop and 3600 // jump to the scalar loop. This check also covers the case where the 3601 // backedge-taken count is uint##_max: adding one to it will overflow leading 3602 // to an incorrect trip count of zero. In this (rare) case we will also jump 3603 // to the scalar loop. 3604 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3605 3606 // Generate the code to check any assumptions that we've made for SCEV 3607 // expressions. 3608 emitSCEVChecks(Lp, LoopScalarPreHeader); 3609 3610 // Generate the code that checks in runtime if arrays overlap. We put the 3611 // checks into a separate block to make the more common case of few elements 3612 // faster. 3613 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3614 3615 createHeaderBranch(Lp); 3616 3617 // Emit phis for the new starting index of the scalar loop. 3618 createInductionResumeValues(Lp); 3619 3620 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3621 } 3622 3623 // Fix up external users of the induction variable. At this point, we are 3624 // in LCSSA form, with all external PHIs that use the IV having one input value, 3625 // coming from the remainder loop. We need those PHIs to also have a correct 3626 // value for the IV when arriving directly from the middle block. 3627 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3628 const InductionDescriptor &II, 3629 Value *CountRoundDown, Value *EndValue, 3630 BasicBlock *MiddleBlock) { 3631 // There are two kinds of external IV usages - those that use the value 3632 // computed in the last iteration (the PHI) and those that use the penultimate 3633 // value (the value that feeds into the phi from the loop latch). 3634 // We allow both, but they, obviously, have different values. 3635 3636 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3637 3638 DenseMap<Value *, Value *> MissingVals; 3639 3640 // An external user of the last iteration's value should see the value that 3641 // the remainder loop uses to initialize its own IV. 3642 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3643 for (User *U : PostInc->users()) { 3644 Instruction *UI = cast<Instruction>(U); 3645 if (!OrigLoop->contains(UI)) { 3646 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3647 MissingVals[UI] = EndValue; 3648 } 3649 } 3650 3651 // An external user of the penultimate value need to see EndValue - Step. 3652 // The simplest way to get this is to recompute it from the constituent SCEVs, 3653 // that is Start + (Step * (CRD - 1)). 3654 for (User *U : OrigPhi->users()) { 3655 auto *UI = cast<Instruction>(U); 3656 if (!OrigLoop->contains(UI)) { 3657 const DataLayout &DL = 3658 OrigLoop->getHeader()->getModule()->getDataLayout(); 3659 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3660 3661 IRBuilder<> B(MiddleBlock->getTerminator()); 3662 3663 // Fast-math-flags propagate from the original induction instruction. 3664 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3665 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3666 3667 Value *CountMinusOne = B.CreateSub( 3668 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3669 Value *CMO = 3670 !II.getStep()->getType()->isIntegerTy() 3671 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3672 II.getStep()->getType()) 3673 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3674 CMO->setName("cast.cmo"); 3675 Value *Escape = 3676 emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); 3677 Escape->setName("ind.escape"); 3678 MissingVals[UI] = Escape; 3679 } 3680 } 3681 3682 for (auto &I : MissingVals) { 3683 PHINode *PHI = cast<PHINode>(I.first); 3684 // One corner case we have to handle is two IVs "chasing" each-other, 3685 // that is %IV2 = phi [...], [ %IV1, %latch ] 3686 // In this case, if IV1 has an external use, we need to avoid adding both 3687 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3688 // don't already have an incoming value for the middle block. 3689 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3690 PHI->addIncoming(I.second, MiddleBlock); 3691 } 3692 } 3693 3694 namespace { 3695 3696 struct CSEDenseMapInfo { 3697 static bool canHandle(const Instruction *I) { 3698 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3699 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3700 } 3701 3702 static inline Instruction *getEmptyKey() { 3703 return DenseMapInfo<Instruction *>::getEmptyKey(); 3704 } 3705 3706 static inline Instruction *getTombstoneKey() { 3707 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3708 } 3709 3710 static unsigned getHashValue(const Instruction *I) { 3711 assert(canHandle(I) && "Unknown instruction!"); 3712 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3713 I->value_op_end())); 3714 } 3715 3716 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3717 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3718 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3719 return LHS == RHS; 3720 return LHS->isIdenticalTo(RHS); 3721 } 3722 }; 3723 3724 } // end anonymous namespace 3725 3726 ///Perform cse of induction variable instructions. 3727 static void cse(BasicBlock *BB) { 3728 // Perform simple cse. 3729 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3730 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3731 if (!CSEDenseMapInfo::canHandle(&In)) 3732 continue; 3733 3734 // Check if we can replace this instruction with any of the 3735 // visited instructions. 3736 if (Instruction *V = CSEMap.lookup(&In)) { 3737 In.replaceAllUsesWith(V); 3738 In.eraseFromParent(); 3739 continue; 3740 } 3741 3742 CSEMap[&In] = &In; 3743 } 3744 } 3745 3746 InstructionCost 3747 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3748 bool &NeedToScalarize) const { 3749 Function *F = CI->getCalledFunction(); 3750 Type *ScalarRetTy = CI->getType(); 3751 SmallVector<Type *, 4> Tys, ScalarTys; 3752 for (auto &ArgOp : CI->args()) 3753 ScalarTys.push_back(ArgOp->getType()); 3754 3755 // Estimate cost of scalarized vector call. The source operands are assumed 3756 // to be vectors, so we need to extract individual elements from there, 3757 // execute VF scalar calls, and then gather the result into the vector return 3758 // value. 3759 InstructionCost ScalarCallCost = 3760 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3761 if (VF.isScalar()) 3762 return ScalarCallCost; 3763 3764 // Compute corresponding vector type for return value and arguments. 3765 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3766 for (Type *ScalarTy : ScalarTys) 3767 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3768 3769 // Compute costs of unpacking argument values for the scalar calls and 3770 // packing the return values to a vector. 3771 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3772 3773 InstructionCost Cost = 3774 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3775 3776 // If we can't emit a vector call for this function, then the currently found 3777 // cost is the cost we need to return. 3778 NeedToScalarize = true; 3779 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3780 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3781 3782 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3783 return Cost; 3784 3785 // If the corresponding vector cost is cheaper, return its cost. 3786 InstructionCost VectorCallCost = 3787 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3788 if (VectorCallCost < Cost) { 3789 NeedToScalarize = false; 3790 Cost = VectorCallCost; 3791 } 3792 return Cost; 3793 } 3794 3795 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3796 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3797 return Elt; 3798 return VectorType::get(Elt, VF); 3799 } 3800 3801 InstructionCost 3802 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3803 ElementCount VF) const { 3804 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3805 assert(ID && "Expected intrinsic call!"); 3806 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3807 FastMathFlags FMF; 3808 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3809 FMF = FPMO->getFastMathFlags(); 3810 3811 SmallVector<const Value *> Arguments(CI->args()); 3812 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3813 SmallVector<Type *> ParamTys; 3814 std::transform(FTy->param_begin(), FTy->param_end(), 3815 std::back_inserter(ParamTys), 3816 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3817 3818 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3819 dyn_cast<IntrinsicInst>(CI)); 3820 return TTI.getIntrinsicInstrCost(CostAttrs, 3821 TargetTransformInfo::TCK_RecipThroughput); 3822 } 3823 3824 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3825 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3826 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3827 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3828 } 3829 3830 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3831 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3832 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3833 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3834 } 3835 3836 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3837 // For every instruction `I` in MinBWs, truncate the operands, create a 3838 // truncated version of `I` and reextend its result. InstCombine runs 3839 // later and will remove any ext/trunc pairs. 3840 SmallPtrSet<Value *, 4> Erased; 3841 for (const auto &KV : Cost->getMinimalBitwidths()) { 3842 // If the value wasn't vectorized, we must maintain the original scalar 3843 // type. The absence of the value from State indicates that it 3844 // wasn't vectorized. 3845 // FIXME: Should not rely on getVPValue at this point. 3846 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3847 if (!State.hasAnyVectorValue(Def)) 3848 continue; 3849 for (unsigned Part = 0; Part < UF; ++Part) { 3850 Value *I = State.get(Def, Part); 3851 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3852 continue; 3853 Type *OriginalTy = I->getType(); 3854 Type *ScalarTruncatedTy = 3855 IntegerType::get(OriginalTy->getContext(), KV.second); 3856 auto *TruncatedTy = VectorType::get( 3857 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3858 if (TruncatedTy == OriginalTy) 3859 continue; 3860 3861 IRBuilder<> B(cast<Instruction>(I)); 3862 auto ShrinkOperand = [&](Value *V) -> Value * { 3863 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3864 if (ZI->getSrcTy() == TruncatedTy) 3865 return ZI->getOperand(0); 3866 return B.CreateZExtOrTrunc(V, TruncatedTy); 3867 }; 3868 3869 // The actual instruction modification depends on the instruction type, 3870 // unfortunately. 3871 Value *NewI = nullptr; 3872 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3873 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3874 ShrinkOperand(BO->getOperand(1))); 3875 3876 // Any wrapping introduced by shrinking this operation shouldn't be 3877 // considered undefined behavior. So, we can't unconditionally copy 3878 // arithmetic wrapping flags to NewI. 3879 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3880 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3881 NewI = 3882 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3883 ShrinkOperand(CI->getOperand(1))); 3884 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3885 NewI = B.CreateSelect(SI->getCondition(), 3886 ShrinkOperand(SI->getTrueValue()), 3887 ShrinkOperand(SI->getFalseValue())); 3888 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3889 switch (CI->getOpcode()) { 3890 default: 3891 llvm_unreachable("Unhandled cast!"); 3892 case Instruction::Trunc: 3893 NewI = ShrinkOperand(CI->getOperand(0)); 3894 break; 3895 case Instruction::SExt: 3896 NewI = B.CreateSExtOrTrunc( 3897 CI->getOperand(0), 3898 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3899 break; 3900 case Instruction::ZExt: 3901 NewI = B.CreateZExtOrTrunc( 3902 CI->getOperand(0), 3903 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3904 break; 3905 } 3906 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3907 auto Elements0 = 3908 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3909 auto *O0 = B.CreateZExtOrTrunc( 3910 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3911 auto Elements1 = 3912 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3913 auto *O1 = B.CreateZExtOrTrunc( 3914 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3915 3916 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3917 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3918 // Don't do anything with the operands, just extend the result. 3919 continue; 3920 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3921 auto Elements = 3922 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3923 auto *O0 = B.CreateZExtOrTrunc( 3924 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3925 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3926 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3927 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3928 auto Elements = 3929 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3930 auto *O0 = B.CreateZExtOrTrunc( 3931 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3932 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3933 } else { 3934 // If we don't know what to do, be conservative and don't do anything. 3935 continue; 3936 } 3937 3938 // Lastly, extend the result. 3939 NewI->takeName(cast<Instruction>(I)); 3940 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3941 I->replaceAllUsesWith(Res); 3942 cast<Instruction>(I)->eraseFromParent(); 3943 Erased.insert(I); 3944 State.reset(Def, Res, Part); 3945 } 3946 } 3947 3948 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3949 for (const auto &KV : Cost->getMinimalBitwidths()) { 3950 // If the value wasn't vectorized, we must maintain the original scalar 3951 // type. The absence of the value from State indicates that it 3952 // wasn't vectorized. 3953 // FIXME: Should not rely on getVPValue at this point. 3954 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3955 if (!State.hasAnyVectorValue(Def)) 3956 continue; 3957 for (unsigned Part = 0; Part < UF; ++Part) { 3958 Value *I = State.get(Def, Part); 3959 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3960 if (Inst && Inst->use_empty()) { 3961 Value *NewI = Inst->getOperand(0); 3962 Inst->eraseFromParent(); 3963 State.reset(Def, NewI, Part); 3964 } 3965 } 3966 } 3967 } 3968 3969 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3970 // Insert truncates and extends for any truncated instructions as hints to 3971 // InstCombine. 3972 if (VF.isVector()) 3973 truncateToMinimalBitwidths(State); 3974 3975 // Fix widened non-induction PHIs by setting up the PHI operands. 3976 if (OrigPHIsToFix.size()) { 3977 assert(EnableVPlanNativePath && 3978 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3979 fixNonInductionPHIs(State); 3980 } 3981 3982 // At this point every instruction in the original loop is widened to a 3983 // vector form. Now we need to fix the recurrences in the loop. These PHI 3984 // nodes are currently empty because we did not want to introduce cycles. 3985 // This is the second stage of vectorizing recurrences. 3986 fixCrossIterationPHIs(State); 3987 3988 // Forget the original basic block. 3989 PSE.getSE()->forgetLoop(OrigLoop); 3990 3991 // If we inserted an edge from the middle block to the unique exit block, 3992 // update uses outside the loop (phis) to account for the newly inserted 3993 // edge. 3994 if (!Cost->requiresScalarEpilogue(VF)) { 3995 // Fix-up external users of the induction variables. 3996 for (auto &Entry : Legal->getInductionVars()) 3997 fixupIVUsers(Entry.first, Entry.second, 3998 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3999 IVEndValues[Entry.first], LoopMiddleBlock); 4000 4001 fixLCSSAPHIs(State); 4002 } 4003 4004 for (Instruction *PI : PredicatedInstructions) 4005 sinkScalarOperands(&*PI); 4006 4007 // Remove redundant induction instructions. 4008 cse(LoopVectorBody); 4009 4010 // Set/update profile weights for the vector and remainder loops as original 4011 // loop iterations are now distributed among them. Note that original loop 4012 // represented by LoopScalarBody becomes remainder loop after vectorization. 4013 // 4014 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4015 // end up getting slightly roughened result but that should be OK since 4016 // profile is not inherently precise anyway. Note also possible bypass of 4017 // vector code caused by legality checks is ignored, assigning all the weight 4018 // to the vector loop, optimistically. 4019 // 4020 // For scalable vectorization we can't know at compile time how many iterations 4021 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4022 // vscale of '1'. 4023 setProfileInfoAfterUnrolling( 4024 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4025 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4026 } 4027 4028 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4029 // In order to support recurrences we need to be able to vectorize Phi nodes. 4030 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4031 // stage #2: We now need to fix the recurrences by adding incoming edges to 4032 // the currently empty PHI nodes. At this point every instruction in the 4033 // original loop is widened to a vector form so we can use them to construct 4034 // the incoming edges. 4035 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4036 for (VPRecipeBase &R : Header->phis()) { 4037 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4038 fixReduction(ReductionPhi, State); 4039 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4040 fixFirstOrderRecurrence(FOR, State); 4041 } 4042 } 4043 4044 void InnerLoopVectorizer::fixFirstOrderRecurrence( 4045 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 4046 // This is the second phase of vectorizing first-order recurrences. An 4047 // overview of the transformation is described below. Suppose we have the 4048 // following loop. 4049 // 4050 // for (int i = 0; i < n; ++i) 4051 // b[i] = a[i] - a[i - 1]; 4052 // 4053 // There is a first-order recurrence on "a". For this loop, the shorthand 4054 // scalar IR looks like: 4055 // 4056 // scalar.ph: 4057 // s_init = a[-1] 4058 // br scalar.body 4059 // 4060 // scalar.body: 4061 // i = phi [0, scalar.ph], [i+1, scalar.body] 4062 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4063 // s2 = a[i] 4064 // b[i] = s2 - s1 4065 // br cond, scalar.body, ... 4066 // 4067 // In this example, s1 is a recurrence because it's value depends on the 4068 // previous iteration. In the first phase of vectorization, we created a 4069 // vector phi v1 for s1. We now complete the vectorization and produce the 4070 // shorthand vector IR shown below (for VF = 4, UF = 1). 4071 // 4072 // vector.ph: 4073 // v_init = vector(..., ..., ..., a[-1]) 4074 // br vector.body 4075 // 4076 // vector.body 4077 // i = phi [0, vector.ph], [i+4, vector.body] 4078 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4079 // v2 = a[i, i+1, i+2, i+3]; 4080 // v3 = vector(v1(3), v2(0, 1, 2)) 4081 // b[i, i+1, i+2, i+3] = v2 - v3 4082 // br cond, vector.body, middle.block 4083 // 4084 // middle.block: 4085 // x = v2(3) 4086 // br scalar.ph 4087 // 4088 // scalar.ph: 4089 // s_init = phi [x, middle.block], [a[-1], otherwise] 4090 // br scalar.body 4091 // 4092 // After execution completes the vector loop, we extract the next value of 4093 // the recurrence (x) to use as the initial value in the scalar loop. 4094 4095 // Extract the last vector element in the middle block. This will be the 4096 // initial value for the recurrence when jumping to the scalar loop. 4097 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4098 Value *Incoming = State.get(PreviousDef, UF - 1); 4099 auto *ExtractForScalar = Incoming; 4100 auto *IdxTy = Builder.getInt32Ty(); 4101 if (VF.isVector()) { 4102 auto *One = ConstantInt::get(IdxTy, 1); 4103 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4104 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4105 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4106 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4107 "vector.recur.extract"); 4108 } 4109 // Extract the second last element in the middle block if the 4110 // Phi is used outside the loop. We need to extract the phi itself 4111 // and not the last element (the phi update in the current iteration). This 4112 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4113 // when the scalar loop is not run at all. 4114 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4115 if (VF.isVector()) { 4116 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4117 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4118 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4119 Incoming, Idx, "vector.recur.extract.for.phi"); 4120 } else if (UF > 1) 4121 // When loop is unrolled without vectorizing, initialize 4122 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4123 // of `Incoming`. This is analogous to the vectorized case above: extracting 4124 // the second last element when VF > 1. 4125 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4126 4127 // Fix the initial value of the original recurrence in the scalar loop. 4128 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4129 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4130 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4131 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4132 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4133 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4134 Start->addIncoming(Incoming, BB); 4135 } 4136 4137 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4138 Phi->setName("scalar.recur"); 4139 4140 // Finally, fix users of the recurrence outside the loop. The users will need 4141 // either the last value of the scalar recurrence or the last value of the 4142 // vector recurrence we extracted in the middle block. Since the loop is in 4143 // LCSSA form, we just need to find all the phi nodes for the original scalar 4144 // recurrence in the exit block, and then add an edge for the middle block. 4145 // Note that LCSSA does not imply single entry when the original scalar loop 4146 // had multiple exiting edges (as we always run the last iteration in the 4147 // scalar epilogue); in that case, there is no edge from middle to exit and 4148 // and thus no phis which needed updated. 4149 if (!Cost->requiresScalarEpilogue(VF)) 4150 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4151 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4152 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4153 } 4154 4155 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4156 VPTransformState &State) { 4157 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4158 // Get it's reduction variable descriptor. 4159 assert(Legal->isReductionVariable(OrigPhi) && 4160 "Unable to find the reduction variable"); 4161 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4162 4163 RecurKind RK = RdxDesc.getRecurrenceKind(); 4164 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4165 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4166 setDebugLocFromInst(ReductionStartValue); 4167 4168 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4169 // This is the vector-clone of the value that leaves the loop. 4170 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4171 4172 // Wrap flags are in general invalid after vectorization, clear them. 4173 clearReductionWrapFlags(RdxDesc, State); 4174 4175 // Before each round, move the insertion point right between 4176 // the PHIs and the values we are going to write. 4177 // This allows us to write both PHINodes and the extractelement 4178 // instructions. 4179 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4180 4181 setDebugLocFromInst(LoopExitInst); 4182 4183 Type *PhiTy = OrigPhi->getType(); 4184 // If tail is folded by masking, the vector value to leave the loop should be 4185 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4186 // instead of the former. For an inloop reduction the reduction will already 4187 // be predicated, and does not need to be handled here. 4188 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4189 for (unsigned Part = 0; Part < UF; ++Part) { 4190 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4191 Value *Sel = nullptr; 4192 for (User *U : VecLoopExitInst->users()) { 4193 if (isa<SelectInst>(U)) { 4194 assert(!Sel && "Reduction exit feeding two selects"); 4195 Sel = U; 4196 } else 4197 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4198 } 4199 assert(Sel && "Reduction exit feeds no select"); 4200 State.reset(LoopExitInstDef, Sel, Part); 4201 4202 // If the target can create a predicated operator for the reduction at no 4203 // extra cost in the loop (for example a predicated vadd), it can be 4204 // cheaper for the select to remain in the loop than be sunk out of it, 4205 // and so use the select value for the phi instead of the old 4206 // LoopExitValue. 4207 if (PreferPredicatedReductionSelect || 4208 TTI->preferPredicatedReductionSelect( 4209 RdxDesc.getOpcode(), PhiTy, 4210 TargetTransformInfo::ReductionFlags())) { 4211 auto *VecRdxPhi = 4212 cast<PHINode>(State.get(PhiR, Part)); 4213 VecRdxPhi->setIncomingValueForBlock( 4214 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4215 } 4216 } 4217 } 4218 4219 // If the vector reduction can be performed in a smaller type, we truncate 4220 // then extend the loop exit value to enable InstCombine to evaluate the 4221 // entire expression in the smaller type. 4222 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4223 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4224 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4225 Builder.SetInsertPoint( 4226 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4227 VectorParts RdxParts(UF); 4228 for (unsigned Part = 0; Part < UF; ++Part) { 4229 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4230 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4231 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4232 : Builder.CreateZExt(Trunc, VecTy); 4233 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4234 if (U != Trunc) { 4235 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4236 RdxParts[Part] = Extnd; 4237 } 4238 } 4239 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4240 for (unsigned Part = 0; Part < UF; ++Part) { 4241 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4242 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4243 } 4244 } 4245 4246 // Reduce all of the unrolled parts into a single vector. 4247 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4248 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4249 4250 // The middle block terminator has already been assigned a DebugLoc here (the 4251 // OrigLoop's single latch terminator). We want the whole middle block to 4252 // appear to execute on this line because: (a) it is all compiler generated, 4253 // (b) these instructions are always executed after evaluating the latch 4254 // conditional branch, and (c) other passes may add new predecessors which 4255 // terminate on this line. This is the easiest way to ensure we don't 4256 // accidentally cause an extra step back into the loop while debugging. 4257 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4258 if (PhiR->isOrdered()) 4259 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4260 else { 4261 // Floating-point operations should have some FMF to enable the reduction. 4262 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4263 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4264 for (unsigned Part = 1; Part < UF; ++Part) { 4265 Value *RdxPart = State.get(LoopExitInstDef, Part); 4266 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4267 ReducedPartRdx = Builder.CreateBinOp( 4268 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4269 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4270 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4271 ReducedPartRdx, RdxPart); 4272 else 4273 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4274 } 4275 } 4276 4277 // Create the reduction after the loop. Note that inloop reductions create the 4278 // target reduction in the loop using a Reduction recipe. 4279 if (VF.isVector() && !PhiR->isInLoop()) { 4280 ReducedPartRdx = 4281 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4282 // If the reduction can be performed in a smaller type, we need to extend 4283 // the reduction to the wider type before we branch to the original loop. 4284 if (PhiTy != RdxDesc.getRecurrenceType()) 4285 ReducedPartRdx = RdxDesc.isSigned() 4286 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4287 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4288 } 4289 4290 // Create a phi node that merges control-flow from the backedge-taken check 4291 // block and the middle block. 4292 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4293 LoopScalarPreHeader->getTerminator()); 4294 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4295 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4296 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4297 4298 // Now, we need to fix the users of the reduction variable 4299 // inside and outside of the scalar remainder loop. 4300 4301 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4302 // in the exit blocks. See comment on analogous loop in 4303 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4304 if (!Cost->requiresScalarEpilogue(VF)) 4305 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4306 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4307 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4308 4309 // Fix the scalar loop reduction variable with the incoming reduction sum 4310 // from the vector body and from the backedge value. 4311 int IncomingEdgeBlockIdx = 4312 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4313 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4314 // Pick the other block. 4315 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4316 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4317 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4318 } 4319 4320 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4321 VPTransformState &State) { 4322 RecurKind RK = RdxDesc.getRecurrenceKind(); 4323 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4324 return; 4325 4326 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4327 assert(LoopExitInstr && "null loop exit instruction"); 4328 SmallVector<Instruction *, 8> Worklist; 4329 SmallPtrSet<Instruction *, 8> Visited; 4330 Worklist.push_back(LoopExitInstr); 4331 Visited.insert(LoopExitInstr); 4332 4333 while (!Worklist.empty()) { 4334 Instruction *Cur = Worklist.pop_back_val(); 4335 if (isa<OverflowingBinaryOperator>(Cur)) 4336 for (unsigned Part = 0; Part < UF; ++Part) { 4337 // FIXME: Should not rely on getVPValue at this point. 4338 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4339 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4340 } 4341 4342 for (User *U : Cur->users()) { 4343 Instruction *UI = cast<Instruction>(U); 4344 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4345 Visited.insert(UI).second) 4346 Worklist.push_back(UI); 4347 } 4348 } 4349 } 4350 4351 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4352 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4353 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4354 // Some phis were already hand updated by the reduction and recurrence 4355 // code above, leave them alone. 4356 continue; 4357 4358 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4359 // Non-instruction incoming values will have only one value. 4360 4361 VPLane Lane = VPLane::getFirstLane(); 4362 if (isa<Instruction>(IncomingValue) && 4363 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4364 VF)) 4365 Lane = VPLane::getLastLaneForVF(VF); 4366 4367 // Can be a loop invariant incoming value or the last scalar value to be 4368 // extracted from the vectorized loop. 4369 // FIXME: Should not rely on getVPValue at this point. 4370 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4371 Value *lastIncomingValue = 4372 OrigLoop->isLoopInvariant(IncomingValue) 4373 ? IncomingValue 4374 : State.get(State.Plan->getVPValue(IncomingValue, true), 4375 VPIteration(UF - 1, Lane)); 4376 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4377 } 4378 } 4379 4380 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4381 // The basic block and loop containing the predicated instruction. 4382 auto *PredBB = PredInst->getParent(); 4383 auto *VectorLoop = LI->getLoopFor(PredBB); 4384 4385 // Initialize a worklist with the operands of the predicated instruction. 4386 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4387 4388 // Holds instructions that we need to analyze again. An instruction may be 4389 // reanalyzed if we don't yet know if we can sink it or not. 4390 SmallVector<Instruction *, 8> InstsToReanalyze; 4391 4392 // Returns true if a given use occurs in the predicated block. Phi nodes use 4393 // their operands in their corresponding predecessor blocks. 4394 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4395 auto *I = cast<Instruction>(U.getUser()); 4396 BasicBlock *BB = I->getParent(); 4397 if (auto *Phi = dyn_cast<PHINode>(I)) 4398 BB = Phi->getIncomingBlock( 4399 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4400 return BB == PredBB; 4401 }; 4402 4403 // Iteratively sink the scalarized operands of the predicated instruction 4404 // into the block we created for it. When an instruction is sunk, it's 4405 // operands are then added to the worklist. The algorithm ends after one pass 4406 // through the worklist doesn't sink a single instruction. 4407 bool Changed; 4408 do { 4409 // Add the instructions that need to be reanalyzed to the worklist, and 4410 // reset the changed indicator. 4411 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4412 InstsToReanalyze.clear(); 4413 Changed = false; 4414 4415 while (!Worklist.empty()) { 4416 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4417 4418 // We can't sink an instruction if it is a phi node, is not in the loop, 4419 // or may have side effects. 4420 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4421 I->mayHaveSideEffects()) 4422 continue; 4423 4424 // If the instruction is already in PredBB, check if we can sink its 4425 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4426 // sinking the scalar instruction I, hence it appears in PredBB; but it 4427 // may have failed to sink I's operands (recursively), which we try 4428 // (again) here. 4429 if (I->getParent() == PredBB) { 4430 Worklist.insert(I->op_begin(), I->op_end()); 4431 continue; 4432 } 4433 4434 // It's legal to sink the instruction if all its uses occur in the 4435 // predicated block. Otherwise, there's nothing to do yet, and we may 4436 // need to reanalyze the instruction. 4437 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4438 InstsToReanalyze.push_back(I); 4439 continue; 4440 } 4441 4442 // Move the instruction to the beginning of the predicated block, and add 4443 // it's operands to the worklist. 4444 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4445 Worklist.insert(I->op_begin(), I->op_end()); 4446 4447 // The sinking may have enabled other instructions to be sunk, so we will 4448 // need to iterate. 4449 Changed = true; 4450 } 4451 } while (Changed); 4452 } 4453 4454 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4455 for (PHINode *OrigPhi : OrigPHIsToFix) { 4456 VPWidenPHIRecipe *VPPhi = 4457 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4458 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4459 // Make sure the builder has a valid insert point. 4460 Builder.SetInsertPoint(NewPhi); 4461 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4462 VPValue *Inc = VPPhi->getIncomingValue(i); 4463 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4464 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4465 } 4466 } 4467 } 4468 4469 bool InnerLoopVectorizer::useOrderedReductions( 4470 const RecurrenceDescriptor &RdxDesc) { 4471 return Cost->useOrderedReductions(RdxDesc); 4472 } 4473 4474 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4475 VPWidenPHIRecipe *PhiR, 4476 VPTransformState &State) { 4477 PHINode *P = cast<PHINode>(PN); 4478 if (EnableVPlanNativePath) { 4479 // Currently we enter here in the VPlan-native path for non-induction 4480 // PHIs where all control flow is uniform. We simply widen these PHIs. 4481 // Create a vector phi with no operands - the vector phi operands will be 4482 // set at the end of vector code generation. 4483 Type *VecTy = (State.VF.isScalar()) 4484 ? PN->getType() 4485 : VectorType::get(PN->getType(), State.VF); 4486 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4487 State.set(PhiR, VecPhi, 0); 4488 OrigPHIsToFix.push_back(P); 4489 4490 return; 4491 } 4492 4493 assert(PN->getParent() == OrigLoop->getHeader() && 4494 "Non-header phis should have been handled elsewhere"); 4495 4496 // In order to support recurrences we need to be able to vectorize Phi nodes. 4497 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4498 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4499 // this value when we vectorize all of the instructions that use the PHI. 4500 4501 assert(!Legal->isReductionVariable(P) && 4502 "reductions should be handled elsewhere"); 4503 4504 setDebugLocFromInst(P); 4505 4506 // This PHINode must be an induction variable. 4507 // Make sure that we know about it. 4508 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4509 4510 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4511 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4512 4513 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4514 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4515 4516 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4517 // which can be found from the original scalar operations. 4518 switch (II.getKind()) { 4519 case InductionDescriptor::IK_NoInduction: 4520 llvm_unreachable("Unknown induction"); 4521 case InductionDescriptor::IK_IntInduction: 4522 case InductionDescriptor::IK_FpInduction: 4523 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4524 case InductionDescriptor::IK_PtrInduction: { 4525 // Handle the pointer induction variable case. 4526 assert(P->getType()->isPointerTy() && "Unexpected type."); 4527 4528 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4529 // This is the normalized GEP that starts counting at zero. 4530 Value *PtrInd = 4531 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4532 // Determine the number of scalars we need to generate for each unroll 4533 // iteration. If the instruction is uniform, we only need to generate the 4534 // first lane. Otherwise, we generate all VF values. 4535 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4536 assert((IsUniform || !State.VF.isScalable()) && 4537 "Cannot scalarize a scalable VF"); 4538 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4539 4540 for (unsigned Part = 0; Part < UF; ++Part) { 4541 Value *PartStart = 4542 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4543 4544 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4545 Value *Idx = Builder.CreateAdd( 4546 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4547 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4548 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), 4549 DL, II, State.CFG.PrevBB); 4550 SclrGep->setName("next.gep"); 4551 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4552 } 4553 } 4554 return; 4555 } 4556 assert(isa<SCEVConstant>(II.getStep()) && 4557 "Induction step not a SCEV constant!"); 4558 Type *PhiType = II.getStep()->getType(); 4559 4560 // Build a pointer phi 4561 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4562 Type *ScStValueType = ScalarStartValue->getType(); 4563 PHINode *NewPointerPhi = 4564 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4565 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4566 4567 // A pointer induction, performed by using a gep 4568 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4569 Instruction *InductionLoc = LoopLatch->getTerminator(); 4570 const SCEV *ScalarStep = II.getStep(); 4571 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4572 Value *ScalarStepValue = 4573 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4574 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4575 Value *NumUnrolledElems = 4576 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4577 Value *InductionGEP = GetElementPtrInst::Create( 4578 II.getElementType(), NewPointerPhi, 4579 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4580 InductionLoc); 4581 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4582 4583 // Create UF many actual address geps that use the pointer 4584 // phi as base and a vectorized version of the step value 4585 // (<step*0, ..., step*N>) as offset. 4586 for (unsigned Part = 0; Part < State.UF; ++Part) { 4587 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4588 Value *StartOffsetScalar = 4589 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4590 Value *StartOffset = 4591 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4592 // Create a vector of consecutive numbers from zero to VF. 4593 StartOffset = 4594 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4595 4596 Value *GEP = Builder.CreateGEP( 4597 II.getElementType(), NewPointerPhi, 4598 Builder.CreateMul( 4599 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4600 "vector.gep")); 4601 State.set(PhiR, GEP, Part); 4602 } 4603 } 4604 } 4605 } 4606 4607 /// A helper function for checking whether an integer division-related 4608 /// instruction may divide by zero (in which case it must be predicated if 4609 /// executed conditionally in the scalar code). 4610 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4611 /// Non-zero divisors that are non compile-time constants will not be 4612 /// converted into multiplication, so we will still end up scalarizing 4613 /// the division, but can do so w/o predication. 4614 static bool mayDivideByZero(Instruction &I) { 4615 assert((I.getOpcode() == Instruction::UDiv || 4616 I.getOpcode() == Instruction::SDiv || 4617 I.getOpcode() == Instruction::URem || 4618 I.getOpcode() == Instruction::SRem) && 4619 "Unexpected instruction"); 4620 Value *Divisor = I.getOperand(1); 4621 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4622 return !CInt || CInt->isZero(); 4623 } 4624 4625 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4626 VPUser &ArgOperands, 4627 VPTransformState &State) { 4628 assert(!isa<DbgInfoIntrinsic>(I) && 4629 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4630 setDebugLocFromInst(&I); 4631 4632 Module *M = I.getParent()->getParent()->getParent(); 4633 auto *CI = cast<CallInst>(&I); 4634 4635 SmallVector<Type *, 4> Tys; 4636 for (Value *ArgOperand : CI->args()) 4637 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4638 4639 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4640 4641 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4642 // version of the instruction. 4643 // Is it beneficial to perform intrinsic call compared to lib call? 4644 bool NeedToScalarize = false; 4645 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4646 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4647 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4648 assert((UseVectorIntrinsic || !NeedToScalarize) && 4649 "Instruction should be scalarized elsewhere."); 4650 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4651 "Either the intrinsic cost or vector call cost must be valid"); 4652 4653 for (unsigned Part = 0; Part < UF; ++Part) { 4654 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4655 SmallVector<Value *, 4> Args; 4656 for (auto &I : enumerate(ArgOperands.operands())) { 4657 // Some intrinsics have a scalar argument - don't replace it with a 4658 // vector. 4659 Value *Arg; 4660 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4661 Arg = State.get(I.value(), Part); 4662 else { 4663 Arg = State.get(I.value(), VPIteration(0, 0)); 4664 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4665 TysForDecl.push_back(Arg->getType()); 4666 } 4667 Args.push_back(Arg); 4668 } 4669 4670 Function *VectorF; 4671 if (UseVectorIntrinsic) { 4672 // Use vector version of the intrinsic. 4673 if (VF.isVector()) 4674 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4675 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4676 assert(VectorF && "Can't retrieve vector intrinsic."); 4677 } else { 4678 // Use vector version of the function call. 4679 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4680 #ifndef NDEBUG 4681 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4682 "Can't create vector function."); 4683 #endif 4684 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4685 } 4686 SmallVector<OperandBundleDef, 1> OpBundles; 4687 CI->getOperandBundlesAsDefs(OpBundles); 4688 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4689 4690 if (isa<FPMathOperator>(V)) 4691 V->copyFastMathFlags(CI); 4692 4693 State.set(Def, V, Part); 4694 addMetadata(V, &I); 4695 } 4696 } 4697 4698 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4699 // We should not collect Scalars more than once per VF. Right now, this 4700 // function is called from collectUniformsAndScalars(), which already does 4701 // this check. Collecting Scalars for VF=1 does not make any sense. 4702 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4703 "This function should not be visited twice for the same VF"); 4704 4705 SmallSetVector<Instruction *, 8> Worklist; 4706 4707 // These sets are used to seed the analysis with pointers used by memory 4708 // accesses that will remain scalar. 4709 SmallSetVector<Instruction *, 8> ScalarPtrs; 4710 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4711 auto *Latch = TheLoop->getLoopLatch(); 4712 4713 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4714 // The pointer operands of loads and stores will be scalar as long as the 4715 // memory access is not a gather or scatter operation. The value operand of a 4716 // store will remain scalar if the store is scalarized. 4717 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4718 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4719 assert(WideningDecision != CM_Unknown && 4720 "Widening decision should be ready at this moment"); 4721 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4722 if (Ptr == Store->getValueOperand()) 4723 return WideningDecision == CM_Scalarize; 4724 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4725 "Ptr is neither a value or pointer operand"); 4726 return WideningDecision != CM_GatherScatter; 4727 }; 4728 4729 // A helper that returns true if the given value is a bitcast or 4730 // getelementptr instruction contained in the loop. 4731 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4732 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4733 isa<GetElementPtrInst>(V)) && 4734 !TheLoop->isLoopInvariant(V); 4735 }; 4736 4737 // A helper that evaluates a memory access's use of a pointer. If the use will 4738 // be a scalar use and the pointer is only used by memory accesses, we place 4739 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4740 // PossibleNonScalarPtrs. 4741 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4742 // We only care about bitcast and getelementptr instructions contained in 4743 // the loop. 4744 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4745 return; 4746 4747 // If the pointer has already been identified as scalar (e.g., if it was 4748 // also identified as uniform), there's nothing to do. 4749 auto *I = cast<Instruction>(Ptr); 4750 if (Worklist.count(I)) 4751 return; 4752 4753 // If the use of the pointer will be a scalar use, and all users of the 4754 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4755 // place the pointer in PossibleNonScalarPtrs. 4756 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4757 return isa<LoadInst>(U) || isa<StoreInst>(U); 4758 })) 4759 ScalarPtrs.insert(I); 4760 else 4761 PossibleNonScalarPtrs.insert(I); 4762 }; 4763 4764 // We seed the scalars analysis with three classes of instructions: (1) 4765 // instructions marked uniform-after-vectorization and (2) bitcast, 4766 // getelementptr and (pointer) phi instructions used by memory accesses 4767 // requiring a scalar use. 4768 // 4769 // (1) Add to the worklist all instructions that have been identified as 4770 // uniform-after-vectorization. 4771 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4772 4773 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4774 // memory accesses requiring a scalar use. The pointer operands of loads and 4775 // stores will be scalar as long as the memory accesses is not a gather or 4776 // scatter operation. The value operand of a store will remain scalar if the 4777 // store is scalarized. 4778 for (auto *BB : TheLoop->blocks()) 4779 for (auto &I : *BB) { 4780 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4781 evaluatePtrUse(Load, Load->getPointerOperand()); 4782 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4783 evaluatePtrUse(Store, Store->getPointerOperand()); 4784 evaluatePtrUse(Store, Store->getValueOperand()); 4785 } 4786 } 4787 for (auto *I : ScalarPtrs) 4788 if (!PossibleNonScalarPtrs.count(I)) { 4789 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4790 Worklist.insert(I); 4791 } 4792 4793 // Insert the forced scalars. 4794 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4795 // induction variable when the PHI user is scalarized. 4796 auto ForcedScalar = ForcedScalars.find(VF); 4797 if (ForcedScalar != ForcedScalars.end()) 4798 for (auto *I : ForcedScalar->second) 4799 Worklist.insert(I); 4800 4801 // Expand the worklist by looking through any bitcasts and getelementptr 4802 // instructions we've already identified as scalar. This is similar to the 4803 // expansion step in collectLoopUniforms(); however, here we're only 4804 // expanding to include additional bitcasts and getelementptr instructions. 4805 unsigned Idx = 0; 4806 while (Idx != Worklist.size()) { 4807 Instruction *Dst = Worklist[Idx++]; 4808 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4809 continue; 4810 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4811 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4812 auto *J = cast<Instruction>(U); 4813 return !TheLoop->contains(J) || Worklist.count(J) || 4814 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4815 isScalarUse(J, Src)); 4816 })) { 4817 Worklist.insert(Src); 4818 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4819 } 4820 } 4821 4822 // An induction variable will remain scalar if all users of the induction 4823 // variable and induction variable update remain scalar. 4824 for (auto &Induction : Legal->getInductionVars()) { 4825 auto *Ind = Induction.first; 4826 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4827 4828 // If tail-folding is applied, the primary induction variable will be used 4829 // to feed a vector compare. 4830 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4831 continue; 4832 4833 // Returns true if \p Indvar is a pointer induction that is used directly by 4834 // load/store instruction \p I. 4835 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4836 Instruction *I) { 4837 return Induction.second.getKind() == 4838 InductionDescriptor::IK_PtrInduction && 4839 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4840 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4841 }; 4842 4843 // Determine if all users of the induction variable are scalar after 4844 // vectorization. 4845 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4846 auto *I = cast<Instruction>(U); 4847 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4848 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4849 }); 4850 if (!ScalarInd) 4851 continue; 4852 4853 // Determine if all users of the induction variable update instruction are 4854 // scalar after vectorization. 4855 auto ScalarIndUpdate = 4856 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4857 auto *I = cast<Instruction>(U); 4858 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4859 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4860 }); 4861 if (!ScalarIndUpdate) 4862 continue; 4863 4864 // The induction variable and its update instruction will remain scalar. 4865 Worklist.insert(Ind); 4866 Worklist.insert(IndUpdate); 4867 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4868 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4869 << "\n"); 4870 } 4871 4872 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4873 } 4874 4875 bool LoopVectorizationCostModel::isScalarWithPredication( 4876 Instruction *I, ElementCount VF) const { 4877 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4878 return false; 4879 switch(I->getOpcode()) { 4880 default: 4881 break; 4882 case Instruction::Load: 4883 case Instruction::Store: { 4884 if (!Legal->isMaskRequired(I)) 4885 return false; 4886 auto *Ptr = getLoadStorePointerOperand(I); 4887 auto *Ty = getLoadStoreType(I); 4888 Type *VTy = Ty; 4889 if (VF.isVector()) 4890 VTy = VectorType::get(Ty, VF); 4891 const Align Alignment = getLoadStoreAlignment(I); 4892 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4893 TTI.isLegalMaskedGather(VTy, Alignment)) 4894 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4895 TTI.isLegalMaskedScatter(VTy, Alignment)); 4896 } 4897 case Instruction::UDiv: 4898 case Instruction::SDiv: 4899 case Instruction::SRem: 4900 case Instruction::URem: 4901 return mayDivideByZero(*I); 4902 } 4903 return false; 4904 } 4905 4906 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4907 Instruction *I, ElementCount VF) { 4908 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4909 assert(getWideningDecision(I, VF) == CM_Unknown && 4910 "Decision should not be set yet."); 4911 auto *Group = getInterleavedAccessGroup(I); 4912 assert(Group && "Must have a group."); 4913 4914 // If the instruction's allocated size doesn't equal it's type size, it 4915 // requires padding and will be scalarized. 4916 auto &DL = I->getModule()->getDataLayout(); 4917 auto *ScalarTy = getLoadStoreType(I); 4918 if (hasIrregularType(ScalarTy, DL)) 4919 return false; 4920 4921 // Check if masking is required. 4922 // A Group may need masking for one of two reasons: it resides in a block that 4923 // needs predication, or it was decided to use masking to deal with gaps 4924 // (either a gap at the end of a load-access that may result in a speculative 4925 // load, or any gaps in a store-access). 4926 bool PredicatedAccessRequiresMasking = 4927 blockNeedsPredicationForAnyReason(I->getParent()) && 4928 Legal->isMaskRequired(I); 4929 bool LoadAccessWithGapsRequiresEpilogMasking = 4930 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4931 !isScalarEpilogueAllowed(); 4932 bool StoreAccessWithGapsRequiresMasking = 4933 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4934 if (!PredicatedAccessRequiresMasking && 4935 !LoadAccessWithGapsRequiresEpilogMasking && 4936 !StoreAccessWithGapsRequiresMasking) 4937 return true; 4938 4939 // If masked interleaving is required, we expect that the user/target had 4940 // enabled it, because otherwise it either wouldn't have been created or 4941 // it should have been invalidated by the CostModel. 4942 assert(useMaskedInterleavedAccesses(TTI) && 4943 "Masked interleave-groups for predicated accesses are not enabled."); 4944 4945 if (Group->isReverse()) 4946 return false; 4947 4948 auto *Ty = getLoadStoreType(I); 4949 const Align Alignment = getLoadStoreAlignment(I); 4950 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4951 : TTI.isLegalMaskedStore(Ty, Alignment); 4952 } 4953 4954 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4955 Instruction *I, ElementCount VF) { 4956 // Get and ensure we have a valid memory instruction. 4957 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4958 4959 auto *Ptr = getLoadStorePointerOperand(I); 4960 auto *ScalarTy = getLoadStoreType(I); 4961 4962 // In order to be widened, the pointer should be consecutive, first of all. 4963 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4964 return false; 4965 4966 // If the instruction is a store located in a predicated block, it will be 4967 // scalarized. 4968 if (isScalarWithPredication(I, VF)) 4969 return false; 4970 4971 // If the instruction's allocated size doesn't equal it's type size, it 4972 // requires padding and will be scalarized. 4973 auto &DL = I->getModule()->getDataLayout(); 4974 if (hasIrregularType(ScalarTy, DL)) 4975 return false; 4976 4977 return true; 4978 } 4979 4980 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4981 // We should not collect Uniforms more than once per VF. Right now, 4982 // this function is called from collectUniformsAndScalars(), which 4983 // already does this check. Collecting Uniforms for VF=1 does not make any 4984 // sense. 4985 4986 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4987 "This function should not be visited twice for the same VF"); 4988 4989 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4990 // not analyze again. Uniforms.count(VF) will return 1. 4991 Uniforms[VF].clear(); 4992 4993 // We now know that the loop is vectorizable! 4994 // Collect instructions inside the loop that will remain uniform after 4995 // vectorization. 4996 4997 // Global values, params and instructions outside of current loop are out of 4998 // scope. 4999 auto isOutOfScope = [&](Value *V) -> bool { 5000 Instruction *I = dyn_cast<Instruction>(V); 5001 return (!I || !TheLoop->contains(I)); 5002 }; 5003 5004 // Worklist containing uniform instructions demanding lane 0. 5005 SetVector<Instruction *> Worklist; 5006 BasicBlock *Latch = TheLoop->getLoopLatch(); 5007 5008 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5009 // that are scalar with predication must not be considered uniform after 5010 // vectorization, because that would create an erroneous replicating region 5011 // where only a single instance out of VF should be formed. 5012 // TODO: optimize such seldom cases if found important, see PR40816. 5013 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5014 if (isOutOfScope(I)) { 5015 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5016 << *I << "\n"); 5017 return; 5018 } 5019 if (isScalarWithPredication(I, VF)) { 5020 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5021 << *I << "\n"); 5022 return; 5023 } 5024 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5025 Worklist.insert(I); 5026 }; 5027 5028 // Start with the conditional branch. If the branch condition is an 5029 // instruction contained in the loop that is only used by the branch, it is 5030 // uniform. 5031 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5032 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5033 addToWorklistIfAllowed(Cmp); 5034 5035 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5036 InstWidening WideningDecision = getWideningDecision(I, VF); 5037 assert(WideningDecision != CM_Unknown && 5038 "Widening decision should be ready at this moment"); 5039 5040 // A uniform memory op is itself uniform. We exclude uniform stores 5041 // here as they demand the last lane, not the first one. 5042 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5043 assert(WideningDecision == CM_Scalarize); 5044 return true; 5045 } 5046 5047 return (WideningDecision == CM_Widen || 5048 WideningDecision == CM_Widen_Reverse || 5049 WideningDecision == CM_Interleave); 5050 }; 5051 5052 5053 // Returns true if Ptr is the pointer operand of a memory access instruction 5054 // I, and I is known to not require scalarization. 5055 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5056 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5057 }; 5058 5059 // Holds a list of values which are known to have at least one uniform use. 5060 // Note that there may be other uses which aren't uniform. A "uniform use" 5061 // here is something which only demands lane 0 of the unrolled iterations; 5062 // it does not imply that all lanes produce the same value (e.g. this is not 5063 // the usual meaning of uniform) 5064 SetVector<Value *> HasUniformUse; 5065 5066 // Scan the loop for instructions which are either a) known to have only 5067 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5068 for (auto *BB : TheLoop->blocks()) 5069 for (auto &I : *BB) { 5070 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5071 switch (II->getIntrinsicID()) { 5072 case Intrinsic::sideeffect: 5073 case Intrinsic::experimental_noalias_scope_decl: 5074 case Intrinsic::assume: 5075 case Intrinsic::lifetime_start: 5076 case Intrinsic::lifetime_end: 5077 if (TheLoop->hasLoopInvariantOperands(&I)) 5078 addToWorklistIfAllowed(&I); 5079 break; 5080 default: 5081 break; 5082 } 5083 } 5084 5085 // ExtractValue instructions must be uniform, because the operands are 5086 // known to be loop-invariant. 5087 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5088 assert(isOutOfScope(EVI->getAggregateOperand()) && 5089 "Expected aggregate value to be loop invariant"); 5090 addToWorklistIfAllowed(EVI); 5091 continue; 5092 } 5093 5094 // If there's no pointer operand, there's nothing to do. 5095 auto *Ptr = getLoadStorePointerOperand(&I); 5096 if (!Ptr) 5097 continue; 5098 5099 // A uniform memory op is itself uniform. We exclude uniform stores 5100 // here as they demand the last lane, not the first one. 5101 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5102 addToWorklistIfAllowed(&I); 5103 5104 if (isUniformDecision(&I, VF)) { 5105 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5106 HasUniformUse.insert(Ptr); 5107 } 5108 } 5109 5110 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5111 // demanding) users. Since loops are assumed to be in LCSSA form, this 5112 // disallows uses outside the loop as well. 5113 for (auto *V : HasUniformUse) { 5114 if (isOutOfScope(V)) 5115 continue; 5116 auto *I = cast<Instruction>(V); 5117 auto UsersAreMemAccesses = 5118 llvm::all_of(I->users(), [&](User *U) -> bool { 5119 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5120 }); 5121 if (UsersAreMemAccesses) 5122 addToWorklistIfAllowed(I); 5123 } 5124 5125 // Expand Worklist in topological order: whenever a new instruction 5126 // is added , its users should be already inside Worklist. It ensures 5127 // a uniform instruction will only be used by uniform instructions. 5128 unsigned idx = 0; 5129 while (idx != Worklist.size()) { 5130 Instruction *I = Worklist[idx++]; 5131 5132 for (auto OV : I->operand_values()) { 5133 // isOutOfScope operands cannot be uniform instructions. 5134 if (isOutOfScope(OV)) 5135 continue; 5136 // First order recurrence Phi's should typically be considered 5137 // non-uniform. 5138 auto *OP = dyn_cast<PHINode>(OV); 5139 if (OP && Legal->isFirstOrderRecurrence(OP)) 5140 continue; 5141 // If all the users of the operand are uniform, then add the 5142 // operand into the uniform worklist. 5143 auto *OI = cast<Instruction>(OV); 5144 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5145 auto *J = cast<Instruction>(U); 5146 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5147 })) 5148 addToWorklistIfAllowed(OI); 5149 } 5150 } 5151 5152 // For an instruction to be added into Worklist above, all its users inside 5153 // the loop should also be in Worklist. However, this condition cannot be 5154 // true for phi nodes that form a cyclic dependence. We must process phi 5155 // nodes separately. An induction variable will remain uniform if all users 5156 // of the induction variable and induction variable update remain uniform. 5157 // The code below handles both pointer and non-pointer induction variables. 5158 for (auto &Induction : Legal->getInductionVars()) { 5159 auto *Ind = Induction.first; 5160 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5161 5162 // Determine if all users of the induction variable are uniform after 5163 // vectorization. 5164 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5165 auto *I = cast<Instruction>(U); 5166 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5167 isVectorizedMemAccessUse(I, Ind); 5168 }); 5169 if (!UniformInd) 5170 continue; 5171 5172 // Determine if all users of the induction variable update instruction are 5173 // uniform after vectorization. 5174 auto UniformIndUpdate = 5175 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5176 auto *I = cast<Instruction>(U); 5177 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5178 isVectorizedMemAccessUse(I, IndUpdate); 5179 }); 5180 if (!UniformIndUpdate) 5181 continue; 5182 5183 // The induction variable and its update instruction will remain uniform. 5184 addToWorklistIfAllowed(Ind); 5185 addToWorklistIfAllowed(IndUpdate); 5186 } 5187 5188 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5189 } 5190 5191 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5192 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5193 5194 if (Legal->getRuntimePointerChecking()->Need) { 5195 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5196 "runtime pointer checks needed. Enable vectorization of this " 5197 "loop with '#pragma clang loop vectorize(enable)' when " 5198 "compiling with -Os/-Oz", 5199 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5200 return true; 5201 } 5202 5203 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5204 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5205 "runtime SCEV checks needed. Enable vectorization of this " 5206 "loop with '#pragma clang loop vectorize(enable)' when " 5207 "compiling with -Os/-Oz", 5208 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5209 return true; 5210 } 5211 5212 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5213 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5214 reportVectorizationFailure("Runtime stride check for small trip count", 5215 "runtime stride == 1 checks needed. Enable vectorization of " 5216 "this loop without such check by compiling with -Os/-Oz", 5217 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5218 return true; 5219 } 5220 5221 return false; 5222 } 5223 5224 ElementCount 5225 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5226 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5227 return ElementCount::getScalable(0); 5228 5229 if (Hints->isScalableVectorizationDisabled()) { 5230 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5231 "ScalableVectorizationDisabled", ORE, TheLoop); 5232 return ElementCount::getScalable(0); 5233 } 5234 5235 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5236 5237 auto MaxScalableVF = ElementCount::getScalable( 5238 std::numeric_limits<ElementCount::ScalarTy>::max()); 5239 5240 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5241 // FIXME: While for scalable vectors this is currently sufficient, this should 5242 // be replaced by a more detailed mechanism that filters out specific VFs, 5243 // instead of invalidating vectorization for a whole set of VFs based on the 5244 // MaxVF. 5245 5246 // Disable scalable vectorization if the loop contains unsupported reductions. 5247 if (!canVectorizeReductions(MaxScalableVF)) { 5248 reportVectorizationInfo( 5249 "Scalable vectorization not supported for the reduction " 5250 "operations found in this loop.", 5251 "ScalableVFUnfeasible", ORE, TheLoop); 5252 return ElementCount::getScalable(0); 5253 } 5254 5255 // Disable scalable vectorization if the loop contains any instructions 5256 // with element types not supported for scalable vectors. 5257 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5258 return !Ty->isVoidTy() && 5259 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5260 })) { 5261 reportVectorizationInfo("Scalable vectorization is not supported " 5262 "for all element types found in this loop.", 5263 "ScalableVFUnfeasible", ORE, TheLoop); 5264 return ElementCount::getScalable(0); 5265 } 5266 5267 if (Legal->isSafeForAnyVectorWidth()) 5268 return MaxScalableVF; 5269 5270 // Limit MaxScalableVF by the maximum safe dependence distance. 5271 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5272 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5273 MaxVScale = 5274 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5275 MaxScalableVF = ElementCount::getScalable( 5276 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5277 if (!MaxScalableVF) 5278 reportVectorizationInfo( 5279 "Max legal vector width too small, scalable vectorization " 5280 "unfeasible.", 5281 "ScalableVFUnfeasible", ORE, TheLoop); 5282 5283 return MaxScalableVF; 5284 } 5285 5286 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5287 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5288 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5289 unsigned SmallestType, WidestType; 5290 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5291 5292 // Get the maximum safe dependence distance in bits computed by LAA. 5293 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5294 // the memory accesses that is most restrictive (involved in the smallest 5295 // dependence distance). 5296 unsigned MaxSafeElements = 5297 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5298 5299 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5300 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5301 5302 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5303 << ".\n"); 5304 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5305 << ".\n"); 5306 5307 // First analyze the UserVF, fall back if the UserVF should be ignored. 5308 if (UserVF) { 5309 auto MaxSafeUserVF = 5310 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5311 5312 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5313 // If `VF=vscale x N` is safe, then so is `VF=N` 5314 if (UserVF.isScalable()) 5315 return FixedScalableVFPair( 5316 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5317 else 5318 return UserVF; 5319 } 5320 5321 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5322 5323 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5324 // is better to ignore the hint and let the compiler choose a suitable VF. 5325 if (!UserVF.isScalable()) { 5326 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5327 << " is unsafe, clamping to max safe VF=" 5328 << MaxSafeFixedVF << ".\n"); 5329 ORE->emit([&]() { 5330 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5331 TheLoop->getStartLoc(), 5332 TheLoop->getHeader()) 5333 << "User-specified vectorization factor " 5334 << ore::NV("UserVectorizationFactor", UserVF) 5335 << " is unsafe, clamping to maximum safe vectorization factor " 5336 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5337 }); 5338 return MaxSafeFixedVF; 5339 } 5340 5341 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5342 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5343 << " is ignored because scalable vectors are not " 5344 "available.\n"); 5345 ORE->emit([&]() { 5346 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5347 TheLoop->getStartLoc(), 5348 TheLoop->getHeader()) 5349 << "User-specified vectorization factor " 5350 << ore::NV("UserVectorizationFactor", UserVF) 5351 << " is ignored because the target does not support scalable " 5352 "vectors. The compiler will pick a more suitable value."; 5353 }); 5354 } else { 5355 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5356 << " is unsafe. Ignoring scalable UserVF.\n"); 5357 ORE->emit([&]() { 5358 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5359 TheLoop->getStartLoc(), 5360 TheLoop->getHeader()) 5361 << "User-specified vectorization factor " 5362 << ore::NV("UserVectorizationFactor", UserVF) 5363 << " is unsafe. Ignoring the hint to let the compiler pick a " 5364 "more suitable value."; 5365 }); 5366 } 5367 } 5368 5369 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5370 << " / " << WidestType << " bits.\n"); 5371 5372 FixedScalableVFPair Result(ElementCount::getFixed(1), 5373 ElementCount::getScalable(0)); 5374 if (auto MaxVF = 5375 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5376 MaxSafeFixedVF, FoldTailByMasking)) 5377 Result.FixedVF = MaxVF; 5378 5379 if (auto MaxVF = 5380 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5381 MaxSafeScalableVF, FoldTailByMasking)) 5382 if (MaxVF.isScalable()) { 5383 Result.ScalableVF = MaxVF; 5384 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5385 << "\n"); 5386 } 5387 5388 return Result; 5389 } 5390 5391 FixedScalableVFPair 5392 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5393 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5394 // TODO: It may by useful to do since it's still likely to be dynamically 5395 // uniform if the target can skip. 5396 reportVectorizationFailure( 5397 "Not inserting runtime ptr check for divergent target", 5398 "runtime pointer checks needed. Not enabled for divergent target", 5399 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5400 return FixedScalableVFPair::getNone(); 5401 } 5402 5403 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5404 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5405 if (TC == 1) { 5406 reportVectorizationFailure("Single iteration (non) loop", 5407 "loop trip count is one, irrelevant for vectorization", 5408 "SingleIterationLoop", ORE, TheLoop); 5409 return FixedScalableVFPair::getNone(); 5410 } 5411 5412 switch (ScalarEpilogueStatus) { 5413 case CM_ScalarEpilogueAllowed: 5414 return computeFeasibleMaxVF(TC, UserVF, false); 5415 case CM_ScalarEpilogueNotAllowedUsePredicate: 5416 LLVM_FALLTHROUGH; 5417 case CM_ScalarEpilogueNotNeededUsePredicate: 5418 LLVM_DEBUG( 5419 dbgs() << "LV: vector predicate hint/switch found.\n" 5420 << "LV: Not allowing scalar epilogue, creating predicated " 5421 << "vector loop.\n"); 5422 break; 5423 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5424 // fallthrough as a special case of OptForSize 5425 case CM_ScalarEpilogueNotAllowedOptSize: 5426 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5427 LLVM_DEBUG( 5428 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5429 else 5430 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5431 << "count.\n"); 5432 5433 // Bail if runtime checks are required, which are not good when optimising 5434 // for size. 5435 if (runtimeChecksRequired()) 5436 return FixedScalableVFPair::getNone(); 5437 5438 break; 5439 } 5440 5441 // The only loops we can vectorize without a scalar epilogue, are loops with 5442 // a bottom-test and a single exiting block. We'd have to handle the fact 5443 // that not every instruction executes on the last iteration. This will 5444 // require a lane mask which varies through the vector loop body. (TODO) 5445 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5446 // If there was a tail-folding hint/switch, but we can't fold the tail by 5447 // masking, fallback to a vectorization with a scalar epilogue. 5448 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5449 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5450 "scalar epilogue instead.\n"); 5451 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5452 return computeFeasibleMaxVF(TC, UserVF, false); 5453 } 5454 return FixedScalableVFPair::getNone(); 5455 } 5456 5457 // Now try the tail folding 5458 5459 // Invalidate interleave groups that require an epilogue if we can't mask 5460 // the interleave-group. 5461 if (!useMaskedInterleavedAccesses(TTI)) { 5462 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5463 "No decisions should have been taken at this point"); 5464 // Note: There is no need to invalidate any cost modeling decisions here, as 5465 // non where taken so far. 5466 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5467 } 5468 5469 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5470 // Avoid tail folding if the trip count is known to be a multiple of any VF 5471 // we chose. 5472 // FIXME: The condition below pessimises the case for fixed-width vectors, 5473 // when scalable VFs are also candidates for vectorization. 5474 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5475 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5476 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5477 "MaxFixedVF must be a power of 2"); 5478 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5479 : MaxFixedVF.getFixedValue(); 5480 ScalarEvolution *SE = PSE.getSE(); 5481 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5482 const SCEV *ExitCount = SE->getAddExpr( 5483 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5484 const SCEV *Rem = SE->getURemExpr( 5485 SE->applyLoopGuards(ExitCount, TheLoop), 5486 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5487 if (Rem->isZero()) { 5488 // Accept MaxFixedVF if we do not have a tail. 5489 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5490 return MaxFactors; 5491 } 5492 } 5493 5494 // For scalable vectors don't use tail folding for low trip counts or 5495 // optimizing for code size. We only permit this if the user has explicitly 5496 // requested it. 5497 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5498 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5499 MaxFactors.ScalableVF.isVector()) 5500 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5501 5502 // If we don't know the precise trip count, or if the trip count that we 5503 // found modulo the vectorization factor is not zero, try to fold the tail 5504 // by masking. 5505 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5506 if (Legal->prepareToFoldTailByMasking()) { 5507 FoldTailByMasking = true; 5508 return MaxFactors; 5509 } 5510 5511 // If there was a tail-folding hint/switch, but we can't fold the tail by 5512 // masking, fallback to a vectorization with a scalar epilogue. 5513 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5514 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5515 "scalar epilogue instead.\n"); 5516 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5517 return MaxFactors; 5518 } 5519 5520 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5521 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5522 return FixedScalableVFPair::getNone(); 5523 } 5524 5525 if (TC == 0) { 5526 reportVectorizationFailure( 5527 "Unable to calculate the loop count due to complex control flow", 5528 "unable to calculate the loop count due to complex control flow", 5529 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5530 return FixedScalableVFPair::getNone(); 5531 } 5532 5533 reportVectorizationFailure( 5534 "Cannot optimize for size and vectorize at the same time.", 5535 "cannot optimize for size and vectorize at the same time. " 5536 "Enable vectorization of this loop with '#pragma clang loop " 5537 "vectorize(enable)' when compiling with -Os/-Oz", 5538 "NoTailLoopWithOptForSize", ORE, TheLoop); 5539 return FixedScalableVFPair::getNone(); 5540 } 5541 5542 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5543 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5544 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5545 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5546 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5547 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5548 : TargetTransformInfo::RGK_FixedWidthVector); 5549 5550 // Convenience function to return the minimum of two ElementCounts. 5551 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5552 assert((LHS.isScalable() == RHS.isScalable()) && 5553 "Scalable flags must match"); 5554 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5555 }; 5556 5557 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5558 // Note that both WidestRegister and WidestType may not be a powers of 2. 5559 auto MaxVectorElementCount = ElementCount::get( 5560 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5561 ComputeScalableMaxVF); 5562 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5563 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5564 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5565 5566 if (!MaxVectorElementCount) { 5567 LLVM_DEBUG(dbgs() << "LV: The target has no " 5568 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5569 << " vector registers.\n"); 5570 return ElementCount::getFixed(1); 5571 } 5572 5573 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5574 if (ConstTripCount && 5575 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5576 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5577 // If loop trip count (TC) is known at compile time there is no point in 5578 // choosing VF greater than TC (as done in the loop below). Select maximum 5579 // power of two which doesn't exceed TC. 5580 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5581 // when the TC is less than or equal to the known number of lanes. 5582 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5583 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5584 "exceeding the constant trip count: " 5585 << ClampedConstTripCount << "\n"); 5586 return ElementCount::getFixed(ClampedConstTripCount); 5587 } 5588 5589 ElementCount MaxVF = MaxVectorElementCount; 5590 if (TTI.shouldMaximizeVectorBandwidth() || 5591 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5592 auto MaxVectorElementCountMaxBW = ElementCount::get( 5593 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5594 ComputeScalableMaxVF); 5595 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5596 5597 // Collect all viable vectorization factors larger than the default MaxVF 5598 // (i.e. MaxVectorElementCount). 5599 SmallVector<ElementCount, 8> VFs; 5600 for (ElementCount VS = MaxVectorElementCount * 2; 5601 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5602 VFs.push_back(VS); 5603 5604 // For each VF calculate its register usage. 5605 auto RUs = calculateRegisterUsage(VFs); 5606 5607 // Select the largest VF which doesn't require more registers than existing 5608 // ones. 5609 for (int i = RUs.size() - 1; i >= 0; --i) { 5610 bool Selected = true; 5611 for (auto &pair : RUs[i].MaxLocalUsers) { 5612 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5613 if (pair.second > TargetNumRegisters) 5614 Selected = false; 5615 } 5616 if (Selected) { 5617 MaxVF = VFs[i]; 5618 break; 5619 } 5620 } 5621 if (ElementCount MinVF = 5622 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5623 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5624 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5625 << ") with target's minimum: " << MinVF << '\n'); 5626 MaxVF = MinVF; 5627 } 5628 } 5629 } 5630 return MaxVF; 5631 } 5632 5633 bool LoopVectorizationCostModel::isMoreProfitable( 5634 const VectorizationFactor &A, const VectorizationFactor &B) const { 5635 InstructionCost CostA = A.Cost; 5636 InstructionCost CostB = B.Cost; 5637 5638 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5639 5640 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5641 MaxTripCount) { 5642 // If we are folding the tail and the trip count is a known (possibly small) 5643 // constant, the trip count will be rounded up to an integer number of 5644 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5645 // which we compare directly. When not folding the tail, the total cost will 5646 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5647 // approximated with the per-lane cost below instead of using the tripcount 5648 // as here. 5649 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5650 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5651 return RTCostA < RTCostB; 5652 } 5653 5654 // Improve estimate for the vector width if it is scalable. 5655 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5656 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5657 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5658 if (A.Width.isScalable()) 5659 EstimatedWidthA *= VScale.getValue(); 5660 if (B.Width.isScalable()) 5661 EstimatedWidthB *= VScale.getValue(); 5662 } 5663 5664 // Assume vscale may be larger than 1 (or the value being tuned for), 5665 // so that scalable vectorization is slightly favorable over fixed-width 5666 // vectorization. 5667 if (A.Width.isScalable() && !B.Width.isScalable()) 5668 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5669 5670 // To avoid the need for FP division: 5671 // (CostA / A.Width) < (CostB / B.Width) 5672 // <=> (CostA * B.Width) < (CostB * A.Width) 5673 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5674 } 5675 5676 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5677 const ElementCountSet &VFCandidates) { 5678 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5679 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5680 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5681 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5682 "Expected Scalar VF to be a candidate"); 5683 5684 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5685 VectorizationFactor ChosenFactor = ScalarCost; 5686 5687 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5688 if (ForceVectorization && VFCandidates.size() > 1) { 5689 // Ignore scalar width, because the user explicitly wants vectorization. 5690 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5691 // evaluation. 5692 ChosenFactor.Cost = InstructionCost::getMax(); 5693 } 5694 5695 SmallVector<InstructionVFPair> InvalidCosts; 5696 for (const auto &i : VFCandidates) { 5697 // The cost for scalar VF=1 is already calculated, so ignore it. 5698 if (i.isScalar()) 5699 continue; 5700 5701 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5702 VectorizationFactor Candidate(i, C.first); 5703 5704 #ifndef NDEBUG 5705 unsigned AssumedMinimumVscale = 1; 5706 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5707 AssumedMinimumVscale = VScale.getValue(); 5708 unsigned Width = 5709 Candidate.Width.isScalable() 5710 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5711 : Candidate.Width.getFixedValue(); 5712 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5713 << " costs: " << (Candidate.Cost / Width)); 5714 if (i.isScalable()) 5715 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5716 << AssumedMinimumVscale << ")"); 5717 LLVM_DEBUG(dbgs() << ".\n"); 5718 #endif 5719 5720 if (!C.second && !ForceVectorization) { 5721 LLVM_DEBUG( 5722 dbgs() << "LV: Not considering vector loop of width " << i 5723 << " because it will not generate any vector instructions.\n"); 5724 continue; 5725 } 5726 5727 // If profitable add it to ProfitableVF list. 5728 if (isMoreProfitable(Candidate, ScalarCost)) 5729 ProfitableVFs.push_back(Candidate); 5730 5731 if (isMoreProfitable(Candidate, ChosenFactor)) 5732 ChosenFactor = Candidate; 5733 } 5734 5735 // Emit a report of VFs with invalid costs in the loop. 5736 if (!InvalidCosts.empty()) { 5737 // Group the remarks per instruction, keeping the instruction order from 5738 // InvalidCosts. 5739 std::map<Instruction *, unsigned> Numbering; 5740 unsigned I = 0; 5741 for (auto &Pair : InvalidCosts) 5742 if (!Numbering.count(Pair.first)) 5743 Numbering[Pair.first] = I++; 5744 5745 // Sort the list, first on instruction(number) then on VF. 5746 llvm::sort(InvalidCosts, 5747 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5748 if (Numbering[A.first] != Numbering[B.first]) 5749 return Numbering[A.first] < Numbering[B.first]; 5750 ElementCountComparator ECC; 5751 return ECC(A.second, B.second); 5752 }); 5753 5754 // For a list of ordered instruction-vf pairs: 5755 // [(load, vf1), (load, vf2), (store, vf1)] 5756 // Group the instructions together to emit separate remarks for: 5757 // load (vf1, vf2) 5758 // store (vf1) 5759 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5760 auto Subset = ArrayRef<InstructionVFPair>(); 5761 do { 5762 if (Subset.empty()) 5763 Subset = Tail.take_front(1); 5764 5765 Instruction *I = Subset.front().first; 5766 5767 // If the next instruction is different, or if there are no other pairs, 5768 // emit a remark for the collated subset. e.g. 5769 // [(load, vf1), (load, vf2))] 5770 // to emit: 5771 // remark: invalid costs for 'load' at VF=(vf, vf2) 5772 if (Subset == Tail || Tail[Subset.size()].first != I) { 5773 std::string OutString; 5774 raw_string_ostream OS(OutString); 5775 assert(!Subset.empty() && "Unexpected empty range"); 5776 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5777 for (auto &Pair : Subset) 5778 OS << (Pair.second == Subset.front().second ? "" : ", ") 5779 << Pair.second; 5780 OS << "):"; 5781 if (auto *CI = dyn_cast<CallInst>(I)) 5782 OS << " call to " << CI->getCalledFunction()->getName(); 5783 else 5784 OS << " " << I->getOpcodeName(); 5785 OS.flush(); 5786 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5787 Tail = Tail.drop_front(Subset.size()); 5788 Subset = {}; 5789 } else 5790 // Grow the subset by one element 5791 Subset = Tail.take_front(Subset.size() + 1); 5792 } while (!Tail.empty()); 5793 } 5794 5795 if (!EnableCondStoresVectorization && NumPredStores) { 5796 reportVectorizationFailure("There are conditional stores.", 5797 "store that is conditionally executed prevents vectorization", 5798 "ConditionalStore", ORE, TheLoop); 5799 ChosenFactor = ScalarCost; 5800 } 5801 5802 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5803 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5804 << "LV: Vectorization seems to be not beneficial, " 5805 << "but was forced by a user.\n"); 5806 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5807 return ChosenFactor; 5808 } 5809 5810 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5811 const Loop &L, ElementCount VF) const { 5812 // Cross iteration phis such as reductions need special handling and are 5813 // currently unsupported. 5814 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5815 return Legal->isFirstOrderRecurrence(&Phi) || 5816 Legal->isReductionVariable(&Phi); 5817 })) 5818 return false; 5819 5820 // Phis with uses outside of the loop require special handling and are 5821 // currently unsupported. 5822 for (auto &Entry : Legal->getInductionVars()) { 5823 // Look for uses of the value of the induction at the last iteration. 5824 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5825 for (User *U : PostInc->users()) 5826 if (!L.contains(cast<Instruction>(U))) 5827 return false; 5828 // Look for uses of penultimate value of the induction. 5829 for (User *U : Entry.first->users()) 5830 if (!L.contains(cast<Instruction>(U))) 5831 return false; 5832 } 5833 5834 // Induction variables that are widened require special handling that is 5835 // currently not supported. 5836 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5837 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5838 this->isProfitableToScalarize(Entry.first, VF)); 5839 })) 5840 return false; 5841 5842 // Epilogue vectorization code has not been auditted to ensure it handles 5843 // non-latch exits properly. It may be fine, but it needs auditted and 5844 // tested. 5845 if (L.getExitingBlock() != L.getLoopLatch()) 5846 return false; 5847 5848 return true; 5849 } 5850 5851 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5852 const ElementCount VF) const { 5853 // FIXME: We need a much better cost-model to take different parameters such 5854 // as register pressure, code size increase and cost of extra branches into 5855 // account. For now we apply a very crude heuristic and only consider loops 5856 // with vectorization factors larger than a certain value. 5857 // We also consider epilogue vectorization unprofitable for targets that don't 5858 // consider interleaving beneficial (eg. MVE). 5859 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5860 return false; 5861 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5862 return true; 5863 return false; 5864 } 5865 5866 VectorizationFactor 5867 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5868 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5869 VectorizationFactor Result = VectorizationFactor::Disabled(); 5870 if (!EnableEpilogueVectorization) { 5871 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5872 return Result; 5873 } 5874 5875 if (!isScalarEpilogueAllowed()) { 5876 LLVM_DEBUG( 5877 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5878 "allowed.\n";); 5879 return Result; 5880 } 5881 5882 // Not really a cost consideration, but check for unsupported cases here to 5883 // simplify the logic. 5884 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5885 LLVM_DEBUG( 5886 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5887 "not a supported candidate.\n";); 5888 return Result; 5889 } 5890 5891 if (EpilogueVectorizationForceVF > 1) { 5892 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5893 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5894 if (LVP.hasPlanWithVF(ForcedEC)) 5895 return {ForcedEC, 0}; 5896 else { 5897 LLVM_DEBUG( 5898 dbgs() 5899 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5900 return Result; 5901 } 5902 } 5903 5904 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5905 TheLoop->getHeader()->getParent()->hasMinSize()) { 5906 LLVM_DEBUG( 5907 dbgs() 5908 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5909 return Result; 5910 } 5911 5912 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5913 if (MainLoopVF.isScalable()) 5914 LLVM_DEBUG( 5915 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 5916 "yet supported. Converting to fixed-width (VF=" 5917 << FixedMainLoopVF << ") instead\n"); 5918 5919 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 5920 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5921 "this loop\n"); 5922 return Result; 5923 } 5924 5925 for (auto &NextVF : ProfitableVFs) 5926 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 5927 (Result.Width.getFixedValue() == 1 || 5928 isMoreProfitable(NextVF, Result)) && 5929 LVP.hasPlanWithVF(NextVF.Width)) 5930 Result = NextVF; 5931 5932 if (Result != VectorizationFactor::Disabled()) 5933 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5934 << Result.Width.getFixedValue() << "\n";); 5935 return Result; 5936 } 5937 5938 std::pair<unsigned, unsigned> 5939 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5940 unsigned MinWidth = -1U; 5941 unsigned MaxWidth = 8; 5942 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5943 // For in-loop reductions, no element types are added to ElementTypesInLoop 5944 // if there are no loads/stores in the loop. In this case, check through the 5945 // reduction variables to determine the maximum width. 5946 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5947 // Reset MaxWidth so that we can find the smallest type used by recurrences 5948 // in the loop. 5949 MaxWidth = -1U; 5950 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5951 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5952 // When finding the min width used by the recurrence we need to account 5953 // for casts on the input operands of the recurrence. 5954 MaxWidth = std::min<unsigned>( 5955 MaxWidth, std::min<unsigned>( 5956 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5957 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5958 } 5959 } else { 5960 for (Type *T : ElementTypesInLoop) { 5961 MinWidth = std::min<unsigned>( 5962 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5963 MaxWidth = std::max<unsigned>( 5964 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5965 } 5966 } 5967 return {MinWidth, MaxWidth}; 5968 } 5969 5970 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5971 ElementTypesInLoop.clear(); 5972 // For each block. 5973 for (BasicBlock *BB : TheLoop->blocks()) { 5974 // For each instruction in the loop. 5975 for (Instruction &I : BB->instructionsWithoutDebug()) { 5976 Type *T = I.getType(); 5977 5978 // Skip ignored values. 5979 if (ValuesToIgnore.count(&I)) 5980 continue; 5981 5982 // Only examine Loads, Stores and PHINodes. 5983 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5984 continue; 5985 5986 // Examine PHI nodes that are reduction variables. Update the type to 5987 // account for the recurrence type. 5988 if (auto *PN = dyn_cast<PHINode>(&I)) { 5989 if (!Legal->isReductionVariable(PN)) 5990 continue; 5991 const RecurrenceDescriptor &RdxDesc = 5992 Legal->getReductionVars().find(PN)->second; 5993 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5994 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5995 RdxDesc.getRecurrenceType(), 5996 TargetTransformInfo::ReductionFlags())) 5997 continue; 5998 T = RdxDesc.getRecurrenceType(); 5999 } 6000 6001 // Examine the stored values. 6002 if (auto *ST = dyn_cast<StoreInst>(&I)) 6003 T = ST->getValueOperand()->getType(); 6004 6005 assert(T->isSized() && 6006 "Expected the load/store/recurrence type to be sized"); 6007 6008 ElementTypesInLoop.insert(T); 6009 } 6010 } 6011 } 6012 6013 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6014 unsigned LoopCost) { 6015 // -- The interleave heuristics -- 6016 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6017 // There are many micro-architectural considerations that we can't predict 6018 // at this level. For example, frontend pressure (on decode or fetch) due to 6019 // code size, or the number and capabilities of the execution ports. 6020 // 6021 // We use the following heuristics to select the interleave count: 6022 // 1. If the code has reductions, then we interleave to break the cross 6023 // iteration dependency. 6024 // 2. If the loop is really small, then we interleave to reduce the loop 6025 // overhead. 6026 // 3. We don't interleave if we think that we will spill registers to memory 6027 // due to the increased register pressure. 6028 6029 if (!isScalarEpilogueAllowed()) 6030 return 1; 6031 6032 // We used the distance for the interleave count. 6033 if (Legal->getMaxSafeDepDistBytes() != -1U) 6034 return 1; 6035 6036 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6037 const bool HasReductions = !Legal->getReductionVars().empty(); 6038 // Do not interleave loops with a relatively small known or estimated trip 6039 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6040 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6041 // because with the above conditions interleaving can expose ILP and break 6042 // cross iteration dependences for reductions. 6043 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6044 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6045 return 1; 6046 6047 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6048 // We divide by these constants so assume that we have at least one 6049 // instruction that uses at least one register. 6050 for (auto& pair : R.MaxLocalUsers) { 6051 pair.second = std::max(pair.second, 1U); 6052 } 6053 6054 // We calculate the interleave count using the following formula. 6055 // Subtract the number of loop invariants from the number of available 6056 // registers. These registers are used by all of the interleaved instances. 6057 // Next, divide the remaining registers by the number of registers that is 6058 // required by the loop, in order to estimate how many parallel instances 6059 // fit without causing spills. All of this is rounded down if necessary to be 6060 // a power of two. We want power of two interleave count to simplify any 6061 // addressing operations or alignment considerations. 6062 // We also want power of two interleave counts to ensure that the induction 6063 // variable of the vector loop wraps to zero, when tail is folded by masking; 6064 // this currently happens when OptForSize, in which case IC is set to 1 above. 6065 unsigned IC = UINT_MAX; 6066 6067 for (auto& pair : R.MaxLocalUsers) { 6068 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6069 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6070 << " registers of " 6071 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6072 if (VF.isScalar()) { 6073 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6074 TargetNumRegisters = ForceTargetNumScalarRegs; 6075 } else { 6076 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6077 TargetNumRegisters = ForceTargetNumVectorRegs; 6078 } 6079 unsigned MaxLocalUsers = pair.second; 6080 unsigned LoopInvariantRegs = 0; 6081 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6082 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6083 6084 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6085 // Don't count the induction variable as interleaved. 6086 if (EnableIndVarRegisterHeur) { 6087 TmpIC = 6088 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6089 std::max(1U, (MaxLocalUsers - 1))); 6090 } 6091 6092 IC = std::min(IC, TmpIC); 6093 } 6094 6095 // Clamp the interleave ranges to reasonable counts. 6096 unsigned MaxInterleaveCount = 6097 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6098 6099 // Check if the user has overridden the max. 6100 if (VF.isScalar()) { 6101 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6102 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6103 } else { 6104 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6105 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6106 } 6107 6108 // If trip count is known or estimated compile time constant, limit the 6109 // interleave count to be less than the trip count divided by VF, provided it 6110 // is at least 1. 6111 // 6112 // For scalable vectors we can't know if interleaving is beneficial. It may 6113 // not be beneficial for small loops if none of the lanes in the second vector 6114 // iterations is enabled. However, for larger loops, there is likely to be a 6115 // similar benefit as for fixed-width vectors. For now, we choose to leave 6116 // the InterleaveCount as if vscale is '1', although if some information about 6117 // the vector is known (e.g. min vector size), we can make a better decision. 6118 if (BestKnownTC) { 6119 MaxInterleaveCount = 6120 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6121 // Make sure MaxInterleaveCount is greater than 0. 6122 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6123 } 6124 6125 assert(MaxInterleaveCount > 0 && 6126 "Maximum interleave count must be greater than 0"); 6127 6128 // Clamp the calculated IC to be between the 1 and the max interleave count 6129 // that the target and trip count allows. 6130 if (IC > MaxInterleaveCount) 6131 IC = MaxInterleaveCount; 6132 else 6133 // Make sure IC is greater than 0. 6134 IC = std::max(1u, IC); 6135 6136 assert(IC > 0 && "Interleave count must be greater than 0."); 6137 6138 // If we did not calculate the cost for VF (because the user selected the VF) 6139 // then we calculate the cost of VF here. 6140 if (LoopCost == 0) { 6141 InstructionCost C = expectedCost(VF).first; 6142 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6143 LoopCost = *C.getValue(); 6144 } 6145 6146 assert(LoopCost && "Non-zero loop cost expected"); 6147 6148 // Interleave if we vectorized this loop and there is a reduction that could 6149 // benefit from interleaving. 6150 if (VF.isVector() && HasReductions) { 6151 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6152 return IC; 6153 } 6154 6155 // Note that if we've already vectorized the loop we will have done the 6156 // runtime check and so interleaving won't require further checks. 6157 bool InterleavingRequiresRuntimePointerCheck = 6158 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6159 6160 // We want to interleave small loops in order to reduce the loop overhead and 6161 // potentially expose ILP opportunities. 6162 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6163 << "LV: IC is " << IC << '\n' 6164 << "LV: VF is " << VF << '\n'); 6165 const bool AggressivelyInterleaveReductions = 6166 TTI.enableAggressiveInterleaving(HasReductions); 6167 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6168 // We assume that the cost overhead is 1 and we use the cost model 6169 // to estimate the cost of the loop and interleave until the cost of the 6170 // loop overhead is about 5% of the cost of the loop. 6171 unsigned SmallIC = 6172 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6173 6174 // Interleave until store/load ports (estimated by max interleave count) are 6175 // saturated. 6176 unsigned NumStores = Legal->getNumStores(); 6177 unsigned NumLoads = Legal->getNumLoads(); 6178 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6179 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6180 6181 // There is little point in interleaving for reductions containing selects 6182 // and compares when VF=1 since it may just create more overhead than it's 6183 // worth for loops with small trip counts. This is because we still have to 6184 // do the final reduction after the loop. 6185 bool HasSelectCmpReductions = 6186 HasReductions && 6187 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6188 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6189 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6190 RdxDesc.getRecurrenceKind()); 6191 }); 6192 if (HasSelectCmpReductions) { 6193 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6194 return 1; 6195 } 6196 6197 // If we have a scalar reduction (vector reductions are already dealt with 6198 // by this point), we can increase the critical path length if the loop 6199 // we're interleaving is inside another loop. For tree-wise reductions 6200 // set the limit to 2, and for ordered reductions it's best to disable 6201 // interleaving entirely. 6202 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6203 bool HasOrderedReductions = 6204 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6205 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6206 return RdxDesc.isOrdered(); 6207 }); 6208 if (HasOrderedReductions) { 6209 LLVM_DEBUG( 6210 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6211 return 1; 6212 } 6213 6214 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6215 SmallIC = std::min(SmallIC, F); 6216 StoresIC = std::min(StoresIC, F); 6217 LoadsIC = std::min(LoadsIC, F); 6218 } 6219 6220 if (EnableLoadStoreRuntimeInterleave && 6221 std::max(StoresIC, LoadsIC) > SmallIC) { 6222 LLVM_DEBUG( 6223 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6224 return std::max(StoresIC, LoadsIC); 6225 } 6226 6227 // If there are scalar reductions and TTI has enabled aggressive 6228 // interleaving for reductions, we will interleave to expose ILP. 6229 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6230 AggressivelyInterleaveReductions) { 6231 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6232 // Interleave no less than SmallIC but not as aggressive as the normal IC 6233 // to satisfy the rare situation when resources are too limited. 6234 return std::max(IC / 2, SmallIC); 6235 } else { 6236 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6237 return SmallIC; 6238 } 6239 } 6240 6241 // Interleave if this is a large loop (small loops are already dealt with by 6242 // this point) that could benefit from interleaving. 6243 if (AggressivelyInterleaveReductions) { 6244 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6245 return IC; 6246 } 6247 6248 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6249 return 1; 6250 } 6251 6252 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6253 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6254 // This function calculates the register usage by measuring the highest number 6255 // of values that are alive at a single location. Obviously, this is a very 6256 // rough estimation. We scan the loop in a topological order in order and 6257 // assign a number to each instruction. We use RPO to ensure that defs are 6258 // met before their users. We assume that each instruction that has in-loop 6259 // users starts an interval. We record every time that an in-loop value is 6260 // used, so we have a list of the first and last occurrences of each 6261 // instruction. Next, we transpose this data structure into a multi map that 6262 // holds the list of intervals that *end* at a specific location. This multi 6263 // map allows us to perform a linear search. We scan the instructions linearly 6264 // and record each time that a new interval starts, by placing it in a set. 6265 // If we find this value in the multi-map then we remove it from the set. 6266 // The max register usage is the maximum size of the set. 6267 // We also search for instructions that are defined outside the loop, but are 6268 // used inside the loop. We need this number separately from the max-interval 6269 // usage number because when we unroll, loop-invariant values do not take 6270 // more register. 6271 LoopBlocksDFS DFS(TheLoop); 6272 DFS.perform(LI); 6273 6274 RegisterUsage RU; 6275 6276 // Each 'key' in the map opens a new interval. The values 6277 // of the map are the index of the 'last seen' usage of the 6278 // instruction that is the key. 6279 using IntervalMap = DenseMap<Instruction *, unsigned>; 6280 6281 // Maps instruction to its index. 6282 SmallVector<Instruction *, 64> IdxToInstr; 6283 // Marks the end of each interval. 6284 IntervalMap EndPoint; 6285 // Saves the list of instruction indices that are used in the loop. 6286 SmallPtrSet<Instruction *, 8> Ends; 6287 // Saves the list of values that are used in the loop but are 6288 // defined outside the loop, such as arguments and constants. 6289 SmallPtrSet<Value *, 8> LoopInvariants; 6290 6291 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6292 for (Instruction &I : BB->instructionsWithoutDebug()) { 6293 IdxToInstr.push_back(&I); 6294 6295 // Save the end location of each USE. 6296 for (Value *U : I.operands()) { 6297 auto *Instr = dyn_cast<Instruction>(U); 6298 6299 // Ignore non-instruction values such as arguments, constants, etc. 6300 if (!Instr) 6301 continue; 6302 6303 // If this instruction is outside the loop then record it and continue. 6304 if (!TheLoop->contains(Instr)) { 6305 LoopInvariants.insert(Instr); 6306 continue; 6307 } 6308 6309 // Overwrite previous end points. 6310 EndPoint[Instr] = IdxToInstr.size(); 6311 Ends.insert(Instr); 6312 } 6313 } 6314 } 6315 6316 // Saves the list of intervals that end with the index in 'key'. 6317 using InstrList = SmallVector<Instruction *, 2>; 6318 DenseMap<unsigned, InstrList> TransposeEnds; 6319 6320 // Transpose the EndPoints to a list of values that end at each index. 6321 for (auto &Interval : EndPoint) 6322 TransposeEnds[Interval.second].push_back(Interval.first); 6323 6324 SmallPtrSet<Instruction *, 8> OpenIntervals; 6325 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6326 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6327 6328 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6329 6330 // A lambda that gets the register usage for the given type and VF. 6331 const auto &TTICapture = TTI; 6332 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6333 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6334 return 0; 6335 InstructionCost::CostType RegUsage = 6336 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6337 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6338 "Nonsensical values for register usage."); 6339 return RegUsage; 6340 }; 6341 6342 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6343 Instruction *I = IdxToInstr[i]; 6344 6345 // Remove all of the instructions that end at this location. 6346 InstrList &List = TransposeEnds[i]; 6347 for (Instruction *ToRemove : List) 6348 OpenIntervals.erase(ToRemove); 6349 6350 // Ignore instructions that are never used within the loop. 6351 if (!Ends.count(I)) 6352 continue; 6353 6354 // Skip ignored values. 6355 if (ValuesToIgnore.count(I)) 6356 continue; 6357 6358 // For each VF find the maximum usage of registers. 6359 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6360 // Count the number of live intervals. 6361 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6362 6363 if (VFs[j].isScalar()) { 6364 for (auto Inst : OpenIntervals) { 6365 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6366 if (RegUsage.find(ClassID) == RegUsage.end()) 6367 RegUsage[ClassID] = 1; 6368 else 6369 RegUsage[ClassID] += 1; 6370 } 6371 } else { 6372 collectUniformsAndScalars(VFs[j]); 6373 for (auto Inst : OpenIntervals) { 6374 // Skip ignored values for VF > 1. 6375 if (VecValuesToIgnore.count(Inst)) 6376 continue; 6377 if (isScalarAfterVectorization(Inst, VFs[j])) { 6378 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6379 if (RegUsage.find(ClassID) == RegUsage.end()) 6380 RegUsage[ClassID] = 1; 6381 else 6382 RegUsage[ClassID] += 1; 6383 } else { 6384 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6385 if (RegUsage.find(ClassID) == RegUsage.end()) 6386 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6387 else 6388 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6389 } 6390 } 6391 } 6392 6393 for (auto& pair : RegUsage) { 6394 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6395 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6396 else 6397 MaxUsages[j][pair.first] = pair.second; 6398 } 6399 } 6400 6401 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6402 << OpenIntervals.size() << '\n'); 6403 6404 // Add the current instruction to the list of open intervals. 6405 OpenIntervals.insert(I); 6406 } 6407 6408 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6409 SmallMapVector<unsigned, unsigned, 4> Invariant; 6410 6411 for (auto Inst : LoopInvariants) { 6412 unsigned Usage = 6413 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6414 unsigned ClassID = 6415 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6416 if (Invariant.find(ClassID) == Invariant.end()) 6417 Invariant[ClassID] = Usage; 6418 else 6419 Invariant[ClassID] += Usage; 6420 } 6421 6422 LLVM_DEBUG({ 6423 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6424 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6425 << " item\n"; 6426 for (const auto &pair : MaxUsages[i]) { 6427 dbgs() << "LV(REG): RegisterClass: " 6428 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6429 << " registers\n"; 6430 } 6431 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6432 << " item\n"; 6433 for (const auto &pair : Invariant) { 6434 dbgs() << "LV(REG): RegisterClass: " 6435 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6436 << " registers\n"; 6437 } 6438 }); 6439 6440 RU.LoopInvariantRegs = Invariant; 6441 RU.MaxLocalUsers = MaxUsages[i]; 6442 RUs[i] = RU; 6443 } 6444 6445 return RUs; 6446 } 6447 6448 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6449 ElementCount VF) { 6450 // TODO: Cost model for emulated masked load/store is completely 6451 // broken. This hack guides the cost model to use an artificially 6452 // high enough value to practically disable vectorization with such 6453 // operations, except where previously deployed legality hack allowed 6454 // using very low cost values. This is to avoid regressions coming simply 6455 // from moving "masked load/store" check from legality to cost model. 6456 // Masked Load/Gather emulation was previously never allowed. 6457 // Limited number of Masked Store/Scatter emulation was allowed. 6458 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6459 return isa<LoadInst>(I) || 6460 (isa<StoreInst>(I) && 6461 NumPredStores > NumberOfStoresToPredicate); 6462 } 6463 6464 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6465 // If we aren't vectorizing the loop, or if we've already collected the 6466 // instructions to scalarize, there's nothing to do. Collection may already 6467 // have occurred if we have a user-selected VF and are now computing the 6468 // expected cost for interleaving. 6469 if (VF.isScalar() || VF.isZero() || 6470 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6471 return; 6472 6473 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6474 // not profitable to scalarize any instructions, the presence of VF in the 6475 // map will indicate that we've analyzed it already. 6476 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6477 6478 // Find all the instructions that are scalar with predication in the loop and 6479 // determine if it would be better to not if-convert the blocks they are in. 6480 // If so, we also record the instructions to scalarize. 6481 for (BasicBlock *BB : TheLoop->blocks()) { 6482 if (!blockNeedsPredicationForAnyReason(BB)) 6483 continue; 6484 for (Instruction &I : *BB) 6485 if (isScalarWithPredication(&I, VF)) { 6486 ScalarCostsTy ScalarCosts; 6487 // Do not apply discount if scalable, because that would lead to 6488 // invalid scalarization costs. 6489 // Do not apply discount logic if hacked cost is needed 6490 // for emulated masked memrefs. 6491 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6492 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6493 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6494 // Remember that BB will remain after vectorization. 6495 PredicatedBBsAfterVectorization.insert(BB); 6496 } 6497 } 6498 } 6499 6500 int LoopVectorizationCostModel::computePredInstDiscount( 6501 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6502 assert(!isUniformAfterVectorization(PredInst, VF) && 6503 "Instruction marked uniform-after-vectorization will be predicated"); 6504 6505 // Initialize the discount to zero, meaning that the scalar version and the 6506 // vector version cost the same. 6507 InstructionCost Discount = 0; 6508 6509 // Holds instructions to analyze. The instructions we visit are mapped in 6510 // ScalarCosts. Those instructions are the ones that would be scalarized if 6511 // we find that the scalar version costs less. 6512 SmallVector<Instruction *, 8> Worklist; 6513 6514 // Returns true if the given instruction can be scalarized. 6515 auto canBeScalarized = [&](Instruction *I) -> bool { 6516 // We only attempt to scalarize instructions forming a single-use chain 6517 // from the original predicated block that would otherwise be vectorized. 6518 // Although not strictly necessary, we give up on instructions we know will 6519 // already be scalar to avoid traversing chains that are unlikely to be 6520 // beneficial. 6521 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6522 isScalarAfterVectorization(I, VF)) 6523 return false; 6524 6525 // If the instruction is scalar with predication, it will be analyzed 6526 // separately. We ignore it within the context of PredInst. 6527 if (isScalarWithPredication(I, VF)) 6528 return false; 6529 6530 // If any of the instruction's operands are uniform after vectorization, 6531 // the instruction cannot be scalarized. This prevents, for example, a 6532 // masked load from being scalarized. 6533 // 6534 // We assume we will only emit a value for lane zero of an instruction 6535 // marked uniform after vectorization, rather than VF identical values. 6536 // Thus, if we scalarize an instruction that uses a uniform, we would 6537 // create uses of values corresponding to the lanes we aren't emitting code 6538 // for. This behavior can be changed by allowing getScalarValue to clone 6539 // the lane zero values for uniforms rather than asserting. 6540 for (Use &U : I->operands()) 6541 if (auto *J = dyn_cast<Instruction>(U.get())) 6542 if (isUniformAfterVectorization(J, VF)) 6543 return false; 6544 6545 // Otherwise, we can scalarize the instruction. 6546 return true; 6547 }; 6548 6549 // Compute the expected cost discount from scalarizing the entire expression 6550 // feeding the predicated instruction. We currently only consider expressions 6551 // that are single-use instruction chains. 6552 Worklist.push_back(PredInst); 6553 while (!Worklist.empty()) { 6554 Instruction *I = Worklist.pop_back_val(); 6555 6556 // If we've already analyzed the instruction, there's nothing to do. 6557 if (ScalarCosts.find(I) != ScalarCosts.end()) 6558 continue; 6559 6560 // Compute the cost of the vector instruction. Note that this cost already 6561 // includes the scalarization overhead of the predicated instruction. 6562 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6563 6564 // Compute the cost of the scalarized instruction. This cost is the cost of 6565 // the instruction as if it wasn't if-converted and instead remained in the 6566 // predicated block. We will scale this cost by block probability after 6567 // computing the scalarization overhead. 6568 InstructionCost ScalarCost = 6569 VF.getFixedValue() * 6570 getInstructionCost(I, ElementCount::getFixed(1)).first; 6571 6572 // Compute the scalarization overhead of needed insertelement instructions 6573 // and phi nodes. 6574 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6575 ScalarCost += TTI.getScalarizationOverhead( 6576 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6577 APInt::getAllOnes(VF.getFixedValue()), true, false); 6578 ScalarCost += 6579 VF.getFixedValue() * 6580 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6581 } 6582 6583 // Compute the scalarization overhead of needed extractelement 6584 // instructions. For each of the instruction's operands, if the operand can 6585 // be scalarized, add it to the worklist; otherwise, account for the 6586 // overhead. 6587 for (Use &U : I->operands()) 6588 if (auto *J = dyn_cast<Instruction>(U.get())) { 6589 assert(VectorType::isValidElementType(J->getType()) && 6590 "Instruction has non-scalar type"); 6591 if (canBeScalarized(J)) 6592 Worklist.push_back(J); 6593 else if (needsExtract(J, VF)) { 6594 ScalarCost += TTI.getScalarizationOverhead( 6595 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6596 APInt::getAllOnes(VF.getFixedValue()), false, true); 6597 } 6598 } 6599 6600 // Scale the total scalar cost by block probability. 6601 ScalarCost /= getReciprocalPredBlockProb(); 6602 6603 // Compute the discount. A non-negative discount means the vector version 6604 // of the instruction costs more, and scalarizing would be beneficial. 6605 Discount += VectorCost - ScalarCost; 6606 ScalarCosts[I] = ScalarCost; 6607 } 6608 6609 return *Discount.getValue(); 6610 } 6611 6612 LoopVectorizationCostModel::VectorizationCostTy 6613 LoopVectorizationCostModel::expectedCost( 6614 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6615 VectorizationCostTy Cost; 6616 6617 // For each block. 6618 for (BasicBlock *BB : TheLoop->blocks()) { 6619 VectorizationCostTy BlockCost; 6620 6621 // For each instruction in the old loop. 6622 for (Instruction &I : BB->instructionsWithoutDebug()) { 6623 // Skip ignored values. 6624 if (ValuesToIgnore.count(&I) || 6625 (VF.isVector() && VecValuesToIgnore.count(&I))) 6626 continue; 6627 6628 VectorizationCostTy C = getInstructionCost(&I, VF); 6629 6630 // Check if we should override the cost. 6631 if (C.first.isValid() && 6632 ForceTargetInstructionCost.getNumOccurrences() > 0) 6633 C.first = InstructionCost(ForceTargetInstructionCost); 6634 6635 // Keep a list of instructions with invalid costs. 6636 if (Invalid && !C.first.isValid()) 6637 Invalid->emplace_back(&I, VF); 6638 6639 BlockCost.first += C.first; 6640 BlockCost.second |= C.second; 6641 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6642 << " for VF " << VF << " For instruction: " << I 6643 << '\n'); 6644 } 6645 6646 // If we are vectorizing a predicated block, it will have been 6647 // if-converted. This means that the block's instructions (aside from 6648 // stores and instructions that may divide by zero) will now be 6649 // unconditionally executed. For the scalar case, we may not always execute 6650 // the predicated block, if it is an if-else block. Thus, scale the block's 6651 // cost by the probability of executing it. blockNeedsPredication from 6652 // Legal is used so as to not include all blocks in tail folded loops. 6653 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6654 BlockCost.first /= getReciprocalPredBlockProb(); 6655 6656 Cost.first += BlockCost.first; 6657 Cost.second |= BlockCost.second; 6658 } 6659 6660 return Cost; 6661 } 6662 6663 /// Gets Address Access SCEV after verifying that the access pattern 6664 /// is loop invariant except the induction variable dependence. 6665 /// 6666 /// This SCEV can be sent to the Target in order to estimate the address 6667 /// calculation cost. 6668 static const SCEV *getAddressAccessSCEV( 6669 Value *Ptr, 6670 LoopVectorizationLegality *Legal, 6671 PredicatedScalarEvolution &PSE, 6672 const Loop *TheLoop) { 6673 6674 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6675 if (!Gep) 6676 return nullptr; 6677 6678 // We are looking for a gep with all loop invariant indices except for one 6679 // which should be an induction variable. 6680 auto SE = PSE.getSE(); 6681 unsigned NumOperands = Gep->getNumOperands(); 6682 for (unsigned i = 1; i < NumOperands; ++i) { 6683 Value *Opd = Gep->getOperand(i); 6684 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6685 !Legal->isInductionVariable(Opd)) 6686 return nullptr; 6687 } 6688 6689 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6690 return PSE.getSCEV(Ptr); 6691 } 6692 6693 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6694 return Legal->hasStride(I->getOperand(0)) || 6695 Legal->hasStride(I->getOperand(1)); 6696 } 6697 6698 InstructionCost 6699 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6700 ElementCount VF) { 6701 assert(VF.isVector() && 6702 "Scalarization cost of instruction implies vectorization."); 6703 if (VF.isScalable()) 6704 return InstructionCost::getInvalid(); 6705 6706 Type *ValTy = getLoadStoreType(I); 6707 auto SE = PSE.getSE(); 6708 6709 unsigned AS = getLoadStoreAddressSpace(I); 6710 Value *Ptr = getLoadStorePointerOperand(I); 6711 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6712 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6713 // that it is being called from this specific place. 6714 6715 // Figure out whether the access is strided and get the stride value 6716 // if it's known in compile time 6717 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6718 6719 // Get the cost of the scalar memory instruction and address computation. 6720 InstructionCost Cost = 6721 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6722 6723 // Don't pass *I here, since it is scalar but will actually be part of a 6724 // vectorized loop where the user of it is a vectorized instruction. 6725 const Align Alignment = getLoadStoreAlignment(I); 6726 Cost += VF.getKnownMinValue() * 6727 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6728 AS, TTI::TCK_RecipThroughput); 6729 6730 // Get the overhead of the extractelement and insertelement instructions 6731 // we might create due to scalarization. 6732 Cost += getScalarizationOverhead(I, VF); 6733 6734 // If we have a predicated load/store, it will need extra i1 extracts and 6735 // conditional branches, but may not be executed for each vector lane. Scale 6736 // the cost by the probability of executing the predicated block. 6737 if (isPredicatedInst(I, VF)) { 6738 Cost /= getReciprocalPredBlockProb(); 6739 6740 // Add the cost of an i1 extract and a branch 6741 auto *Vec_i1Ty = 6742 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6743 Cost += TTI.getScalarizationOverhead( 6744 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6745 /*Insert=*/false, /*Extract=*/true); 6746 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6747 6748 if (useEmulatedMaskMemRefHack(I, VF)) 6749 // Artificially setting to a high enough value to practically disable 6750 // vectorization with such operations. 6751 Cost = 3000000; 6752 } 6753 6754 return Cost; 6755 } 6756 6757 InstructionCost 6758 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6759 ElementCount VF) { 6760 Type *ValTy = getLoadStoreType(I); 6761 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6762 Value *Ptr = getLoadStorePointerOperand(I); 6763 unsigned AS = getLoadStoreAddressSpace(I); 6764 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6765 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6766 6767 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6768 "Stride should be 1 or -1 for consecutive memory access"); 6769 const Align Alignment = getLoadStoreAlignment(I); 6770 InstructionCost Cost = 0; 6771 if (Legal->isMaskRequired(I)) 6772 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6773 CostKind); 6774 else 6775 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6776 CostKind, I); 6777 6778 bool Reverse = ConsecutiveStride < 0; 6779 if (Reverse) 6780 Cost += 6781 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6782 return Cost; 6783 } 6784 6785 InstructionCost 6786 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6787 ElementCount VF) { 6788 assert(Legal->isUniformMemOp(*I)); 6789 6790 Type *ValTy = getLoadStoreType(I); 6791 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6792 const Align Alignment = getLoadStoreAlignment(I); 6793 unsigned AS = getLoadStoreAddressSpace(I); 6794 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6795 if (isa<LoadInst>(I)) { 6796 return TTI.getAddressComputationCost(ValTy) + 6797 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6798 CostKind) + 6799 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6800 } 6801 StoreInst *SI = cast<StoreInst>(I); 6802 6803 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6804 return TTI.getAddressComputationCost(ValTy) + 6805 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6806 CostKind) + 6807 (isLoopInvariantStoreValue 6808 ? 0 6809 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6810 VF.getKnownMinValue() - 1)); 6811 } 6812 6813 InstructionCost 6814 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6815 ElementCount VF) { 6816 Type *ValTy = getLoadStoreType(I); 6817 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6818 const Align Alignment = getLoadStoreAlignment(I); 6819 const Value *Ptr = getLoadStorePointerOperand(I); 6820 6821 return TTI.getAddressComputationCost(VectorTy) + 6822 TTI.getGatherScatterOpCost( 6823 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6824 TargetTransformInfo::TCK_RecipThroughput, I); 6825 } 6826 6827 InstructionCost 6828 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6829 ElementCount VF) { 6830 // TODO: Once we have support for interleaving with scalable vectors 6831 // we can calculate the cost properly here. 6832 if (VF.isScalable()) 6833 return InstructionCost::getInvalid(); 6834 6835 Type *ValTy = getLoadStoreType(I); 6836 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6837 unsigned AS = getLoadStoreAddressSpace(I); 6838 6839 auto Group = getInterleavedAccessGroup(I); 6840 assert(Group && "Fail to get an interleaved access group."); 6841 6842 unsigned InterleaveFactor = Group->getFactor(); 6843 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6844 6845 // Holds the indices of existing members in the interleaved group. 6846 SmallVector<unsigned, 4> Indices; 6847 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6848 if (Group->getMember(IF)) 6849 Indices.push_back(IF); 6850 6851 // Calculate the cost of the whole interleaved group. 6852 bool UseMaskForGaps = 6853 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6854 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6855 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6856 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6857 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6858 6859 if (Group->isReverse()) { 6860 // TODO: Add support for reversed masked interleaved access. 6861 assert(!Legal->isMaskRequired(I) && 6862 "Reverse masked interleaved access not supported."); 6863 Cost += 6864 Group->getNumMembers() * 6865 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6866 } 6867 return Cost; 6868 } 6869 6870 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6871 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6872 using namespace llvm::PatternMatch; 6873 // Early exit for no inloop reductions 6874 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6875 return None; 6876 auto *VectorTy = cast<VectorType>(Ty); 6877 6878 // We are looking for a pattern of, and finding the minimal acceptable cost: 6879 // reduce(mul(ext(A), ext(B))) or 6880 // reduce(mul(A, B)) or 6881 // reduce(ext(A)) or 6882 // reduce(A). 6883 // The basic idea is that we walk down the tree to do that, finding the root 6884 // reduction instruction in InLoopReductionImmediateChains. From there we find 6885 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6886 // of the components. If the reduction cost is lower then we return it for the 6887 // reduction instruction and 0 for the other instructions in the pattern. If 6888 // it is not we return an invalid cost specifying the orignal cost method 6889 // should be used. 6890 Instruction *RetI = I; 6891 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6892 if (!RetI->hasOneUser()) 6893 return None; 6894 RetI = RetI->user_back(); 6895 } 6896 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6897 RetI->user_back()->getOpcode() == Instruction::Add) { 6898 if (!RetI->hasOneUser()) 6899 return None; 6900 RetI = RetI->user_back(); 6901 } 6902 6903 // Test if the found instruction is a reduction, and if not return an invalid 6904 // cost specifying the parent to use the original cost modelling. 6905 if (!InLoopReductionImmediateChains.count(RetI)) 6906 return None; 6907 6908 // Find the reduction this chain is a part of and calculate the basic cost of 6909 // the reduction on its own. 6910 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6911 Instruction *ReductionPhi = LastChain; 6912 while (!isa<PHINode>(ReductionPhi)) 6913 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6914 6915 const RecurrenceDescriptor &RdxDesc = 6916 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6917 6918 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6919 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6920 6921 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6922 // normal fmul instruction to the cost of the fadd reduction. 6923 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6924 BaseCost += 6925 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6926 6927 // If we're using ordered reductions then we can just return the base cost 6928 // here, since getArithmeticReductionCost calculates the full ordered 6929 // reduction cost when FP reassociation is not allowed. 6930 if (useOrderedReductions(RdxDesc)) 6931 return BaseCost; 6932 6933 // Get the operand that was not the reduction chain and match it to one of the 6934 // patterns, returning the better cost if it is found. 6935 Instruction *RedOp = RetI->getOperand(1) == LastChain 6936 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6937 : dyn_cast<Instruction>(RetI->getOperand(1)); 6938 6939 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6940 6941 Instruction *Op0, *Op1; 6942 if (RedOp && 6943 match(RedOp, 6944 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6945 match(Op0, m_ZExtOrSExt(m_Value())) && 6946 Op0->getOpcode() == Op1->getOpcode() && 6947 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6948 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6949 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6950 6951 // Matched reduce(ext(mul(ext(A), ext(B))) 6952 // Note that the extend opcodes need to all match, or if A==B they will have 6953 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6954 // which is equally fine. 6955 bool IsUnsigned = isa<ZExtInst>(Op0); 6956 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6957 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6958 6959 InstructionCost ExtCost = 6960 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6961 TTI::CastContextHint::None, CostKind, Op0); 6962 InstructionCost MulCost = 6963 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6964 InstructionCost Ext2Cost = 6965 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6966 TTI::CastContextHint::None, CostKind, RedOp); 6967 6968 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6969 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6970 CostKind); 6971 6972 if (RedCost.isValid() && 6973 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6974 return I == RetI ? RedCost : 0; 6975 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6976 !TheLoop->isLoopInvariant(RedOp)) { 6977 // Matched reduce(ext(A)) 6978 bool IsUnsigned = isa<ZExtInst>(RedOp); 6979 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6980 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6981 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6982 CostKind); 6983 6984 InstructionCost ExtCost = 6985 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6986 TTI::CastContextHint::None, CostKind, RedOp); 6987 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6988 return I == RetI ? RedCost : 0; 6989 } else if (RedOp && 6990 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6991 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6992 Op0->getOpcode() == Op1->getOpcode() && 6993 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6994 bool IsUnsigned = isa<ZExtInst>(Op0); 6995 Type *Op0Ty = Op0->getOperand(0)->getType(); 6996 Type *Op1Ty = Op1->getOperand(0)->getType(); 6997 Type *LargestOpTy = 6998 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6999 : Op0Ty; 7000 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 7001 7002 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 7003 // different sizes. We take the largest type as the ext to reduce, and add 7004 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 7005 InstructionCost ExtCost0 = TTI.getCastInstrCost( 7006 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 7007 TTI::CastContextHint::None, CostKind, Op0); 7008 InstructionCost ExtCost1 = TTI.getCastInstrCost( 7009 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 7010 TTI::CastContextHint::None, CostKind, Op1); 7011 InstructionCost MulCost = 7012 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7013 7014 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7015 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7016 CostKind); 7017 InstructionCost ExtraExtCost = 0; 7018 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7019 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7020 ExtraExtCost = TTI.getCastInstrCost( 7021 ExtraExtOp->getOpcode(), ExtType, 7022 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7023 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7024 } 7025 7026 if (RedCost.isValid() && 7027 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7028 return I == RetI ? RedCost : 0; 7029 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7030 // Matched reduce(mul()) 7031 InstructionCost MulCost = 7032 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7033 7034 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7035 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7036 CostKind); 7037 7038 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7039 return I == RetI ? RedCost : 0; 7040 } 7041 } 7042 7043 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7044 } 7045 7046 InstructionCost 7047 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7048 ElementCount VF) { 7049 // Calculate scalar cost only. Vectorization cost should be ready at this 7050 // moment. 7051 if (VF.isScalar()) { 7052 Type *ValTy = getLoadStoreType(I); 7053 const Align Alignment = getLoadStoreAlignment(I); 7054 unsigned AS = getLoadStoreAddressSpace(I); 7055 7056 return TTI.getAddressComputationCost(ValTy) + 7057 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7058 TTI::TCK_RecipThroughput, I); 7059 } 7060 return getWideningCost(I, VF); 7061 } 7062 7063 LoopVectorizationCostModel::VectorizationCostTy 7064 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7065 ElementCount VF) { 7066 // If we know that this instruction will remain uniform, check the cost of 7067 // the scalar version. 7068 if (isUniformAfterVectorization(I, VF)) 7069 VF = ElementCount::getFixed(1); 7070 7071 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7072 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7073 7074 // Forced scalars do not have any scalarization overhead. 7075 auto ForcedScalar = ForcedScalars.find(VF); 7076 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7077 auto InstSet = ForcedScalar->second; 7078 if (InstSet.count(I)) 7079 return VectorizationCostTy( 7080 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7081 VF.getKnownMinValue()), 7082 false); 7083 } 7084 7085 Type *VectorTy; 7086 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7087 7088 bool TypeNotScalarized = false; 7089 if (VF.isVector() && VectorTy->isVectorTy()) { 7090 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7091 if (NumParts) 7092 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7093 else 7094 C = InstructionCost::getInvalid(); 7095 } 7096 return VectorizationCostTy(C, TypeNotScalarized); 7097 } 7098 7099 InstructionCost 7100 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7101 ElementCount VF) const { 7102 7103 // There is no mechanism yet to create a scalable scalarization loop, 7104 // so this is currently Invalid. 7105 if (VF.isScalable()) 7106 return InstructionCost::getInvalid(); 7107 7108 if (VF.isScalar()) 7109 return 0; 7110 7111 InstructionCost Cost = 0; 7112 Type *RetTy = ToVectorTy(I->getType(), VF); 7113 if (!RetTy->isVoidTy() && 7114 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7115 Cost += TTI.getScalarizationOverhead( 7116 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7117 false); 7118 7119 // Some targets keep addresses scalar. 7120 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7121 return Cost; 7122 7123 // Some targets support efficient element stores. 7124 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7125 return Cost; 7126 7127 // Collect operands to consider. 7128 CallInst *CI = dyn_cast<CallInst>(I); 7129 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7130 7131 // Skip operands that do not require extraction/scalarization and do not incur 7132 // any overhead. 7133 SmallVector<Type *> Tys; 7134 for (auto *V : filterExtractingOperands(Ops, VF)) 7135 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7136 return Cost + TTI.getOperandsScalarizationOverhead( 7137 filterExtractingOperands(Ops, VF), Tys); 7138 } 7139 7140 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7141 if (VF.isScalar()) 7142 return; 7143 NumPredStores = 0; 7144 for (BasicBlock *BB : TheLoop->blocks()) { 7145 // For each instruction in the old loop. 7146 for (Instruction &I : *BB) { 7147 Value *Ptr = getLoadStorePointerOperand(&I); 7148 if (!Ptr) 7149 continue; 7150 7151 // TODO: We should generate better code and update the cost model for 7152 // predicated uniform stores. Today they are treated as any other 7153 // predicated store (see added test cases in 7154 // invariant-store-vectorization.ll). 7155 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 7156 NumPredStores++; 7157 7158 if (Legal->isUniformMemOp(I)) { 7159 // TODO: Avoid replicating loads and stores instead of 7160 // relying on instcombine to remove them. 7161 // Load: Scalar load + broadcast 7162 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7163 InstructionCost Cost; 7164 if (isa<StoreInst>(&I) && VF.isScalable() && 7165 isLegalGatherOrScatter(&I, VF)) { 7166 Cost = getGatherScatterCost(&I, VF); 7167 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7168 } else { 7169 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7170 "Cannot yet scalarize uniform stores"); 7171 Cost = getUniformMemOpCost(&I, VF); 7172 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7173 } 7174 continue; 7175 } 7176 7177 // We assume that widening is the best solution when possible. 7178 if (memoryInstructionCanBeWidened(&I, VF)) { 7179 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7180 int ConsecutiveStride = Legal->isConsecutivePtr( 7181 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7182 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7183 "Expected consecutive stride."); 7184 InstWidening Decision = 7185 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7186 setWideningDecision(&I, VF, Decision, Cost); 7187 continue; 7188 } 7189 7190 // Choose between Interleaving, Gather/Scatter or Scalarization. 7191 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7192 unsigned NumAccesses = 1; 7193 if (isAccessInterleaved(&I)) { 7194 auto Group = getInterleavedAccessGroup(&I); 7195 assert(Group && "Fail to get an interleaved access group."); 7196 7197 // Make one decision for the whole group. 7198 if (getWideningDecision(&I, VF) != CM_Unknown) 7199 continue; 7200 7201 NumAccesses = Group->getNumMembers(); 7202 if (interleavedAccessCanBeWidened(&I, VF)) 7203 InterleaveCost = getInterleaveGroupCost(&I, VF); 7204 } 7205 7206 InstructionCost GatherScatterCost = 7207 isLegalGatherOrScatter(&I, VF) 7208 ? getGatherScatterCost(&I, VF) * NumAccesses 7209 : InstructionCost::getInvalid(); 7210 7211 InstructionCost ScalarizationCost = 7212 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7213 7214 // Choose better solution for the current VF, 7215 // write down this decision and use it during vectorization. 7216 InstructionCost Cost; 7217 InstWidening Decision; 7218 if (InterleaveCost <= GatherScatterCost && 7219 InterleaveCost < ScalarizationCost) { 7220 Decision = CM_Interleave; 7221 Cost = InterleaveCost; 7222 } else if (GatherScatterCost < ScalarizationCost) { 7223 Decision = CM_GatherScatter; 7224 Cost = GatherScatterCost; 7225 } else { 7226 Decision = CM_Scalarize; 7227 Cost = ScalarizationCost; 7228 } 7229 // If the instructions belongs to an interleave group, the whole group 7230 // receives the same decision. The whole group receives the cost, but 7231 // the cost will actually be assigned to one instruction. 7232 if (auto Group = getInterleavedAccessGroup(&I)) 7233 setWideningDecision(Group, VF, Decision, Cost); 7234 else 7235 setWideningDecision(&I, VF, Decision, Cost); 7236 } 7237 } 7238 7239 // Make sure that any load of address and any other address computation 7240 // remains scalar unless there is gather/scatter support. This avoids 7241 // inevitable extracts into address registers, and also has the benefit of 7242 // activating LSR more, since that pass can't optimize vectorized 7243 // addresses. 7244 if (TTI.prefersVectorizedAddressing()) 7245 return; 7246 7247 // Start with all scalar pointer uses. 7248 SmallPtrSet<Instruction *, 8> AddrDefs; 7249 for (BasicBlock *BB : TheLoop->blocks()) 7250 for (Instruction &I : *BB) { 7251 Instruction *PtrDef = 7252 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7253 if (PtrDef && TheLoop->contains(PtrDef) && 7254 getWideningDecision(&I, VF) != CM_GatherScatter) 7255 AddrDefs.insert(PtrDef); 7256 } 7257 7258 // Add all instructions used to generate the addresses. 7259 SmallVector<Instruction *, 4> Worklist; 7260 append_range(Worklist, AddrDefs); 7261 while (!Worklist.empty()) { 7262 Instruction *I = Worklist.pop_back_val(); 7263 for (auto &Op : I->operands()) 7264 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7265 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7266 AddrDefs.insert(InstOp).second) 7267 Worklist.push_back(InstOp); 7268 } 7269 7270 for (auto *I : AddrDefs) { 7271 if (isa<LoadInst>(I)) { 7272 // Setting the desired widening decision should ideally be handled in 7273 // by cost functions, but since this involves the task of finding out 7274 // if the loaded register is involved in an address computation, it is 7275 // instead changed here when we know this is the case. 7276 InstWidening Decision = getWideningDecision(I, VF); 7277 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7278 // Scalarize a widened load of address. 7279 setWideningDecision( 7280 I, VF, CM_Scalarize, 7281 (VF.getKnownMinValue() * 7282 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7283 else if (auto Group = getInterleavedAccessGroup(I)) { 7284 // Scalarize an interleave group of address loads. 7285 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7286 if (Instruction *Member = Group->getMember(I)) 7287 setWideningDecision( 7288 Member, VF, CM_Scalarize, 7289 (VF.getKnownMinValue() * 7290 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7291 } 7292 } 7293 } else 7294 // Make sure I gets scalarized and a cost estimate without 7295 // scalarization overhead. 7296 ForcedScalars[VF].insert(I); 7297 } 7298 } 7299 7300 InstructionCost 7301 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7302 Type *&VectorTy) { 7303 Type *RetTy = I->getType(); 7304 if (canTruncateToMinimalBitwidth(I, VF)) 7305 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7306 auto SE = PSE.getSE(); 7307 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7308 7309 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7310 ElementCount VF) -> bool { 7311 if (VF.isScalar()) 7312 return true; 7313 7314 auto Scalarized = InstsToScalarize.find(VF); 7315 assert(Scalarized != InstsToScalarize.end() && 7316 "VF not yet analyzed for scalarization profitability"); 7317 return !Scalarized->second.count(I) && 7318 llvm::all_of(I->users(), [&](User *U) { 7319 auto *UI = cast<Instruction>(U); 7320 return !Scalarized->second.count(UI); 7321 }); 7322 }; 7323 (void) hasSingleCopyAfterVectorization; 7324 7325 if (isScalarAfterVectorization(I, VF)) { 7326 // With the exception of GEPs and PHIs, after scalarization there should 7327 // only be one copy of the instruction generated in the loop. This is 7328 // because the VF is either 1, or any instructions that need scalarizing 7329 // have already been dealt with by the the time we get here. As a result, 7330 // it means we don't have to multiply the instruction cost by VF. 7331 assert(I->getOpcode() == Instruction::GetElementPtr || 7332 I->getOpcode() == Instruction::PHI || 7333 (I->getOpcode() == Instruction::BitCast && 7334 I->getType()->isPointerTy()) || 7335 hasSingleCopyAfterVectorization(I, VF)); 7336 VectorTy = RetTy; 7337 } else 7338 VectorTy = ToVectorTy(RetTy, VF); 7339 7340 // TODO: We need to estimate the cost of intrinsic calls. 7341 switch (I->getOpcode()) { 7342 case Instruction::GetElementPtr: 7343 // We mark this instruction as zero-cost because the cost of GEPs in 7344 // vectorized code depends on whether the corresponding memory instruction 7345 // is scalarized or not. Therefore, we handle GEPs with the memory 7346 // instruction cost. 7347 return 0; 7348 case Instruction::Br: { 7349 // In cases of scalarized and predicated instructions, there will be VF 7350 // predicated blocks in the vectorized loop. Each branch around these 7351 // blocks requires also an extract of its vector compare i1 element. 7352 bool ScalarPredicatedBB = false; 7353 BranchInst *BI = cast<BranchInst>(I); 7354 if (VF.isVector() && BI->isConditional() && 7355 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7356 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7357 ScalarPredicatedBB = true; 7358 7359 if (ScalarPredicatedBB) { 7360 // Not possible to scalarize scalable vector with predicated instructions. 7361 if (VF.isScalable()) 7362 return InstructionCost::getInvalid(); 7363 // Return cost for branches around scalarized and predicated blocks. 7364 auto *Vec_i1Ty = 7365 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7366 return ( 7367 TTI.getScalarizationOverhead( 7368 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7369 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7370 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7371 // The back-edge branch will remain, as will all scalar branches. 7372 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7373 else 7374 // This branch will be eliminated by if-conversion. 7375 return 0; 7376 // Note: We currently assume zero cost for an unconditional branch inside 7377 // a predicated block since it will become a fall-through, although we 7378 // may decide in the future to call TTI for all branches. 7379 } 7380 case Instruction::PHI: { 7381 auto *Phi = cast<PHINode>(I); 7382 7383 // First-order recurrences are replaced by vector shuffles inside the loop. 7384 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7385 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7386 return TTI.getShuffleCost( 7387 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7388 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7389 7390 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7391 // converted into select instructions. We require N - 1 selects per phi 7392 // node, where N is the number of incoming values. 7393 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7394 return (Phi->getNumIncomingValues() - 1) * 7395 TTI.getCmpSelInstrCost( 7396 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7397 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7398 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7399 7400 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7401 } 7402 case Instruction::UDiv: 7403 case Instruction::SDiv: 7404 case Instruction::URem: 7405 case Instruction::SRem: 7406 // If we have a predicated instruction, it may not be executed for each 7407 // vector lane. Get the scalarization cost and scale this amount by the 7408 // probability of executing the predicated block. If the instruction is not 7409 // predicated, we fall through to the next case. 7410 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7411 InstructionCost Cost = 0; 7412 7413 // These instructions have a non-void type, so account for the phi nodes 7414 // that we will create. This cost is likely to be zero. The phi node 7415 // cost, if any, should be scaled by the block probability because it 7416 // models a copy at the end of each predicated block. 7417 Cost += VF.getKnownMinValue() * 7418 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7419 7420 // The cost of the non-predicated instruction. 7421 Cost += VF.getKnownMinValue() * 7422 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7423 7424 // The cost of insertelement and extractelement instructions needed for 7425 // scalarization. 7426 Cost += getScalarizationOverhead(I, VF); 7427 7428 // Scale the cost by the probability of executing the predicated blocks. 7429 // This assumes the predicated block for each vector lane is equally 7430 // likely. 7431 return Cost / getReciprocalPredBlockProb(); 7432 } 7433 LLVM_FALLTHROUGH; 7434 case Instruction::Add: 7435 case Instruction::FAdd: 7436 case Instruction::Sub: 7437 case Instruction::FSub: 7438 case Instruction::Mul: 7439 case Instruction::FMul: 7440 case Instruction::FDiv: 7441 case Instruction::FRem: 7442 case Instruction::Shl: 7443 case Instruction::LShr: 7444 case Instruction::AShr: 7445 case Instruction::And: 7446 case Instruction::Or: 7447 case Instruction::Xor: { 7448 // Since we will replace the stride by 1 the multiplication should go away. 7449 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7450 return 0; 7451 7452 // Detect reduction patterns 7453 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7454 return *RedCost; 7455 7456 // Certain instructions can be cheaper to vectorize if they have a constant 7457 // second vector operand. One example of this are shifts on x86. 7458 Value *Op2 = I->getOperand(1); 7459 TargetTransformInfo::OperandValueProperties Op2VP; 7460 TargetTransformInfo::OperandValueKind Op2VK = 7461 TTI.getOperandInfo(Op2, Op2VP); 7462 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7463 Op2VK = TargetTransformInfo::OK_UniformValue; 7464 7465 SmallVector<const Value *, 4> Operands(I->operand_values()); 7466 return TTI.getArithmeticInstrCost( 7467 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7468 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7469 } 7470 case Instruction::FNeg: { 7471 return TTI.getArithmeticInstrCost( 7472 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7473 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7474 TargetTransformInfo::OP_None, I->getOperand(0), I); 7475 } 7476 case Instruction::Select: { 7477 SelectInst *SI = cast<SelectInst>(I); 7478 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7479 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7480 7481 const Value *Op0, *Op1; 7482 using namespace llvm::PatternMatch; 7483 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7484 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7485 // select x, y, false --> x & y 7486 // select x, true, y --> x | y 7487 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7488 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7489 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7490 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7491 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7492 Op1->getType()->getScalarSizeInBits() == 1); 7493 7494 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7495 return TTI.getArithmeticInstrCost( 7496 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7497 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7498 } 7499 7500 Type *CondTy = SI->getCondition()->getType(); 7501 if (!ScalarCond) 7502 CondTy = VectorType::get(CondTy, VF); 7503 7504 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7505 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7506 Pred = Cmp->getPredicate(); 7507 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7508 CostKind, I); 7509 } 7510 case Instruction::ICmp: 7511 case Instruction::FCmp: { 7512 Type *ValTy = I->getOperand(0)->getType(); 7513 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7514 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7515 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7516 VectorTy = ToVectorTy(ValTy, VF); 7517 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7518 cast<CmpInst>(I)->getPredicate(), CostKind, 7519 I); 7520 } 7521 case Instruction::Store: 7522 case Instruction::Load: { 7523 ElementCount Width = VF; 7524 if (Width.isVector()) { 7525 InstWidening Decision = getWideningDecision(I, Width); 7526 assert(Decision != CM_Unknown && 7527 "CM decision should be taken at this point"); 7528 if (Decision == CM_Scalarize) 7529 Width = ElementCount::getFixed(1); 7530 } 7531 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7532 return getMemoryInstructionCost(I, VF); 7533 } 7534 case Instruction::BitCast: 7535 if (I->getType()->isPointerTy()) 7536 return 0; 7537 LLVM_FALLTHROUGH; 7538 case Instruction::ZExt: 7539 case Instruction::SExt: 7540 case Instruction::FPToUI: 7541 case Instruction::FPToSI: 7542 case Instruction::FPExt: 7543 case Instruction::PtrToInt: 7544 case Instruction::IntToPtr: 7545 case Instruction::SIToFP: 7546 case Instruction::UIToFP: 7547 case Instruction::Trunc: 7548 case Instruction::FPTrunc: { 7549 // Computes the CastContextHint from a Load/Store instruction. 7550 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7551 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7552 "Expected a load or a store!"); 7553 7554 if (VF.isScalar() || !TheLoop->contains(I)) 7555 return TTI::CastContextHint::Normal; 7556 7557 switch (getWideningDecision(I, VF)) { 7558 case LoopVectorizationCostModel::CM_GatherScatter: 7559 return TTI::CastContextHint::GatherScatter; 7560 case LoopVectorizationCostModel::CM_Interleave: 7561 return TTI::CastContextHint::Interleave; 7562 case LoopVectorizationCostModel::CM_Scalarize: 7563 case LoopVectorizationCostModel::CM_Widen: 7564 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7565 : TTI::CastContextHint::Normal; 7566 case LoopVectorizationCostModel::CM_Widen_Reverse: 7567 return TTI::CastContextHint::Reversed; 7568 case LoopVectorizationCostModel::CM_Unknown: 7569 llvm_unreachable("Instr did not go through cost modelling?"); 7570 } 7571 7572 llvm_unreachable("Unhandled case!"); 7573 }; 7574 7575 unsigned Opcode = I->getOpcode(); 7576 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7577 // For Trunc, the context is the only user, which must be a StoreInst. 7578 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7579 if (I->hasOneUse()) 7580 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7581 CCH = ComputeCCH(Store); 7582 } 7583 // For Z/Sext, the context is the operand, which must be a LoadInst. 7584 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7585 Opcode == Instruction::FPExt) { 7586 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7587 CCH = ComputeCCH(Load); 7588 } 7589 7590 // We optimize the truncation of induction variables having constant 7591 // integer steps. The cost of these truncations is the same as the scalar 7592 // operation. 7593 if (isOptimizableIVTruncate(I, VF)) { 7594 auto *Trunc = cast<TruncInst>(I); 7595 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7596 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7597 } 7598 7599 // Detect reduction patterns 7600 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7601 return *RedCost; 7602 7603 Type *SrcScalarTy = I->getOperand(0)->getType(); 7604 Type *SrcVecTy = 7605 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7606 if (canTruncateToMinimalBitwidth(I, VF)) { 7607 // This cast is going to be shrunk. This may remove the cast or it might 7608 // turn it into slightly different cast. For example, if MinBW == 16, 7609 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7610 // 7611 // Calculate the modified src and dest types. 7612 Type *MinVecTy = VectorTy; 7613 if (Opcode == Instruction::Trunc) { 7614 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7615 VectorTy = 7616 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7617 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7618 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7619 VectorTy = 7620 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7621 } 7622 } 7623 7624 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7625 } 7626 case Instruction::Call: { 7627 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7628 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7629 return *RedCost; 7630 bool NeedToScalarize; 7631 CallInst *CI = cast<CallInst>(I); 7632 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7633 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7634 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7635 return std::min(CallCost, IntrinsicCost); 7636 } 7637 return CallCost; 7638 } 7639 case Instruction::ExtractValue: 7640 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7641 case Instruction::Alloca: 7642 // We cannot easily widen alloca to a scalable alloca, as 7643 // the result would need to be a vector of pointers. 7644 if (VF.isScalable()) 7645 return InstructionCost::getInvalid(); 7646 LLVM_FALLTHROUGH; 7647 default: 7648 // This opcode is unknown. Assume that it is the same as 'mul'. 7649 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7650 } // end of switch. 7651 } 7652 7653 char LoopVectorize::ID = 0; 7654 7655 static const char lv_name[] = "Loop Vectorization"; 7656 7657 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7658 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7659 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7660 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7661 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7662 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7663 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7664 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7665 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7666 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7667 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7668 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7669 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7670 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7671 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7672 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7673 7674 namespace llvm { 7675 7676 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7677 7678 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7679 bool VectorizeOnlyWhenForced) { 7680 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7681 } 7682 7683 } // end namespace llvm 7684 7685 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7686 // Check if the pointer operand of a load or store instruction is 7687 // consecutive. 7688 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7689 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7690 return false; 7691 } 7692 7693 void LoopVectorizationCostModel::collectValuesToIgnore() { 7694 // Ignore ephemeral values. 7695 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7696 7697 // Ignore type-promoting instructions we identified during reduction 7698 // detection. 7699 for (auto &Reduction : Legal->getReductionVars()) { 7700 const RecurrenceDescriptor &RedDes = Reduction.second; 7701 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7702 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7703 } 7704 // Ignore type-casting instructions we identified during induction 7705 // detection. 7706 for (auto &Induction : Legal->getInductionVars()) { 7707 const InductionDescriptor &IndDes = Induction.second; 7708 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7709 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7710 } 7711 } 7712 7713 void LoopVectorizationCostModel::collectInLoopReductions() { 7714 for (auto &Reduction : Legal->getReductionVars()) { 7715 PHINode *Phi = Reduction.first; 7716 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7717 7718 // We don't collect reductions that are type promoted (yet). 7719 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7720 continue; 7721 7722 // If the target would prefer this reduction to happen "in-loop", then we 7723 // want to record it as such. 7724 unsigned Opcode = RdxDesc.getOpcode(); 7725 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7726 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7727 TargetTransformInfo::ReductionFlags())) 7728 continue; 7729 7730 // Check that we can correctly put the reductions into the loop, by 7731 // finding the chain of operations that leads from the phi to the loop 7732 // exit value. 7733 SmallVector<Instruction *, 4> ReductionOperations = 7734 RdxDesc.getReductionOpChain(Phi, TheLoop); 7735 bool InLoop = !ReductionOperations.empty(); 7736 if (InLoop) { 7737 InLoopReductionChains[Phi] = ReductionOperations; 7738 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7739 Instruction *LastChain = Phi; 7740 for (auto *I : ReductionOperations) { 7741 InLoopReductionImmediateChains[I] = LastChain; 7742 LastChain = I; 7743 } 7744 } 7745 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7746 << " reduction for phi: " << *Phi << "\n"); 7747 } 7748 } 7749 7750 // TODO: we could return a pair of values that specify the max VF and 7751 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7752 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7753 // doesn't have a cost model that can choose which plan to execute if 7754 // more than one is generated. 7755 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7756 LoopVectorizationCostModel &CM) { 7757 unsigned WidestType; 7758 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7759 return WidestVectorRegBits / WidestType; 7760 } 7761 7762 VectorizationFactor 7763 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7764 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7765 ElementCount VF = UserVF; 7766 // Outer loop handling: They may require CFG and instruction level 7767 // transformations before even evaluating whether vectorization is profitable. 7768 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7769 // the vectorization pipeline. 7770 if (!OrigLoop->isInnermost()) { 7771 // If the user doesn't provide a vectorization factor, determine a 7772 // reasonable one. 7773 if (UserVF.isZero()) { 7774 VF = ElementCount::getFixed(determineVPlanVF( 7775 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7776 .getFixedSize(), 7777 CM)); 7778 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7779 7780 // Make sure we have a VF > 1 for stress testing. 7781 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7782 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7783 << "overriding computed VF.\n"); 7784 VF = ElementCount::getFixed(4); 7785 } 7786 } 7787 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7788 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7789 "VF needs to be a power of two"); 7790 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7791 << "VF " << VF << " to build VPlans.\n"); 7792 buildVPlans(VF, VF); 7793 7794 // For VPlan build stress testing, we bail out after VPlan construction. 7795 if (VPlanBuildStressTest) 7796 return VectorizationFactor::Disabled(); 7797 7798 return {VF, 0 /*Cost*/}; 7799 } 7800 7801 LLVM_DEBUG( 7802 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7803 "VPlan-native path.\n"); 7804 return VectorizationFactor::Disabled(); 7805 } 7806 7807 Optional<VectorizationFactor> 7808 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7809 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7810 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7811 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7812 return None; 7813 7814 // Invalidate interleave groups if all blocks of loop will be predicated. 7815 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7816 !useMaskedInterleavedAccesses(*TTI)) { 7817 LLVM_DEBUG( 7818 dbgs() 7819 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7820 "which requires masked-interleaved support.\n"); 7821 if (CM.InterleaveInfo.invalidateGroups()) 7822 // Invalidating interleave groups also requires invalidating all decisions 7823 // based on them, which includes widening decisions and uniform and scalar 7824 // values. 7825 CM.invalidateCostModelingDecisions(); 7826 } 7827 7828 ElementCount MaxUserVF = 7829 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7830 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7831 if (!UserVF.isZero() && UserVFIsLegal) { 7832 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7833 "VF needs to be a power of two"); 7834 // Collect the instructions (and their associated costs) that will be more 7835 // profitable to scalarize. 7836 if (CM.selectUserVectorizationFactor(UserVF)) { 7837 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7838 CM.collectInLoopReductions(); 7839 buildVPlansWithVPRecipes(UserVF, UserVF); 7840 LLVM_DEBUG(printPlans(dbgs())); 7841 return {{UserVF, 0}}; 7842 } else 7843 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7844 "InvalidCost", ORE, OrigLoop); 7845 } 7846 7847 // Populate the set of Vectorization Factor Candidates. 7848 ElementCountSet VFCandidates; 7849 for (auto VF = ElementCount::getFixed(1); 7850 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7851 VFCandidates.insert(VF); 7852 for (auto VF = ElementCount::getScalable(1); 7853 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7854 VFCandidates.insert(VF); 7855 7856 for (const auto &VF : VFCandidates) { 7857 // Collect Uniform and Scalar instructions after vectorization with VF. 7858 CM.collectUniformsAndScalars(VF); 7859 7860 // Collect the instructions (and their associated costs) that will be more 7861 // profitable to scalarize. 7862 if (VF.isVector()) 7863 CM.collectInstsToScalarize(VF); 7864 } 7865 7866 CM.collectInLoopReductions(); 7867 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7868 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7869 7870 LLVM_DEBUG(printPlans(dbgs())); 7871 if (!MaxFactors.hasVector()) 7872 return VectorizationFactor::Disabled(); 7873 7874 // Select the optimal vectorization factor. 7875 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7876 7877 // Check if it is profitable to vectorize with runtime checks. 7878 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7879 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7880 bool PragmaThresholdReached = 7881 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7882 bool ThresholdReached = 7883 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7884 if ((ThresholdReached && !Hints.allowReordering()) || 7885 PragmaThresholdReached) { 7886 ORE->emit([&]() { 7887 return OptimizationRemarkAnalysisAliasing( 7888 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7889 OrigLoop->getHeader()) 7890 << "loop not vectorized: cannot prove it is safe to reorder " 7891 "memory operations"; 7892 }); 7893 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7894 Hints.emitRemarkWithHints(); 7895 return VectorizationFactor::Disabled(); 7896 } 7897 } 7898 return SelectedVF; 7899 } 7900 7901 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7902 assert(count_if(VPlans, 7903 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7904 1 && 7905 "Best VF has not a single VPlan."); 7906 7907 for (const VPlanPtr &Plan : VPlans) { 7908 if (Plan->hasVF(VF)) 7909 return *Plan.get(); 7910 } 7911 llvm_unreachable("No plan found!"); 7912 } 7913 7914 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7915 SmallVector<Metadata *, 4> MDs; 7916 // Reserve first location for self reference to the LoopID metadata node. 7917 MDs.push_back(nullptr); 7918 bool IsUnrollMetadata = false; 7919 MDNode *LoopID = L->getLoopID(); 7920 if (LoopID) { 7921 // First find existing loop unrolling disable metadata. 7922 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7923 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7924 if (MD) { 7925 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7926 IsUnrollMetadata = 7927 S && S->getString().startswith("llvm.loop.unroll.disable"); 7928 } 7929 MDs.push_back(LoopID->getOperand(i)); 7930 } 7931 } 7932 7933 if (!IsUnrollMetadata) { 7934 // Add runtime unroll disable metadata. 7935 LLVMContext &Context = L->getHeader()->getContext(); 7936 SmallVector<Metadata *, 1> DisableOperands; 7937 DisableOperands.push_back( 7938 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7939 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7940 MDs.push_back(DisableNode); 7941 MDNode *NewLoopID = MDNode::get(Context, MDs); 7942 // Set operand 0 to refer to the loop id itself. 7943 NewLoopID->replaceOperandWith(0, NewLoopID); 7944 L->setLoopID(NewLoopID); 7945 } 7946 } 7947 7948 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7949 VPlan &BestVPlan, 7950 InnerLoopVectorizer &ILV, 7951 DominatorTree *DT) { 7952 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7953 << '\n'); 7954 7955 // Perform the actual loop transformation. 7956 7957 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7958 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7959 Value *CanonicalIVStartValue; 7960 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7961 ILV.createVectorizedLoopSkeleton(); 7962 ILV.collectPoisonGeneratingRecipes(State); 7963 7964 ILV.printDebugTracesAtStart(); 7965 7966 //===------------------------------------------------===// 7967 // 7968 // Notice: any optimization or new instruction that go 7969 // into the code below should also be implemented in 7970 // the cost-model. 7971 // 7972 //===------------------------------------------------===// 7973 7974 // 2. Copy and widen instructions from the old loop into the new loop. 7975 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7976 ILV.getOrCreateVectorTripCount(nullptr), 7977 CanonicalIVStartValue, State); 7978 BestVPlan.execute(&State); 7979 7980 // Keep all loop hints from the original loop on the vector loop (we'll 7981 // replace the vectorizer-specific hints below). 7982 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7983 7984 Optional<MDNode *> VectorizedLoopID = 7985 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7986 LLVMLoopVectorizeFollowupVectorized}); 7987 7988 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 7989 if (VectorizedLoopID.hasValue()) 7990 L->setLoopID(VectorizedLoopID.getValue()); 7991 else { 7992 // Keep all loop hints from the original loop on the vector loop (we'll 7993 // replace the vectorizer-specific hints below). 7994 if (MDNode *LID = OrigLoop->getLoopID()) 7995 L->setLoopID(LID); 7996 7997 LoopVectorizeHints Hints(L, true, *ORE); 7998 Hints.setAlreadyVectorized(); 7999 } 8000 // Disable runtime unrolling when vectorizing the epilogue loop. 8001 if (CanonicalIVStartValue) 8002 AddRuntimeUnrollDisableMetaData(L); 8003 8004 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8005 // predication, updating analyses. 8006 ILV.fixVectorizedLoop(State); 8007 8008 ILV.printDebugTracesAtEnd(); 8009 } 8010 8011 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8012 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8013 for (const auto &Plan : VPlans) 8014 if (PrintVPlansInDotFormat) 8015 Plan->printDOT(O); 8016 else 8017 Plan->print(O); 8018 } 8019 #endif 8020 8021 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8022 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8023 8024 // We create new control-flow for the vectorized loop, so the original exit 8025 // conditions will be dead after vectorization if it's only used by the 8026 // terminator 8027 SmallVector<BasicBlock*> ExitingBlocks; 8028 OrigLoop->getExitingBlocks(ExitingBlocks); 8029 for (auto *BB : ExitingBlocks) { 8030 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8031 if (!Cmp || !Cmp->hasOneUse()) 8032 continue; 8033 8034 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8035 if (!DeadInstructions.insert(Cmp).second) 8036 continue; 8037 8038 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8039 // TODO: can recurse through operands in general 8040 for (Value *Op : Cmp->operands()) { 8041 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8042 DeadInstructions.insert(cast<Instruction>(Op)); 8043 } 8044 } 8045 8046 // We create new "steps" for induction variable updates to which the original 8047 // induction variables map. An original update instruction will be dead if 8048 // all its users except the induction variable are dead. 8049 auto *Latch = OrigLoop->getLoopLatch(); 8050 for (auto &Induction : Legal->getInductionVars()) { 8051 PHINode *Ind = Induction.first; 8052 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8053 8054 // If the tail is to be folded by masking, the primary induction variable, 8055 // if exists, isn't dead: it will be used for masking. Don't kill it. 8056 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8057 continue; 8058 8059 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8060 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8061 })) 8062 DeadInstructions.insert(IndUpdate); 8063 } 8064 } 8065 8066 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8067 8068 //===--------------------------------------------------------------------===// 8069 // EpilogueVectorizerMainLoop 8070 //===--------------------------------------------------------------------===// 8071 8072 /// This function is partially responsible for generating the control flow 8073 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8074 std::pair<BasicBlock *, Value *> 8075 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8076 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8077 Loop *Lp = createVectorLoopSkeleton(""); 8078 8079 // Generate the code to check the minimum iteration count of the vector 8080 // epilogue (see below). 8081 EPI.EpilogueIterationCountCheck = 8082 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8083 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8084 8085 // Generate the code to check any assumptions that we've made for SCEV 8086 // expressions. 8087 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8088 8089 // Generate the code that checks at runtime if arrays overlap. We put the 8090 // checks into a separate block to make the more common case of few elements 8091 // faster. 8092 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8093 8094 // Generate the iteration count check for the main loop, *after* the check 8095 // for the epilogue loop, so that the path-length is shorter for the case 8096 // that goes directly through the vector epilogue. The longer-path length for 8097 // the main loop is compensated for, by the gain from vectorizing the larger 8098 // trip count. Note: the branch will get updated later on when we vectorize 8099 // the epilogue. 8100 EPI.MainLoopIterationCountCheck = 8101 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8102 8103 // Generate the induction variable. 8104 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8105 EPI.VectorTripCount = CountRoundDown; 8106 createHeaderBranch(Lp); 8107 8108 // Skip induction resume value creation here because they will be created in 8109 // the second pass. If we created them here, they wouldn't be used anyway, 8110 // because the vplan in the second pass still contains the inductions from the 8111 // original loop. 8112 8113 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 8114 } 8115 8116 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8117 LLVM_DEBUG({ 8118 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8119 << "Main Loop VF:" << EPI.MainLoopVF 8120 << ", Main Loop UF:" << EPI.MainLoopUF 8121 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8122 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8123 }); 8124 } 8125 8126 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8127 DEBUG_WITH_TYPE(VerboseDebug, { 8128 dbgs() << "intermediate fn:\n" 8129 << *OrigLoop->getHeader()->getParent() << "\n"; 8130 }); 8131 } 8132 8133 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8134 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8135 assert(L && "Expected valid Loop."); 8136 assert(Bypass && "Expected valid bypass basic block."); 8137 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8138 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8139 Value *Count = getOrCreateTripCount(L); 8140 // Reuse existing vector loop preheader for TC checks. 8141 // Note that new preheader block is generated for vector loop. 8142 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8143 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8144 8145 // Generate code to check if the loop's trip count is less than VF * UF of the 8146 // main vector loop. 8147 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8148 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8149 8150 Value *CheckMinIters = Builder.CreateICmp( 8151 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8152 "min.iters.check"); 8153 8154 if (!ForEpilogue) 8155 TCCheckBlock->setName("vector.main.loop.iter.check"); 8156 8157 // Create new preheader for vector loop. 8158 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8159 DT, LI, nullptr, "vector.ph"); 8160 8161 if (ForEpilogue) { 8162 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8163 DT->getNode(Bypass)->getIDom()) && 8164 "TC check is expected to dominate Bypass"); 8165 8166 // Update dominator for Bypass & LoopExit. 8167 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8168 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8169 // For loops with multiple exits, there's no edge from the middle block 8170 // to exit blocks (as the epilogue must run) and thus no need to update 8171 // the immediate dominator of the exit blocks. 8172 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8173 8174 LoopBypassBlocks.push_back(TCCheckBlock); 8175 8176 // Save the trip count so we don't have to regenerate it in the 8177 // vec.epilog.iter.check. This is safe to do because the trip count 8178 // generated here dominates the vector epilog iter check. 8179 EPI.TripCount = Count; 8180 } 8181 8182 ReplaceInstWithInst( 8183 TCCheckBlock->getTerminator(), 8184 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8185 8186 return TCCheckBlock; 8187 } 8188 8189 //===--------------------------------------------------------------------===// 8190 // EpilogueVectorizerEpilogueLoop 8191 //===--------------------------------------------------------------------===// 8192 8193 /// This function is partially responsible for generating the control flow 8194 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8195 std::pair<BasicBlock *, Value *> 8196 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8197 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8198 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8199 8200 // Now, compare the remaining count and if there aren't enough iterations to 8201 // execute the vectorized epilogue skip to the scalar part. 8202 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8203 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8204 LoopVectorPreHeader = 8205 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8206 LI, nullptr, "vec.epilog.ph"); 8207 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8208 VecEpilogueIterationCountCheck); 8209 8210 // Adjust the control flow taking the state info from the main loop 8211 // vectorization into account. 8212 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8213 "expected this to be saved from the previous pass."); 8214 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8215 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8216 8217 DT->changeImmediateDominator(LoopVectorPreHeader, 8218 EPI.MainLoopIterationCountCheck); 8219 8220 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8221 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8222 8223 if (EPI.SCEVSafetyCheck) 8224 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8225 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8226 if (EPI.MemSafetyCheck) 8227 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8228 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8229 8230 DT->changeImmediateDominator( 8231 VecEpilogueIterationCountCheck, 8232 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8233 8234 DT->changeImmediateDominator(LoopScalarPreHeader, 8235 EPI.EpilogueIterationCountCheck); 8236 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8237 // If there is an epilogue which must run, there's no edge from the 8238 // middle block to exit blocks and thus no need to update the immediate 8239 // dominator of the exit blocks. 8240 DT->changeImmediateDominator(LoopExitBlock, 8241 EPI.EpilogueIterationCountCheck); 8242 8243 // Keep track of bypass blocks, as they feed start values to the induction 8244 // phis in the scalar loop preheader. 8245 if (EPI.SCEVSafetyCheck) 8246 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8247 if (EPI.MemSafetyCheck) 8248 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8249 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8250 8251 // Generate a resume induction for the vector epilogue and put it in the 8252 // vector epilogue preheader 8253 Type *IdxTy = Legal->getWidestInductionType(); 8254 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8255 LoopVectorPreHeader->getFirstNonPHI()); 8256 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8257 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8258 EPI.MainLoopIterationCountCheck); 8259 8260 // Generate the induction variable. 8261 createHeaderBranch(Lp); 8262 8263 // Generate induction resume values. These variables save the new starting 8264 // indexes for the scalar loop. They are used to test if there are any tail 8265 // iterations left once the vector loop has completed. 8266 // Note that when the vectorized epilogue is skipped due to iteration count 8267 // check, then the resume value for the induction variable comes from 8268 // the trip count of the main vector loop, hence passing the AdditionalBypass 8269 // argument. 8270 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8271 EPI.VectorTripCount} /* AdditionalBypass */); 8272 8273 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8274 } 8275 8276 BasicBlock * 8277 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8278 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8279 8280 assert(EPI.TripCount && 8281 "Expected trip count to have been safed in the first pass."); 8282 assert( 8283 (!isa<Instruction>(EPI.TripCount) || 8284 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8285 "saved trip count does not dominate insertion point."); 8286 Value *TC = EPI.TripCount; 8287 IRBuilder<> Builder(Insert->getTerminator()); 8288 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8289 8290 // Generate code to check if the loop's trip count is less than VF * UF of the 8291 // vector epilogue loop. 8292 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8293 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8294 8295 Value *CheckMinIters = 8296 Builder.CreateICmp(P, Count, 8297 createStepForVF(Builder, Count->getType(), 8298 EPI.EpilogueVF, EPI.EpilogueUF), 8299 "min.epilog.iters.check"); 8300 8301 ReplaceInstWithInst( 8302 Insert->getTerminator(), 8303 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8304 8305 LoopBypassBlocks.push_back(Insert); 8306 return Insert; 8307 } 8308 8309 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8310 LLVM_DEBUG({ 8311 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8312 << "Epilogue Loop VF:" << EPI.EpilogueVF 8313 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8314 }); 8315 } 8316 8317 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8318 DEBUG_WITH_TYPE(VerboseDebug, { 8319 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8320 }); 8321 } 8322 8323 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8324 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8325 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8326 bool PredicateAtRangeStart = Predicate(Range.Start); 8327 8328 for (ElementCount TmpVF = Range.Start * 2; 8329 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8330 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8331 Range.End = TmpVF; 8332 break; 8333 } 8334 8335 return PredicateAtRangeStart; 8336 } 8337 8338 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8339 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8340 /// of VF's starting at a given VF and extending it as much as possible. Each 8341 /// vectorization decision can potentially shorten this sub-range during 8342 /// buildVPlan(). 8343 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8344 ElementCount MaxVF) { 8345 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8346 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8347 VFRange SubRange = {VF, MaxVFPlusOne}; 8348 VPlans.push_back(buildVPlan(SubRange)); 8349 VF = SubRange.End; 8350 } 8351 } 8352 8353 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8354 VPlanPtr &Plan) { 8355 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8356 8357 // Look for cached value. 8358 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8359 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8360 if (ECEntryIt != EdgeMaskCache.end()) 8361 return ECEntryIt->second; 8362 8363 VPValue *SrcMask = createBlockInMask(Src, Plan); 8364 8365 // The terminator has to be a branch inst! 8366 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8367 assert(BI && "Unexpected terminator found"); 8368 8369 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8370 return EdgeMaskCache[Edge] = SrcMask; 8371 8372 // If source is an exiting block, we know the exit edge is dynamically dead 8373 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8374 // adding uses of an otherwise potentially dead instruction. 8375 if (OrigLoop->isLoopExiting(Src)) 8376 return EdgeMaskCache[Edge] = SrcMask; 8377 8378 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8379 assert(EdgeMask && "No Edge Mask found for condition"); 8380 8381 if (BI->getSuccessor(0) != Dst) 8382 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8383 8384 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8385 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8386 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8387 // The select version does not introduce new UB if SrcMask is false and 8388 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8389 VPValue *False = Plan->getOrAddVPValue( 8390 ConstantInt::getFalse(BI->getCondition()->getType())); 8391 EdgeMask = 8392 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8393 } 8394 8395 return EdgeMaskCache[Edge] = EdgeMask; 8396 } 8397 8398 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8399 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8400 8401 // Look for cached value. 8402 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8403 if (BCEntryIt != BlockMaskCache.end()) 8404 return BCEntryIt->second; 8405 8406 // All-one mask is modelled as no-mask following the convention for masked 8407 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8408 VPValue *BlockMask = nullptr; 8409 8410 if (OrigLoop->getHeader() == BB) { 8411 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8412 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8413 8414 // Introduce the early-exit compare IV <= BTC to form header block mask. 8415 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8416 // constructing the desired canonical IV in the header block as its first 8417 // non-phi instructions. 8418 assert(CM.foldTailByMasking() && "must fold the tail"); 8419 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8420 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8421 8422 VPValue *IV = nullptr; 8423 if (Legal->getPrimaryInduction()) 8424 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8425 else { 8426 auto *IVRecipe = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8427 HeaderVPBB->insert(IVRecipe, NewInsertionPoint); 8428 IV = IVRecipe; 8429 } 8430 8431 VPBuilder::InsertPointGuard Guard(Builder); 8432 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8433 if (CM.TTI.emitGetActiveLaneMask()) { 8434 VPValue *TC = Plan->getOrCreateTripCount(); 8435 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8436 } else { 8437 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8438 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8439 } 8440 return BlockMaskCache[BB] = BlockMask; 8441 } 8442 8443 // This is the block mask. We OR all incoming edges. 8444 for (auto *Predecessor : predecessors(BB)) { 8445 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8446 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8447 return BlockMaskCache[BB] = EdgeMask; 8448 8449 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8450 BlockMask = EdgeMask; 8451 continue; 8452 } 8453 8454 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8455 } 8456 8457 return BlockMaskCache[BB] = BlockMask; 8458 } 8459 8460 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8461 ArrayRef<VPValue *> Operands, 8462 VFRange &Range, 8463 VPlanPtr &Plan) { 8464 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8465 "Must be called with either a load or store"); 8466 8467 auto willWiden = [&](ElementCount VF) -> bool { 8468 if (VF.isScalar()) 8469 return false; 8470 LoopVectorizationCostModel::InstWidening Decision = 8471 CM.getWideningDecision(I, VF); 8472 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8473 "CM decision should be taken at this point."); 8474 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8475 return true; 8476 if (CM.isScalarAfterVectorization(I, VF) || 8477 CM.isProfitableToScalarize(I, VF)) 8478 return false; 8479 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8480 }; 8481 8482 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8483 return nullptr; 8484 8485 VPValue *Mask = nullptr; 8486 if (Legal->isMaskRequired(I)) 8487 Mask = createBlockInMask(I->getParent(), Plan); 8488 8489 // Determine if the pointer operand of the access is either consecutive or 8490 // reverse consecutive. 8491 LoopVectorizationCostModel::InstWidening Decision = 8492 CM.getWideningDecision(I, Range.Start); 8493 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8494 bool Consecutive = 8495 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8496 8497 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8498 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8499 Consecutive, Reverse); 8500 8501 StoreInst *Store = cast<StoreInst>(I); 8502 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8503 Mask, Consecutive, Reverse); 8504 } 8505 8506 VPWidenIntOrFpInductionRecipe * 8507 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8508 ArrayRef<VPValue *> Operands) const { 8509 // Check if this is an integer or fp induction. If so, build the recipe that 8510 // produces its scalar and vector values. 8511 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { 8512 assert(II->getStartValue() == 8513 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8514 return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); 8515 } 8516 8517 return nullptr; 8518 } 8519 8520 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8521 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8522 VPlan &Plan) const { 8523 // Optimize the special case where the source is a constant integer 8524 // induction variable. Notice that we can only optimize the 'trunc' case 8525 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8526 // (c) other casts depend on pointer size. 8527 8528 // Determine whether \p K is a truncation based on an induction variable that 8529 // can be optimized. 8530 auto isOptimizableIVTruncate = 8531 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8532 return [=](ElementCount VF) -> bool { 8533 return CM.isOptimizableIVTruncate(K, VF); 8534 }; 8535 }; 8536 8537 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8538 isOptimizableIVTruncate(I), Range)) { 8539 8540 auto *Phi = cast<PHINode>(I->getOperand(0)); 8541 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8542 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8543 return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); 8544 } 8545 return nullptr; 8546 } 8547 8548 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8549 ArrayRef<VPValue *> Operands, 8550 VPlanPtr &Plan) { 8551 // If all incoming values are equal, the incoming VPValue can be used directly 8552 // instead of creating a new VPBlendRecipe. 8553 VPValue *FirstIncoming = Operands[0]; 8554 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8555 return FirstIncoming == Inc; 8556 })) { 8557 return Operands[0]; 8558 } 8559 8560 // We know that all PHIs in non-header blocks are converted into selects, so 8561 // we don't have to worry about the insertion order and we can just use the 8562 // builder. At this point we generate the predication tree. There may be 8563 // duplications since this is a simple recursive scan, but future 8564 // optimizations will clean it up. 8565 SmallVector<VPValue *, 2> OperandsWithMask; 8566 unsigned NumIncoming = Phi->getNumIncomingValues(); 8567 8568 for (unsigned In = 0; In < NumIncoming; In++) { 8569 VPValue *EdgeMask = 8570 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8571 assert((EdgeMask || NumIncoming == 1) && 8572 "Multiple predecessors with one having a full mask"); 8573 OperandsWithMask.push_back(Operands[In]); 8574 if (EdgeMask) 8575 OperandsWithMask.push_back(EdgeMask); 8576 } 8577 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8578 } 8579 8580 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8581 ArrayRef<VPValue *> Operands, 8582 VFRange &Range) const { 8583 8584 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8585 [this, CI](ElementCount VF) { 8586 return CM.isScalarWithPredication(CI, VF); 8587 }, 8588 Range); 8589 8590 if (IsPredicated) 8591 return nullptr; 8592 8593 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8594 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8595 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8596 ID == Intrinsic::pseudoprobe || 8597 ID == Intrinsic::experimental_noalias_scope_decl)) 8598 return nullptr; 8599 8600 auto willWiden = [&](ElementCount VF) -> bool { 8601 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8602 // The following case may be scalarized depending on the VF. 8603 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8604 // version of the instruction. 8605 // Is it beneficial to perform intrinsic call compared to lib call? 8606 bool NeedToScalarize = false; 8607 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8608 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8609 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8610 return UseVectorIntrinsic || !NeedToScalarize; 8611 }; 8612 8613 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8614 return nullptr; 8615 8616 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8617 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8618 } 8619 8620 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8621 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8622 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8623 // Instruction should be widened, unless it is scalar after vectorization, 8624 // scalarization is profitable or it is predicated. 8625 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8626 return CM.isScalarAfterVectorization(I, VF) || 8627 CM.isProfitableToScalarize(I, VF) || 8628 CM.isScalarWithPredication(I, VF); 8629 }; 8630 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8631 Range); 8632 } 8633 8634 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8635 ArrayRef<VPValue *> Operands) const { 8636 auto IsVectorizableOpcode = [](unsigned Opcode) { 8637 switch (Opcode) { 8638 case Instruction::Add: 8639 case Instruction::And: 8640 case Instruction::AShr: 8641 case Instruction::BitCast: 8642 case Instruction::FAdd: 8643 case Instruction::FCmp: 8644 case Instruction::FDiv: 8645 case Instruction::FMul: 8646 case Instruction::FNeg: 8647 case Instruction::FPExt: 8648 case Instruction::FPToSI: 8649 case Instruction::FPToUI: 8650 case Instruction::FPTrunc: 8651 case Instruction::FRem: 8652 case Instruction::FSub: 8653 case Instruction::ICmp: 8654 case Instruction::IntToPtr: 8655 case Instruction::LShr: 8656 case Instruction::Mul: 8657 case Instruction::Or: 8658 case Instruction::PtrToInt: 8659 case Instruction::SDiv: 8660 case Instruction::Select: 8661 case Instruction::SExt: 8662 case Instruction::Shl: 8663 case Instruction::SIToFP: 8664 case Instruction::SRem: 8665 case Instruction::Sub: 8666 case Instruction::Trunc: 8667 case Instruction::UDiv: 8668 case Instruction::UIToFP: 8669 case Instruction::URem: 8670 case Instruction::Xor: 8671 case Instruction::ZExt: 8672 return true; 8673 } 8674 return false; 8675 }; 8676 8677 if (!IsVectorizableOpcode(I->getOpcode())) 8678 return nullptr; 8679 8680 // Success: widen this instruction. 8681 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8682 } 8683 8684 void VPRecipeBuilder::fixHeaderPhis() { 8685 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8686 for (VPHeaderPHIRecipe *R : PhisToFix) { 8687 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8688 VPRecipeBase *IncR = 8689 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8690 R->addOperand(IncR->getVPSingleValue()); 8691 } 8692 } 8693 8694 VPBasicBlock *VPRecipeBuilder::handleReplication( 8695 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8696 VPlanPtr &Plan) { 8697 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8698 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8699 Range); 8700 8701 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8702 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8703 Range); 8704 8705 // Even if the instruction is not marked as uniform, there are certain 8706 // intrinsic calls that can be effectively treated as such, so we check for 8707 // them here. Conservatively, we only do this for scalable vectors, since 8708 // for fixed-width VFs we can always fall back on full scalarization. 8709 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8710 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8711 case Intrinsic::assume: 8712 case Intrinsic::lifetime_start: 8713 case Intrinsic::lifetime_end: 8714 // For scalable vectors if one of the operands is variant then we still 8715 // want to mark as uniform, which will generate one instruction for just 8716 // the first lane of the vector. We can't scalarize the call in the same 8717 // way as for fixed-width vectors because we don't know how many lanes 8718 // there are. 8719 // 8720 // The reasons for doing it this way for scalable vectors are: 8721 // 1. For the assume intrinsic generating the instruction for the first 8722 // lane is still be better than not generating any at all. For 8723 // example, the input may be a splat across all lanes. 8724 // 2. For the lifetime start/end intrinsics the pointer operand only 8725 // does anything useful when the input comes from a stack object, 8726 // which suggests it should always be uniform. For non-stack objects 8727 // the effect is to poison the object, which still allows us to 8728 // remove the call. 8729 IsUniform = true; 8730 break; 8731 default: 8732 break; 8733 } 8734 } 8735 8736 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8737 IsUniform, IsPredicated); 8738 setRecipe(I, Recipe); 8739 Plan->addVPValue(I, Recipe); 8740 8741 // Find if I uses a predicated instruction. If so, it will use its scalar 8742 // value. Avoid hoisting the insert-element which packs the scalar value into 8743 // a vector value, as that happens iff all users use the vector value. 8744 for (VPValue *Op : Recipe->operands()) { 8745 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8746 if (!PredR) 8747 continue; 8748 auto *RepR = 8749 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8750 assert(RepR->isPredicated() && 8751 "expected Replicate recipe to be predicated"); 8752 RepR->setAlsoPack(false); 8753 } 8754 8755 // Finalize the recipe for Instr, first if it is not predicated. 8756 if (!IsPredicated) { 8757 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8758 VPBB->appendRecipe(Recipe); 8759 return VPBB; 8760 } 8761 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8762 8763 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8764 assert(SingleSucc && "VPBB must have a single successor when handling " 8765 "predicated replication."); 8766 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8767 // Record predicated instructions for above packing optimizations. 8768 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8769 VPBlockUtils::insertBlockAfter(Region, VPBB); 8770 auto *RegSucc = new VPBasicBlock(); 8771 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8772 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8773 return RegSucc; 8774 } 8775 8776 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8777 VPRecipeBase *PredRecipe, 8778 VPlanPtr &Plan) { 8779 // Instructions marked for predication are replicated and placed under an 8780 // if-then construct to prevent side-effects. 8781 8782 // Generate recipes to compute the block mask for this region. 8783 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8784 8785 // Build the triangular if-then region. 8786 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8787 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8788 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8789 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8790 auto *PHIRecipe = Instr->getType()->isVoidTy() 8791 ? nullptr 8792 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8793 if (PHIRecipe) { 8794 Plan->removeVPValueFor(Instr); 8795 Plan->addVPValue(Instr, PHIRecipe); 8796 } 8797 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8798 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8799 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8800 8801 // Note: first set Entry as region entry and then connect successors starting 8802 // from it in order, to propagate the "parent" of each VPBasicBlock. 8803 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8804 VPBlockUtils::connectBlocks(Pred, Exit); 8805 8806 return Region; 8807 } 8808 8809 VPRecipeOrVPValueTy 8810 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8811 ArrayRef<VPValue *> Operands, 8812 VFRange &Range, VPlanPtr &Plan) { 8813 // First, check for specific widening recipes that deal with calls, memory 8814 // operations, inductions and Phi nodes. 8815 if (auto *CI = dyn_cast<CallInst>(Instr)) 8816 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8817 8818 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8819 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8820 8821 VPRecipeBase *Recipe; 8822 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8823 if (Phi->getParent() != OrigLoop->getHeader()) 8824 return tryToBlend(Phi, Operands, Plan); 8825 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8826 return toVPRecipeResult(Recipe); 8827 8828 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8829 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8830 VPValue *StartV = Operands[0]; 8831 if (Legal->isReductionVariable(Phi)) { 8832 const RecurrenceDescriptor &RdxDesc = 8833 Legal->getReductionVars().find(Phi)->second; 8834 assert(RdxDesc.getRecurrenceStartValue() == 8835 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8836 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8837 CM.isInLoopReduction(Phi), 8838 CM.useOrderedReductions(RdxDesc)); 8839 } else { 8840 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8841 } 8842 8843 // Record the incoming value from the backedge, so we can add the incoming 8844 // value from the backedge after all recipes have been created. 8845 recordRecipeOf(cast<Instruction>( 8846 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8847 PhisToFix.push_back(PhiRecipe); 8848 } else { 8849 // TODO: record backedge value for remaining pointer induction phis. 8850 assert(Phi->getType()->isPointerTy() && 8851 "only pointer phis should be handled here"); 8852 assert(Legal->getInductionVars().count(Phi) && 8853 "Not an induction variable"); 8854 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8855 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8856 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8857 } 8858 8859 return toVPRecipeResult(PhiRecipe); 8860 } 8861 8862 if (isa<TruncInst>(Instr) && 8863 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8864 Range, *Plan))) 8865 return toVPRecipeResult(Recipe); 8866 8867 if (!shouldWiden(Instr, Range)) 8868 return nullptr; 8869 8870 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8871 return toVPRecipeResult(new VPWidenGEPRecipe( 8872 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8873 8874 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8875 bool InvariantCond = 8876 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8877 return toVPRecipeResult(new VPWidenSelectRecipe( 8878 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8879 } 8880 8881 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8882 } 8883 8884 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8885 ElementCount MaxVF) { 8886 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8887 8888 // Collect instructions from the original loop that will become trivially dead 8889 // in the vectorized loop. We don't need to vectorize these instructions. For 8890 // example, original induction update instructions can become dead because we 8891 // separately emit induction "steps" when generating code for the new loop. 8892 // Similarly, we create a new latch condition when setting up the structure 8893 // of the new loop, so the old one can become dead. 8894 SmallPtrSet<Instruction *, 4> DeadInstructions; 8895 collectTriviallyDeadInstructions(DeadInstructions); 8896 8897 // Add assume instructions we need to drop to DeadInstructions, to prevent 8898 // them from being added to the VPlan. 8899 // TODO: We only need to drop assumes in blocks that get flattend. If the 8900 // control flow is preserved, we should keep them. 8901 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8902 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8903 8904 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8905 // Dead instructions do not need sinking. Remove them from SinkAfter. 8906 for (Instruction *I : DeadInstructions) 8907 SinkAfter.erase(I); 8908 8909 // Cannot sink instructions after dead instructions (there won't be any 8910 // recipes for them). Instead, find the first non-dead previous instruction. 8911 for (auto &P : Legal->getSinkAfter()) { 8912 Instruction *SinkTarget = P.second; 8913 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8914 (void)FirstInst; 8915 while (DeadInstructions.contains(SinkTarget)) { 8916 assert( 8917 SinkTarget != FirstInst && 8918 "Must find a live instruction (at least the one feeding the " 8919 "first-order recurrence PHI) before reaching beginning of the block"); 8920 SinkTarget = SinkTarget->getPrevNode(); 8921 assert(SinkTarget != P.first && 8922 "sink source equals target, no sinking required"); 8923 } 8924 P.second = SinkTarget; 8925 } 8926 8927 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8928 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8929 VFRange SubRange = {VF, MaxVFPlusOne}; 8930 VPlans.push_back( 8931 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8932 VF = SubRange.End; 8933 } 8934 } 8935 8936 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8937 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8938 // BranchOnCount VPInstruction to the latch. 8939 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8940 bool HasNUW, bool IsVPlanNative) { 8941 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8942 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8943 8944 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8945 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8946 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8947 if (IsVPlanNative) 8948 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 8949 Header->insert(CanonicalIVPHI, Header->begin()); 8950 8951 auto *CanonicalIVIncrement = 8952 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8953 : VPInstruction::CanonicalIVIncrement, 8954 {CanonicalIVPHI}, DL); 8955 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8956 8957 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 8958 if (IsVPlanNative) { 8959 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 8960 EB->setCondBit(nullptr); 8961 } 8962 EB->appendRecipe(CanonicalIVIncrement); 8963 8964 auto *BranchOnCount = 8965 new VPInstruction(VPInstruction::BranchOnCount, 8966 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8967 EB->appendRecipe(BranchOnCount); 8968 } 8969 8970 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8971 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8972 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8973 8974 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8975 8976 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8977 8978 // --------------------------------------------------------------------------- 8979 // Pre-construction: record ingredients whose recipes we'll need to further 8980 // process after constructing the initial VPlan. 8981 // --------------------------------------------------------------------------- 8982 8983 // Mark instructions we'll need to sink later and their targets as 8984 // ingredients whose recipe we'll need to record. 8985 for (auto &Entry : SinkAfter) { 8986 RecipeBuilder.recordRecipeOf(Entry.first); 8987 RecipeBuilder.recordRecipeOf(Entry.second); 8988 } 8989 for (auto &Reduction : CM.getInLoopReductionChains()) { 8990 PHINode *Phi = Reduction.first; 8991 RecurKind Kind = 8992 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8993 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8994 8995 RecipeBuilder.recordRecipeOf(Phi); 8996 for (auto &R : ReductionOperations) { 8997 RecipeBuilder.recordRecipeOf(R); 8998 // For min/max reducitons, where we have a pair of icmp/select, we also 8999 // need to record the ICmp recipe, so it can be removed later. 9000 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9001 "Only min/max recurrences allowed for inloop reductions"); 9002 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9003 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9004 } 9005 } 9006 9007 // For each interleave group which is relevant for this (possibly trimmed) 9008 // Range, add it to the set of groups to be later applied to the VPlan and add 9009 // placeholders for its members' Recipes which we'll be replacing with a 9010 // single VPInterleaveRecipe. 9011 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9012 auto applyIG = [IG, this](ElementCount VF) -> bool { 9013 return (VF.isVector() && // Query is illegal for VF == 1 9014 CM.getWideningDecision(IG->getInsertPos(), VF) == 9015 LoopVectorizationCostModel::CM_Interleave); 9016 }; 9017 if (!getDecisionAndClampRange(applyIG, Range)) 9018 continue; 9019 InterleaveGroups.insert(IG); 9020 for (unsigned i = 0; i < IG->getFactor(); i++) 9021 if (Instruction *Member = IG->getMember(i)) 9022 RecipeBuilder.recordRecipeOf(Member); 9023 }; 9024 9025 // --------------------------------------------------------------------------- 9026 // Build initial VPlan: Scan the body of the loop in a topological order to 9027 // visit each basic block after having visited its predecessor basic blocks. 9028 // --------------------------------------------------------------------------- 9029 9030 // Create initial VPlan skeleton, with separate header and latch blocks. 9031 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9032 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9033 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9034 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9035 auto Plan = std::make_unique<VPlan>(TopRegion); 9036 9037 Instruction *DLInst = 9038 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9039 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 9040 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 9041 !CM.foldTailByMasking(), false); 9042 9043 // Scan the body of the loop in a topological order to visit each basic block 9044 // after having visited its predecessor basic blocks. 9045 LoopBlocksDFS DFS(OrigLoop); 9046 DFS.perform(LI); 9047 9048 VPBasicBlock *VPBB = HeaderVPBB; 9049 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9050 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9051 // Relevant instructions from basic block BB will be grouped into VPRecipe 9052 // ingredients and fill a new VPBasicBlock. 9053 unsigned VPBBsForBB = 0; 9054 VPBB->setName(BB->getName()); 9055 Builder.setInsertPoint(VPBB); 9056 9057 // Introduce each ingredient into VPlan. 9058 // TODO: Model and preserve debug instrinsics in VPlan. 9059 for (Instruction &I : BB->instructionsWithoutDebug()) { 9060 Instruction *Instr = &I; 9061 9062 // First filter out irrelevant instructions, to ensure no recipes are 9063 // built for them. 9064 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9065 continue; 9066 9067 SmallVector<VPValue *, 4> Operands; 9068 auto *Phi = dyn_cast<PHINode>(Instr); 9069 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9070 Operands.push_back(Plan->getOrAddVPValue( 9071 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9072 } else { 9073 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9074 Operands = {OpRange.begin(), OpRange.end()}; 9075 } 9076 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9077 Instr, Operands, Range, Plan)) { 9078 // If Instr can be simplified to an existing VPValue, use it. 9079 if (RecipeOrValue.is<VPValue *>()) { 9080 auto *VPV = RecipeOrValue.get<VPValue *>(); 9081 Plan->addVPValue(Instr, VPV); 9082 // If the re-used value is a recipe, register the recipe for the 9083 // instruction, in case the recipe for Instr needs to be recorded. 9084 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9085 RecipeBuilder.setRecipe(Instr, R); 9086 continue; 9087 } 9088 // Otherwise, add the new recipe. 9089 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9090 for (auto *Def : Recipe->definedValues()) { 9091 auto *UV = Def->getUnderlyingValue(); 9092 Plan->addVPValue(UV, Def); 9093 } 9094 9095 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9096 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9097 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9098 // of the header block. That can happen for truncates of induction 9099 // variables. Those recipes are moved to the phi section of the header 9100 // block after applying SinkAfter, which relies on the original 9101 // position of the trunc. 9102 assert(isa<TruncInst>(Instr)); 9103 InductionsToMove.push_back( 9104 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9105 } 9106 RecipeBuilder.setRecipe(Instr, Recipe); 9107 VPBB->appendRecipe(Recipe); 9108 continue; 9109 } 9110 9111 // Otherwise, if all widening options failed, Instruction is to be 9112 // replicated. This may create a successor for VPBB. 9113 VPBasicBlock *NextVPBB = 9114 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9115 if (NextVPBB != VPBB) { 9116 VPBB = NextVPBB; 9117 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9118 : ""); 9119 } 9120 } 9121 9122 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9123 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9124 } 9125 9126 // Fold the last, empty block into its predecessor. 9127 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9128 assert(VPBB && "expected to fold last (empty) block"); 9129 // After here, VPBB should not be used. 9130 VPBB = nullptr; 9131 9132 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9133 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9134 "entry block must be set to a VPRegionBlock having a non-empty entry " 9135 "VPBasicBlock"); 9136 RecipeBuilder.fixHeaderPhis(); 9137 9138 // --------------------------------------------------------------------------- 9139 // Transform initial VPlan: Apply previously taken decisions, in order, to 9140 // bring the VPlan to its final state. 9141 // --------------------------------------------------------------------------- 9142 9143 // Apply Sink-After legal constraints. 9144 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9145 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9146 if (Region && Region->isReplicator()) { 9147 assert(Region->getNumSuccessors() == 1 && 9148 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9149 assert(R->getParent()->size() == 1 && 9150 "A recipe in an original replicator region must be the only " 9151 "recipe in its block"); 9152 return Region; 9153 } 9154 return nullptr; 9155 }; 9156 for (auto &Entry : SinkAfter) { 9157 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9158 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9159 9160 auto *TargetRegion = GetReplicateRegion(Target); 9161 auto *SinkRegion = GetReplicateRegion(Sink); 9162 if (!SinkRegion) { 9163 // If the sink source is not a replicate region, sink the recipe directly. 9164 if (TargetRegion) { 9165 // The target is in a replication region, make sure to move Sink to 9166 // the block after it, not into the replication region itself. 9167 VPBasicBlock *NextBlock = 9168 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9169 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9170 } else 9171 Sink->moveAfter(Target); 9172 continue; 9173 } 9174 9175 // The sink source is in a replicate region. Unhook the region from the CFG. 9176 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9177 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9178 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9179 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9180 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9181 9182 if (TargetRegion) { 9183 // The target recipe is also in a replicate region, move the sink region 9184 // after the target region. 9185 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9186 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9187 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9188 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9189 } else { 9190 // The sink source is in a replicate region, we need to move the whole 9191 // replicate region, which should only contain a single recipe in the 9192 // main block. 9193 auto *SplitBlock = 9194 Target->getParent()->splitAt(std::next(Target->getIterator())); 9195 9196 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9197 9198 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9199 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9200 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9201 } 9202 } 9203 9204 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9205 9206 // Now that sink-after is done, move induction recipes for optimized truncates 9207 // to the phi section of the header block. 9208 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9209 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9210 9211 // Adjust the recipes for any inloop reductions. 9212 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9213 RecipeBuilder, Range.Start); 9214 9215 // Introduce a recipe to combine the incoming and previous values of a 9216 // first-order recurrence. 9217 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9218 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9219 if (!RecurPhi) 9220 continue; 9221 9222 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9223 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9224 auto *Region = GetReplicateRegion(PrevRecipe); 9225 if (Region) 9226 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9227 if (Region || PrevRecipe->isPhi()) 9228 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9229 else 9230 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9231 9232 auto *RecurSplice = cast<VPInstruction>( 9233 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9234 {RecurPhi, RecurPhi->getBackedgeValue()})); 9235 9236 RecurPhi->replaceAllUsesWith(RecurSplice); 9237 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9238 // all users. 9239 RecurSplice->setOperand(0, RecurPhi); 9240 } 9241 9242 // Interleave memory: for each Interleave Group we marked earlier as relevant 9243 // for this VPlan, replace the Recipes widening its memory instructions with a 9244 // single VPInterleaveRecipe at its insertion point. 9245 for (auto IG : InterleaveGroups) { 9246 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9247 RecipeBuilder.getRecipe(IG->getInsertPos())); 9248 SmallVector<VPValue *, 4> StoredValues; 9249 for (unsigned i = 0; i < IG->getFactor(); ++i) 9250 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9251 auto *StoreR = 9252 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9253 StoredValues.push_back(StoreR->getStoredValue()); 9254 } 9255 9256 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9257 Recipe->getMask()); 9258 VPIG->insertBefore(Recipe); 9259 unsigned J = 0; 9260 for (unsigned i = 0; i < IG->getFactor(); ++i) 9261 if (Instruction *Member = IG->getMember(i)) { 9262 if (!Member->getType()->isVoidTy()) { 9263 VPValue *OriginalV = Plan->getVPValue(Member); 9264 Plan->removeVPValueFor(Member); 9265 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9266 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9267 J++; 9268 } 9269 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9270 } 9271 } 9272 9273 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9274 // in ways that accessing values using original IR values is incorrect. 9275 Plan->disableValue2VPValue(); 9276 9277 VPlanTransforms::sinkScalarOperands(*Plan); 9278 VPlanTransforms::mergeReplicateRegions(*Plan); 9279 9280 std::string PlanName; 9281 raw_string_ostream RSO(PlanName); 9282 ElementCount VF = Range.Start; 9283 Plan->addVF(VF); 9284 RSO << "Initial VPlan for VF={" << VF; 9285 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9286 Plan->addVF(VF); 9287 RSO << "," << VF; 9288 } 9289 RSO << "},UF>=1"; 9290 RSO.flush(); 9291 Plan->setName(PlanName); 9292 9293 // Fold Exit block into its predecessor if possible. 9294 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9295 // VPBasicBlock as exit. 9296 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9297 9298 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9299 return Plan; 9300 } 9301 9302 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9303 // Outer loop handling: They may require CFG and instruction level 9304 // transformations before even evaluating whether vectorization is profitable. 9305 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9306 // the vectorization pipeline. 9307 assert(!OrigLoop->isInnermost()); 9308 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9309 9310 // Create new empty VPlan 9311 auto Plan = std::make_unique<VPlan>(); 9312 9313 // Build hierarchical CFG 9314 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9315 HCFGBuilder.buildHierarchicalCFG(); 9316 9317 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9318 VF *= 2) 9319 Plan->addVF(VF); 9320 9321 if (EnableVPlanPredication) { 9322 VPlanPredicator VPP(*Plan); 9323 VPP.predicate(); 9324 9325 // Avoid running transformation to recipes until masked code generation in 9326 // VPlan-native path is in place. 9327 return Plan; 9328 } 9329 9330 SmallPtrSet<Instruction *, 1> DeadInstructions; 9331 VPlanTransforms::VPInstructionsToVPRecipes( 9332 OrigLoop, Plan, 9333 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9334 DeadInstructions, *PSE.getSE()); 9335 9336 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9337 true, true); 9338 return Plan; 9339 } 9340 9341 // Adjust the recipes for reductions. For in-loop reductions the chain of 9342 // instructions leading from the loop exit instr to the phi need to be converted 9343 // to reductions, with one operand being vector and the other being the scalar 9344 // reduction chain. For other reductions, a select is introduced between the phi 9345 // and live-out recipes when folding the tail. 9346 void LoopVectorizationPlanner::adjustRecipesForReductions( 9347 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9348 ElementCount MinVF) { 9349 for (auto &Reduction : CM.getInLoopReductionChains()) { 9350 PHINode *Phi = Reduction.first; 9351 const RecurrenceDescriptor &RdxDesc = 9352 Legal->getReductionVars().find(Phi)->second; 9353 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9354 9355 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9356 continue; 9357 9358 // ReductionOperations are orders top-down from the phi's use to the 9359 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9360 // which of the two operands will remain scalar and which will be reduced. 9361 // For minmax the chain will be the select instructions. 9362 Instruction *Chain = Phi; 9363 for (Instruction *R : ReductionOperations) { 9364 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9365 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9366 9367 VPValue *ChainOp = Plan->getVPValue(Chain); 9368 unsigned FirstOpId; 9369 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9370 "Only min/max recurrences allowed for inloop reductions"); 9371 // Recognize a call to the llvm.fmuladd intrinsic. 9372 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9373 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9374 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9375 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9376 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9377 "Expected to replace a VPWidenSelectSC"); 9378 FirstOpId = 1; 9379 } else { 9380 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9381 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9382 "Expected to replace a VPWidenSC"); 9383 FirstOpId = 0; 9384 } 9385 unsigned VecOpId = 9386 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9387 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9388 9389 auto *CondOp = CM.foldTailByMasking() 9390 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9391 : nullptr; 9392 9393 if (IsFMulAdd) { 9394 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9395 // need to create an fmul recipe to use as the vector operand for the 9396 // fadd reduction. 9397 VPInstruction *FMulRecipe = new VPInstruction( 9398 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9399 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9400 WidenRecipe->getParent()->insert(FMulRecipe, 9401 WidenRecipe->getIterator()); 9402 VecOp = FMulRecipe; 9403 } 9404 VPReductionRecipe *RedRecipe = 9405 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9406 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9407 Plan->removeVPValueFor(R); 9408 Plan->addVPValue(R, RedRecipe); 9409 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9410 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9411 WidenRecipe->eraseFromParent(); 9412 9413 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9414 VPRecipeBase *CompareRecipe = 9415 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9416 assert(isa<VPWidenRecipe>(CompareRecipe) && 9417 "Expected to replace a VPWidenSC"); 9418 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9419 "Expected no remaining users"); 9420 CompareRecipe->eraseFromParent(); 9421 } 9422 Chain = R; 9423 } 9424 } 9425 9426 // If tail is folded by masking, introduce selects between the phi 9427 // and the live-out instruction of each reduction, at the beginning of the 9428 // dedicated latch block. 9429 if (CM.foldTailByMasking()) { 9430 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9431 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9432 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9433 if (!PhiR || PhiR->isInLoop()) 9434 continue; 9435 VPValue *Cond = 9436 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9437 VPValue *Red = PhiR->getBackedgeValue(); 9438 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9439 "reduction recipe must be defined before latch"); 9440 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9441 } 9442 } 9443 } 9444 9445 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9446 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9447 VPSlotTracker &SlotTracker) const { 9448 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9449 IG->getInsertPos()->printAsOperand(O, false); 9450 O << ", "; 9451 getAddr()->printAsOperand(O, SlotTracker); 9452 VPValue *Mask = getMask(); 9453 if (Mask) { 9454 O << ", "; 9455 Mask->printAsOperand(O, SlotTracker); 9456 } 9457 9458 unsigned OpIdx = 0; 9459 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9460 if (!IG->getMember(i)) 9461 continue; 9462 if (getNumStoreOperands() > 0) { 9463 O << "\n" << Indent << " store "; 9464 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9465 O << " to index " << i; 9466 } else { 9467 O << "\n" << Indent << " "; 9468 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9469 O << " = load from index " << i; 9470 } 9471 ++OpIdx; 9472 } 9473 } 9474 #endif 9475 9476 void VPWidenCallRecipe::execute(VPTransformState &State) { 9477 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9478 *this, State); 9479 } 9480 9481 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9482 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9483 State.ILV->setDebugLocFromInst(&I); 9484 9485 // The condition can be loop invariant but still defined inside the 9486 // loop. This means that we can't just use the original 'cond' value. 9487 // We have to take the 'vectorized' value and pick the first lane. 9488 // Instcombine will make this a no-op. 9489 auto *InvarCond = 9490 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9491 9492 for (unsigned Part = 0; Part < State.UF; ++Part) { 9493 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9494 Value *Op0 = State.get(getOperand(1), Part); 9495 Value *Op1 = State.get(getOperand(2), Part); 9496 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9497 State.set(this, Sel, Part); 9498 State.ILV->addMetadata(Sel, &I); 9499 } 9500 } 9501 9502 void VPWidenRecipe::execute(VPTransformState &State) { 9503 auto &I = *cast<Instruction>(getUnderlyingValue()); 9504 auto &Builder = State.Builder; 9505 switch (I.getOpcode()) { 9506 case Instruction::Call: 9507 case Instruction::Br: 9508 case Instruction::PHI: 9509 case Instruction::GetElementPtr: 9510 case Instruction::Select: 9511 llvm_unreachable("This instruction is handled by a different recipe."); 9512 case Instruction::UDiv: 9513 case Instruction::SDiv: 9514 case Instruction::SRem: 9515 case Instruction::URem: 9516 case Instruction::Add: 9517 case Instruction::FAdd: 9518 case Instruction::Sub: 9519 case Instruction::FSub: 9520 case Instruction::FNeg: 9521 case Instruction::Mul: 9522 case Instruction::FMul: 9523 case Instruction::FDiv: 9524 case Instruction::FRem: 9525 case Instruction::Shl: 9526 case Instruction::LShr: 9527 case Instruction::AShr: 9528 case Instruction::And: 9529 case Instruction::Or: 9530 case Instruction::Xor: { 9531 // Just widen unops and binops. 9532 State.ILV->setDebugLocFromInst(&I); 9533 9534 for (unsigned Part = 0; Part < State.UF; ++Part) { 9535 SmallVector<Value *, 2> Ops; 9536 for (VPValue *VPOp : operands()) 9537 Ops.push_back(State.get(VPOp, Part)); 9538 9539 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9540 9541 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9542 VecOp->copyIRFlags(&I); 9543 9544 // If the instruction is vectorized and was in a basic block that needed 9545 // predication, we can't propagate poison-generating flags (nuw/nsw, 9546 // exact, etc.). The control flow has been linearized and the 9547 // instruction is no longer guarded by the predicate, which could make 9548 // the flag properties to no longer hold. 9549 if (State.MayGeneratePoisonRecipes.contains(this)) 9550 VecOp->dropPoisonGeneratingFlags(); 9551 } 9552 9553 // Use this vector value for all users of the original instruction. 9554 State.set(this, V, Part); 9555 State.ILV->addMetadata(V, &I); 9556 } 9557 9558 break; 9559 } 9560 case Instruction::ICmp: 9561 case Instruction::FCmp: { 9562 // Widen compares. Generate vector compares. 9563 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9564 auto *Cmp = cast<CmpInst>(&I); 9565 State.ILV->setDebugLocFromInst(Cmp); 9566 for (unsigned Part = 0; Part < State.UF; ++Part) { 9567 Value *A = State.get(getOperand(0), Part); 9568 Value *B = State.get(getOperand(1), Part); 9569 Value *C = nullptr; 9570 if (FCmp) { 9571 // Propagate fast math flags. 9572 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9573 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9574 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9575 } else { 9576 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9577 } 9578 State.set(this, C, Part); 9579 State.ILV->addMetadata(C, &I); 9580 } 9581 9582 break; 9583 } 9584 9585 case Instruction::ZExt: 9586 case Instruction::SExt: 9587 case Instruction::FPToUI: 9588 case Instruction::FPToSI: 9589 case Instruction::FPExt: 9590 case Instruction::PtrToInt: 9591 case Instruction::IntToPtr: 9592 case Instruction::SIToFP: 9593 case Instruction::UIToFP: 9594 case Instruction::Trunc: 9595 case Instruction::FPTrunc: 9596 case Instruction::BitCast: { 9597 auto *CI = cast<CastInst>(&I); 9598 State.ILV->setDebugLocFromInst(CI); 9599 9600 /// Vectorize casts. 9601 Type *DestTy = (State.VF.isScalar()) 9602 ? CI->getType() 9603 : VectorType::get(CI->getType(), State.VF); 9604 9605 for (unsigned Part = 0; Part < State.UF; ++Part) { 9606 Value *A = State.get(getOperand(0), Part); 9607 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9608 State.set(this, Cast, Part); 9609 State.ILV->addMetadata(Cast, &I); 9610 } 9611 break; 9612 } 9613 default: 9614 // This instruction is not vectorized by simple widening. 9615 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9616 llvm_unreachable("Unhandled instruction!"); 9617 } // end of switch. 9618 } 9619 9620 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9621 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9622 // Construct a vector GEP by widening the operands of the scalar GEP as 9623 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9624 // results in a vector of pointers when at least one operand of the GEP 9625 // is vector-typed. Thus, to keep the representation compact, we only use 9626 // vector-typed operands for loop-varying values. 9627 9628 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9629 // If we are vectorizing, but the GEP has only loop-invariant operands, 9630 // the GEP we build (by only using vector-typed operands for 9631 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9632 // produce a vector of pointers, we need to either arbitrarily pick an 9633 // operand to broadcast, or broadcast a clone of the original GEP. 9634 // Here, we broadcast a clone of the original. 9635 // 9636 // TODO: If at some point we decide to scalarize instructions having 9637 // loop-invariant operands, this special case will no longer be 9638 // required. We would add the scalarization decision to 9639 // collectLoopScalars() and teach getVectorValue() to broadcast 9640 // the lane-zero scalar value. 9641 auto *Clone = State.Builder.Insert(GEP->clone()); 9642 for (unsigned Part = 0; Part < State.UF; ++Part) { 9643 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9644 State.set(this, EntryPart, Part); 9645 State.ILV->addMetadata(EntryPart, GEP); 9646 } 9647 } else { 9648 // If the GEP has at least one loop-varying operand, we are sure to 9649 // produce a vector of pointers. But if we are only unrolling, we want 9650 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9651 // produce with the code below will be scalar (if VF == 1) or vector 9652 // (otherwise). Note that for the unroll-only case, we still maintain 9653 // values in the vector mapping with initVector, as we do for other 9654 // instructions. 9655 for (unsigned Part = 0; Part < State.UF; ++Part) { 9656 // The pointer operand of the new GEP. If it's loop-invariant, we 9657 // won't broadcast it. 9658 auto *Ptr = IsPtrLoopInvariant 9659 ? State.get(getOperand(0), VPIteration(0, 0)) 9660 : State.get(getOperand(0), Part); 9661 9662 // Collect all the indices for the new GEP. If any index is 9663 // loop-invariant, we won't broadcast it. 9664 SmallVector<Value *, 4> Indices; 9665 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9666 VPValue *Operand = getOperand(I); 9667 if (IsIndexLoopInvariant[I - 1]) 9668 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9669 else 9670 Indices.push_back(State.get(Operand, Part)); 9671 } 9672 9673 // If the GEP instruction is vectorized and was in a basic block that 9674 // needed predication, we can't propagate the poison-generating 'inbounds' 9675 // flag. The control flow has been linearized and the GEP is no longer 9676 // guarded by the predicate, which could make the 'inbounds' properties to 9677 // no longer hold. 9678 bool IsInBounds = 9679 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9680 9681 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9682 // but it should be a vector, otherwise. 9683 auto *NewGEP = IsInBounds 9684 ? State.Builder.CreateInBoundsGEP( 9685 GEP->getSourceElementType(), Ptr, Indices) 9686 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9687 Ptr, Indices); 9688 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9689 "NewGEP is not a pointer vector"); 9690 State.set(this, NewGEP, Part); 9691 State.ILV->addMetadata(NewGEP, GEP); 9692 } 9693 } 9694 } 9695 9696 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9697 assert(!State.Instance && "Int or FP induction being replicated."); 9698 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9699 State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); 9700 } 9701 9702 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9703 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9704 State); 9705 } 9706 9707 void VPBlendRecipe::execute(VPTransformState &State) { 9708 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9709 // We know that all PHIs in non-header blocks are converted into 9710 // selects, so we don't have to worry about the insertion order and we 9711 // can just use the builder. 9712 // At this point we generate the predication tree. There may be 9713 // duplications since this is a simple recursive scan, but future 9714 // optimizations will clean it up. 9715 9716 unsigned NumIncoming = getNumIncomingValues(); 9717 9718 // Generate a sequence of selects of the form: 9719 // SELECT(Mask3, In3, 9720 // SELECT(Mask2, In2, 9721 // SELECT(Mask1, In1, 9722 // In0))) 9723 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9724 // are essentially undef are taken from In0. 9725 InnerLoopVectorizer::VectorParts Entry(State.UF); 9726 for (unsigned In = 0; In < NumIncoming; ++In) { 9727 for (unsigned Part = 0; Part < State.UF; ++Part) { 9728 // We might have single edge PHIs (blocks) - use an identity 9729 // 'select' for the first PHI operand. 9730 Value *In0 = State.get(getIncomingValue(In), Part); 9731 if (In == 0) 9732 Entry[Part] = In0; // Initialize with the first incoming value. 9733 else { 9734 // Select between the current value and the previous incoming edge 9735 // based on the incoming mask. 9736 Value *Cond = State.get(getMask(In), Part); 9737 Entry[Part] = 9738 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9739 } 9740 } 9741 } 9742 for (unsigned Part = 0; Part < State.UF; ++Part) 9743 State.set(this, Entry[Part], Part); 9744 } 9745 9746 void VPInterleaveRecipe::execute(VPTransformState &State) { 9747 assert(!State.Instance && "Interleave group being replicated."); 9748 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9749 getStoredValues(), getMask()); 9750 } 9751 9752 void VPReductionRecipe::execute(VPTransformState &State) { 9753 assert(!State.Instance && "Reduction being replicated."); 9754 Value *PrevInChain = State.get(getChainOp(), 0); 9755 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9756 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9757 // Propagate the fast-math flags carried by the underlying instruction. 9758 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9759 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9760 for (unsigned Part = 0; Part < State.UF; ++Part) { 9761 Value *NewVecOp = State.get(getVecOp(), Part); 9762 if (VPValue *Cond = getCondOp()) { 9763 Value *NewCond = State.get(Cond, Part); 9764 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9765 Value *Iden = RdxDesc->getRecurrenceIdentity( 9766 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9767 Value *IdenVec = 9768 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9769 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9770 NewVecOp = Select; 9771 } 9772 Value *NewRed; 9773 Value *NextInChain; 9774 if (IsOrdered) { 9775 if (State.VF.isVector()) 9776 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9777 PrevInChain); 9778 else 9779 NewRed = State.Builder.CreateBinOp( 9780 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9781 NewVecOp); 9782 PrevInChain = NewRed; 9783 } else { 9784 PrevInChain = State.get(getChainOp(), Part); 9785 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9786 } 9787 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9788 NextInChain = 9789 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9790 NewRed, PrevInChain); 9791 } else if (IsOrdered) 9792 NextInChain = NewRed; 9793 else 9794 NextInChain = State.Builder.CreateBinOp( 9795 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9796 PrevInChain); 9797 State.set(this, NextInChain, Part); 9798 } 9799 } 9800 9801 void VPReplicateRecipe::execute(VPTransformState &State) { 9802 if (State.Instance) { // Generate a single instance. 9803 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9804 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9805 IsPredicated, State); 9806 // Insert scalar instance packing it into a vector. 9807 if (AlsoPack && State.VF.isVector()) { 9808 // If we're constructing lane 0, initialize to start from poison. 9809 if (State.Instance->Lane.isFirstLane()) { 9810 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9811 Value *Poison = PoisonValue::get( 9812 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9813 State.set(this, Poison, State.Instance->Part); 9814 } 9815 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9816 } 9817 return; 9818 } 9819 9820 // Generate scalar instances for all VF lanes of all UF parts, unless the 9821 // instruction is uniform inwhich case generate only the first lane for each 9822 // of the UF parts. 9823 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9824 assert((!State.VF.isScalable() || IsUniform) && 9825 "Can't scalarize a scalable vector"); 9826 for (unsigned Part = 0; Part < State.UF; ++Part) 9827 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9828 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9829 VPIteration(Part, Lane), IsPredicated, 9830 State); 9831 } 9832 9833 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9834 assert(State.Instance && "Branch on Mask works only on single instance."); 9835 9836 unsigned Part = State.Instance->Part; 9837 unsigned Lane = State.Instance->Lane.getKnownLane(); 9838 9839 Value *ConditionBit = nullptr; 9840 VPValue *BlockInMask = getMask(); 9841 if (BlockInMask) { 9842 ConditionBit = State.get(BlockInMask, Part); 9843 if (ConditionBit->getType()->isVectorTy()) 9844 ConditionBit = State.Builder.CreateExtractElement( 9845 ConditionBit, State.Builder.getInt32(Lane)); 9846 } else // Block in mask is all-one. 9847 ConditionBit = State.Builder.getTrue(); 9848 9849 // Replace the temporary unreachable terminator with a new conditional branch, 9850 // whose two destinations will be set later when they are created. 9851 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9852 assert(isa<UnreachableInst>(CurrentTerminator) && 9853 "Expected to replace unreachable terminator with conditional branch."); 9854 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9855 CondBr->setSuccessor(0, nullptr); 9856 ReplaceInstWithInst(CurrentTerminator, CondBr); 9857 } 9858 9859 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9860 assert(State.Instance && "Predicated instruction PHI works per instance."); 9861 Instruction *ScalarPredInst = 9862 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9863 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9864 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9865 assert(PredicatingBB && "Predicated block has no single predecessor."); 9866 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9867 "operand must be VPReplicateRecipe"); 9868 9869 // By current pack/unpack logic we need to generate only a single phi node: if 9870 // a vector value for the predicated instruction exists at this point it means 9871 // the instruction has vector users only, and a phi for the vector value is 9872 // needed. In this case the recipe of the predicated instruction is marked to 9873 // also do that packing, thereby "hoisting" the insert-element sequence. 9874 // Otherwise, a phi node for the scalar value is needed. 9875 unsigned Part = State.Instance->Part; 9876 if (State.hasVectorValue(getOperand(0), Part)) { 9877 Value *VectorValue = State.get(getOperand(0), Part); 9878 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9879 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9880 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9881 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9882 if (State.hasVectorValue(this, Part)) 9883 State.reset(this, VPhi, Part); 9884 else 9885 State.set(this, VPhi, Part); 9886 // NOTE: Currently we need to update the value of the operand, so the next 9887 // predicated iteration inserts its generated value in the correct vector. 9888 State.reset(getOperand(0), VPhi, Part); 9889 } else { 9890 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9891 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9892 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9893 PredicatingBB); 9894 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9895 if (State.hasScalarValue(this, *State.Instance)) 9896 State.reset(this, Phi, *State.Instance); 9897 else 9898 State.set(this, Phi, *State.Instance); 9899 // NOTE: Currently we need to update the value of the operand, so the next 9900 // predicated iteration inserts its generated value in the correct vector. 9901 State.reset(getOperand(0), Phi, *State.Instance); 9902 } 9903 } 9904 9905 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9906 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9907 9908 // Attempt to issue a wide load. 9909 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9910 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9911 9912 assert((LI || SI) && "Invalid Load/Store instruction"); 9913 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9914 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9915 9916 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9917 9918 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9919 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9920 bool CreateGatherScatter = !Consecutive; 9921 9922 auto &Builder = State.Builder; 9923 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9924 bool isMaskRequired = getMask(); 9925 if (isMaskRequired) 9926 for (unsigned Part = 0; Part < State.UF; ++Part) 9927 BlockInMaskParts[Part] = State.get(getMask(), Part); 9928 9929 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9930 // Calculate the pointer for the specific unroll-part. 9931 GetElementPtrInst *PartPtr = nullptr; 9932 9933 bool InBounds = false; 9934 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9935 InBounds = gep->isInBounds(); 9936 if (Reverse) { 9937 // If the address is consecutive but reversed, then the 9938 // wide store needs to start at the last vector element. 9939 // RunTimeVF = VScale * VF.getKnownMinValue() 9940 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9941 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9942 // NumElt = -Part * RunTimeVF 9943 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9944 // LastLane = 1 - RunTimeVF 9945 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9946 PartPtr = 9947 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9948 PartPtr->setIsInBounds(InBounds); 9949 PartPtr = cast<GetElementPtrInst>( 9950 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9951 PartPtr->setIsInBounds(InBounds); 9952 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9953 BlockInMaskParts[Part] = 9954 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9955 } else { 9956 Value *Increment = 9957 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9958 PartPtr = cast<GetElementPtrInst>( 9959 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9960 PartPtr->setIsInBounds(InBounds); 9961 } 9962 9963 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9964 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9965 }; 9966 9967 // Handle Stores: 9968 if (SI) { 9969 State.ILV->setDebugLocFromInst(SI); 9970 9971 for (unsigned Part = 0; Part < State.UF; ++Part) { 9972 Instruction *NewSI = nullptr; 9973 Value *StoredVal = State.get(StoredValue, Part); 9974 if (CreateGatherScatter) { 9975 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9976 Value *VectorGep = State.get(getAddr(), Part); 9977 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9978 MaskPart); 9979 } else { 9980 if (Reverse) { 9981 // If we store to reverse consecutive memory locations, then we need 9982 // to reverse the order of elements in the stored value. 9983 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9984 // We don't want to update the value in the map as it might be used in 9985 // another expression. So don't call resetVectorValue(StoredVal). 9986 } 9987 auto *VecPtr = 9988 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9989 if (isMaskRequired) 9990 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9991 BlockInMaskParts[Part]); 9992 else 9993 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9994 } 9995 State.ILV->addMetadata(NewSI, SI); 9996 } 9997 return; 9998 } 9999 10000 // Handle loads. 10001 assert(LI && "Must have a load instruction"); 10002 State.ILV->setDebugLocFromInst(LI); 10003 for (unsigned Part = 0; Part < State.UF; ++Part) { 10004 Value *NewLI; 10005 if (CreateGatherScatter) { 10006 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10007 Value *VectorGep = State.get(getAddr(), Part); 10008 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10009 nullptr, "wide.masked.gather"); 10010 State.ILV->addMetadata(NewLI, LI); 10011 } else { 10012 auto *VecPtr = 10013 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10014 if (isMaskRequired) 10015 NewLI = Builder.CreateMaskedLoad( 10016 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10017 PoisonValue::get(DataTy), "wide.masked.load"); 10018 else 10019 NewLI = 10020 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10021 10022 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10023 State.ILV->addMetadata(NewLI, LI); 10024 if (Reverse) 10025 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10026 } 10027 10028 State.set(this, NewLI, Part); 10029 } 10030 } 10031 10032 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10033 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10034 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10035 // for predication. 10036 static ScalarEpilogueLowering getScalarEpilogueLowering( 10037 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10038 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10039 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10040 LoopVectorizationLegality &LVL) { 10041 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10042 // don't look at hints or options, and don't request a scalar epilogue. 10043 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10044 // LoopAccessInfo (due to code dependency and not being able to reliably get 10045 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10046 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10047 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10048 // back to the old way and vectorize with versioning when forced. See D81345.) 10049 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10050 PGSOQueryType::IRPass) && 10051 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10052 return CM_ScalarEpilogueNotAllowedOptSize; 10053 10054 // 2) If set, obey the directives 10055 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10056 switch (PreferPredicateOverEpilogue) { 10057 case PreferPredicateTy::ScalarEpilogue: 10058 return CM_ScalarEpilogueAllowed; 10059 case PreferPredicateTy::PredicateElseScalarEpilogue: 10060 return CM_ScalarEpilogueNotNeededUsePredicate; 10061 case PreferPredicateTy::PredicateOrDontVectorize: 10062 return CM_ScalarEpilogueNotAllowedUsePredicate; 10063 }; 10064 } 10065 10066 // 3) If set, obey the hints 10067 switch (Hints.getPredicate()) { 10068 case LoopVectorizeHints::FK_Enabled: 10069 return CM_ScalarEpilogueNotNeededUsePredicate; 10070 case LoopVectorizeHints::FK_Disabled: 10071 return CM_ScalarEpilogueAllowed; 10072 }; 10073 10074 // 4) if the TTI hook indicates this is profitable, request predication. 10075 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10076 LVL.getLAI())) 10077 return CM_ScalarEpilogueNotNeededUsePredicate; 10078 10079 return CM_ScalarEpilogueAllowed; 10080 } 10081 10082 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10083 // If Values have been set for this Def return the one relevant for \p Part. 10084 if (hasVectorValue(Def, Part)) 10085 return Data.PerPartOutput[Def][Part]; 10086 10087 if (!hasScalarValue(Def, {Part, 0})) { 10088 Value *IRV = Def->getLiveInIRValue(); 10089 Value *B = ILV->getBroadcastInstrs(IRV); 10090 set(Def, B, Part); 10091 return B; 10092 } 10093 10094 Value *ScalarValue = get(Def, {Part, 0}); 10095 // If we aren't vectorizing, we can just copy the scalar map values over 10096 // to the vector map. 10097 if (VF.isScalar()) { 10098 set(Def, ScalarValue, Part); 10099 return ScalarValue; 10100 } 10101 10102 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10103 bool IsUniform = RepR && RepR->isUniform(); 10104 10105 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10106 // Check if there is a scalar value for the selected lane. 10107 if (!hasScalarValue(Def, {Part, LastLane})) { 10108 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10109 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10110 "unexpected recipe found to be invariant"); 10111 IsUniform = true; 10112 LastLane = 0; 10113 } 10114 10115 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10116 // Set the insert point after the last scalarized instruction or after the 10117 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10118 // will directly follow the scalar definitions. 10119 auto OldIP = Builder.saveIP(); 10120 auto NewIP = 10121 isa<PHINode>(LastInst) 10122 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10123 : std::next(BasicBlock::iterator(LastInst)); 10124 Builder.SetInsertPoint(&*NewIP); 10125 10126 // However, if we are vectorizing, we need to construct the vector values. 10127 // If the value is known to be uniform after vectorization, we can just 10128 // broadcast the scalar value corresponding to lane zero for each unroll 10129 // iteration. Otherwise, we construct the vector values using 10130 // insertelement instructions. Since the resulting vectors are stored in 10131 // State, we will only generate the insertelements once. 10132 Value *VectorValue = nullptr; 10133 if (IsUniform) { 10134 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10135 set(Def, VectorValue, Part); 10136 } else { 10137 // Initialize packing with insertelements to start from undef. 10138 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10139 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10140 set(Def, Undef, Part); 10141 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10142 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10143 VectorValue = get(Def, Part); 10144 } 10145 Builder.restoreIP(OldIP); 10146 return VectorValue; 10147 } 10148 10149 // Process the loop in the VPlan-native vectorization path. This path builds 10150 // VPlan upfront in the vectorization pipeline, which allows to apply 10151 // VPlan-to-VPlan transformations from the very beginning without modifying the 10152 // input LLVM IR. 10153 static bool processLoopInVPlanNativePath( 10154 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10155 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10156 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10157 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10158 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10159 LoopVectorizationRequirements &Requirements) { 10160 10161 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10162 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10163 return false; 10164 } 10165 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10166 Function *F = L->getHeader()->getParent(); 10167 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10168 10169 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10170 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10171 10172 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10173 &Hints, IAI); 10174 // Use the planner for outer loop vectorization. 10175 // TODO: CM is not used at this point inside the planner. Turn CM into an 10176 // optional argument if we don't need it in the future. 10177 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10178 Requirements, ORE); 10179 10180 // Get user vectorization factor. 10181 ElementCount UserVF = Hints.getWidth(); 10182 10183 CM.collectElementTypesForWidening(); 10184 10185 // Plan how to best vectorize, return the best VF and its cost. 10186 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10187 10188 // If we are stress testing VPlan builds, do not attempt to generate vector 10189 // code. Masked vector code generation support will follow soon. 10190 // Also, do not attempt to vectorize if no vector code will be produced. 10191 if (VPlanBuildStressTest || EnableVPlanPredication || 10192 VectorizationFactor::Disabled() == VF) 10193 return false; 10194 10195 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10196 10197 { 10198 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10199 F->getParent()->getDataLayout()); 10200 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10201 &CM, BFI, PSI, Checks); 10202 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10203 << L->getHeader()->getParent()->getName() << "\"\n"); 10204 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10205 } 10206 10207 // Mark the loop as already vectorized to avoid vectorizing again. 10208 Hints.setAlreadyVectorized(); 10209 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10210 return true; 10211 } 10212 10213 // Emit a remark if there are stores to floats that required a floating point 10214 // extension. If the vectorized loop was generated with floating point there 10215 // will be a performance penalty from the conversion overhead and the change in 10216 // the vector width. 10217 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10218 SmallVector<Instruction *, 4> Worklist; 10219 for (BasicBlock *BB : L->getBlocks()) { 10220 for (Instruction &Inst : *BB) { 10221 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10222 if (S->getValueOperand()->getType()->isFloatTy()) 10223 Worklist.push_back(S); 10224 } 10225 } 10226 } 10227 10228 // Traverse the floating point stores upwards searching, for floating point 10229 // conversions. 10230 SmallPtrSet<const Instruction *, 4> Visited; 10231 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10232 while (!Worklist.empty()) { 10233 auto *I = Worklist.pop_back_val(); 10234 if (!L->contains(I)) 10235 continue; 10236 if (!Visited.insert(I).second) 10237 continue; 10238 10239 // Emit a remark if the floating point store required a floating 10240 // point conversion. 10241 // TODO: More work could be done to identify the root cause such as a 10242 // constant or a function return type and point the user to it. 10243 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10244 ORE->emit([&]() { 10245 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10246 I->getDebugLoc(), L->getHeader()) 10247 << "floating point conversion changes vector width. " 10248 << "Mixed floating point precision requires an up/down " 10249 << "cast that will negatively impact performance."; 10250 }); 10251 10252 for (Use &Op : I->operands()) 10253 if (auto *OpI = dyn_cast<Instruction>(Op)) 10254 Worklist.push_back(OpI); 10255 } 10256 } 10257 10258 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10259 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10260 !EnableLoopInterleaving), 10261 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10262 !EnableLoopVectorization) {} 10263 10264 bool LoopVectorizePass::processLoop(Loop *L) { 10265 assert((EnableVPlanNativePath || L->isInnermost()) && 10266 "VPlan-native path is not enabled. Only process inner loops."); 10267 10268 #ifndef NDEBUG 10269 const std::string DebugLocStr = getDebugLocString(L); 10270 #endif /* NDEBUG */ 10271 10272 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10273 << L->getHeader()->getParent()->getName() << "\" from " 10274 << DebugLocStr << "\n"); 10275 10276 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10277 10278 LLVM_DEBUG( 10279 dbgs() << "LV: Loop hints:" 10280 << " force=" 10281 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10282 ? "disabled" 10283 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10284 ? "enabled" 10285 : "?")) 10286 << " width=" << Hints.getWidth() 10287 << " interleave=" << Hints.getInterleave() << "\n"); 10288 10289 // Function containing loop 10290 Function *F = L->getHeader()->getParent(); 10291 10292 // Looking at the diagnostic output is the only way to determine if a loop 10293 // was vectorized (other than looking at the IR or machine code), so it 10294 // is important to generate an optimization remark for each loop. Most of 10295 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10296 // generated as OptimizationRemark and OptimizationRemarkMissed are 10297 // less verbose reporting vectorized loops and unvectorized loops that may 10298 // benefit from vectorization, respectively. 10299 10300 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10301 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10302 return false; 10303 } 10304 10305 PredicatedScalarEvolution PSE(*SE, *L); 10306 10307 // Check if it is legal to vectorize the loop. 10308 LoopVectorizationRequirements Requirements; 10309 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10310 &Requirements, &Hints, DB, AC, BFI, PSI); 10311 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10312 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10313 Hints.emitRemarkWithHints(); 10314 return false; 10315 } 10316 10317 // Check the function attributes and profiles to find out if this function 10318 // should be optimized for size. 10319 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10320 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10321 10322 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10323 // here. They may require CFG and instruction level transformations before 10324 // even evaluating whether vectorization is profitable. Since we cannot modify 10325 // the incoming IR, we need to build VPlan upfront in the vectorization 10326 // pipeline. 10327 if (!L->isInnermost()) 10328 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10329 ORE, BFI, PSI, Hints, Requirements); 10330 10331 assert(L->isInnermost() && "Inner loop expected."); 10332 10333 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10334 // count by optimizing for size, to minimize overheads. 10335 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10336 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10337 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10338 << "This loop is worth vectorizing only if no scalar " 10339 << "iteration overheads are incurred."); 10340 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10341 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10342 else { 10343 LLVM_DEBUG(dbgs() << "\n"); 10344 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10345 } 10346 } 10347 10348 // Check the function attributes to see if implicit floats are allowed. 10349 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10350 // an integer loop and the vector instructions selected are purely integer 10351 // vector instructions? 10352 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10353 reportVectorizationFailure( 10354 "Can't vectorize when the NoImplicitFloat attribute is used", 10355 "loop not vectorized due to NoImplicitFloat attribute", 10356 "NoImplicitFloat", ORE, L); 10357 Hints.emitRemarkWithHints(); 10358 return false; 10359 } 10360 10361 // Check if the target supports potentially unsafe FP vectorization. 10362 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10363 // for the target we're vectorizing for, to make sure none of the 10364 // additional fp-math flags can help. 10365 if (Hints.isPotentiallyUnsafe() && 10366 TTI->isFPVectorizationPotentiallyUnsafe()) { 10367 reportVectorizationFailure( 10368 "Potentially unsafe FP op prevents vectorization", 10369 "loop not vectorized due to unsafe FP support.", 10370 "UnsafeFP", ORE, L); 10371 Hints.emitRemarkWithHints(); 10372 return false; 10373 } 10374 10375 bool AllowOrderedReductions; 10376 // If the flag is set, use that instead and override the TTI behaviour. 10377 if (ForceOrderedReductions.getNumOccurrences() > 0) 10378 AllowOrderedReductions = ForceOrderedReductions; 10379 else 10380 AllowOrderedReductions = TTI->enableOrderedReductions(); 10381 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10382 ORE->emit([&]() { 10383 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10384 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10385 ExactFPMathInst->getDebugLoc(), 10386 ExactFPMathInst->getParent()) 10387 << "loop not vectorized: cannot prove it is safe to reorder " 10388 "floating-point operations"; 10389 }); 10390 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10391 "reorder floating-point operations\n"); 10392 Hints.emitRemarkWithHints(); 10393 return false; 10394 } 10395 10396 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10397 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10398 10399 // If an override option has been passed in for interleaved accesses, use it. 10400 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10401 UseInterleaved = EnableInterleavedMemAccesses; 10402 10403 // Analyze interleaved memory accesses. 10404 if (UseInterleaved) { 10405 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10406 } 10407 10408 // Use the cost model. 10409 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10410 F, &Hints, IAI); 10411 CM.collectValuesToIgnore(); 10412 CM.collectElementTypesForWidening(); 10413 10414 // Use the planner for vectorization. 10415 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10416 Requirements, ORE); 10417 10418 // Get user vectorization factor and interleave count. 10419 ElementCount UserVF = Hints.getWidth(); 10420 unsigned UserIC = Hints.getInterleave(); 10421 10422 // Plan how to best vectorize, return the best VF and its cost. 10423 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10424 10425 VectorizationFactor VF = VectorizationFactor::Disabled(); 10426 unsigned IC = 1; 10427 10428 if (MaybeVF) { 10429 VF = *MaybeVF; 10430 // Select the interleave count. 10431 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10432 } 10433 10434 // Identify the diagnostic messages that should be produced. 10435 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10436 bool VectorizeLoop = true, InterleaveLoop = true; 10437 if (VF.Width.isScalar()) { 10438 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10439 VecDiagMsg = std::make_pair( 10440 "VectorizationNotBeneficial", 10441 "the cost-model indicates that vectorization is not beneficial"); 10442 VectorizeLoop = false; 10443 } 10444 10445 if (!MaybeVF && UserIC > 1) { 10446 // Tell the user interleaving was avoided up-front, despite being explicitly 10447 // requested. 10448 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10449 "interleaving should be avoided up front\n"); 10450 IntDiagMsg = std::make_pair( 10451 "InterleavingAvoided", 10452 "Ignoring UserIC, because interleaving was avoided up front"); 10453 InterleaveLoop = false; 10454 } else if (IC == 1 && UserIC <= 1) { 10455 // Tell the user interleaving is not beneficial. 10456 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10457 IntDiagMsg = std::make_pair( 10458 "InterleavingNotBeneficial", 10459 "the cost-model indicates that interleaving is not beneficial"); 10460 InterleaveLoop = false; 10461 if (UserIC == 1) { 10462 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10463 IntDiagMsg.second += 10464 " and is explicitly disabled or interleave count is set to 1"; 10465 } 10466 } else if (IC > 1 && UserIC == 1) { 10467 // Tell the user interleaving is beneficial, but it explicitly disabled. 10468 LLVM_DEBUG( 10469 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10470 IntDiagMsg = std::make_pair( 10471 "InterleavingBeneficialButDisabled", 10472 "the cost-model indicates that interleaving is beneficial " 10473 "but is explicitly disabled or interleave count is set to 1"); 10474 InterleaveLoop = false; 10475 } 10476 10477 // Override IC if user provided an interleave count. 10478 IC = UserIC > 0 ? UserIC : IC; 10479 10480 // Emit diagnostic messages, if any. 10481 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10482 if (!VectorizeLoop && !InterleaveLoop) { 10483 // Do not vectorize or interleaving the loop. 10484 ORE->emit([&]() { 10485 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10486 L->getStartLoc(), L->getHeader()) 10487 << VecDiagMsg.second; 10488 }); 10489 ORE->emit([&]() { 10490 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10491 L->getStartLoc(), L->getHeader()) 10492 << IntDiagMsg.second; 10493 }); 10494 return false; 10495 } else if (!VectorizeLoop && InterleaveLoop) { 10496 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10497 ORE->emit([&]() { 10498 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10499 L->getStartLoc(), L->getHeader()) 10500 << VecDiagMsg.second; 10501 }); 10502 } else if (VectorizeLoop && !InterleaveLoop) { 10503 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10504 << ") in " << DebugLocStr << '\n'); 10505 ORE->emit([&]() { 10506 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10507 L->getStartLoc(), L->getHeader()) 10508 << IntDiagMsg.second; 10509 }); 10510 } else if (VectorizeLoop && InterleaveLoop) { 10511 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10512 << ") in " << DebugLocStr << '\n'); 10513 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10514 } 10515 10516 bool DisableRuntimeUnroll = false; 10517 MDNode *OrigLoopID = L->getLoopID(); 10518 { 10519 // Optimistically generate runtime checks. Drop them if they turn out to not 10520 // be profitable. Limit the scope of Checks, so the cleanup happens 10521 // immediately after vector codegeneration is done. 10522 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10523 F->getParent()->getDataLayout()); 10524 if (!VF.Width.isScalar() || IC > 1) 10525 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10526 10527 using namespace ore; 10528 if (!VectorizeLoop) { 10529 assert(IC > 1 && "interleave count should not be 1 or 0"); 10530 // If we decided that it is not legal to vectorize the loop, then 10531 // interleave it. 10532 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10533 &CM, BFI, PSI, Checks); 10534 10535 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10536 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10537 10538 ORE->emit([&]() { 10539 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10540 L->getHeader()) 10541 << "interleaved loop (interleaved count: " 10542 << NV("InterleaveCount", IC) << ")"; 10543 }); 10544 } else { 10545 // If we decided that it is *legal* to vectorize the loop, then do it. 10546 10547 // Consider vectorizing the epilogue too if it's profitable. 10548 VectorizationFactor EpilogueVF = 10549 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10550 if (EpilogueVF.Width.isVector()) { 10551 10552 // The first pass vectorizes the main loop and creates a scalar epilogue 10553 // to be vectorized by executing the plan (potentially with a different 10554 // factor) again shortly afterwards. 10555 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10556 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10557 EPI, &LVL, &CM, BFI, PSI, Checks); 10558 10559 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10560 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10561 DT); 10562 ++LoopsVectorized; 10563 10564 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10565 formLCSSARecursively(*L, *DT, LI, SE); 10566 10567 // Second pass vectorizes the epilogue and adjusts the control flow 10568 // edges from the first pass. 10569 EPI.MainLoopVF = EPI.EpilogueVF; 10570 EPI.MainLoopUF = EPI.EpilogueUF; 10571 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10572 ORE, EPI, &LVL, &CM, BFI, PSI, 10573 Checks); 10574 10575 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10576 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10577 DT); 10578 ++LoopsEpilogueVectorized; 10579 10580 if (!MainILV.areSafetyChecksAdded()) 10581 DisableRuntimeUnroll = true; 10582 } else { 10583 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10584 &LVL, &CM, BFI, PSI, Checks); 10585 10586 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10587 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10588 ++LoopsVectorized; 10589 10590 // Add metadata to disable runtime unrolling a scalar loop when there 10591 // are no runtime checks about strides and memory. A scalar loop that is 10592 // rarely used is not worth unrolling. 10593 if (!LB.areSafetyChecksAdded()) 10594 DisableRuntimeUnroll = true; 10595 } 10596 // Report the vectorization decision. 10597 ORE->emit([&]() { 10598 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10599 L->getHeader()) 10600 << "vectorized loop (vectorization width: " 10601 << NV("VectorizationFactor", VF.Width) 10602 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10603 }); 10604 } 10605 10606 if (ORE->allowExtraAnalysis(LV_NAME)) 10607 checkMixedPrecision(L, ORE); 10608 } 10609 10610 Optional<MDNode *> RemainderLoopID = 10611 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10612 LLVMLoopVectorizeFollowupEpilogue}); 10613 if (RemainderLoopID.hasValue()) { 10614 L->setLoopID(RemainderLoopID.getValue()); 10615 } else { 10616 if (DisableRuntimeUnroll) 10617 AddRuntimeUnrollDisableMetaData(L); 10618 10619 // Mark the loop as already vectorized to avoid vectorizing again. 10620 Hints.setAlreadyVectorized(); 10621 } 10622 10623 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10624 return true; 10625 } 10626 10627 LoopVectorizeResult LoopVectorizePass::runImpl( 10628 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10629 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10630 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10631 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10632 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10633 SE = &SE_; 10634 LI = &LI_; 10635 TTI = &TTI_; 10636 DT = &DT_; 10637 BFI = &BFI_; 10638 TLI = TLI_; 10639 AA = &AA_; 10640 AC = &AC_; 10641 GetLAA = &GetLAA_; 10642 DB = &DB_; 10643 ORE = &ORE_; 10644 PSI = PSI_; 10645 10646 // Don't attempt if 10647 // 1. the target claims to have no vector registers, and 10648 // 2. interleaving won't help ILP. 10649 // 10650 // The second condition is necessary because, even if the target has no 10651 // vector registers, loop vectorization may still enable scalar 10652 // interleaving. 10653 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10654 TTI->getMaxInterleaveFactor(1) < 2) 10655 return LoopVectorizeResult(false, false); 10656 10657 bool Changed = false, CFGChanged = false; 10658 10659 // The vectorizer requires loops to be in simplified form. 10660 // Since simplification may add new inner loops, it has to run before the 10661 // legality and profitability checks. This means running the loop vectorizer 10662 // will simplify all loops, regardless of whether anything end up being 10663 // vectorized. 10664 for (auto &L : *LI) 10665 Changed |= CFGChanged |= 10666 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10667 10668 // Build up a worklist of inner-loops to vectorize. This is necessary as 10669 // the act of vectorizing or partially unrolling a loop creates new loops 10670 // and can invalidate iterators across the loops. 10671 SmallVector<Loop *, 8> Worklist; 10672 10673 for (Loop *L : *LI) 10674 collectSupportedLoops(*L, LI, ORE, Worklist); 10675 10676 LoopsAnalyzed += Worklist.size(); 10677 10678 // Now walk the identified inner loops. 10679 while (!Worklist.empty()) { 10680 Loop *L = Worklist.pop_back_val(); 10681 10682 // For the inner loops we actually process, form LCSSA to simplify the 10683 // transform. 10684 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10685 10686 Changed |= CFGChanged |= processLoop(L); 10687 } 10688 10689 // Process each loop nest in the function. 10690 return LoopVectorizeResult(Changed, CFGChanged); 10691 } 10692 10693 PreservedAnalyses LoopVectorizePass::run(Function &F, 10694 FunctionAnalysisManager &AM) { 10695 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10696 auto &LI = AM.getResult<LoopAnalysis>(F); 10697 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10698 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10699 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10700 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10701 auto &AA = AM.getResult<AAManager>(F); 10702 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10703 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10704 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10705 10706 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10707 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10708 [&](Loop &L) -> const LoopAccessInfo & { 10709 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10710 TLI, TTI, nullptr, nullptr, nullptr}; 10711 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10712 }; 10713 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10714 ProfileSummaryInfo *PSI = 10715 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10716 LoopVectorizeResult Result = 10717 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10718 if (!Result.MadeAnyChange) 10719 return PreservedAnalyses::all(); 10720 PreservedAnalyses PA; 10721 10722 // We currently do not preserve loopinfo/dominator analyses with outer loop 10723 // vectorization. Until this is addressed, mark these analyses as preserved 10724 // only for non-VPlan-native path. 10725 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10726 if (!EnableVPlanNativePath) { 10727 PA.preserve<LoopAnalysis>(); 10728 PA.preserve<DominatorTreeAnalysis>(); 10729 } 10730 10731 if (Result.MadeCFGChange) { 10732 // Making CFG changes likely means a loop got vectorized. Indicate that 10733 // extra simplification passes should be run. 10734 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10735 // be run if runtime checks have been added. 10736 AM.getResult<ShouldRunExtraVectorPasses>(F); 10737 PA.preserve<ShouldRunExtraVectorPasses>(); 10738 } else { 10739 PA.preserveSet<CFGAnalyses>(); 10740 } 10741 return PA; 10742 } 10743 10744 void LoopVectorizePass::printPipeline( 10745 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10746 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10747 OS, MapClassName2PassName); 10748 10749 OS << "<"; 10750 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10751 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10752 OS << ">"; 10753 } 10754