1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop and the start value for the canonical induction, if it is != 0. The 474 /// latter is the case when vectorizing the epilogue loop. In the case of 475 /// epilogue vectorization, this function is overriden to handle the more 476 /// complex control flow around the loops. 477 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 478 479 /// Widen a single call instruction within the innermost loop. 480 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 481 VPTransformState &State); 482 483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 484 void fixVectorizedLoop(VPTransformState &State); 485 486 // Return true if any runtime check is added. 487 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 488 489 /// A type for vectorized values in the new loop. Each value from the 490 /// original loop, when vectorized, is represented by UF vector values in the 491 /// new unrolled loop, where UF is the unroll factor. 492 using VectorParts = SmallVector<Value *, 2>; 493 494 /// Vectorize a single first-order recurrence or pointer induction PHINode in 495 /// a block. This method handles the induction variable canonicalization. It 496 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 497 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 498 VPTransformState &State); 499 500 /// A helper function to scalarize a single Instruction in the innermost loop. 501 /// Generates a sequence of scalar instances for each lane between \p MinLane 502 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 503 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 504 /// Instr's operands. 505 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 506 const VPIteration &Instance, bool IfPredicateInstr, 507 VPTransformState &State); 508 509 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 510 /// is provided, the integer induction variable will first be truncated to 511 /// the corresponding type. \p CanonicalIV is the scalar value generated for 512 /// the canonical induction variable. 513 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, 514 VPTransformState &State, Value *CanonicalIV); 515 516 /// Construct the vector value of a scalarized value \p V one lane at a time. 517 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 518 VPTransformState &State); 519 520 /// Try to vectorize interleaved access group \p Group with the base address 521 /// given in \p Addr, optionally masking the vector operations if \p 522 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 523 /// values in the vectorized loop. 524 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 525 ArrayRef<VPValue *> VPDefs, 526 VPTransformState &State, VPValue *Addr, 527 ArrayRef<VPValue *> StoredValues, 528 VPValue *BlockInMask = nullptr); 529 530 /// Set the debug location in the builder \p Ptr using the debug location in 531 /// \p V. If \p Ptr is None then it uses the class member's Builder. 532 void setDebugLocFromInst(const Value *V, 533 Optional<IRBuilder<> *> CustomBuilder = None); 534 535 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 536 void fixNonInductionPHIs(VPTransformState &State); 537 538 /// Returns true if the reordering of FP operations is not allowed, but we are 539 /// able to vectorize with strict in-order reductions for the given RdxDesc. 540 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 541 542 /// Create a broadcast instruction. This method generates a broadcast 543 /// instruction (shuffle) for loop invariant values and for the induction 544 /// value. If this is the induction variable then we extend it to N, N+1, ... 545 /// this is needed because each iteration in the loop corresponds to a SIMD 546 /// element. 547 virtual Value *getBroadcastInstrs(Value *V); 548 549 /// Add metadata from one instruction to another. 550 /// 551 /// This includes both the original MDs from \p From and additional ones (\see 552 /// addNewMetadata). Use this for *newly created* instructions in the vector 553 /// loop. 554 void addMetadata(Instruction *To, Instruction *From); 555 556 /// Similar to the previous function but it adds the metadata to a 557 /// vector of instructions. 558 void addMetadata(ArrayRef<Value *> To, Instruction *From); 559 560 protected: 561 friend class LoopVectorizationPlanner; 562 563 /// A small list of PHINodes. 564 using PhiVector = SmallVector<PHINode *, 4>; 565 566 /// A type for scalarized values in the new loop. Each value from the 567 /// original loop, when scalarized, is represented by UF x VF scalar values 568 /// in the new unrolled loop, where UF is the unroll factor and VF is the 569 /// vectorization factor. 570 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 571 572 /// Set up the values of the IVs correctly when exiting the vector loop. 573 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 574 Value *CountRoundDown, Value *EndValue, 575 BasicBlock *MiddleBlock); 576 577 /// Introduce a conditional branch (on true, condition to be set later) at the 578 /// end of the header=latch connecting it to itself (across the backedge) and 579 /// to the exit block of \p L. 580 void createHeaderBranch(Loop *L); 581 582 /// Handle all cross-iteration phis in the header. 583 void fixCrossIterationPHIs(VPTransformState &State); 584 585 /// Create the exit value of first order recurrences in the middle block and 586 /// update their users. 587 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 588 VPTransformState &State); 589 590 /// Create code for the loop exit value of the reduction. 591 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 592 593 /// Clear NSW/NUW flags from reduction instructions if necessary. 594 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 595 VPTransformState &State); 596 597 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 598 /// means we need to add the appropriate incoming value from the middle 599 /// block as exiting edges from the scalar epilogue loop (if present) are 600 /// already in place, and we exit the vector loop exclusively to the middle 601 /// block. 602 void fixLCSSAPHIs(VPTransformState &State); 603 604 /// Iteratively sink the scalarized operands of a predicated instruction into 605 /// the block that was created for it. 606 void sinkScalarOperands(Instruction *PredInst); 607 608 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 609 /// represented as. 610 void truncateToMinimalBitwidths(VPTransformState &State); 611 612 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 613 /// variable on which to base the steps, \p Step is the size of the step, and 614 /// \p EntryVal is the value from the original loop that maps to the steps. 615 /// Note that \p EntryVal doesn't have to be an induction variable - it 616 /// can also be a truncate instruction. 617 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 618 const InductionDescriptor &ID, VPValue *Def, 619 VPTransformState &State); 620 621 /// Create a vector induction phi node based on an existing scalar one. \p 622 /// EntryVal is the value from the original loop that maps to the vector phi 623 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 624 /// truncate instruction, instead of widening the original IV, we widen a 625 /// version of the IV truncated to \p EntryVal's type. 626 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 627 Value *Step, Value *Start, 628 Instruction *EntryVal, VPValue *Def, 629 VPTransformState &State); 630 631 /// Returns true if an instruction \p I should be scalarized instead of 632 /// vectorized for the chosen vectorization factor. 633 bool shouldScalarizeInstruction(Instruction *I) const; 634 635 /// Returns true if we should generate a scalar version of \p IV. 636 bool needsScalarInduction(Instruction *IV) const; 637 638 /// Returns (and creates if needed) the original loop trip count. 639 Value *getOrCreateTripCount(Loop *NewLoop); 640 641 /// Returns (and creates if needed) the trip count of the widened loop. 642 Value *getOrCreateVectorTripCount(Loop *NewLoop); 643 644 /// Returns a bitcasted value to the requested vector type. 645 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 646 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 647 const DataLayout &DL); 648 649 /// Emit a bypass check to see if the vector trip count is zero, including if 650 /// it overflows. 651 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 652 653 /// Emit a bypass check to see if all of the SCEV assumptions we've 654 /// had to make are correct. Returns the block containing the checks or 655 /// nullptr if no checks have been added. 656 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 657 658 /// Emit bypass checks to check any memory assumptions we may have made. 659 /// Returns the block containing the checks or nullptr if no checks have been 660 /// added. 661 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 662 663 /// Compute the transformed value of Index at offset StartValue using step 664 /// StepValue. 665 /// For integer induction, returns StartValue + Index * StepValue. 666 /// For pointer induction, returns StartValue[Index * StepValue]. 667 /// FIXME: The newly created binary instructions should contain nsw/nuw 668 /// flags, which can be found from the original scalar operations. 669 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 670 const DataLayout &DL, 671 const InductionDescriptor &ID, 672 BasicBlock *VectorHeader) const; 673 674 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 675 /// vector loop preheader, middle block and scalar preheader. Also 676 /// allocate a loop object for the new vector loop and return it. 677 Loop *createVectorLoopSkeleton(StringRef Prefix); 678 679 /// Create new phi nodes for the induction variables to resume iteration count 680 /// in the scalar epilogue, from where the vectorized loop left off. 681 /// In cases where the loop skeleton is more complicated (eg. epilogue 682 /// vectorization) and the resume values can come from an additional bypass 683 /// block, the \p AdditionalBypass pair provides information about the bypass 684 /// block and the end value on the edge from bypass to this loop. 685 void createInductionResumeValues( 686 Loop *L, 687 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 688 689 /// Complete the loop skeleton by adding debug MDs, creating appropriate 690 /// conditional branches in the middle block, preparing the builder and 691 /// running the verifier. Take in the vector loop \p L as argument, and return 692 /// the preheader of the completed vector loop. 693 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 694 695 /// Add additional metadata to \p To that was not present on \p Orig. 696 /// 697 /// Currently this is used to add the noalias annotations based on the 698 /// inserted memchecks. Use this for instructions that are *cloned* into the 699 /// vector loop. 700 void addNewMetadata(Instruction *To, const Instruction *Orig); 701 702 /// Collect poison-generating recipes that may generate a poison value that is 703 /// used after vectorization, even when their operands are not poison. Those 704 /// recipes meet the following conditions: 705 /// * Contribute to the address computation of a recipe generating a widen 706 /// memory load/store (VPWidenMemoryInstructionRecipe or 707 /// VPInterleaveRecipe). 708 /// * Such a widen memory load/store has at least one underlying Instruction 709 /// that is in a basic block that needs predication and after vectorization 710 /// the generated instruction won't be predicated. 711 void collectPoisonGeneratingRecipes(VPTransformState &State); 712 713 /// Allow subclasses to override and print debug traces before/after vplan 714 /// execution, when trace information is requested. 715 virtual void printDebugTracesAtStart(){}; 716 virtual void printDebugTracesAtEnd(){}; 717 718 /// The original loop. 719 Loop *OrigLoop; 720 721 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 722 /// dynamic knowledge to simplify SCEV expressions and converts them to a 723 /// more usable form. 724 PredicatedScalarEvolution &PSE; 725 726 /// Loop Info. 727 LoopInfo *LI; 728 729 /// Dominator Tree. 730 DominatorTree *DT; 731 732 /// Alias Analysis. 733 AAResults *AA; 734 735 /// Target Library Info. 736 const TargetLibraryInfo *TLI; 737 738 /// Target Transform Info. 739 const TargetTransformInfo *TTI; 740 741 /// Assumption Cache. 742 AssumptionCache *AC; 743 744 /// Interface to emit optimization remarks. 745 OptimizationRemarkEmitter *ORE; 746 747 /// LoopVersioning. It's only set up (non-null) if memchecks were 748 /// used. 749 /// 750 /// This is currently only used to add no-alias metadata based on the 751 /// memchecks. The actually versioning is performed manually. 752 std::unique_ptr<LoopVersioning> LVer; 753 754 /// The vectorization SIMD factor to use. Each vector will have this many 755 /// vector elements. 756 ElementCount VF; 757 758 /// The vectorization unroll factor to use. Each scalar is vectorized to this 759 /// many different vector instructions. 760 unsigned UF; 761 762 /// The builder that we use 763 IRBuilder<> Builder; 764 765 // --- Vectorization state --- 766 767 /// The vector-loop preheader. 768 BasicBlock *LoopVectorPreHeader; 769 770 /// The scalar-loop preheader. 771 BasicBlock *LoopScalarPreHeader; 772 773 /// Middle Block between the vector and the scalar. 774 BasicBlock *LoopMiddleBlock; 775 776 /// The unique ExitBlock of the scalar loop if one exists. Note that 777 /// there can be multiple exiting edges reaching this block. 778 BasicBlock *LoopExitBlock; 779 780 /// The vector loop body. 781 BasicBlock *LoopVectorBody; 782 783 /// The scalar loop body. 784 BasicBlock *LoopScalarBody; 785 786 /// A list of all bypass blocks. The first block is the entry of the loop. 787 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 788 789 /// Store instructions that were predicated. 790 SmallVector<Instruction *, 4> PredicatedInstructions; 791 792 /// Trip count of the original loop. 793 Value *TripCount = nullptr; 794 795 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 796 Value *VectorTripCount = nullptr; 797 798 /// The legality analysis. 799 LoopVectorizationLegality *Legal; 800 801 /// The profitablity analysis. 802 LoopVectorizationCostModel *Cost; 803 804 // Record whether runtime checks are added. 805 bool AddedSafetyChecks = false; 806 807 // Holds the end values for each induction variable. We save the end values 808 // so we can later fix-up the external users of the induction variables. 809 DenseMap<PHINode *, Value *> IVEndValues; 810 811 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 812 // fixed up at the end of vector code generation. 813 SmallVector<PHINode *, 8> OrigPHIsToFix; 814 815 /// BFI and PSI are used to check for profile guided size optimizations. 816 BlockFrequencyInfo *BFI; 817 ProfileSummaryInfo *PSI; 818 819 // Whether this loop should be optimized for size based on profile guided size 820 // optimizatios. 821 bool OptForSizeBasedOnProfile; 822 823 /// Structure to hold information about generated runtime checks, responsible 824 /// for cleaning the checks, if vectorization turns out unprofitable. 825 GeneratedRTChecks &RTChecks; 826 }; 827 828 class InnerLoopUnroller : public InnerLoopVectorizer { 829 public: 830 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 831 LoopInfo *LI, DominatorTree *DT, 832 const TargetLibraryInfo *TLI, 833 const TargetTransformInfo *TTI, AssumptionCache *AC, 834 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 835 LoopVectorizationLegality *LVL, 836 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 837 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 838 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 839 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 840 BFI, PSI, Check) {} 841 842 private: 843 Value *getBroadcastInstrs(Value *V) override; 844 }; 845 846 /// Encapsulate information regarding vectorization of a loop and its epilogue. 847 /// This information is meant to be updated and used across two stages of 848 /// epilogue vectorization. 849 struct EpilogueLoopVectorizationInfo { 850 ElementCount MainLoopVF = ElementCount::getFixed(0); 851 unsigned MainLoopUF = 0; 852 ElementCount EpilogueVF = ElementCount::getFixed(0); 853 unsigned EpilogueUF = 0; 854 BasicBlock *MainLoopIterationCountCheck = nullptr; 855 BasicBlock *EpilogueIterationCountCheck = nullptr; 856 BasicBlock *SCEVSafetyCheck = nullptr; 857 BasicBlock *MemSafetyCheck = nullptr; 858 Value *TripCount = nullptr; 859 Value *VectorTripCount = nullptr; 860 861 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 862 ElementCount EVF, unsigned EUF) 863 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 864 assert(EUF == 1 && 865 "A high UF for the epilogue loop is likely not beneficial."); 866 } 867 }; 868 869 /// An extension of the inner loop vectorizer that creates a skeleton for a 870 /// vectorized loop that has its epilogue (residual) also vectorized. 871 /// The idea is to run the vplan on a given loop twice, firstly to setup the 872 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 873 /// from the first step and vectorize the epilogue. This is achieved by 874 /// deriving two concrete strategy classes from this base class and invoking 875 /// them in succession from the loop vectorizer planner. 876 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 877 public: 878 InnerLoopAndEpilogueVectorizer( 879 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 880 DominatorTree *DT, const TargetLibraryInfo *TLI, 881 const TargetTransformInfo *TTI, AssumptionCache *AC, 882 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 883 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 884 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 885 GeneratedRTChecks &Checks) 886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 887 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 888 Checks), 889 EPI(EPI) {} 890 891 // Override this function to handle the more complex control flow around the 892 // three loops. 893 std::pair<BasicBlock *, Value *> 894 createVectorizedLoopSkeleton() final override { 895 return createEpilogueVectorizedLoopSkeleton(); 896 } 897 898 /// The interface for creating a vectorized skeleton using one of two 899 /// different strategies, each corresponding to one execution of the vplan 900 /// as described above. 901 virtual std::pair<BasicBlock *, Value *> 902 createEpilogueVectorizedLoopSkeleton() = 0; 903 904 /// Holds and updates state information required to vectorize the main loop 905 /// and its epilogue in two separate passes. This setup helps us avoid 906 /// regenerating and recomputing runtime safety checks. It also helps us to 907 /// shorten the iteration-count-check path length for the cases where the 908 /// iteration count of the loop is so small that the main vector loop is 909 /// completely skipped. 910 EpilogueLoopVectorizationInfo &EPI; 911 }; 912 913 /// A specialized derived class of inner loop vectorizer that performs 914 /// vectorization of *main* loops in the process of vectorizing loops and their 915 /// epilogues. 916 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 917 public: 918 EpilogueVectorizerMainLoop( 919 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 920 DominatorTree *DT, const TargetLibraryInfo *TLI, 921 const TargetTransformInfo *TTI, AssumptionCache *AC, 922 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 923 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 924 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 925 GeneratedRTChecks &Check) 926 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 927 EPI, LVL, CM, BFI, PSI, Check) {} 928 /// Implements the interface for creating a vectorized skeleton using the 929 /// *main loop* strategy (ie the first pass of vplan execution). 930 std::pair<BasicBlock *, Value *> 931 createEpilogueVectorizedLoopSkeleton() final override; 932 933 protected: 934 /// Emits an iteration count bypass check once for the main loop (when \p 935 /// ForEpilogue is false) and once for the epilogue loop (when \p 936 /// ForEpilogue is true). 937 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 938 bool ForEpilogue); 939 void printDebugTracesAtStart() override; 940 void printDebugTracesAtEnd() override; 941 }; 942 943 // A specialized derived class of inner loop vectorizer that performs 944 // vectorization of *epilogue* loops in the process of vectorizing loops and 945 // their epilogues. 946 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 947 public: 948 EpilogueVectorizerEpilogueLoop( 949 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 950 DominatorTree *DT, const TargetLibraryInfo *TLI, 951 const TargetTransformInfo *TTI, AssumptionCache *AC, 952 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 953 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 954 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 955 GeneratedRTChecks &Checks) 956 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 957 EPI, LVL, CM, BFI, PSI, Checks) {} 958 /// Implements the interface for creating a vectorized skeleton using the 959 /// *epilogue loop* strategy (ie the second pass of vplan execution). 960 std::pair<BasicBlock *, Value *> 961 createEpilogueVectorizedLoopSkeleton() final override; 962 963 protected: 964 /// Emits an iteration count bypass check after the main vector loop has 965 /// finished to see if there are any iterations left to execute by either 966 /// the vector epilogue or the scalar epilogue. 967 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 968 BasicBlock *Bypass, 969 BasicBlock *Insert); 970 void printDebugTracesAtStart() override; 971 void printDebugTracesAtEnd() override; 972 }; 973 } // end namespace llvm 974 975 /// Look for a meaningful debug location on the instruction or it's 976 /// operands. 977 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 978 if (!I) 979 return I; 980 981 DebugLoc Empty; 982 if (I->getDebugLoc() != Empty) 983 return I; 984 985 for (Use &Op : I->operands()) { 986 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 987 if (OpInst->getDebugLoc() != Empty) 988 return OpInst; 989 } 990 991 return I; 992 } 993 994 void InnerLoopVectorizer::setDebugLocFromInst( 995 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 996 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 997 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 998 const DILocation *DIL = Inst->getDebugLoc(); 999 1000 // When a FSDiscriminator is enabled, we don't need to add the multiply 1001 // factors to the discriminators. 1002 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1003 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1004 // FIXME: For scalable vectors, assume vscale=1. 1005 auto NewDIL = 1006 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1007 if (NewDIL) 1008 B->SetCurrentDebugLocation(NewDIL.getValue()); 1009 else 1010 LLVM_DEBUG(dbgs() 1011 << "Failed to create new discriminator: " 1012 << DIL->getFilename() << " Line: " << DIL->getLine()); 1013 } else 1014 B->SetCurrentDebugLocation(DIL); 1015 } else 1016 B->SetCurrentDebugLocation(DebugLoc()); 1017 } 1018 1019 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1020 /// is passed, the message relates to that particular instruction. 1021 #ifndef NDEBUG 1022 static void debugVectorizationMessage(const StringRef Prefix, 1023 const StringRef DebugMsg, 1024 Instruction *I) { 1025 dbgs() << "LV: " << Prefix << DebugMsg; 1026 if (I != nullptr) 1027 dbgs() << " " << *I; 1028 else 1029 dbgs() << '.'; 1030 dbgs() << '\n'; 1031 } 1032 #endif 1033 1034 /// Create an analysis remark that explains why vectorization failed 1035 /// 1036 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1037 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1038 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1039 /// the location of the remark. \return the remark object that can be 1040 /// streamed to. 1041 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1042 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1043 Value *CodeRegion = TheLoop->getHeader(); 1044 DebugLoc DL = TheLoop->getStartLoc(); 1045 1046 if (I) { 1047 CodeRegion = I->getParent(); 1048 // If there is no debug location attached to the instruction, revert back to 1049 // using the loop's. 1050 if (I->getDebugLoc()) 1051 DL = I->getDebugLoc(); 1052 } 1053 1054 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1055 } 1056 1057 namespace llvm { 1058 1059 /// Return a value for Step multiplied by VF. 1060 Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1061 int64_t Step) { 1062 assert(Ty->isIntegerTy() && "Expected an integer step"); 1063 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1064 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1065 } 1066 1067 /// Return the runtime value for VF. 1068 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1069 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1070 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1071 } 1072 1073 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1074 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1075 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1076 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1077 return B.CreateUIToFP(RuntimeVF, FTy); 1078 } 1079 1080 void reportVectorizationFailure(const StringRef DebugMsg, 1081 const StringRef OREMsg, const StringRef ORETag, 1082 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1083 Instruction *I) { 1084 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1085 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1086 ORE->emit( 1087 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1088 << "loop not vectorized: " << OREMsg); 1089 } 1090 1091 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1092 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1093 Instruction *I) { 1094 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1095 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1096 ORE->emit( 1097 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1098 << Msg); 1099 } 1100 1101 } // end namespace llvm 1102 1103 #ifndef NDEBUG 1104 /// \return string containing a file name and a line # for the given loop. 1105 static std::string getDebugLocString(const Loop *L) { 1106 std::string Result; 1107 if (L) { 1108 raw_string_ostream OS(Result); 1109 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1110 LoopDbgLoc.print(OS); 1111 else 1112 // Just print the module name. 1113 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1114 OS.flush(); 1115 } 1116 return Result; 1117 } 1118 #endif 1119 1120 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1121 const Instruction *Orig) { 1122 // If the loop was versioned with memchecks, add the corresponding no-alias 1123 // metadata. 1124 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1125 LVer->annotateInstWithNoAlias(To, Orig); 1126 } 1127 1128 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1129 VPTransformState &State) { 1130 1131 // Collect recipes in the backward slice of `Root` that may generate a poison 1132 // value that is used after vectorization. 1133 SmallPtrSet<VPRecipeBase *, 16> Visited; 1134 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1135 SmallVector<VPRecipeBase *, 16> Worklist; 1136 Worklist.push_back(Root); 1137 1138 // Traverse the backward slice of Root through its use-def chain. 1139 while (!Worklist.empty()) { 1140 VPRecipeBase *CurRec = Worklist.back(); 1141 Worklist.pop_back(); 1142 1143 if (!Visited.insert(CurRec).second) 1144 continue; 1145 1146 // Prune search if we find another recipe generating a widen memory 1147 // instruction. Widen memory instructions involved in address computation 1148 // will lead to gather/scatter instructions, which don't need to be 1149 // handled. 1150 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1151 isa<VPInterleaveRecipe>(CurRec) || 1152 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1153 continue; 1154 1155 // This recipe contributes to the address computation of a widen 1156 // load/store. Collect recipe if its underlying instruction has 1157 // poison-generating flags. 1158 Instruction *Instr = CurRec->getUnderlyingInstr(); 1159 if (Instr && Instr->hasPoisonGeneratingFlags()) 1160 State.MayGeneratePoisonRecipes.insert(CurRec); 1161 1162 // Add new definitions to the worklist. 1163 for (VPValue *operand : CurRec->operands()) 1164 if (VPDef *OpDef = operand->getDef()) 1165 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1166 } 1167 }); 1168 1169 // Traverse all the recipes in the VPlan and collect the poison-generating 1170 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1171 // VPInterleaveRecipe. 1172 auto Iter = depth_first( 1173 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1174 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1175 for (VPRecipeBase &Recipe : *VPBB) { 1176 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1177 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1178 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1179 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1180 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1181 collectPoisonGeneratingInstrsInBackwardSlice( 1182 cast<VPRecipeBase>(AddrDef)); 1183 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1184 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1185 if (AddrDef) { 1186 // Check if any member of the interleave group needs predication. 1187 const InterleaveGroup<Instruction> *InterGroup = 1188 InterleaveRec->getInterleaveGroup(); 1189 bool NeedPredication = false; 1190 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1191 I < NumMembers; ++I) { 1192 Instruction *Member = InterGroup->getMember(I); 1193 if (Member) 1194 NeedPredication |= 1195 Legal->blockNeedsPredication(Member->getParent()); 1196 } 1197 1198 if (NeedPredication) 1199 collectPoisonGeneratingInstrsInBackwardSlice( 1200 cast<VPRecipeBase>(AddrDef)); 1201 } 1202 } 1203 } 1204 } 1205 } 1206 1207 void InnerLoopVectorizer::addMetadata(Instruction *To, 1208 Instruction *From) { 1209 propagateMetadata(To, From); 1210 addNewMetadata(To, From); 1211 } 1212 1213 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1214 Instruction *From) { 1215 for (Value *V : To) { 1216 if (Instruction *I = dyn_cast<Instruction>(V)) 1217 addMetadata(I, From); 1218 } 1219 } 1220 1221 namespace llvm { 1222 1223 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1224 // lowered. 1225 enum ScalarEpilogueLowering { 1226 1227 // The default: allowing scalar epilogues. 1228 CM_ScalarEpilogueAllowed, 1229 1230 // Vectorization with OptForSize: don't allow epilogues. 1231 CM_ScalarEpilogueNotAllowedOptSize, 1232 1233 // A special case of vectorisation with OptForSize: loops with a very small 1234 // trip count are considered for vectorization under OptForSize, thereby 1235 // making sure the cost of their loop body is dominant, free of runtime 1236 // guards and scalar iteration overheads. 1237 CM_ScalarEpilogueNotAllowedLowTripLoop, 1238 1239 // Loop hint predicate indicating an epilogue is undesired. 1240 CM_ScalarEpilogueNotNeededUsePredicate, 1241 1242 // Directive indicating we must either tail fold or not vectorize 1243 CM_ScalarEpilogueNotAllowedUsePredicate 1244 }; 1245 1246 /// ElementCountComparator creates a total ordering for ElementCount 1247 /// for the purposes of using it in a set structure. 1248 struct ElementCountComparator { 1249 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1250 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1251 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1252 } 1253 }; 1254 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1255 1256 /// LoopVectorizationCostModel - estimates the expected speedups due to 1257 /// vectorization. 1258 /// In many cases vectorization is not profitable. This can happen because of 1259 /// a number of reasons. In this class we mainly attempt to predict the 1260 /// expected speedup/slowdowns due to the supported instruction set. We use the 1261 /// TargetTransformInfo to query the different backends for the cost of 1262 /// different operations. 1263 class LoopVectorizationCostModel { 1264 public: 1265 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1266 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1267 LoopVectorizationLegality *Legal, 1268 const TargetTransformInfo &TTI, 1269 const TargetLibraryInfo *TLI, DemandedBits *DB, 1270 AssumptionCache *AC, 1271 OptimizationRemarkEmitter *ORE, const Function *F, 1272 const LoopVectorizeHints *Hints, 1273 InterleavedAccessInfo &IAI) 1274 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1275 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1276 Hints(Hints), InterleaveInfo(IAI) {} 1277 1278 /// \return An upper bound for the vectorization factors (both fixed and 1279 /// scalable). If the factors are 0, vectorization and interleaving should be 1280 /// avoided up front. 1281 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1282 1283 /// \return True if runtime checks are required for vectorization, and false 1284 /// otherwise. 1285 bool runtimeChecksRequired(); 1286 1287 /// \return The most profitable vectorization factor and the cost of that VF. 1288 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1289 /// then this vectorization factor will be selected if vectorization is 1290 /// possible. 1291 VectorizationFactor 1292 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1293 1294 VectorizationFactor 1295 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1296 const LoopVectorizationPlanner &LVP); 1297 1298 /// Setup cost-based decisions for user vectorization factor. 1299 /// \return true if the UserVF is a feasible VF to be chosen. 1300 bool selectUserVectorizationFactor(ElementCount UserVF) { 1301 collectUniformsAndScalars(UserVF); 1302 collectInstsToScalarize(UserVF); 1303 return expectedCost(UserVF).first.isValid(); 1304 } 1305 1306 /// \return The size (in bits) of the smallest and widest types in the code 1307 /// that needs to be vectorized. We ignore values that remain scalar such as 1308 /// 64 bit loop indices. 1309 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1310 1311 /// \return The desired interleave count. 1312 /// If interleave count has been specified by metadata it will be returned. 1313 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1314 /// are the selected vectorization factor and the cost of the selected VF. 1315 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1316 1317 /// Memory access instruction may be vectorized in more than one way. 1318 /// Form of instruction after vectorization depends on cost. 1319 /// This function takes cost-based decisions for Load/Store instructions 1320 /// and collects them in a map. This decisions map is used for building 1321 /// the lists of loop-uniform and loop-scalar instructions. 1322 /// The calculated cost is saved with widening decision in order to 1323 /// avoid redundant calculations. 1324 void setCostBasedWideningDecision(ElementCount VF); 1325 1326 /// A struct that represents some properties of the register usage 1327 /// of a loop. 1328 struct RegisterUsage { 1329 /// Holds the number of loop invariant values that are used in the loop. 1330 /// The key is ClassID of target-provided register class. 1331 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1332 /// Holds the maximum number of concurrent live intervals in the loop. 1333 /// The key is ClassID of target-provided register class. 1334 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1335 }; 1336 1337 /// \return Returns information about the register usages of the loop for the 1338 /// given vectorization factors. 1339 SmallVector<RegisterUsage, 8> 1340 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1341 1342 /// Collect values we want to ignore in the cost model. 1343 void collectValuesToIgnore(); 1344 1345 /// Collect all element types in the loop for which widening is needed. 1346 void collectElementTypesForWidening(); 1347 1348 /// Split reductions into those that happen in the loop, and those that happen 1349 /// outside. In loop reductions are collected into InLoopReductionChains. 1350 void collectInLoopReductions(); 1351 1352 /// Returns true if we should use strict in-order reductions for the given 1353 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1354 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1355 /// of FP operations. 1356 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1357 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1358 } 1359 1360 /// \returns The smallest bitwidth each instruction can be represented with. 1361 /// The vector equivalents of these instructions should be truncated to this 1362 /// type. 1363 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1364 return MinBWs; 1365 } 1366 1367 /// \returns True if it is more profitable to scalarize instruction \p I for 1368 /// vectorization factor \p VF. 1369 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1370 assert(VF.isVector() && 1371 "Profitable to scalarize relevant only for VF > 1."); 1372 1373 // Cost model is not run in the VPlan-native path - return conservative 1374 // result until this changes. 1375 if (EnableVPlanNativePath) 1376 return false; 1377 1378 auto Scalars = InstsToScalarize.find(VF); 1379 assert(Scalars != InstsToScalarize.end() && 1380 "VF not yet analyzed for scalarization profitability"); 1381 return Scalars->second.find(I) != Scalars->second.end(); 1382 } 1383 1384 /// Returns true if \p I is known to be uniform after vectorization. 1385 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1386 if (VF.isScalar()) 1387 return true; 1388 1389 // Cost model is not run in the VPlan-native path - return conservative 1390 // result until this changes. 1391 if (EnableVPlanNativePath) 1392 return false; 1393 1394 auto UniformsPerVF = Uniforms.find(VF); 1395 assert(UniformsPerVF != Uniforms.end() && 1396 "VF not yet analyzed for uniformity"); 1397 return UniformsPerVF->second.count(I); 1398 } 1399 1400 /// Returns true if \p I is known to be scalar after vectorization. 1401 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1402 if (VF.isScalar()) 1403 return true; 1404 1405 // Cost model is not run in the VPlan-native path - return conservative 1406 // result until this changes. 1407 if (EnableVPlanNativePath) 1408 return false; 1409 1410 auto ScalarsPerVF = Scalars.find(VF); 1411 assert(ScalarsPerVF != Scalars.end() && 1412 "Scalar values are not calculated for VF"); 1413 return ScalarsPerVF->second.count(I); 1414 } 1415 1416 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1417 /// for vectorization factor \p VF. 1418 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1419 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1420 !isProfitableToScalarize(I, VF) && 1421 !isScalarAfterVectorization(I, VF); 1422 } 1423 1424 /// Decision that was taken during cost calculation for memory instruction. 1425 enum InstWidening { 1426 CM_Unknown, 1427 CM_Widen, // For consecutive accesses with stride +1. 1428 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1429 CM_Interleave, 1430 CM_GatherScatter, 1431 CM_Scalarize 1432 }; 1433 1434 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1435 /// instruction \p I and vector width \p VF. 1436 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1437 InstructionCost Cost) { 1438 assert(VF.isVector() && "Expected VF >=2"); 1439 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1440 } 1441 1442 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1443 /// interleaving group \p Grp and vector width \p VF. 1444 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1445 ElementCount VF, InstWidening W, 1446 InstructionCost Cost) { 1447 assert(VF.isVector() && "Expected VF >=2"); 1448 /// Broadcast this decicion to all instructions inside the group. 1449 /// But the cost will be assigned to one instruction only. 1450 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1451 if (auto *I = Grp->getMember(i)) { 1452 if (Grp->getInsertPos() == I) 1453 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1454 else 1455 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1456 } 1457 } 1458 } 1459 1460 /// Return the cost model decision for the given instruction \p I and vector 1461 /// width \p VF. Return CM_Unknown if this instruction did not pass 1462 /// through the cost modeling. 1463 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1464 assert(VF.isVector() && "Expected VF to be a vector VF"); 1465 // Cost model is not run in the VPlan-native path - return conservative 1466 // result until this changes. 1467 if (EnableVPlanNativePath) 1468 return CM_GatherScatter; 1469 1470 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1471 auto Itr = WideningDecisions.find(InstOnVF); 1472 if (Itr == WideningDecisions.end()) 1473 return CM_Unknown; 1474 return Itr->second.first; 1475 } 1476 1477 /// Return the vectorization cost for the given instruction \p I and vector 1478 /// width \p VF. 1479 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1480 assert(VF.isVector() && "Expected VF >=2"); 1481 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1482 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1483 "The cost is not calculated"); 1484 return WideningDecisions[InstOnVF].second; 1485 } 1486 1487 /// Return True if instruction \p I is an optimizable truncate whose operand 1488 /// is an induction variable. Such a truncate will be removed by adding a new 1489 /// induction variable with the destination type. 1490 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1491 // If the instruction is not a truncate, return false. 1492 auto *Trunc = dyn_cast<TruncInst>(I); 1493 if (!Trunc) 1494 return false; 1495 1496 // Get the source and destination types of the truncate. 1497 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1498 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1499 1500 // If the truncate is free for the given types, return false. Replacing a 1501 // free truncate with an induction variable would add an induction variable 1502 // update instruction to each iteration of the loop. We exclude from this 1503 // check the primary induction variable since it will need an update 1504 // instruction regardless. 1505 Value *Op = Trunc->getOperand(0); 1506 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1507 return false; 1508 1509 // If the truncated value is not an induction variable, return false. 1510 return Legal->isInductionPhi(Op); 1511 } 1512 1513 /// Collects the instructions to scalarize for each predicated instruction in 1514 /// the loop. 1515 void collectInstsToScalarize(ElementCount VF); 1516 1517 /// Collect Uniform and Scalar values for the given \p VF. 1518 /// The sets depend on CM decision for Load/Store instructions 1519 /// that may be vectorized as interleave, gather-scatter or scalarized. 1520 void collectUniformsAndScalars(ElementCount VF) { 1521 // Do the analysis once. 1522 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1523 return; 1524 setCostBasedWideningDecision(VF); 1525 collectLoopUniforms(VF); 1526 collectLoopScalars(VF); 1527 } 1528 1529 /// Returns true if the target machine supports masked store operation 1530 /// for the given \p DataType and kind of access to \p Ptr. 1531 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1532 return Legal->isConsecutivePtr(DataType, Ptr) && 1533 TTI.isLegalMaskedStore(DataType, Alignment); 1534 } 1535 1536 /// Returns true if the target machine supports masked load operation 1537 /// for the given \p DataType and kind of access to \p Ptr. 1538 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1539 return Legal->isConsecutivePtr(DataType, Ptr) && 1540 TTI.isLegalMaskedLoad(DataType, Alignment); 1541 } 1542 1543 /// Returns true if the target machine can represent \p V as a masked gather 1544 /// or scatter operation. 1545 bool isLegalGatherOrScatter(Value *V, 1546 ElementCount VF = ElementCount::getFixed(1)) { 1547 bool LI = isa<LoadInst>(V); 1548 bool SI = isa<StoreInst>(V); 1549 if (!LI && !SI) 1550 return false; 1551 auto *Ty = getLoadStoreType(V); 1552 Align Align = getLoadStoreAlignment(V); 1553 if (VF.isVector()) 1554 Ty = VectorType::get(Ty, VF); 1555 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1556 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1557 } 1558 1559 /// Returns true if the target machine supports all of the reduction 1560 /// variables found for the given VF. 1561 bool canVectorizeReductions(ElementCount VF) const { 1562 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1563 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1564 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1565 })); 1566 } 1567 1568 /// Returns true if \p I is an instruction that will be scalarized with 1569 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1570 /// instructions include conditional stores and instructions that may divide 1571 /// by zero. 1572 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1573 1574 // Returns true if \p I is an instruction that will be predicated either 1575 // through scalar predication or masked load/store or masked gather/scatter. 1576 // \p VF is the vectorization factor that will be used to vectorize \p I. 1577 // Superset of instructions that return true for isScalarWithPredication. 1578 bool isPredicatedInst(Instruction *I, ElementCount VF, 1579 bool IsKnownUniform = false) { 1580 // When we know the load is uniform and the original scalar loop was not 1581 // predicated we don't need to mark it as a predicated instruction. Any 1582 // vectorised blocks created when tail-folding are something artificial we 1583 // have introduced and we know there is always at least one active lane. 1584 // That's why we call Legal->blockNeedsPredication here because it doesn't 1585 // query tail-folding. 1586 if (IsKnownUniform && isa<LoadInst>(I) && 1587 !Legal->blockNeedsPredication(I->getParent())) 1588 return false; 1589 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1590 return false; 1591 // Loads and stores that need some form of masked operation are predicated 1592 // instructions. 1593 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1594 return Legal->isMaskRequired(I); 1595 return isScalarWithPredication(I, VF); 1596 } 1597 1598 /// Returns true if \p I is a memory instruction with consecutive memory 1599 /// access that can be widened. 1600 bool 1601 memoryInstructionCanBeWidened(Instruction *I, 1602 ElementCount VF = ElementCount::getFixed(1)); 1603 1604 /// Returns true if \p I is a memory instruction in an interleaved-group 1605 /// of memory accesses that can be vectorized with wide vector loads/stores 1606 /// and shuffles. 1607 bool 1608 interleavedAccessCanBeWidened(Instruction *I, 1609 ElementCount VF = ElementCount::getFixed(1)); 1610 1611 /// Check if \p Instr belongs to any interleaved access group. 1612 bool isAccessInterleaved(Instruction *Instr) { 1613 return InterleaveInfo.isInterleaved(Instr); 1614 } 1615 1616 /// Get the interleaved access group that \p Instr belongs to. 1617 const InterleaveGroup<Instruction> * 1618 getInterleavedAccessGroup(Instruction *Instr) { 1619 return InterleaveInfo.getInterleaveGroup(Instr); 1620 } 1621 1622 /// Returns true if we're required to use a scalar epilogue for at least 1623 /// the final iteration of the original loop. 1624 bool requiresScalarEpilogue(ElementCount VF) const { 1625 if (!isScalarEpilogueAllowed()) 1626 return false; 1627 // If we might exit from anywhere but the latch, must run the exiting 1628 // iteration in scalar form. 1629 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1630 return true; 1631 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1632 } 1633 1634 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1635 /// loop hint annotation. 1636 bool isScalarEpilogueAllowed() const { 1637 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1638 } 1639 1640 /// Returns true if all loop blocks should be masked to fold tail loop. 1641 bool foldTailByMasking() const { return FoldTailByMasking; } 1642 1643 /// Returns true if the instructions in this block requires predication 1644 /// for any reason, e.g. because tail folding now requires a predicate 1645 /// or because the block in the original loop was predicated. 1646 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1647 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1648 } 1649 1650 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1651 /// nodes to the chain of instructions representing the reductions. Uses a 1652 /// MapVector to ensure deterministic iteration order. 1653 using ReductionChainMap = 1654 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1655 1656 /// Return the chain of instructions representing an inloop reduction. 1657 const ReductionChainMap &getInLoopReductionChains() const { 1658 return InLoopReductionChains; 1659 } 1660 1661 /// Returns true if the Phi is part of an inloop reduction. 1662 bool isInLoopReduction(PHINode *Phi) const { 1663 return InLoopReductionChains.count(Phi); 1664 } 1665 1666 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1667 /// with factor VF. Return the cost of the instruction, including 1668 /// scalarization overhead if it's needed. 1669 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1670 1671 /// Estimate cost of a call instruction CI if it were vectorized with factor 1672 /// VF. Return the cost of the instruction, including scalarization overhead 1673 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1674 /// scalarized - 1675 /// i.e. either vector version isn't available, or is too expensive. 1676 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1677 bool &NeedToScalarize) const; 1678 1679 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1680 /// that of B. 1681 bool isMoreProfitable(const VectorizationFactor &A, 1682 const VectorizationFactor &B) const; 1683 1684 /// Invalidates decisions already taken by the cost model. 1685 void invalidateCostModelingDecisions() { 1686 WideningDecisions.clear(); 1687 Uniforms.clear(); 1688 Scalars.clear(); 1689 } 1690 1691 private: 1692 unsigned NumPredStores = 0; 1693 1694 /// \return An upper bound for the vectorization factors for both 1695 /// fixed and scalable vectorization, where the minimum-known number of 1696 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1697 /// disabled or unsupported, then the scalable part will be equal to 1698 /// ElementCount::getScalable(0). 1699 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1700 ElementCount UserVF, 1701 bool FoldTailByMasking); 1702 1703 /// \return the maximized element count based on the targets vector 1704 /// registers and the loop trip-count, but limited to a maximum safe VF. 1705 /// This is a helper function of computeFeasibleMaxVF. 1706 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1707 /// issue that occurred on one of the buildbots which cannot be reproduced 1708 /// without having access to the properietary compiler (see comments on 1709 /// D98509). The issue is currently under investigation and this workaround 1710 /// will be removed as soon as possible. 1711 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1712 unsigned SmallestType, 1713 unsigned WidestType, 1714 const ElementCount &MaxSafeVF, 1715 bool FoldTailByMasking); 1716 1717 /// \return the maximum legal scalable VF, based on the safe max number 1718 /// of elements. 1719 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1720 1721 /// The vectorization cost is a combination of the cost itself and a boolean 1722 /// indicating whether any of the contributing operations will actually 1723 /// operate on vector values after type legalization in the backend. If this 1724 /// latter value is false, then all operations will be scalarized (i.e. no 1725 /// vectorization has actually taken place). 1726 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1727 1728 /// Returns the expected execution cost. The unit of the cost does 1729 /// not matter because we use the 'cost' units to compare different 1730 /// vector widths. The cost that is returned is *not* normalized by 1731 /// the factor width. If \p Invalid is not nullptr, this function 1732 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1733 /// each instruction that has an Invalid cost for the given VF. 1734 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1735 VectorizationCostTy 1736 expectedCost(ElementCount VF, 1737 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1738 1739 /// Returns the execution time cost of an instruction for a given vector 1740 /// width. Vector width of one means scalar. 1741 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1742 1743 /// The cost-computation logic from getInstructionCost which provides 1744 /// the vector type as an output parameter. 1745 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1746 Type *&VectorTy); 1747 1748 /// Return the cost of instructions in an inloop reduction pattern, if I is 1749 /// part of that pattern. 1750 Optional<InstructionCost> 1751 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1752 TTI::TargetCostKind CostKind); 1753 1754 /// Calculate vectorization cost of memory instruction \p I. 1755 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1756 1757 /// The cost computation for scalarized memory instruction. 1758 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1759 1760 /// The cost computation for interleaving group of memory instructions. 1761 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1762 1763 /// The cost computation for Gather/Scatter instruction. 1764 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1765 1766 /// The cost computation for widening instruction \p I with consecutive 1767 /// memory access. 1768 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1769 1770 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1771 /// Load: scalar load + broadcast. 1772 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1773 /// element) 1774 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1775 1776 /// Estimate the overhead of scalarizing an instruction. This is a 1777 /// convenience wrapper for the type-based getScalarizationOverhead API. 1778 InstructionCost getScalarizationOverhead(Instruction *I, 1779 ElementCount VF) const; 1780 1781 /// Returns whether the instruction is a load or store and will be a emitted 1782 /// as a vector operation. 1783 bool isConsecutiveLoadOrStore(Instruction *I); 1784 1785 /// Returns true if an artificially high cost for emulated masked memrefs 1786 /// should be used. 1787 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1788 1789 /// Map of scalar integer values to the smallest bitwidth they can be legally 1790 /// represented as. The vector equivalents of these values should be truncated 1791 /// to this type. 1792 MapVector<Instruction *, uint64_t> MinBWs; 1793 1794 /// A type representing the costs for instructions if they were to be 1795 /// scalarized rather than vectorized. The entries are Instruction-Cost 1796 /// pairs. 1797 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1798 1799 /// A set containing all BasicBlocks that are known to present after 1800 /// vectorization as a predicated block. 1801 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1802 1803 /// Records whether it is allowed to have the original scalar loop execute at 1804 /// least once. This may be needed as a fallback loop in case runtime 1805 /// aliasing/dependence checks fail, or to handle the tail/remainder 1806 /// iterations when the trip count is unknown or doesn't divide by the VF, 1807 /// or as a peel-loop to handle gaps in interleave-groups. 1808 /// Under optsize and when the trip count is very small we don't allow any 1809 /// iterations to execute in the scalar loop. 1810 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1811 1812 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1813 bool FoldTailByMasking = false; 1814 1815 /// A map holding scalar costs for different vectorization factors. The 1816 /// presence of a cost for an instruction in the mapping indicates that the 1817 /// instruction will be scalarized when vectorizing with the associated 1818 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1819 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1820 1821 /// Holds the instructions known to be uniform after vectorization. 1822 /// The data is collected per VF. 1823 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1824 1825 /// Holds the instructions known to be scalar after vectorization. 1826 /// The data is collected per VF. 1827 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1828 1829 /// Holds the instructions (address computations) that are forced to be 1830 /// scalarized. 1831 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1832 1833 /// PHINodes of the reductions that should be expanded in-loop along with 1834 /// their associated chains of reduction operations, in program order from top 1835 /// (PHI) to bottom 1836 ReductionChainMap InLoopReductionChains; 1837 1838 /// A Map of inloop reduction operations and their immediate chain operand. 1839 /// FIXME: This can be removed once reductions can be costed correctly in 1840 /// vplan. This was added to allow quick lookup to the inloop operations, 1841 /// without having to loop through InLoopReductionChains. 1842 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1843 1844 /// Returns the expected difference in cost from scalarizing the expression 1845 /// feeding a predicated instruction \p PredInst. The instructions to 1846 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1847 /// non-negative return value implies the expression will be scalarized. 1848 /// Currently, only single-use chains are considered for scalarization. 1849 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1850 ElementCount VF); 1851 1852 /// Collect the instructions that are uniform after vectorization. An 1853 /// instruction is uniform if we represent it with a single scalar value in 1854 /// the vectorized loop corresponding to each vector iteration. Examples of 1855 /// uniform instructions include pointer operands of consecutive or 1856 /// interleaved memory accesses. Note that although uniformity implies an 1857 /// instruction will be scalar, the reverse is not true. In general, a 1858 /// scalarized instruction will be represented by VF scalar values in the 1859 /// vectorized loop, each corresponding to an iteration of the original 1860 /// scalar loop. 1861 void collectLoopUniforms(ElementCount VF); 1862 1863 /// Collect the instructions that are scalar after vectorization. An 1864 /// instruction is scalar if it is known to be uniform or will be scalarized 1865 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1866 /// to the list if they are used by a load/store instruction that is marked as 1867 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1868 /// VF values in the vectorized loop, each corresponding to an iteration of 1869 /// the original scalar loop. 1870 void collectLoopScalars(ElementCount VF); 1871 1872 /// Keeps cost model vectorization decision and cost for instructions. 1873 /// Right now it is used for memory instructions only. 1874 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1875 std::pair<InstWidening, InstructionCost>>; 1876 1877 DecisionList WideningDecisions; 1878 1879 /// Returns true if \p V is expected to be vectorized and it needs to be 1880 /// extracted. 1881 bool needsExtract(Value *V, ElementCount VF) const { 1882 Instruction *I = dyn_cast<Instruction>(V); 1883 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1884 TheLoop->isLoopInvariant(I)) 1885 return false; 1886 1887 // Assume we can vectorize V (and hence we need extraction) if the 1888 // scalars are not computed yet. This can happen, because it is called 1889 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1890 // the scalars are collected. That should be a safe assumption in most 1891 // cases, because we check if the operands have vectorizable types 1892 // beforehand in LoopVectorizationLegality. 1893 return Scalars.find(VF) == Scalars.end() || 1894 !isScalarAfterVectorization(I, VF); 1895 }; 1896 1897 /// Returns a range containing only operands needing to be extracted. 1898 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1899 ElementCount VF) const { 1900 return SmallVector<Value *, 4>(make_filter_range( 1901 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1902 } 1903 1904 /// Determines if we have the infrastructure to vectorize loop \p L and its 1905 /// epilogue, assuming the main loop is vectorized by \p VF. 1906 bool isCandidateForEpilogueVectorization(const Loop &L, 1907 const ElementCount VF) const; 1908 1909 /// Returns true if epilogue vectorization is considered profitable, and 1910 /// false otherwise. 1911 /// \p VF is the vectorization factor chosen for the original loop. 1912 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1913 1914 public: 1915 /// The loop that we evaluate. 1916 Loop *TheLoop; 1917 1918 /// Predicated scalar evolution analysis. 1919 PredicatedScalarEvolution &PSE; 1920 1921 /// Loop Info analysis. 1922 LoopInfo *LI; 1923 1924 /// Vectorization legality. 1925 LoopVectorizationLegality *Legal; 1926 1927 /// Vector target information. 1928 const TargetTransformInfo &TTI; 1929 1930 /// Target Library Info. 1931 const TargetLibraryInfo *TLI; 1932 1933 /// Demanded bits analysis. 1934 DemandedBits *DB; 1935 1936 /// Assumption cache. 1937 AssumptionCache *AC; 1938 1939 /// Interface to emit optimization remarks. 1940 OptimizationRemarkEmitter *ORE; 1941 1942 const Function *TheFunction; 1943 1944 /// Loop Vectorize Hint. 1945 const LoopVectorizeHints *Hints; 1946 1947 /// The interleave access information contains groups of interleaved accesses 1948 /// with the same stride and close to each other. 1949 InterleavedAccessInfo &InterleaveInfo; 1950 1951 /// Values to ignore in the cost model. 1952 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1953 1954 /// Values to ignore in the cost model when VF > 1. 1955 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1956 1957 /// All element types found in the loop. 1958 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1959 1960 /// Profitable vector factors. 1961 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1962 }; 1963 } // end namespace llvm 1964 1965 /// Helper struct to manage generating runtime checks for vectorization. 1966 /// 1967 /// The runtime checks are created up-front in temporary blocks to allow better 1968 /// estimating the cost and un-linked from the existing IR. After deciding to 1969 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1970 /// temporary blocks are completely removed. 1971 class GeneratedRTChecks { 1972 /// Basic block which contains the generated SCEV checks, if any. 1973 BasicBlock *SCEVCheckBlock = nullptr; 1974 1975 /// The value representing the result of the generated SCEV checks. If it is 1976 /// nullptr, either no SCEV checks have been generated or they have been used. 1977 Value *SCEVCheckCond = nullptr; 1978 1979 /// Basic block which contains the generated memory runtime checks, if any. 1980 BasicBlock *MemCheckBlock = nullptr; 1981 1982 /// The value representing the result of the generated memory runtime checks. 1983 /// If it is nullptr, either no memory runtime checks have been generated or 1984 /// they have been used. 1985 Value *MemRuntimeCheckCond = nullptr; 1986 1987 DominatorTree *DT; 1988 LoopInfo *LI; 1989 1990 SCEVExpander SCEVExp; 1991 SCEVExpander MemCheckExp; 1992 1993 public: 1994 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1995 const DataLayout &DL) 1996 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1997 MemCheckExp(SE, DL, "scev.check") {} 1998 1999 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 2000 /// accurately estimate the cost of the runtime checks. The blocks are 2001 /// un-linked from the IR and is added back during vector code generation. If 2002 /// there is no vector code generation, the check blocks are removed 2003 /// completely. 2004 void Create(Loop *L, const LoopAccessInfo &LAI, 2005 const SCEVUnionPredicate &UnionPred) { 2006 2007 BasicBlock *LoopHeader = L->getHeader(); 2008 BasicBlock *Preheader = L->getLoopPreheader(); 2009 2010 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2011 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2012 // may be used by SCEVExpander. The blocks will be un-linked from their 2013 // predecessors and removed from LI & DT at the end of the function. 2014 if (!UnionPred.isAlwaysTrue()) { 2015 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2016 nullptr, "vector.scevcheck"); 2017 2018 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2019 &UnionPred, SCEVCheckBlock->getTerminator()); 2020 } 2021 2022 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2023 if (RtPtrChecking.Need) { 2024 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2025 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2026 "vector.memcheck"); 2027 2028 MemRuntimeCheckCond = 2029 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2030 RtPtrChecking.getChecks(), MemCheckExp); 2031 assert(MemRuntimeCheckCond && 2032 "no RT checks generated although RtPtrChecking " 2033 "claimed checks are required"); 2034 } 2035 2036 if (!MemCheckBlock && !SCEVCheckBlock) 2037 return; 2038 2039 // Unhook the temporary block with the checks, update various places 2040 // accordingly. 2041 if (SCEVCheckBlock) 2042 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2043 if (MemCheckBlock) 2044 MemCheckBlock->replaceAllUsesWith(Preheader); 2045 2046 if (SCEVCheckBlock) { 2047 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2048 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2049 Preheader->getTerminator()->eraseFromParent(); 2050 } 2051 if (MemCheckBlock) { 2052 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2053 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2054 Preheader->getTerminator()->eraseFromParent(); 2055 } 2056 2057 DT->changeImmediateDominator(LoopHeader, Preheader); 2058 if (MemCheckBlock) { 2059 DT->eraseNode(MemCheckBlock); 2060 LI->removeBlock(MemCheckBlock); 2061 } 2062 if (SCEVCheckBlock) { 2063 DT->eraseNode(SCEVCheckBlock); 2064 LI->removeBlock(SCEVCheckBlock); 2065 } 2066 } 2067 2068 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2069 /// unused. 2070 ~GeneratedRTChecks() { 2071 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2072 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2073 if (!SCEVCheckCond) 2074 SCEVCleaner.markResultUsed(); 2075 2076 if (!MemRuntimeCheckCond) 2077 MemCheckCleaner.markResultUsed(); 2078 2079 if (MemRuntimeCheckCond) { 2080 auto &SE = *MemCheckExp.getSE(); 2081 // Memory runtime check generation creates compares that use expanded 2082 // values. Remove them before running the SCEVExpanderCleaners. 2083 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2084 if (MemCheckExp.isInsertedInstruction(&I)) 2085 continue; 2086 SE.forgetValue(&I); 2087 I.eraseFromParent(); 2088 } 2089 } 2090 MemCheckCleaner.cleanup(); 2091 SCEVCleaner.cleanup(); 2092 2093 if (SCEVCheckCond) 2094 SCEVCheckBlock->eraseFromParent(); 2095 if (MemRuntimeCheckCond) 2096 MemCheckBlock->eraseFromParent(); 2097 } 2098 2099 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2100 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2101 /// depending on the generated condition. 2102 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2103 BasicBlock *LoopVectorPreHeader, 2104 BasicBlock *LoopExitBlock) { 2105 if (!SCEVCheckCond) 2106 return nullptr; 2107 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2108 if (C->isZero()) 2109 return nullptr; 2110 2111 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2112 2113 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2114 // Create new preheader for vector loop. 2115 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2116 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2117 2118 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2119 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2120 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2121 SCEVCheckBlock); 2122 2123 DT->addNewBlock(SCEVCheckBlock, Pred); 2124 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2125 2126 ReplaceInstWithInst( 2127 SCEVCheckBlock->getTerminator(), 2128 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2129 // Mark the check as used, to prevent it from being removed during cleanup. 2130 SCEVCheckCond = nullptr; 2131 return SCEVCheckBlock; 2132 } 2133 2134 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2135 /// the branches to branch to the vector preheader or \p Bypass, depending on 2136 /// the generated condition. 2137 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2138 BasicBlock *LoopVectorPreHeader) { 2139 // Check if we generated code that checks in runtime if arrays overlap. 2140 if (!MemRuntimeCheckCond) 2141 return nullptr; 2142 2143 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2144 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2145 MemCheckBlock); 2146 2147 DT->addNewBlock(MemCheckBlock, Pred); 2148 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2149 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2150 2151 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2152 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2153 2154 ReplaceInstWithInst( 2155 MemCheckBlock->getTerminator(), 2156 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2157 MemCheckBlock->getTerminator()->setDebugLoc( 2158 Pred->getTerminator()->getDebugLoc()); 2159 2160 // Mark the check as used, to prevent it from being removed during cleanup. 2161 MemRuntimeCheckCond = nullptr; 2162 return MemCheckBlock; 2163 } 2164 }; 2165 2166 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2167 // vectorization. The loop needs to be annotated with #pragma omp simd 2168 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2169 // vector length information is not provided, vectorization is not considered 2170 // explicit. Interleave hints are not allowed either. These limitations will be 2171 // relaxed in the future. 2172 // Please, note that we are currently forced to abuse the pragma 'clang 2173 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2174 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2175 // provides *explicit vectorization hints* (LV can bypass legal checks and 2176 // assume that vectorization is legal). However, both hints are implemented 2177 // using the same metadata (llvm.loop.vectorize, processed by 2178 // LoopVectorizeHints). This will be fixed in the future when the native IR 2179 // representation for pragma 'omp simd' is introduced. 2180 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2181 OptimizationRemarkEmitter *ORE) { 2182 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2183 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2184 2185 // Only outer loops with an explicit vectorization hint are supported. 2186 // Unannotated outer loops are ignored. 2187 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2188 return false; 2189 2190 Function *Fn = OuterLp->getHeader()->getParent(); 2191 if (!Hints.allowVectorization(Fn, OuterLp, 2192 true /*VectorizeOnlyWhenForced*/)) { 2193 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2194 return false; 2195 } 2196 2197 if (Hints.getInterleave() > 1) { 2198 // TODO: Interleave support is future work. 2199 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2200 "outer loops.\n"); 2201 Hints.emitRemarkWithHints(); 2202 return false; 2203 } 2204 2205 return true; 2206 } 2207 2208 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2209 OptimizationRemarkEmitter *ORE, 2210 SmallVectorImpl<Loop *> &V) { 2211 // Collect inner loops and outer loops without irreducible control flow. For 2212 // now, only collect outer loops that have explicit vectorization hints. If we 2213 // are stress testing the VPlan H-CFG construction, we collect the outermost 2214 // loop of every loop nest. 2215 if (L.isInnermost() || VPlanBuildStressTest || 2216 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2217 LoopBlocksRPO RPOT(&L); 2218 RPOT.perform(LI); 2219 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2220 V.push_back(&L); 2221 // TODO: Collect inner loops inside marked outer loops in case 2222 // vectorization fails for the outer loop. Do not invoke 2223 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2224 // already known to be reducible. We can use an inherited attribute for 2225 // that. 2226 return; 2227 } 2228 } 2229 for (Loop *InnerL : L) 2230 collectSupportedLoops(*InnerL, LI, ORE, V); 2231 } 2232 2233 namespace { 2234 2235 /// The LoopVectorize Pass. 2236 struct LoopVectorize : public FunctionPass { 2237 /// Pass identification, replacement for typeid 2238 static char ID; 2239 2240 LoopVectorizePass Impl; 2241 2242 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2243 bool VectorizeOnlyWhenForced = false) 2244 : FunctionPass(ID), 2245 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2246 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2247 } 2248 2249 bool runOnFunction(Function &F) override { 2250 if (skipFunction(F)) 2251 return false; 2252 2253 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2254 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2255 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2256 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2257 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2258 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2259 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2260 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2261 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2262 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2263 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2264 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2265 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2266 2267 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2268 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2269 2270 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2271 GetLAA, *ORE, PSI).MadeAnyChange; 2272 } 2273 2274 void getAnalysisUsage(AnalysisUsage &AU) const override { 2275 AU.addRequired<AssumptionCacheTracker>(); 2276 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2277 AU.addRequired<DominatorTreeWrapperPass>(); 2278 AU.addRequired<LoopInfoWrapperPass>(); 2279 AU.addRequired<ScalarEvolutionWrapperPass>(); 2280 AU.addRequired<TargetTransformInfoWrapperPass>(); 2281 AU.addRequired<AAResultsWrapperPass>(); 2282 AU.addRequired<LoopAccessLegacyAnalysis>(); 2283 AU.addRequired<DemandedBitsWrapperPass>(); 2284 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2285 AU.addRequired<InjectTLIMappingsLegacy>(); 2286 2287 // We currently do not preserve loopinfo/dominator analyses with outer loop 2288 // vectorization. Until this is addressed, mark these analyses as preserved 2289 // only for non-VPlan-native path. 2290 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2291 if (!EnableVPlanNativePath) { 2292 AU.addPreserved<LoopInfoWrapperPass>(); 2293 AU.addPreserved<DominatorTreeWrapperPass>(); 2294 } 2295 2296 AU.addPreserved<BasicAAWrapperPass>(); 2297 AU.addPreserved<GlobalsAAWrapperPass>(); 2298 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2299 } 2300 }; 2301 2302 } // end anonymous namespace 2303 2304 //===----------------------------------------------------------------------===// 2305 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2306 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2307 //===----------------------------------------------------------------------===// 2308 2309 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2310 // We need to place the broadcast of invariant variables outside the loop, 2311 // but only if it's proven safe to do so. Else, broadcast will be inside 2312 // vector loop body. 2313 Instruction *Instr = dyn_cast<Instruction>(V); 2314 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2315 (!Instr || 2316 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2317 // Place the code for broadcasting invariant variables in the new preheader. 2318 IRBuilder<>::InsertPointGuard Guard(Builder); 2319 if (SafeToHoist) 2320 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2321 2322 // Broadcast the scalar into all locations in the vector. 2323 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2324 2325 return Shuf; 2326 } 2327 2328 /// This function adds 2329 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2330 /// to each vector element of Val. The sequence starts at StartIndex. 2331 /// \p Opcode is relevant for FP induction variable. 2332 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2333 Instruction::BinaryOps BinOp, ElementCount VF, 2334 IRBuilder<> &Builder) { 2335 assert(VF.isVector() && "only vector VFs are supported"); 2336 2337 // Create and check the types. 2338 auto *ValVTy = cast<VectorType>(Val->getType()); 2339 ElementCount VLen = ValVTy->getElementCount(); 2340 2341 Type *STy = Val->getType()->getScalarType(); 2342 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2343 "Induction Step must be an integer or FP"); 2344 assert(Step->getType() == STy && "Step has wrong type"); 2345 2346 SmallVector<Constant *, 8> Indices; 2347 2348 // Create a vector of consecutive numbers from zero to VF. 2349 VectorType *InitVecValVTy = ValVTy; 2350 Type *InitVecValSTy = STy; 2351 if (STy->isFloatingPointTy()) { 2352 InitVecValSTy = 2353 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2354 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2355 } 2356 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2357 2358 // Splat the StartIdx 2359 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2360 2361 if (STy->isIntegerTy()) { 2362 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2363 Step = Builder.CreateVectorSplat(VLen, Step); 2364 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2365 // FIXME: The newly created binary instructions should contain nsw/nuw 2366 // flags, which can be found from the original scalar operations. 2367 Step = Builder.CreateMul(InitVec, Step); 2368 return Builder.CreateAdd(Val, Step, "induction"); 2369 } 2370 2371 // Floating point induction. 2372 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2373 "Binary Opcode should be specified for FP induction"); 2374 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2375 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2376 2377 Step = Builder.CreateVectorSplat(VLen, Step); 2378 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2379 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2380 } 2381 2382 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2383 const InductionDescriptor &II, Value *Step, Value *Start, 2384 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2385 IRBuilder<> &Builder = State.Builder; 2386 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2387 "Expected either an induction phi-node or a truncate of it!"); 2388 2389 // Construct the initial value of the vector IV in the vector loop preheader 2390 auto CurrIP = Builder.saveIP(); 2391 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2392 if (isa<TruncInst>(EntryVal)) { 2393 assert(Start->getType()->isIntegerTy() && 2394 "Truncation requires an integer type"); 2395 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2396 Step = Builder.CreateTrunc(Step, TruncType); 2397 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2398 } 2399 2400 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2401 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2402 Value *SteppedStart = getStepVector( 2403 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2404 2405 // We create vector phi nodes for both integer and floating-point induction 2406 // variables. Here, we determine the kind of arithmetic we will perform. 2407 Instruction::BinaryOps AddOp; 2408 Instruction::BinaryOps MulOp; 2409 if (Step->getType()->isIntegerTy()) { 2410 AddOp = Instruction::Add; 2411 MulOp = Instruction::Mul; 2412 } else { 2413 AddOp = II.getInductionOpcode(); 2414 MulOp = Instruction::FMul; 2415 } 2416 2417 // Multiply the vectorization factor by the step using integer or 2418 // floating-point arithmetic as appropriate. 2419 Type *StepType = Step->getType(); 2420 Value *RuntimeVF; 2421 if (Step->getType()->isFloatingPointTy()) 2422 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2423 else 2424 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2425 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2426 2427 // Create a vector splat to use in the induction update. 2428 // 2429 // FIXME: If the step is non-constant, we create the vector splat with 2430 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2431 // handle a constant vector splat. 2432 Value *SplatVF = isa<Constant>(Mul) 2433 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2434 : Builder.CreateVectorSplat(State.VF, Mul); 2435 Builder.restoreIP(CurrIP); 2436 2437 // We may need to add the step a number of times, depending on the unroll 2438 // factor. The last of those goes into the PHI. 2439 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2440 &*LoopVectorBody->getFirstInsertionPt()); 2441 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2442 Instruction *LastInduction = VecInd; 2443 for (unsigned Part = 0; Part < UF; ++Part) { 2444 State.set(Def, LastInduction, Part); 2445 2446 if (isa<TruncInst>(EntryVal)) 2447 addMetadata(LastInduction, EntryVal); 2448 2449 LastInduction = cast<Instruction>( 2450 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2451 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2452 } 2453 2454 // Move the last step to the end of the latch block. This ensures consistent 2455 // placement of all induction updates. 2456 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2457 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2458 LastInduction->moveBefore(Br); 2459 LastInduction->setName("vec.ind.next"); 2460 2461 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2462 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2463 } 2464 2465 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2466 return Cost->isScalarAfterVectorization(I, VF) || 2467 Cost->isProfitableToScalarize(I, VF); 2468 } 2469 2470 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2471 if (shouldScalarizeInstruction(IV)) 2472 return true; 2473 auto isScalarInst = [&](User *U) -> bool { 2474 auto *I = cast<Instruction>(U); 2475 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2476 }; 2477 return llvm::any_of(IV->users(), isScalarInst); 2478 } 2479 2480 void InnerLoopVectorizer::widenIntOrFpInduction( 2481 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, 2482 Value *CanonicalIV) { 2483 Value *Start = Def->getStartValue()->getLiveInIRValue(); 2484 const InductionDescriptor &ID = Def->getInductionDescriptor(); 2485 TruncInst *Trunc = Def->getTruncInst(); 2486 IRBuilder<> &Builder = State.Builder; 2487 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2488 assert(!State.VF.isZero() && "VF must be non-zero"); 2489 2490 // The value from the original loop to which we are mapping the new induction 2491 // variable. 2492 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2493 2494 auto &DL = EntryVal->getModule()->getDataLayout(); 2495 2496 // Generate code for the induction step. Note that induction steps are 2497 // required to be loop-invariant 2498 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2499 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2500 "Induction step should be loop invariant"); 2501 if (PSE.getSE()->isSCEVable(IV->getType())) { 2502 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2503 return Exp.expandCodeFor(Step, Step->getType(), 2504 State.CFG.VectorPreHeader->getTerminator()); 2505 } 2506 return cast<SCEVUnknown>(Step)->getValue(); 2507 }; 2508 2509 // The scalar value to broadcast. This is derived from the canonical 2510 // induction variable. If a truncation type is given, truncate the canonical 2511 // induction variable and step. Otherwise, derive these values from the 2512 // induction descriptor. 2513 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2514 Value *ScalarIV = CanonicalIV; 2515 Type *NeededType = IV->getType(); 2516 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { 2517 ScalarIV = 2518 NeededType->isIntegerTy() 2519 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) 2520 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); 2521 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, 2522 State.CFG.PrevBB); 2523 ScalarIV->setName("offset.idx"); 2524 } 2525 if (Trunc) { 2526 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2527 assert(Step->getType()->isIntegerTy() && 2528 "Truncation requires an integer step"); 2529 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2530 Step = Builder.CreateTrunc(Step, TruncType); 2531 } 2532 return ScalarIV; 2533 }; 2534 2535 // Create the vector values from the scalar IV, in the absence of creating a 2536 // vector IV. 2537 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2538 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2539 for (unsigned Part = 0; Part < UF; ++Part) { 2540 Value *StartIdx; 2541 if (Step->getType()->isFloatingPointTy()) 2542 StartIdx = 2543 getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); 2544 else 2545 StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); 2546 2547 Value *EntryPart = 2548 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), 2549 State.VF, State.Builder); 2550 State.set(Def, EntryPart, Part); 2551 if (Trunc) 2552 addMetadata(EntryPart, Trunc); 2553 } 2554 }; 2555 2556 // Fast-math-flags propagate from the original induction instruction. 2557 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2558 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2559 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2560 2561 // Now do the actual transformations, and start with creating the step value. 2562 Value *Step = CreateStepValue(ID.getStep()); 2563 if (State.VF.isScalar()) { 2564 Value *ScalarIV = CreateScalarIV(Step); 2565 Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), 2566 Step->getType()->getScalarSizeInBits()); 2567 2568 Instruction::BinaryOps IncOp = ID.getInductionOpcode(); 2569 if (IncOp == Instruction::BinaryOpsEnd) 2570 IncOp = Instruction::Add; 2571 for (unsigned Part = 0; Part < UF; ++Part) { 2572 Value *StartIdx = ConstantInt::get(ScalarTy, Part); 2573 Instruction::BinaryOps MulOp = Instruction::Mul; 2574 if (Step->getType()->isFloatingPointTy()) { 2575 StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); 2576 MulOp = Instruction::FMul; 2577 } 2578 2579 Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2580 Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); 2581 State.set(Def, EntryPart, Part); 2582 if (Trunc) { 2583 assert(!Step->getType()->isFloatingPointTy() && 2584 "fp inductions shouldn't be truncated"); 2585 addMetadata(EntryPart, Trunc); 2586 } 2587 } 2588 return; 2589 } 2590 2591 // Determine if we want a scalar version of the induction variable. This is 2592 // true if the induction variable itself is not widened, or if it has at 2593 // least one user in the loop that is not widened. 2594 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2595 if (!NeedsScalarIV) { 2596 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2597 return; 2598 } 2599 2600 // Try to create a new independent vector induction variable. If we can't 2601 // create the phi node, we will splat the scalar induction variable in each 2602 // loop iteration. 2603 if (!shouldScalarizeInstruction(EntryVal)) { 2604 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2605 Value *ScalarIV = CreateScalarIV(Step); 2606 // Create scalar steps that can be used by instructions we will later 2607 // scalarize. Note that the addition of the scalar steps will not increase 2608 // the number of instructions in the loop in the common case prior to 2609 // InstCombine. We will be trading one vector extract for each scalar step. 2610 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2611 return; 2612 } 2613 2614 // All IV users are scalar instructions, so only emit a scalar IV, not a 2615 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2616 // predicate used by the masked loads/stores. 2617 Value *ScalarIV = CreateScalarIV(Step); 2618 if (!Cost->isScalarEpilogueAllowed()) 2619 CreateSplatIV(ScalarIV, Step); 2620 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2621 } 2622 2623 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2624 Instruction *EntryVal, 2625 const InductionDescriptor &ID, 2626 VPValue *Def, 2627 VPTransformState &State) { 2628 IRBuilder<> &Builder = State.Builder; 2629 // We shouldn't have to build scalar steps if we aren't vectorizing. 2630 assert(State.VF.isVector() && "VF should be greater than one"); 2631 // Get the value type and ensure it and the step have the same integer type. 2632 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2633 assert(ScalarIVTy == Step->getType() && 2634 "Val and Step should have the same type"); 2635 2636 // We build scalar steps for both integer and floating-point induction 2637 // variables. Here, we determine the kind of arithmetic we will perform. 2638 Instruction::BinaryOps AddOp; 2639 Instruction::BinaryOps MulOp; 2640 if (ScalarIVTy->isIntegerTy()) { 2641 AddOp = Instruction::Add; 2642 MulOp = Instruction::Mul; 2643 } else { 2644 AddOp = ID.getInductionOpcode(); 2645 MulOp = Instruction::FMul; 2646 } 2647 2648 // Determine the number of scalars we need to generate for each unroll 2649 // iteration. If EntryVal is uniform, we only need to generate the first 2650 // lane. Otherwise, we generate all VF values. 2651 bool IsUniform = 2652 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); 2653 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 2654 // Compute the scalar steps and save the results in State. 2655 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2656 ScalarIVTy->getScalarSizeInBits()); 2657 Type *VecIVTy = nullptr; 2658 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2659 if (!IsUniform && State.VF.isScalable()) { 2660 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2661 UnitStepVec = 2662 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2663 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2664 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2665 } 2666 2667 for (unsigned Part = 0; Part < State.UF; ++Part) { 2668 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2669 2670 if (!IsUniform && State.VF.isScalable()) { 2671 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2672 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2673 if (ScalarIVTy->isFloatingPointTy()) 2674 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2675 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2676 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2677 State.set(Def, Add, Part); 2678 // It's useful to record the lane values too for the known minimum number 2679 // of elements so we do those below. This improves the code quality when 2680 // trying to extract the first element, for example. 2681 } 2682 2683 if (ScalarIVTy->isFloatingPointTy()) 2684 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2685 2686 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2687 Value *StartIdx = Builder.CreateBinOp( 2688 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2689 // The step returned by `createStepForVF` is a runtime-evaluated value 2690 // when VF is scalable. Otherwise, it should be folded into a Constant. 2691 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2692 "Expected StartIdx to be folded to a constant when VF is not " 2693 "scalable"); 2694 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2695 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2696 State.set(Def, Add, VPIteration(Part, Lane)); 2697 } 2698 } 2699 } 2700 2701 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2702 const VPIteration &Instance, 2703 VPTransformState &State) { 2704 Value *ScalarInst = State.get(Def, Instance); 2705 Value *VectorValue = State.get(Def, Instance.Part); 2706 VectorValue = Builder.CreateInsertElement( 2707 VectorValue, ScalarInst, 2708 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2709 State.set(Def, VectorValue, Instance.Part); 2710 } 2711 2712 // Return whether we allow using masked interleave-groups (for dealing with 2713 // strided loads/stores that reside in predicated blocks, or for dealing 2714 // with gaps). 2715 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2716 // If an override option has been passed in for interleaved accesses, use it. 2717 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2718 return EnableMaskedInterleavedMemAccesses; 2719 2720 return TTI.enableMaskedInterleavedAccessVectorization(); 2721 } 2722 2723 // Try to vectorize the interleave group that \p Instr belongs to. 2724 // 2725 // E.g. Translate following interleaved load group (factor = 3): 2726 // for (i = 0; i < N; i+=3) { 2727 // R = Pic[i]; // Member of index 0 2728 // G = Pic[i+1]; // Member of index 1 2729 // B = Pic[i+2]; // Member of index 2 2730 // ... // do something to R, G, B 2731 // } 2732 // To: 2733 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2734 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2735 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2736 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2737 // 2738 // Or translate following interleaved store group (factor = 3): 2739 // for (i = 0; i < N; i+=3) { 2740 // ... do something to R, G, B 2741 // Pic[i] = R; // Member of index 0 2742 // Pic[i+1] = G; // Member of index 1 2743 // Pic[i+2] = B; // Member of index 2 2744 // } 2745 // To: 2746 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2747 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2748 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2749 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2750 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2751 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2752 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2753 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2754 VPValue *BlockInMask) { 2755 Instruction *Instr = Group->getInsertPos(); 2756 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2757 2758 // Prepare for the vector type of the interleaved load/store. 2759 Type *ScalarTy = getLoadStoreType(Instr); 2760 unsigned InterleaveFactor = Group->getFactor(); 2761 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2762 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2763 2764 // Prepare for the new pointers. 2765 SmallVector<Value *, 2> AddrParts; 2766 unsigned Index = Group->getIndex(Instr); 2767 2768 // TODO: extend the masked interleaved-group support to reversed access. 2769 assert((!BlockInMask || !Group->isReverse()) && 2770 "Reversed masked interleave-group not supported."); 2771 2772 // If the group is reverse, adjust the index to refer to the last vector lane 2773 // instead of the first. We adjust the index from the first vector lane, 2774 // rather than directly getting the pointer for lane VF - 1, because the 2775 // pointer operand of the interleaved access is supposed to be uniform. For 2776 // uniform instructions, we're only required to generate a value for the 2777 // first vector lane in each unroll iteration. 2778 if (Group->isReverse()) 2779 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2780 2781 for (unsigned Part = 0; Part < UF; Part++) { 2782 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2783 setDebugLocFromInst(AddrPart); 2784 2785 // Notice current instruction could be any index. Need to adjust the address 2786 // to the member of index 0. 2787 // 2788 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2789 // b = A[i]; // Member of index 0 2790 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2791 // 2792 // E.g. A[i+1] = a; // Member of index 1 2793 // A[i] = b; // Member of index 0 2794 // A[i+2] = c; // Member of index 2 (Current instruction) 2795 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2796 2797 bool InBounds = false; 2798 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2799 InBounds = gep->isInBounds(); 2800 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2801 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2802 2803 // Cast to the vector pointer type. 2804 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2805 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2806 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2807 } 2808 2809 setDebugLocFromInst(Instr); 2810 Value *PoisonVec = PoisonValue::get(VecTy); 2811 2812 Value *MaskForGaps = nullptr; 2813 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2814 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2815 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2816 } 2817 2818 // Vectorize the interleaved load group. 2819 if (isa<LoadInst>(Instr)) { 2820 // For each unroll part, create a wide load for the group. 2821 SmallVector<Value *, 2> NewLoads; 2822 for (unsigned Part = 0; Part < UF; Part++) { 2823 Instruction *NewLoad; 2824 if (BlockInMask || MaskForGaps) { 2825 assert(useMaskedInterleavedAccesses(*TTI) && 2826 "masked interleaved groups are not allowed."); 2827 Value *GroupMask = MaskForGaps; 2828 if (BlockInMask) { 2829 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2830 Value *ShuffledMask = Builder.CreateShuffleVector( 2831 BlockInMaskPart, 2832 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2833 "interleaved.mask"); 2834 GroupMask = MaskForGaps 2835 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2836 MaskForGaps) 2837 : ShuffledMask; 2838 } 2839 NewLoad = 2840 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2841 GroupMask, PoisonVec, "wide.masked.vec"); 2842 } 2843 else 2844 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2845 Group->getAlign(), "wide.vec"); 2846 Group->addMetadata(NewLoad); 2847 NewLoads.push_back(NewLoad); 2848 } 2849 2850 // For each member in the group, shuffle out the appropriate data from the 2851 // wide loads. 2852 unsigned J = 0; 2853 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2854 Instruction *Member = Group->getMember(I); 2855 2856 // Skip the gaps in the group. 2857 if (!Member) 2858 continue; 2859 2860 auto StrideMask = 2861 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2862 for (unsigned Part = 0; Part < UF; Part++) { 2863 Value *StridedVec = Builder.CreateShuffleVector( 2864 NewLoads[Part], StrideMask, "strided.vec"); 2865 2866 // If this member has different type, cast the result type. 2867 if (Member->getType() != ScalarTy) { 2868 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2869 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2870 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2871 } 2872 2873 if (Group->isReverse()) 2874 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2875 2876 State.set(VPDefs[J], StridedVec, Part); 2877 } 2878 ++J; 2879 } 2880 return; 2881 } 2882 2883 // The sub vector type for current instruction. 2884 auto *SubVT = VectorType::get(ScalarTy, VF); 2885 2886 // Vectorize the interleaved store group. 2887 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2888 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2889 "masked interleaved groups are not allowed."); 2890 assert((!MaskForGaps || !VF.isScalable()) && 2891 "masking gaps for scalable vectors is not yet supported."); 2892 for (unsigned Part = 0; Part < UF; Part++) { 2893 // Collect the stored vector from each member. 2894 SmallVector<Value *, 4> StoredVecs; 2895 for (unsigned i = 0; i < InterleaveFactor; i++) { 2896 assert((Group->getMember(i) || MaskForGaps) && 2897 "Fail to get a member from an interleaved store group"); 2898 Instruction *Member = Group->getMember(i); 2899 2900 // Skip the gaps in the group. 2901 if (!Member) { 2902 Value *Undef = PoisonValue::get(SubVT); 2903 StoredVecs.push_back(Undef); 2904 continue; 2905 } 2906 2907 Value *StoredVec = State.get(StoredValues[i], Part); 2908 2909 if (Group->isReverse()) 2910 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2911 2912 // If this member has different type, cast it to a unified type. 2913 2914 if (StoredVec->getType() != SubVT) 2915 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2916 2917 StoredVecs.push_back(StoredVec); 2918 } 2919 2920 // Concatenate all vectors into a wide vector. 2921 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2922 2923 // Interleave the elements in the wide vector. 2924 Value *IVec = Builder.CreateShuffleVector( 2925 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2926 "interleaved.vec"); 2927 2928 Instruction *NewStoreInstr; 2929 if (BlockInMask || MaskForGaps) { 2930 Value *GroupMask = MaskForGaps; 2931 if (BlockInMask) { 2932 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2933 Value *ShuffledMask = Builder.CreateShuffleVector( 2934 BlockInMaskPart, 2935 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2936 "interleaved.mask"); 2937 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2938 ShuffledMask, MaskForGaps) 2939 : ShuffledMask; 2940 } 2941 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2942 Group->getAlign(), GroupMask); 2943 } else 2944 NewStoreInstr = 2945 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2946 2947 Group->addMetadata(NewStoreInstr); 2948 } 2949 } 2950 2951 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2952 VPReplicateRecipe *RepRecipe, 2953 const VPIteration &Instance, 2954 bool IfPredicateInstr, 2955 VPTransformState &State) { 2956 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2957 2958 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2959 // the first lane and part. 2960 if (isa<NoAliasScopeDeclInst>(Instr)) 2961 if (!Instance.isFirstIteration()) 2962 return; 2963 2964 setDebugLocFromInst(Instr); 2965 2966 // Does this instruction return a value ? 2967 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2968 2969 Instruction *Cloned = Instr->clone(); 2970 if (!IsVoidRetTy) 2971 Cloned->setName(Instr->getName() + ".cloned"); 2972 2973 // If the scalarized instruction contributes to the address computation of a 2974 // widen masked load/store which was in a basic block that needed predication 2975 // and is not predicated after vectorization, we can't propagate 2976 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2977 // instruction could feed a poison value to the base address of the widen 2978 // load/store. 2979 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2980 Cloned->dropPoisonGeneratingFlags(); 2981 2982 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2983 Builder.GetInsertPoint()); 2984 // Replace the operands of the cloned instructions with their scalar 2985 // equivalents in the new loop. 2986 for (auto &I : enumerate(RepRecipe->operands())) { 2987 auto InputInstance = Instance; 2988 VPValue *Operand = I.value(); 2989 if (State.Plan->isUniformAfterVectorization(Operand)) 2990 InputInstance.Lane = VPLane::getFirstLane(); 2991 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2992 } 2993 addNewMetadata(Cloned, Instr); 2994 2995 // Place the cloned scalar in the new loop. 2996 Builder.Insert(Cloned); 2997 2998 State.set(RepRecipe, Cloned, Instance); 2999 3000 // If we just cloned a new assumption, add it the assumption cache. 3001 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3002 AC->registerAssumption(II); 3003 3004 // End if-block. 3005 if (IfPredicateInstr) 3006 PredicatedInstructions.push_back(Cloned); 3007 } 3008 3009 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 3010 BasicBlock *Header = L->getHeader(); 3011 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 3012 3013 IRBuilder<> B(Header->getTerminator()); 3014 Instruction *OldInst = 3015 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 3016 setDebugLocFromInst(OldInst, &B); 3017 3018 // Connect the header to the exit and header blocks and replace the old 3019 // terminator. 3020 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 3021 3022 // Now we have two terminators. Remove the old one from the block. 3023 Header->getTerminator()->eraseFromParent(); 3024 } 3025 3026 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3027 if (TripCount) 3028 return TripCount; 3029 3030 assert(L && "Create Trip Count for null loop."); 3031 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3032 // Find the loop boundaries. 3033 ScalarEvolution *SE = PSE.getSE(); 3034 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3035 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3036 "Invalid loop count"); 3037 3038 Type *IdxTy = Legal->getWidestInductionType(); 3039 assert(IdxTy && "No type for induction"); 3040 3041 // The exit count might have the type of i64 while the phi is i32. This can 3042 // happen if we have an induction variable that is sign extended before the 3043 // compare. The only way that we get a backedge taken count is that the 3044 // induction variable was signed and as such will not overflow. In such a case 3045 // truncation is legal. 3046 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3047 IdxTy->getPrimitiveSizeInBits()) 3048 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3049 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3050 3051 // Get the total trip count from the count by adding 1. 3052 const SCEV *ExitCount = SE->getAddExpr( 3053 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3054 3055 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3056 3057 // Expand the trip count and place the new instructions in the preheader. 3058 // Notice that the pre-header does not change, only the loop body. 3059 SCEVExpander Exp(*SE, DL, "induction"); 3060 3061 // Count holds the overall loop count (N). 3062 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3063 L->getLoopPreheader()->getTerminator()); 3064 3065 if (TripCount->getType()->isPointerTy()) 3066 TripCount = 3067 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3068 L->getLoopPreheader()->getTerminator()); 3069 3070 return TripCount; 3071 } 3072 3073 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3074 if (VectorTripCount) 3075 return VectorTripCount; 3076 3077 Value *TC = getOrCreateTripCount(L); 3078 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3079 3080 Type *Ty = TC->getType(); 3081 // This is where we can make the step a runtime constant. 3082 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3083 3084 // If the tail is to be folded by masking, round the number of iterations N 3085 // up to a multiple of Step instead of rounding down. This is done by first 3086 // adding Step-1 and then rounding down. Note that it's ok if this addition 3087 // overflows: the vector induction variable will eventually wrap to zero given 3088 // that it starts at zero and its Step is a power of two; the loop will then 3089 // exit, with the last early-exit vector comparison also producing all-true. 3090 if (Cost->foldTailByMasking()) { 3091 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3092 "VF*UF must be a power of 2 when folding tail by masking"); 3093 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 3094 TC = Builder.CreateAdd( 3095 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 3096 } 3097 3098 // Now we need to generate the expression for the part of the loop that the 3099 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3100 // iterations are not required for correctness, or N - Step, otherwise. Step 3101 // is equal to the vectorization factor (number of SIMD elements) times the 3102 // unroll factor (number of SIMD instructions). 3103 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3104 3105 // There are cases where we *must* run at least one iteration in the remainder 3106 // loop. See the cost model for when this can happen. If the step evenly 3107 // divides the trip count, we set the remainder to be equal to the step. If 3108 // the step does not evenly divide the trip count, no adjustment is necessary 3109 // since there will already be scalar iterations. Note that the minimum 3110 // iterations check ensures that N >= Step. 3111 if (Cost->requiresScalarEpilogue(VF)) { 3112 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3113 R = Builder.CreateSelect(IsZero, Step, R); 3114 } 3115 3116 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3117 3118 return VectorTripCount; 3119 } 3120 3121 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3122 const DataLayout &DL) { 3123 // Verify that V is a vector type with same number of elements as DstVTy. 3124 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3125 unsigned VF = DstFVTy->getNumElements(); 3126 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3127 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3128 Type *SrcElemTy = SrcVecTy->getElementType(); 3129 Type *DstElemTy = DstFVTy->getElementType(); 3130 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3131 "Vector elements must have same size"); 3132 3133 // Do a direct cast if element types are castable. 3134 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3135 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3136 } 3137 // V cannot be directly casted to desired vector type. 3138 // May happen when V is a floating point vector but DstVTy is a vector of 3139 // pointers or vice-versa. Handle this using a two-step bitcast using an 3140 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3141 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3142 "Only one type should be a pointer type"); 3143 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3144 "Only one type should be a floating point type"); 3145 Type *IntTy = 3146 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3147 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3148 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3149 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3150 } 3151 3152 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3153 BasicBlock *Bypass) { 3154 Value *Count = getOrCreateTripCount(L); 3155 // Reuse existing vector loop preheader for TC checks. 3156 // Note that new preheader block is generated for vector loop. 3157 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3158 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3159 3160 // Generate code to check if the loop's trip count is less than VF * UF, or 3161 // equal to it in case a scalar epilogue is required; this implies that the 3162 // vector trip count is zero. This check also covers the case where adding one 3163 // to the backedge-taken count overflowed leading to an incorrect trip count 3164 // of zero. In this case we will also jump to the scalar loop. 3165 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3166 : ICmpInst::ICMP_ULT; 3167 3168 // If tail is to be folded, vector loop takes care of all iterations. 3169 Value *CheckMinIters = Builder.getFalse(); 3170 if (!Cost->foldTailByMasking()) { 3171 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3172 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3173 } 3174 // Create new preheader for vector loop. 3175 LoopVectorPreHeader = 3176 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3177 "vector.ph"); 3178 3179 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3180 DT->getNode(Bypass)->getIDom()) && 3181 "TC check is expected to dominate Bypass"); 3182 3183 // Update dominator for Bypass & LoopExit (if needed). 3184 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3185 if (!Cost->requiresScalarEpilogue(VF)) 3186 // If there is an epilogue which must run, there's no edge from the 3187 // middle block to exit blocks and thus no need to update the immediate 3188 // dominator of the exit blocks. 3189 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3190 3191 ReplaceInstWithInst( 3192 TCCheckBlock->getTerminator(), 3193 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3194 LoopBypassBlocks.push_back(TCCheckBlock); 3195 } 3196 3197 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3198 3199 BasicBlock *const SCEVCheckBlock = 3200 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3201 if (!SCEVCheckBlock) 3202 return nullptr; 3203 3204 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3205 (OptForSizeBasedOnProfile && 3206 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3207 "Cannot SCEV check stride or overflow when optimizing for size"); 3208 3209 3210 // Update dominator only if this is first RT check. 3211 if (LoopBypassBlocks.empty()) { 3212 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3213 if (!Cost->requiresScalarEpilogue(VF)) 3214 // If there is an epilogue which must run, there's no edge from the 3215 // middle block to exit blocks and thus no need to update the immediate 3216 // dominator of the exit blocks. 3217 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3218 } 3219 3220 LoopBypassBlocks.push_back(SCEVCheckBlock); 3221 AddedSafetyChecks = true; 3222 return SCEVCheckBlock; 3223 } 3224 3225 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3226 BasicBlock *Bypass) { 3227 // VPlan-native path does not do any analysis for runtime checks currently. 3228 if (EnableVPlanNativePath) 3229 return nullptr; 3230 3231 BasicBlock *const MemCheckBlock = 3232 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3233 3234 // Check if we generated code that checks in runtime if arrays overlap. We put 3235 // the checks into a separate block to make the more common case of few 3236 // elements faster. 3237 if (!MemCheckBlock) 3238 return nullptr; 3239 3240 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3241 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3242 "Cannot emit memory checks when optimizing for size, unless forced " 3243 "to vectorize."); 3244 ORE->emit([&]() { 3245 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3246 L->getStartLoc(), L->getHeader()) 3247 << "Code-size may be reduced by not forcing " 3248 "vectorization, or by source-code modifications " 3249 "eliminating the need for runtime checks " 3250 "(e.g., adding 'restrict')."; 3251 }); 3252 } 3253 3254 LoopBypassBlocks.push_back(MemCheckBlock); 3255 3256 AddedSafetyChecks = true; 3257 3258 // We currently don't use LoopVersioning for the actual loop cloning but we 3259 // still use it to add the noalias metadata. 3260 LVer = std::make_unique<LoopVersioning>( 3261 *Legal->getLAI(), 3262 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3263 DT, PSE.getSE()); 3264 LVer->prepareNoAliasMetadata(); 3265 return MemCheckBlock; 3266 } 3267 3268 Value *InnerLoopVectorizer::emitTransformedIndex( 3269 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3270 const InductionDescriptor &ID, BasicBlock *VectorHeader) const { 3271 3272 SCEVExpander Exp(*SE, DL, "induction"); 3273 auto Step = ID.getStep(); 3274 auto StartValue = ID.getStartValue(); 3275 assert(Index->getType()->getScalarType() == Step->getType() && 3276 "Index scalar type does not match StepValue type"); 3277 3278 // Note: the IR at this point is broken. We cannot use SE to create any new 3279 // SCEV and then expand it, hoping that SCEV's simplification will give us 3280 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3281 // lead to various SCEV crashes. So all we can do is to use builder and rely 3282 // on InstCombine for future simplifications. Here we handle some trivial 3283 // cases only. 3284 auto CreateAdd = [&B](Value *X, Value *Y) { 3285 assert(X->getType() == Y->getType() && "Types don't match!"); 3286 if (auto *CX = dyn_cast<ConstantInt>(X)) 3287 if (CX->isZero()) 3288 return Y; 3289 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3290 if (CY->isZero()) 3291 return X; 3292 return B.CreateAdd(X, Y); 3293 }; 3294 3295 // We allow X to be a vector type, in which case Y will potentially be 3296 // splatted into a vector with the same element count. 3297 auto CreateMul = [&B](Value *X, Value *Y) { 3298 assert(X->getType()->getScalarType() == Y->getType() && 3299 "Types don't match!"); 3300 if (auto *CX = dyn_cast<ConstantInt>(X)) 3301 if (CX->isOne()) 3302 return Y; 3303 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3304 if (CY->isOne()) 3305 return X; 3306 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3307 if (XVTy && !isa<VectorType>(Y->getType())) 3308 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3309 return B.CreateMul(X, Y); 3310 }; 3311 3312 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3313 // loop, choose the end of the vector loop header (=VectorHeader), because 3314 // the DomTree is not kept up-to-date for additional blocks generated in the 3315 // vector loop. By using the header as insertion point, we guarantee that the 3316 // expanded instructions dominate all their uses. 3317 auto GetInsertPoint = [this, &B, VectorHeader]() { 3318 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3319 if (InsertBB != LoopVectorBody && 3320 LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) 3321 return VectorHeader->getTerminator(); 3322 return &*B.GetInsertPoint(); 3323 }; 3324 3325 switch (ID.getKind()) { 3326 case InductionDescriptor::IK_IntInduction: { 3327 assert(!isa<VectorType>(Index->getType()) && 3328 "Vector indices not supported for integer inductions yet"); 3329 assert(Index->getType() == StartValue->getType() && 3330 "Index type does not match StartValue type"); 3331 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3332 return B.CreateSub(StartValue, Index); 3333 auto *Offset = CreateMul( 3334 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3335 return CreateAdd(StartValue, Offset); 3336 } 3337 case InductionDescriptor::IK_PtrInduction: { 3338 assert(isa<SCEVConstant>(Step) && 3339 "Expected constant step for pointer induction"); 3340 return B.CreateGEP( 3341 ID.getElementType(), StartValue, 3342 CreateMul(Index, 3343 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3344 GetInsertPoint()))); 3345 } 3346 case InductionDescriptor::IK_FpInduction: { 3347 assert(!isa<VectorType>(Index->getType()) && 3348 "Vector indices not supported for FP inductions yet"); 3349 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3350 auto InductionBinOp = ID.getInductionBinOp(); 3351 assert(InductionBinOp && 3352 (InductionBinOp->getOpcode() == Instruction::FAdd || 3353 InductionBinOp->getOpcode() == Instruction::FSub) && 3354 "Original bin op should be defined for FP induction"); 3355 3356 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3357 Value *MulExp = B.CreateFMul(StepValue, Index); 3358 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3359 "induction"); 3360 } 3361 case InductionDescriptor::IK_NoInduction: 3362 return nullptr; 3363 } 3364 llvm_unreachable("invalid enum"); 3365 } 3366 3367 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3368 LoopScalarBody = OrigLoop->getHeader(); 3369 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3370 assert(LoopVectorPreHeader && "Invalid loop structure"); 3371 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3372 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3373 "multiple exit loop without required epilogue?"); 3374 3375 LoopMiddleBlock = 3376 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3377 LI, nullptr, Twine(Prefix) + "middle.block"); 3378 LoopScalarPreHeader = 3379 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3380 nullptr, Twine(Prefix) + "scalar.ph"); 3381 3382 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3383 3384 // Set up the middle block terminator. Two cases: 3385 // 1) If we know that we must execute the scalar epilogue, emit an 3386 // unconditional branch. 3387 // 2) Otherwise, we must have a single unique exit block (due to how we 3388 // implement the multiple exit case). In this case, set up a conditonal 3389 // branch from the middle block to the loop scalar preheader, and the 3390 // exit block. completeLoopSkeleton will update the condition to use an 3391 // iteration check, if required to decide whether to execute the remainder. 3392 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3393 BranchInst::Create(LoopScalarPreHeader) : 3394 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3395 Builder.getTrue()); 3396 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3397 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3398 3399 // We intentionally don't let SplitBlock to update LoopInfo since 3400 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3401 // LoopVectorBody is explicitly added to the correct place few lines later. 3402 LoopVectorBody = 3403 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3404 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3405 3406 // Update dominator for loop exit. 3407 if (!Cost->requiresScalarEpilogue(VF)) 3408 // If there is an epilogue which must run, there's no edge from the 3409 // middle block to exit blocks and thus no need to update the immediate 3410 // dominator of the exit blocks. 3411 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3412 3413 // Create and register the new vector loop. 3414 Loop *Lp = LI->AllocateLoop(); 3415 Loop *ParentLoop = OrigLoop->getParentLoop(); 3416 3417 // Insert the new loop into the loop nest and register the new basic blocks 3418 // before calling any utilities such as SCEV that require valid LoopInfo. 3419 if (ParentLoop) { 3420 ParentLoop->addChildLoop(Lp); 3421 } else { 3422 LI->addTopLevelLoop(Lp); 3423 } 3424 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3425 return Lp; 3426 } 3427 3428 void InnerLoopVectorizer::createInductionResumeValues( 3429 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3430 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3431 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3432 "Inconsistent information about additional bypass."); 3433 3434 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3435 assert(VectorTripCount && L && "Expected valid arguments"); 3436 // We are going to resume the execution of the scalar loop. 3437 // Go over all of the induction variables that we found and fix the 3438 // PHIs that are left in the scalar version of the loop. 3439 // The starting values of PHI nodes depend on the counter of the last 3440 // iteration in the vectorized loop. 3441 // If we come from a bypass edge then we need to start from the original 3442 // start value. 3443 Instruction *OldInduction = Legal->getPrimaryInduction(); 3444 for (auto &InductionEntry : Legal->getInductionVars()) { 3445 PHINode *OrigPhi = InductionEntry.first; 3446 InductionDescriptor II = InductionEntry.second; 3447 3448 // Create phi nodes to merge from the backedge-taken check block. 3449 PHINode *BCResumeVal = 3450 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3451 LoopScalarPreHeader->getTerminator()); 3452 // Copy original phi DL over to the new one. 3453 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3454 Value *&EndValue = IVEndValues[OrigPhi]; 3455 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3456 if (OrigPhi == OldInduction) { 3457 // We know what the end value is. 3458 EndValue = VectorTripCount; 3459 } else { 3460 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3461 3462 // Fast-math-flags propagate from the original induction instruction. 3463 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3464 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3465 3466 Type *StepType = II.getStep()->getType(); 3467 Instruction::CastOps CastOp = 3468 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3469 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3470 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3471 EndValue = 3472 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3473 EndValue->setName("ind.end"); 3474 3475 // Compute the end value for the additional bypass (if applicable). 3476 if (AdditionalBypass.first) { 3477 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3478 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3479 StepType, true); 3480 CRD = 3481 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3482 EndValueFromAdditionalBypass = 3483 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3484 EndValueFromAdditionalBypass->setName("ind.end"); 3485 } 3486 } 3487 // The new PHI merges the original incoming value, in case of a bypass, 3488 // or the value at the end of the vectorized loop. 3489 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3490 3491 // Fix the scalar body counter (PHI node). 3492 // The old induction's phi node in the scalar body needs the truncated 3493 // value. 3494 for (BasicBlock *BB : LoopBypassBlocks) 3495 BCResumeVal->addIncoming(II.getStartValue(), BB); 3496 3497 if (AdditionalBypass.first) 3498 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3499 EndValueFromAdditionalBypass); 3500 3501 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3502 } 3503 } 3504 3505 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3506 MDNode *OrigLoopID) { 3507 assert(L && "Expected valid loop."); 3508 3509 // The trip counts should be cached by now. 3510 Value *Count = getOrCreateTripCount(L); 3511 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3512 3513 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3514 3515 // Add a check in the middle block to see if we have completed 3516 // all of the iterations in the first vector loop. Three cases: 3517 // 1) If we require a scalar epilogue, there is no conditional branch as 3518 // we unconditionally branch to the scalar preheader. Do nothing. 3519 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3520 // Thus if tail is to be folded, we know we don't need to run the 3521 // remainder and we can use the previous value for the condition (true). 3522 // 3) Otherwise, construct a runtime check. 3523 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3524 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3525 Count, VectorTripCount, "cmp.n", 3526 LoopMiddleBlock->getTerminator()); 3527 3528 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3529 // of the corresponding compare because they may have ended up with 3530 // different line numbers and we want to avoid awkward line stepping while 3531 // debugging. Eg. if the compare has got a line number inside the loop. 3532 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3533 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3534 } 3535 3536 // Get ready to start creating new instructions into the vectorized body. 3537 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3538 "Inconsistent vector loop preheader"); 3539 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3540 3541 #ifdef EXPENSIVE_CHECKS 3542 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3543 LI->verify(*DT); 3544 #endif 3545 3546 return LoopVectorPreHeader; 3547 } 3548 3549 std::pair<BasicBlock *, Value *> 3550 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3551 /* 3552 In this function we generate a new loop. The new loop will contain 3553 the vectorized instructions while the old loop will continue to run the 3554 scalar remainder. 3555 3556 [ ] <-- loop iteration number check. 3557 / | 3558 / v 3559 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3560 | / | 3561 | / v 3562 || [ ] <-- vector pre header. 3563 |/ | 3564 | v 3565 | [ ] \ 3566 | [ ]_| <-- vector loop. 3567 | | 3568 | v 3569 \ -[ ] <--- middle-block. 3570 \/ | 3571 /\ v 3572 | ->[ ] <--- new preheader. 3573 | | 3574 (opt) v <-- edge from middle to exit iff epilogue is not required. 3575 | [ ] \ 3576 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3577 \ | 3578 \ v 3579 >[ ] <-- exit block(s). 3580 ... 3581 */ 3582 3583 // Get the metadata of the original loop before it gets modified. 3584 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3585 3586 // Workaround! Compute the trip count of the original loop and cache it 3587 // before we start modifying the CFG. This code has a systemic problem 3588 // wherein it tries to run analysis over partially constructed IR; this is 3589 // wrong, and not simply for SCEV. The trip count of the original loop 3590 // simply happens to be prone to hitting this in practice. In theory, we 3591 // can hit the same issue for any SCEV, or ValueTracking query done during 3592 // mutation. See PR49900. 3593 getOrCreateTripCount(OrigLoop); 3594 3595 // Create an empty vector loop, and prepare basic blocks for the runtime 3596 // checks. 3597 Loop *Lp = createVectorLoopSkeleton(""); 3598 3599 // Now, compare the new count to zero. If it is zero skip the vector loop and 3600 // jump to the scalar loop. This check also covers the case where the 3601 // backedge-taken count is uint##_max: adding one to it will overflow leading 3602 // to an incorrect trip count of zero. In this (rare) case we will also jump 3603 // to the scalar loop. 3604 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3605 3606 // Generate the code to check any assumptions that we've made for SCEV 3607 // expressions. 3608 emitSCEVChecks(Lp, LoopScalarPreHeader); 3609 3610 // Generate the code that checks in runtime if arrays overlap. We put the 3611 // checks into a separate block to make the more common case of few elements 3612 // faster. 3613 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3614 3615 createHeaderBranch(Lp); 3616 3617 // Emit phis for the new starting index of the scalar loop. 3618 createInductionResumeValues(Lp); 3619 3620 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3621 } 3622 3623 // Fix up external users of the induction variable. At this point, we are 3624 // in LCSSA form, with all external PHIs that use the IV having one input value, 3625 // coming from the remainder loop. We need those PHIs to also have a correct 3626 // value for the IV when arriving directly from the middle block. 3627 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3628 const InductionDescriptor &II, 3629 Value *CountRoundDown, Value *EndValue, 3630 BasicBlock *MiddleBlock) { 3631 // There are two kinds of external IV usages - those that use the value 3632 // computed in the last iteration (the PHI) and those that use the penultimate 3633 // value (the value that feeds into the phi from the loop latch). 3634 // We allow both, but they, obviously, have different values. 3635 3636 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3637 3638 DenseMap<Value *, Value *> MissingVals; 3639 3640 // An external user of the last iteration's value should see the value that 3641 // the remainder loop uses to initialize its own IV. 3642 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3643 for (User *U : PostInc->users()) { 3644 Instruction *UI = cast<Instruction>(U); 3645 if (!OrigLoop->contains(UI)) { 3646 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3647 MissingVals[UI] = EndValue; 3648 } 3649 } 3650 3651 // An external user of the penultimate value need to see EndValue - Step. 3652 // The simplest way to get this is to recompute it from the constituent SCEVs, 3653 // that is Start + (Step * (CRD - 1)). 3654 for (User *U : OrigPhi->users()) { 3655 auto *UI = cast<Instruction>(U); 3656 if (!OrigLoop->contains(UI)) { 3657 const DataLayout &DL = 3658 OrigLoop->getHeader()->getModule()->getDataLayout(); 3659 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3660 3661 IRBuilder<> B(MiddleBlock->getTerminator()); 3662 3663 // Fast-math-flags propagate from the original induction instruction. 3664 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3665 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3666 3667 Value *CountMinusOne = B.CreateSub( 3668 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3669 Value *CMO = 3670 !II.getStep()->getType()->isIntegerTy() 3671 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3672 II.getStep()->getType()) 3673 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3674 CMO->setName("cast.cmo"); 3675 Value *Escape = 3676 emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); 3677 Escape->setName("ind.escape"); 3678 MissingVals[UI] = Escape; 3679 } 3680 } 3681 3682 for (auto &I : MissingVals) { 3683 PHINode *PHI = cast<PHINode>(I.first); 3684 // One corner case we have to handle is two IVs "chasing" each-other, 3685 // that is %IV2 = phi [...], [ %IV1, %latch ] 3686 // In this case, if IV1 has an external use, we need to avoid adding both 3687 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3688 // don't already have an incoming value for the middle block. 3689 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3690 PHI->addIncoming(I.second, MiddleBlock); 3691 } 3692 } 3693 3694 namespace { 3695 3696 struct CSEDenseMapInfo { 3697 static bool canHandle(const Instruction *I) { 3698 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3699 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3700 } 3701 3702 static inline Instruction *getEmptyKey() { 3703 return DenseMapInfo<Instruction *>::getEmptyKey(); 3704 } 3705 3706 static inline Instruction *getTombstoneKey() { 3707 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3708 } 3709 3710 static unsigned getHashValue(const Instruction *I) { 3711 assert(canHandle(I) && "Unknown instruction!"); 3712 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3713 I->value_op_end())); 3714 } 3715 3716 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3717 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3718 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3719 return LHS == RHS; 3720 return LHS->isIdenticalTo(RHS); 3721 } 3722 }; 3723 3724 } // end anonymous namespace 3725 3726 ///Perform cse of induction variable instructions. 3727 static void cse(BasicBlock *BB) { 3728 // Perform simple cse. 3729 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3730 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3731 if (!CSEDenseMapInfo::canHandle(&In)) 3732 continue; 3733 3734 // Check if we can replace this instruction with any of the 3735 // visited instructions. 3736 if (Instruction *V = CSEMap.lookup(&In)) { 3737 In.replaceAllUsesWith(V); 3738 In.eraseFromParent(); 3739 continue; 3740 } 3741 3742 CSEMap[&In] = &In; 3743 } 3744 } 3745 3746 InstructionCost 3747 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3748 bool &NeedToScalarize) const { 3749 Function *F = CI->getCalledFunction(); 3750 Type *ScalarRetTy = CI->getType(); 3751 SmallVector<Type *, 4> Tys, ScalarTys; 3752 for (auto &ArgOp : CI->args()) 3753 ScalarTys.push_back(ArgOp->getType()); 3754 3755 // Estimate cost of scalarized vector call. The source operands are assumed 3756 // to be vectors, so we need to extract individual elements from there, 3757 // execute VF scalar calls, and then gather the result into the vector return 3758 // value. 3759 InstructionCost ScalarCallCost = 3760 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3761 if (VF.isScalar()) 3762 return ScalarCallCost; 3763 3764 // Compute corresponding vector type for return value and arguments. 3765 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3766 for (Type *ScalarTy : ScalarTys) 3767 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3768 3769 // Compute costs of unpacking argument values for the scalar calls and 3770 // packing the return values to a vector. 3771 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3772 3773 InstructionCost Cost = 3774 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3775 3776 // If we can't emit a vector call for this function, then the currently found 3777 // cost is the cost we need to return. 3778 NeedToScalarize = true; 3779 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3780 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3781 3782 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3783 return Cost; 3784 3785 // If the corresponding vector cost is cheaper, return its cost. 3786 InstructionCost VectorCallCost = 3787 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3788 if (VectorCallCost < Cost) { 3789 NeedToScalarize = false; 3790 Cost = VectorCallCost; 3791 } 3792 return Cost; 3793 } 3794 3795 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3796 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3797 return Elt; 3798 return VectorType::get(Elt, VF); 3799 } 3800 3801 InstructionCost 3802 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3803 ElementCount VF) const { 3804 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3805 assert(ID && "Expected intrinsic call!"); 3806 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3807 FastMathFlags FMF; 3808 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3809 FMF = FPMO->getFastMathFlags(); 3810 3811 SmallVector<const Value *> Arguments(CI->args()); 3812 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3813 SmallVector<Type *> ParamTys; 3814 std::transform(FTy->param_begin(), FTy->param_end(), 3815 std::back_inserter(ParamTys), 3816 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3817 3818 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3819 dyn_cast<IntrinsicInst>(CI)); 3820 return TTI.getIntrinsicInstrCost(CostAttrs, 3821 TargetTransformInfo::TCK_RecipThroughput); 3822 } 3823 3824 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3825 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3826 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3827 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3828 } 3829 3830 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3831 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3832 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3833 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3834 } 3835 3836 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3837 // For every instruction `I` in MinBWs, truncate the operands, create a 3838 // truncated version of `I` and reextend its result. InstCombine runs 3839 // later and will remove any ext/trunc pairs. 3840 SmallPtrSet<Value *, 4> Erased; 3841 for (const auto &KV : Cost->getMinimalBitwidths()) { 3842 // If the value wasn't vectorized, we must maintain the original scalar 3843 // type. The absence of the value from State indicates that it 3844 // wasn't vectorized. 3845 // FIXME: Should not rely on getVPValue at this point. 3846 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3847 if (!State.hasAnyVectorValue(Def)) 3848 continue; 3849 for (unsigned Part = 0; Part < UF; ++Part) { 3850 Value *I = State.get(Def, Part); 3851 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3852 continue; 3853 Type *OriginalTy = I->getType(); 3854 Type *ScalarTruncatedTy = 3855 IntegerType::get(OriginalTy->getContext(), KV.second); 3856 auto *TruncatedTy = VectorType::get( 3857 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3858 if (TruncatedTy == OriginalTy) 3859 continue; 3860 3861 IRBuilder<> B(cast<Instruction>(I)); 3862 auto ShrinkOperand = [&](Value *V) -> Value * { 3863 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3864 if (ZI->getSrcTy() == TruncatedTy) 3865 return ZI->getOperand(0); 3866 return B.CreateZExtOrTrunc(V, TruncatedTy); 3867 }; 3868 3869 // The actual instruction modification depends on the instruction type, 3870 // unfortunately. 3871 Value *NewI = nullptr; 3872 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3873 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3874 ShrinkOperand(BO->getOperand(1))); 3875 3876 // Any wrapping introduced by shrinking this operation shouldn't be 3877 // considered undefined behavior. So, we can't unconditionally copy 3878 // arithmetic wrapping flags to NewI. 3879 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3880 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3881 NewI = 3882 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3883 ShrinkOperand(CI->getOperand(1))); 3884 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3885 NewI = B.CreateSelect(SI->getCondition(), 3886 ShrinkOperand(SI->getTrueValue()), 3887 ShrinkOperand(SI->getFalseValue())); 3888 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3889 switch (CI->getOpcode()) { 3890 default: 3891 llvm_unreachable("Unhandled cast!"); 3892 case Instruction::Trunc: 3893 NewI = ShrinkOperand(CI->getOperand(0)); 3894 break; 3895 case Instruction::SExt: 3896 NewI = B.CreateSExtOrTrunc( 3897 CI->getOperand(0), 3898 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3899 break; 3900 case Instruction::ZExt: 3901 NewI = B.CreateZExtOrTrunc( 3902 CI->getOperand(0), 3903 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3904 break; 3905 } 3906 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3907 auto Elements0 = 3908 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3909 auto *O0 = B.CreateZExtOrTrunc( 3910 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3911 auto Elements1 = 3912 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3913 auto *O1 = B.CreateZExtOrTrunc( 3914 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3915 3916 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3917 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3918 // Don't do anything with the operands, just extend the result. 3919 continue; 3920 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3921 auto Elements = 3922 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3923 auto *O0 = B.CreateZExtOrTrunc( 3924 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3925 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3926 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3927 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3928 auto Elements = 3929 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3930 auto *O0 = B.CreateZExtOrTrunc( 3931 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3932 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3933 } else { 3934 // If we don't know what to do, be conservative and don't do anything. 3935 continue; 3936 } 3937 3938 // Lastly, extend the result. 3939 NewI->takeName(cast<Instruction>(I)); 3940 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3941 I->replaceAllUsesWith(Res); 3942 cast<Instruction>(I)->eraseFromParent(); 3943 Erased.insert(I); 3944 State.reset(Def, Res, Part); 3945 } 3946 } 3947 3948 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3949 for (const auto &KV : Cost->getMinimalBitwidths()) { 3950 // If the value wasn't vectorized, we must maintain the original scalar 3951 // type. The absence of the value from State indicates that it 3952 // wasn't vectorized. 3953 // FIXME: Should not rely on getVPValue at this point. 3954 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3955 if (!State.hasAnyVectorValue(Def)) 3956 continue; 3957 for (unsigned Part = 0; Part < UF; ++Part) { 3958 Value *I = State.get(Def, Part); 3959 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3960 if (Inst && Inst->use_empty()) { 3961 Value *NewI = Inst->getOperand(0); 3962 Inst->eraseFromParent(); 3963 State.reset(Def, NewI, Part); 3964 } 3965 } 3966 } 3967 } 3968 3969 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3970 // Insert truncates and extends for any truncated instructions as hints to 3971 // InstCombine. 3972 if (VF.isVector()) 3973 truncateToMinimalBitwidths(State); 3974 3975 // Fix widened non-induction PHIs by setting up the PHI operands. 3976 if (OrigPHIsToFix.size()) { 3977 assert(EnableVPlanNativePath && 3978 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3979 fixNonInductionPHIs(State); 3980 } 3981 3982 // At this point every instruction in the original loop is widened to a 3983 // vector form. Now we need to fix the recurrences in the loop. These PHI 3984 // nodes are currently empty because we did not want to introduce cycles. 3985 // This is the second stage of vectorizing recurrences. 3986 fixCrossIterationPHIs(State); 3987 3988 // Forget the original basic block. 3989 PSE.getSE()->forgetLoop(OrigLoop); 3990 3991 // If we inserted an edge from the middle block to the unique exit block, 3992 // update uses outside the loop (phis) to account for the newly inserted 3993 // edge. 3994 if (!Cost->requiresScalarEpilogue(VF)) { 3995 // Fix-up external users of the induction variables. 3996 for (auto &Entry : Legal->getInductionVars()) 3997 fixupIVUsers(Entry.first, Entry.second, 3998 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3999 IVEndValues[Entry.first], LoopMiddleBlock); 4000 4001 fixLCSSAPHIs(State); 4002 } 4003 4004 for (Instruction *PI : PredicatedInstructions) 4005 sinkScalarOperands(&*PI); 4006 4007 // Remove redundant induction instructions. 4008 cse(LoopVectorBody); 4009 4010 // Set/update profile weights for the vector and remainder loops as original 4011 // loop iterations are now distributed among them. Note that original loop 4012 // represented by LoopScalarBody becomes remainder loop after vectorization. 4013 // 4014 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4015 // end up getting slightly roughened result but that should be OK since 4016 // profile is not inherently precise anyway. Note also possible bypass of 4017 // vector code caused by legality checks is ignored, assigning all the weight 4018 // to the vector loop, optimistically. 4019 // 4020 // For scalable vectorization we can't know at compile time how many iterations 4021 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4022 // vscale of '1'. 4023 setProfileInfoAfterUnrolling( 4024 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4025 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4026 } 4027 4028 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4029 // In order to support recurrences we need to be able to vectorize Phi nodes. 4030 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4031 // stage #2: We now need to fix the recurrences by adding incoming edges to 4032 // the currently empty PHI nodes. At this point every instruction in the 4033 // original loop is widened to a vector form so we can use them to construct 4034 // the incoming edges. 4035 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4036 for (VPRecipeBase &R : Header->phis()) { 4037 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4038 fixReduction(ReductionPhi, State); 4039 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4040 fixFirstOrderRecurrence(FOR, State); 4041 } 4042 } 4043 4044 void InnerLoopVectorizer::fixFirstOrderRecurrence( 4045 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 4046 // This is the second phase of vectorizing first-order recurrences. An 4047 // overview of the transformation is described below. Suppose we have the 4048 // following loop. 4049 // 4050 // for (int i = 0; i < n; ++i) 4051 // b[i] = a[i] - a[i - 1]; 4052 // 4053 // There is a first-order recurrence on "a". For this loop, the shorthand 4054 // scalar IR looks like: 4055 // 4056 // scalar.ph: 4057 // s_init = a[-1] 4058 // br scalar.body 4059 // 4060 // scalar.body: 4061 // i = phi [0, scalar.ph], [i+1, scalar.body] 4062 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4063 // s2 = a[i] 4064 // b[i] = s2 - s1 4065 // br cond, scalar.body, ... 4066 // 4067 // In this example, s1 is a recurrence because it's value depends on the 4068 // previous iteration. In the first phase of vectorization, we created a 4069 // vector phi v1 for s1. We now complete the vectorization and produce the 4070 // shorthand vector IR shown below (for VF = 4, UF = 1). 4071 // 4072 // vector.ph: 4073 // v_init = vector(..., ..., ..., a[-1]) 4074 // br vector.body 4075 // 4076 // vector.body 4077 // i = phi [0, vector.ph], [i+4, vector.body] 4078 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4079 // v2 = a[i, i+1, i+2, i+3]; 4080 // v3 = vector(v1(3), v2(0, 1, 2)) 4081 // b[i, i+1, i+2, i+3] = v2 - v3 4082 // br cond, vector.body, middle.block 4083 // 4084 // middle.block: 4085 // x = v2(3) 4086 // br scalar.ph 4087 // 4088 // scalar.ph: 4089 // s_init = phi [x, middle.block], [a[-1], otherwise] 4090 // br scalar.body 4091 // 4092 // After execution completes the vector loop, we extract the next value of 4093 // the recurrence (x) to use as the initial value in the scalar loop. 4094 4095 // Extract the last vector element in the middle block. This will be the 4096 // initial value for the recurrence when jumping to the scalar loop. 4097 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4098 Value *Incoming = State.get(PreviousDef, UF - 1); 4099 auto *ExtractForScalar = Incoming; 4100 auto *IdxTy = Builder.getInt32Ty(); 4101 if (VF.isVector()) { 4102 auto *One = ConstantInt::get(IdxTy, 1); 4103 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4104 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4105 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4106 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4107 "vector.recur.extract"); 4108 } 4109 // Extract the second last element in the middle block if the 4110 // Phi is used outside the loop. We need to extract the phi itself 4111 // and not the last element (the phi update in the current iteration). This 4112 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4113 // when the scalar loop is not run at all. 4114 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4115 if (VF.isVector()) { 4116 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4117 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4118 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4119 Incoming, Idx, "vector.recur.extract.for.phi"); 4120 } else if (UF > 1) 4121 // When loop is unrolled without vectorizing, initialize 4122 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4123 // of `Incoming`. This is analogous to the vectorized case above: extracting 4124 // the second last element when VF > 1. 4125 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4126 4127 // Fix the initial value of the original recurrence in the scalar loop. 4128 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4129 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4130 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4131 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4132 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4133 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4134 Start->addIncoming(Incoming, BB); 4135 } 4136 4137 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4138 Phi->setName("scalar.recur"); 4139 4140 // Finally, fix users of the recurrence outside the loop. The users will need 4141 // either the last value of the scalar recurrence or the last value of the 4142 // vector recurrence we extracted in the middle block. Since the loop is in 4143 // LCSSA form, we just need to find all the phi nodes for the original scalar 4144 // recurrence in the exit block, and then add an edge for the middle block. 4145 // Note that LCSSA does not imply single entry when the original scalar loop 4146 // had multiple exiting edges (as we always run the last iteration in the 4147 // scalar epilogue); in that case, there is no edge from middle to exit and 4148 // and thus no phis which needed updated. 4149 if (!Cost->requiresScalarEpilogue(VF)) 4150 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4151 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4152 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4153 } 4154 4155 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4156 VPTransformState &State) { 4157 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4158 // Get it's reduction variable descriptor. 4159 assert(Legal->isReductionVariable(OrigPhi) && 4160 "Unable to find the reduction variable"); 4161 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4162 4163 RecurKind RK = RdxDesc.getRecurrenceKind(); 4164 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4165 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4166 setDebugLocFromInst(ReductionStartValue); 4167 4168 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4169 // This is the vector-clone of the value that leaves the loop. 4170 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4171 4172 // Wrap flags are in general invalid after vectorization, clear them. 4173 clearReductionWrapFlags(RdxDesc, State); 4174 4175 // Before each round, move the insertion point right between 4176 // the PHIs and the values we are going to write. 4177 // This allows us to write both PHINodes and the extractelement 4178 // instructions. 4179 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4180 4181 setDebugLocFromInst(LoopExitInst); 4182 4183 Type *PhiTy = OrigPhi->getType(); 4184 // If tail is folded by masking, the vector value to leave the loop should be 4185 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4186 // instead of the former. For an inloop reduction the reduction will already 4187 // be predicated, and does not need to be handled here. 4188 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4189 for (unsigned Part = 0; Part < UF; ++Part) { 4190 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4191 Value *Sel = nullptr; 4192 for (User *U : VecLoopExitInst->users()) { 4193 if (isa<SelectInst>(U)) { 4194 assert(!Sel && "Reduction exit feeding two selects"); 4195 Sel = U; 4196 } else 4197 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4198 } 4199 assert(Sel && "Reduction exit feeds no select"); 4200 State.reset(LoopExitInstDef, Sel, Part); 4201 4202 // If the target can create a predicated operator for the reduction at no 4203 // extra cost in the loop (for example a predicated vadd), it can be 4204 // cheaper for the select to remain in the loop than be sunk out of it, 4205 // and so use the select value for the phi instead of the old 4206 // LoopExitValue. 4207 if (PreferPredicatedReductionSelect || 4208 TTI->preferPredicatedReductionSelect( 4209 RdxDesc.getOpcode(), PhiTy, 4210 TargetTransformInfo::ReductionFlags())) { 4211 auto *VecRdxPhi = 4212 cast<PHINode>(State.get(PhiR, Part)); 4213 VecRdxPhi->setIncomingValueForBlock( 4214 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4215 } 4216 } 4217 } 4218 4219 // If the vector reduction can be performed in a smaller type, we truncate 4220 // then extend the loop exit value to enable InstCombine to evaluate the 4221 // entire expression in the smaller type. 4222 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4223 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4224 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4225 Builder.SetInsertPoint( 4226 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4227 VectorParts RdxParts(UF); 4228 for (unsigned Part = 0; Part < UF; ++Part) { 4229 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4230 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4231 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4232 : Builder.CreateZExt(Trunc, VecTy); 4233 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4234 if (U != Trunc) { 4235 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4236 RdxParts[Part] = Extnd; 4237 } 4238 } 4239 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4240 for (unsigned Part = 0; Part < UF; ++Part) { 4241 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4242 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4243 } 4244 } 4245 4246 // Reduce all of the unrolled parts into a single vector. 4247 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4248 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4249 4250 // The middle block terminator has already been assigned a DebugLoc here (the 4251 // OrigLoop's single latch terminator). We want the whole middle block to 4252 // appear to execute on this line because: (a) it is all compiler generated, 4253 // (b) these instructions are always executed after evaluating the latch 4254 // conditional branch, and (c) other passes may add new predecessors which 4255 // terminate on this line. This is the easiest way to ensure we don't 4256 // accidentally cause an extra step back into the loop while debugging. 4257 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4258 if (PhiR->isOrdered()) 4259 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4260 else { 4261 // Floating-point operations should have some FMF to enable the reduction. 4262 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4263 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4264 for (unsigned Part = 1; Part < UF; ++Part) { 4265 Value *RdxPart = State.get(LoopExitInstDef, Part); 4266 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4267 ReducedPartRdx = Builder.CreateBinOp( 4268 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4269 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4270 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4271 ReducedPartRdx, RdxPart); 4272 else 4273 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4274 } 4275 } 4276 4277 // Create the reduction after the loop. Note that inloop reductions create the 4278 // target reduction in the loop using a Reduction recipe. 4279 if (VF.isVector() && !PhiR->isInLoop()) { 4280 ReducedPartRdx = 4281 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4282 // If the reduction can be performed in a smaller type, we need to extend 4283 // the reduction to the wider type before we branch to the original loop. 4284 if (PhiTy != RdxDesc.getRecurrenceType()) 4285 ReducedPartRdx = RdxDesc.isSigned() 4286 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4287 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4288 } 4289 4290 // Create a phi node that merges control-flow from the backedge-taken check 4291 // block and the middle block. 4292 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4293 LoopScalarPreHeader->getTerminator()); 4294 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4295 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4296 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4297 4298 // Now, we need to fix the users of the reduction variable 4299 // inside and outside of the scalar remainder loop. 4300 4301 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4302 // in the exit blocks. See comment on analogous loop in 4303 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4304 if (!Cost->requiresScalarEpilogue(VF)) 4305 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4306 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4307 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4308 4309 // Fix the scalar loop reduction variable with the incoming reduction sum 4310 // from the vector body and from the backedge value. 4311 int IncomingEdgeBlockIdx = 4312 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4313 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4314 // Pick the other block. 4315 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4316 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4317 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4318 } 4319 4320 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4321 VPTransformState &State) { 4322 RecurKind RK = RdxDesc.getRecurrenceKind(); 4323 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4324 return; 4325 4326 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4327 assert(LoopExitInstr && "null loop exit instruction"); 4328 SmallVector<Instruction *, 8> Worklist; 4329 SmallPtrSet<Instruction *, 8> Visited; 4330 Worklist.push_back(LoopExitInstr); 4331 Visited.insert(LoopExitInstr); 4332 4333 while (!Worklist.empty()) { 4334 Instruction *Cur = Worklist.pop_back_val(); 4335 if (isa<OverflowingBinaryOperator>(Cur)) 4336 for (unsigned Part = 0; Part < UF; ++Part) { 4337 // FIXME: Should not rely on getVPValue at this point. 4338 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4339 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4340 } 4341 4342 for (User *U : Cur->users()) { 4343 Instruction *UI = cast<Instruction>(U); 4344 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4345 Visited.insert(UI).second) 4346 Worklist.push_back(UI); 4347 } 4348 } 4349 } 4350 4351 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4352 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4353 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4354 // Some phis were already hand updated by the reduction and recurrence 4355 // code above, leave them alone. 4356 continue; 4357 4358 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4359 // Non-instruction incoming values will have only one value. 4360 4361 VPLane Lane = VPLane::getFirstLane(); 4362 if (isa<Instruction>(IncomingValue) && 4363 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4364 VF)) 4365 Lane = VPLane::getLastLaneForVF(VF); 4366 4367 // Can be a loop invariant incoming value or the last scalar value to be 4368 // extracted from the vectorized loop. 4369 // FIXME: Should not rely on getVPValue at this point. 4370 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4371 Value *lastIncomingValue = 4372 OrigLoop->isLoopInvariant(IncomingValue) 4373 ? IncomingValue 4374 : State.get(State.Plan->getVPValue(IncomingValue, true), 4375 VPIteration(UF - 1, Lane)); 4376 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4377 } 4378 } 4379 4380 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4381 // The basic block and loop containing the predicated instruction. 4382 auto *PredBB = PredInst->getParent(); 4383 auto *VectorLoop = LI->getLoopFor(PredBB); 4384 4385 // Initialize a worklist with the operands of the predicated instruction. 4386 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4387 4388 // Holds instructions that we need to analyze again. An instruction may be 4389 // reanalyzed if we don't yet know if we can sink it or not. 4390 SmallVector<Instruction *, 8> InstsToReanalyze; 4391 4392 // Returns true if a given use occurs in the predicated block. Phi nodes use 4393 // their operands in their corresponding predecessor blocks. 4394 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4395 auto *I = cast<Instruction>(U.getUser()); 4396 BasicBlock *BB = I->getParent(); 4397 if (auto *Phi = dyn_cast<PHINode>(I)) 4398 BB = Phi->getIncomingBlock( 4399 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4400 return BB == PredBB; 4401 }; 4402 4403 // Iteratively sink the scalarized operands of the predicated instruction 4404 // into the block we created for it. When an instruction is sunk, it's 4405 // operands are then added to the worklist. The algorithm ends after one pass 4406 // through the worklist doesn't sink a single instruction. 4407 bool Changed; 4408 do { 4409 // Add the instructions that need to be reanalyzed to the worklist, and 4410 // reset the changed indicator. 4411 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4412 InstsToReanalyze.clear(); 4413 Changed = false; 4414 4415 while (!Worklist.empty()) { 4416 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4417 4418 // We can't sink an instruction if it is a phi node, is not in the loop, 4419 // or may have side effects. 4420 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4421 I->mayHaveSideEffects()) 4422 continue; 4423 4424 // If the instruction is already in PredBB, check if we can sink its 4425 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4426 // sinking the scalar instruction I, hence it appears in PredBB; but it 4427 // may have failed to sink I's operands (recursively), which we try 4428 // (again) here. 4429 if (I->getParent() == PredBB) { 4430 Worklist.insert(I->op_begin(), I->op_end()); 4431 continue; 4432 } 4433 4434 // It's legal to sink the instruction if all its uses occur in the 4435 // predicated block. Otherwise, there's nothing to do yet, and we may 4436 // need to reanalyze the instruction. 4437 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4438 InstsToReanalyze.push_back(I); 4439 continue; 4440 } 4441 4442 // Move the instruction to the beginning of the predicated block, and add 4443 // it's operands to the worklist. 4444 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4445 Worklist.insert(I->op_begin(), I->op_end()); 4446 4447 // The sinking may have enabled other instructions to be sunk, so we will 4448 // need to iterate. 4449 Changed = true; 4450 } 4451 } while (Changed); 4452 } 4453 4454 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4455 for (PHINode *OrigPhi : OrigPHIsToFix) { 4456 VPWidenPHIRecipe *VPPhi = 4457 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4458 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4459 // Make sure the builder has a valid insert point. 4460 Builder.SetInsertPoint(NewPhi); 4461 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4462 VPValue *Inc = VPPhi->getIncomingValue(i); 4463 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4464 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4465 } 4466 } 4467 } 4468 4469 bool InnerLoopVectorizer::useOrderedReductions( 4470 const RecurrenceDescriptor &RdxDesc) { 4471 return Cost->useOrderedReductions(RdxDesc); 4472 } 4473 4474 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4475 VPWidenPHIRecipe *PhiR, 4476 VPTransformState &State) { 4477 PHINode *P = cast<PHINode>(PN); 4478 if (EnableVPlanNativePath) { 4479 // Currently we enter here in the VPlan-native path for non-induction 4480 // PHIs where all control flow is uniform. We simply widen these PHIs. 4481 // Create a vector phi with no operands - the vector phi operands will be 4482 // set at the end of vector code generation. 4483 Type *VecTy = (State.VF.isScalar()) 4484 ? PN->getType() 4485 : VectorType::get(PN->getType(), State.VF); 4486 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4487 State.set(PhiR, VecPhi, 0); 4488 OrigPHIsToFix.push_back(P); 4489 4490 return; 4491 } 4492 4493 assert(PN->getParent() == OrigLoop->getHeader() && 4494 "Non-header phis should have been handled elsewhere"); 4495 4496 // In order to support recurrences we need to be able to vectorize Phi nodes. 4497 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4498 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4499 // this value when we vectorize all of the instructions that use the PHI. 4500 4501 assert(!Legal->isReductionVariable(P) && 4502 "reductions should be handled elsewhere"); 4503 4504 setDebugLocFromInst(P); 4505 4506 // This PHINode must be an induction variable. 4507 // Make sure that we know about it. 4508 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4509 4510 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4511 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4512 4513 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4514 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4515 4516 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4517 // which can be found from the original scalar operations. 4518 switch (II.getKind()) { 4519 case InductionDescriptor::IK_NoInduction: 4520 llvm_unreachable("Unknown induction"); 4521 case InductionDescriptor::IK_IntInduction: 4522 case InductionDescriptor::IK_FpInduction: 4523 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4524 case InductionDescriptor::IK_PtrInduction: { 4525 // Handle the pointer induction variable case. 4526 assert(P->getType()->isPointerTy() && "Unexpected type."); 4527 4528 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4529 // This is the normalized GEP that starts counting at zero. 4530 Value *PtrInd = 4531 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4532 // Determine the number of scalars we need to generate for each unroll 4533 // iteration. If the instruction is uniform, we only need to generate the 4534 // first lane. Otherwise, we generate all VF values. 4535 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4536 assert((IsUniform || !State.VF.isScalable()) && 4537 "Cannot scalarize a scalable VF"); 4538 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4539 4540 for (unsigned Part = 0; Part < UF; ++Part) { 4541 Value *PartStart = 4542 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4543 4544 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4545 Value *Idx = Builder.CreateAdd( 4546 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4547 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4548 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), 4549 DL, II, State.CFG.PrevBB); 4550 SclrGep->setName("next.gep"); 4551 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4552 } 4553 } 4554 return; 4555 } 4556 assert(isa<SCEVConstant>(II.getStep()) && 4557 "Induction step not a SCEV constant!"); 4558 Type *PhiType = II.getStep()->getType(); 4559 4560 // Build a pointer phi 4561 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4562 Type *ScStValueType = ScalarStartValue->getType(); 4563 PHINode *NewPointerPhi = 4564 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4565 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4566 4567 // A pointer induction, performed by using a gep 4568 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4569 Instruction *InductionLoc = LoopLatch->getTerminator(); 4570 const SCEV *ScalarStep = II.getStep(); 4571 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4572 Value *ScalarStepValue = 4573 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4574 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4575 Value *NumUnrolledElems = 4576 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4577 Value *InductionGEP = GetElementPtrInst::Create( 4578 II.getElementType(), NewPointerPhi, 4579 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4580 InductionLoc); 4581 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4582 4583 // Create UF many actual address geps that use the pointer 4584 // phi as base and a vectorized version of the step value 4585 // (<step*0, ..., step*N>) as offset. 4586 for (unsigned Part = 0; Part < State.UF; ++Part) { 4587 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4588 Value *StartOffsetScalar = 4589 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4590 Value *StartOffset = 4591 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4592 // Create a vector of consecutive numbers from zero to VF. 4593 StartOffset = 4594 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4595 4596 Value *GEP = Builder.CreateGEP( 4597 II.getElementType(), NewPointerPhi, 4598 Builder.CreateMul( 4599 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4600 "vector.gep")); 4601 State.set(PhiR, GEP, Part); 4602 } 4603 } 4604 } 4605 } 4606 4607 /// A helper function for checking whether an integer division-related 4608 /// instruction may divide by zero (in which case it must be predicated if 4609 /// executed conditionally in the scalar code). 4610 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4611 /// Non-zero divisors that are non compile-time constants will not be 4612 /// converted into multiplication, so we will still end up scalarizing 4613 /// the division, but can do so w/o predication. 4614 static bool mayDivideByZero(Instruction &I) { 4615 assert((I.getOpcode() == Instruction::UDiv || 4616 I.getOpcode() == Instruction::SDiv || 4617 I.getOpcode() == Instruction::URem || 4618 I.getOpcode() == Instruction::SRem) && 4619 "Unexpected instruction"); 4620 Value *Divisor = I.getOperand(1); 4621 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4622 return !CInt || CInt->isZero(); 4623 } 4624 4625 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4626 VPUser &ArgOperands, 4627 VPTransformState &State) { 4628 assert(!isa<DbgInfoIntrinsic>(I) && 4629 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4630 setDebugLocFromInst(&I); 4631 4632 Module *M = I.getParent()->getParent()->getParent(); 4633 auto *CI = cast<CallInst>(&I); 4634 4635 SmallVector<Type *, 4> Tys; 4636 for (Value *ArgOperand : CI->args()) 4637 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4638 4639 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4640 4641 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4642 // version of the instruction. 4643 // Is it beneficial to perform intrinsic call compared to lib call? 4644 bool NeedToScalarize = false; 4645 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4646 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4647 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4648 assert((UseVectorIntrinsic || !NeedToScalarize) && 4649 "Instruction should be scalarized elsewhere."); 4650 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4651 "Either the intrinsic cost or vector call cost must be valid"); 4652 4653 for (unsigned Part = 0; Part < UF; ++Part) { 4654 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4655 SmallVector<Value *, 4> Args; 4656 for (auto &I : enumerate(ArgOperands.operands())) { 4657 // Some intrinsics have a scalar argument - don't replace it with a 4658 // vector. 4659 Value *Arg; 4660 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4661 Arg = State.get(I.value(), Part); 4662 else { 4663 Arg = State.get(I.value(), VPIteration(0, 0)); 4664 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4665 TysForDecl.push_back(Arg->getType()); 4666 } 4667 Args.push_back(Arg); 4668 } 4669 4670 Function *VectorF; 4671 if (UseVectorIntrinsic) { 4672 // Use vector version of the intrinsic. 4673 if (VF.isVector()) 4674 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4675 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4676 assert(VectorF && "Can't retrieve vector intrinsic."); 4677 } else { 4678 // Use vector version of the function call. 4679 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4680 #ifndef NDEBUG 4681 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4682 "Can't create vector function."); 4683 #endif 4684 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4685 } 4686 SmallVector<OperandBundleDef, 1> OpBundles; 4687 CI->getOperandBundlesAsDefs(OpBundles); 4688 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4689 4690 if (isa<FPMathOperator>(V)) 4691 V->copyFastMathFlags(CI); 4692 4693 State.set(Def, V, Part); 4694 addMetadata(V, &I); 4695 } 4696 } 4697 4698 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4699 // We should not collect Scalars more than once per VF. Right now, this 4700 // function is called from collectUniformsAndScalars(), which already does 4701 // this check. Collecting Scalars for VF=1 does not make any sense. 4702 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4703 "This function should not be visited twice for the same VF"); 4704 4705 SmallSetVector<Instruction *, 8> Worklist; 4706 4707 // These sets are used to seed the analysis with pointers used by memory 4708 // accesses that will remain scalar. 4709 SmallSetVector<Instruction *, 8> ScalarPtrs; 4710 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4711 auto *Latch = TheLoop->getLoopLatch(); 4712 4713 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4714 // The pointer operands of loads and stores will be scalar as long as the 4715 // memory access is not a gather or scatter operation. The value operand of a 4716 // store will remain scalar if the store is scalarized. 4717 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4718 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4719 assert(WideningDecision != CM_Unknown && 4720 "Widening decision should be ready at this moment"); 4721 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4722 if (Ptr == Store->getValueOperand()) 4723 return WideningDecision == CM_Scalarize; 4724 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4725 "Ptr is neither a value or pointer operand"); 4726 return WideningDecision != CM_GatherScatter; 4727 }; 4728 4729 // A helper that returns true if the given value is a bitcast or 4730 // getelementptr instruction contained in the loop. 4731 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4732 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4733 isa<GetElementPtrInst>(V)) && 4734 !TheLoop->isLoopInvariant(V); 4735 }; 4736 4737 // A helper that evaluates a memory access's use of a pointer. If the use will 4738 // be a scalar use and the pointer is only used by memory accesses, we place 4739 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4740 // PossibleNonScalarPtrs. 4741 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4742 // We only care about bitcast and getelementptr instructions contained in 4743 // the loop. 4744 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4745 return; 4746 4747 // If the pointer has already been identified as scalar (e.g., if it was 4748 // also identified as uniform), there's nothing to do. 4749 auto *I = cast<Instruction>(Ptr); 4750 if (Worklist.count(I)) 4751 return; 4752 4753 // If the use of the pointer will be a scalar use, and all users of the 4754 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4755 // place the pointer in PossibleNonScalarPtrs. 4756 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4757 return isa<LoadInst>(U) || isa<StoreInst>(U); 4758 })) 4759 ScalarPtrs.insert(I); 4760 else 4761 PossibleNonScalarPtrs.insert(I); 4762 }; 4763 4764 // We seed the scalars analysis with three classes of instructions: (1) 4765 // instructions marked uniform-after-vectorization and (2) bitcast, 4766 // getelementptr and (pointer) phi instructions used by memory accesses 4767 // requiring a scalar use. 4768 // 4769 // (1) Add to the worklist all instructions that have been identified as 4770 // uniform-after-vectorization. 4771 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4772 4773 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4774 // memory accesses requiring a scalar use. The pointer operands of loads and 4775 // stores will be scalar as long as the memory accesses is not a gather or 4776 // scatter operation. The value operand of a store will remain scalar if the 4777 // store is scalarized. 4778 for (auto *BB : TheLoop->blocks()) 4779 for (auto &I : *BB) { 4780 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4781 evaluatePtrUse(Load, Load->getPointerOperand()); 4782 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4783 evaluatePtrUse(Store, Store->getPointerOperand()); 4784 evaluatePtrUse(Store, Store->getValueOperand()); 4785 } 4786 } 4787 for (auto *I : ScalarPtrs) 4788 if (!PossibleNonScalarPtrs.count(I)) { 4789 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4790 Worklist.insert(I); 4791 } 4792 4793 // Insert the forced scalars. 4794 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4795 // induction variable when the PHI user is scalarized. 4796 auto ForcedScalar = ForcedScalars.find(VF); 4797 if (ForcedScalar != ForcedScalars.end()) 4798 for (auto *I : ForcedScalar->second) 4799 Worklist.insert(I); 4800 4801 // Expand the worklist by looking through any bitcasts and getelementptr 4802 // instructions we've already identified as scalar. This is similar to the 4803 // expansion step in collectLoopUniforms(); however, here we're only 4804 // expanding to include additional bitcasts and getelementptr instructions. 4805 unsigned Idx = 0; 4806 while (Idx != Worklist.size()) { 4807 Instruction *Dst = Worklist[Idx++]; 4808 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4809 continue; 4810 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4811 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4812 auto *J = cast<Instruction>(U); 4813 return !TheLoop->contains(J) || Worklist.count(J) || 4814 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4815 isScalarUse(J, Src)); 4816 })) { 4817 Worklist.insert(Src); 4818 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4819 } 4820 } 4821 4822 // An induction variable will remain scalar if all users of the induction 4823 // variable and induction variable update remain scalar. 4824 for (auto &Induction : Legal->getInductionVars()) { 4825 auto *Ind = Induction.first; 4826 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4827 4828 // If tail-folding is applied, the primary induction variable will be used 4829 // to feed a vector compare. 4830 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4831 continue; 4832 4833 // Returns true if \p Indvar is a pointer induction that is used directly by 4834 // load/store instruction \p I. 4835 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4836 Instruction *I) { 4837 return Induction.second.getKind() == 4838 InductionDescriptor::IK_PtrInduction && 4839 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4840 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4841 }; 4842 4843 // Determine if all users of the induction variable are scalar after 4844 // vectorization. 4845 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4846 auto *I = cast<Instruction>(U); 4847 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4848 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4849 }); 4850 if (!ScalarInd) 4851 continue; 4852 4853 // Determine if all users of the induction variable update instruction are 4854 // scalar after vectorization. 4855 auto ScalarIndUpdate = 4856 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4857 auto *I = cast<Instruction>(U); 4858 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4859 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4860 }); 4861 if (!ScalarIndUpdate) 4862 continue; 4863 4864 // The induction variable and its update instruction will remain scalar. 4865 Worklist.insert(Ind); 4866 Worklist.insert(IndUpdate); 4867 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4868 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4869 << "\n"); 4870 } 4871 4872 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4873 } 4874 4875 bool LoopVectorizationCostModel::isScalarWithPredication( 4876 Instruction *I, ElementCount VF) const { 4877 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4878 return false; 4879 switch(I->getOpcode()) { 4880 default: 4881 break; 4882 case Instruction::Load: 4883 case Instruction::Store: { 4884 if (!Legal->isMaskRequired(I)) 4885 return false; 4886 auto *Ptr = getLoadStorePointerOperand(I); 4887 auto *Ty = getLoadStoreType(I); 4888 Type *VTy = Ty; 4889 if (VF.isVector()) 4890 VTy = VectorType::get(Ty, VF); 4891 const Align Alignment = getLoadStoreAlignment(I); 4892 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4893 TTI.isLegalMaskedGather(VTy, Alignment)) 4894 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4895 TTI.isLegalMaskedScatter(VTy, Alignment)); 4896 } 4897 case Instruction::UDiv: 4898 case Instruction::SDiv: 4899 case Instruction::SRem: 4900 case Instruction::URem: 4901 return mayDivideByZero(*I); 4902 } 4903 return false; 4904 } 4905 4906 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4907 Instruction *I, ElementCount VF) { 4908 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4909 assert(getWideningDecision(I, VF) == CM_Unknown && 4910 "Decision should not be set yet."); 4911 auto *Group = getInterleavedAccessGroup(I); 4912 assert(Group && "Must have a group."); 4913 4914 // If the instruction's allocated size doesn't equal it's type size, it 4915 // requires padding and will be scalarized. 4916 auto &DL = I->getModule()->getDataLayout(); 4917 auto *ScalarTy = getLoadStoreType(I); 4918 if (hasIrregularType(ScalarTy, DL)) 4919 return false; 4920 4921 // Check if masking is required. 4922 // A Group may need masking for one of two reasons: it resides in a block that 4923 // needs predication, or it was decided to use masking to deal with gaps 4924 // (either a gap at the end of a load-access that may result in a speculative 4925 // load, or any gaps in a store-access). 4926 bool PredicatedAccessRequiresMasking = 4927 blockNeedsPredicationForAnyReason(I->getParent()) && 4928 Legal->isMaskRequired(I); 4929 bool LoadAccessWithGapsRequiresEpilogMasking = 4930 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4931 !isScalarEpilogueAllowed(); 4932 bool StoreAccessWithGapsRequiresMasking = 4933 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4934 if (!PredicatedAccessRequiresMasking && 4935 !LoadAccessWithGapsRequiresEpilogMasking && 4936 !StoreAccessWithGapsRequiresMasking) 4937 return true; 4938 4939 // If masked interleaving is required, we expect that the user/target had 4940 // enabled it, because otherwise it either wouldn't have been created or 4941 // it should have been invalidated by the CostModel. 4942 assert(useMaskedInterleavedAccesses(TTI) && 4943 "Masked interleave-groups for predicated accesses are not enabled."); 4944 4945 if (Group->isReverse()) 4946 return false; 4947 4948 auto *Ty = getLoadStoreType(I); 4949 const Align Alignment = getLoadStoreAlignment(I); 4950 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4951 : TTI.isLegalMaskedStore(Ty, Alignment); 4952 } 4953 4954 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4955 Instruction *I, ElementCount VF) { 4956 // Get and ensure we have a valid memory instruction. 4957 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4958 4959 auto *Ptr = getLoadStorePointerOperand(I); 4960 auto *ScalarTy = getLoadStoreType(I); 4961 4962 // In order to be widened, the pointer should be consecutive, first of all. 4963 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4964 return false; 4965 4966 // If the instruction is a store located in a predicated block, it will be 4967 // scalarized. 4968 if (isScalarWithPredication(I, VF)) 4969 return false; 4970 4971 // If the instruction's allocated size doesn't equal it's type size, it 4972 // requires padding and will be scalarized. 4973 auto &DL = I->getModule()->getDataLayout(); 4974 if (hasIrregularType(ScalarTy, DL)) 4975 return false; 4976 4977 return true; 4978 } 4979 4980 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4981 // We should not collect Uniforms more than once per VF. Right now, 4982 // this function is called from collectUniformsAndScalars(), which 4983 // already does this check. Collecting Uniforms for VF=1 does not make any 4984 // sense. 4985 4986 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4987 "This function should not be visited twice for the same VF"); 4988 4989 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4990 // not analyze again. Uniforms.count(VF) will return 1. 4991 Uniforms[VF].clear(); 4992 4993 // We now know that the loop is vectorizable! 4994 // Collect instructions inside the loop that will remain uniform after 4995 // vectorization. 4996 4997 // Global values, params and instructions outside of current loop are out of 4998 // scope. 4999 auto isOutOfScope = [&](Value *V) -> bool { 5000 Instruction *I = dyn_cast<Instruction>(V); 5001 return (!I || !TheLoop->contains(I)); 5002 }; 5003 5004 // Worklist containing uniform instructions demanding lane 0. 5005 SetVector<Instruction *> Worklist; 5006 BasicBlock *Latch = TheLoop->getLoopLatch(); 5007 5008 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5009 // that are scalar with predication must not be considered uniform after 5010 // vectorization, because that would create an erroneous replicating region 5011 // where only a single instance out of VF should be formed. 5012 // TODO: optimize such seldom cases if found important, see PR40816. 5013 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5014 if (isOutOfScope(I)) { 5015 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5016 << *I << "\n"); 5017 return; 5018 } 5019 if (isScalarWithPredication(I, VF)) { 5020 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5021 << *I << "\n"); 5022 return; 5023 } 5024 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5025 Worklist.insert(I); 5026 }; 5027 5028 // Start with the conditional branch. If the branch condition is an 5029 // instruction contained in the loop that is only used by the branch, it is 5030 // uniform. 5031 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5032 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5033 addToWorklistIfAllowed(Cmp); 5034 5035 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5036 InstWidening WideningDecision = getWideningDecision(I, VF); 5037 assert(WideningDecision != CM_Unknown && 5038 "Widening decision should be ready at this moment"); 5039 5040 // A uniform memory op is itself uniform. We exclude uniform stores 5041 // here as they demand the last lane, not the first one. 5042 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5043 assert(WideningDecision == CM_Scalarize); 5044 return true; 5045 } 5046 5047 return (WideningDecision == CM_Widen || 5048 WideningDecision == CM_Widen_Reverse || 5049 WideningDecision == CM_Interleave); 5050 }; 5051 5052 5053 // Returns true if Ptr is the pointer operand of a memory access instruction 5054 // I, and I is known to not require scalarization. 5055 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5056 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5057 }; 5058 5059 // Holds a list of values which are known to have at least one uniform use. 5060 // Note that there may be other uses which aren't uniform. A "uniform use" 5061 // here is something which only demands lane 0 of the unrolled iterations; 5062 // it does not imply that all lanes produce the same value (e.g. this is not 5063 // the usual meaning of uniform) 5064 SetVector<Value *> HasUniformUse; 5065 5066 // Scan the loop for instructions which are either a) known to have only 5067 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5068 for (auto *BB : TheLoop->blocks()) 5069 for (auto &I : *BB) { 5070 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5071 switch (II->getIntrinsicID()) { 5072 case Intrinsic::sideeffect: 5073 case Intrinsic::experimental_noalias_scope_decl: 5074 case Intrinsic::assume: 5075 case Intrinsic::lifetime_start: 5076 case Intrinsic::lifetime_end: 5077 if (TheLoop->hasLoopInvariantOperands(&I)) 5078 addToWorklistIfAllowed(&I); 5079 break; 5080 default: 5081 break; 5082 } 5083 } 5084 5085 // ExtractValue instructions must be uniform, because the operands are 5086 // known to be loop-invariant. 5087 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5088 assert(isOutOfScope(EVI->getAggregateOperand()) && 5089 "Expected aggregate value to be loop invariant"); 5090 addToWorklistIfAllowed(EVI); 5091 continue; 5092 } 5093 5094 // If there's no pointer operand, there's nothing to do. 5095 auto *Ptr = getLoadStorePointerOperand(&I); 5096 if (!Ptr) 5097 continue; 5098 5099 // A uniform memory op is itself uniform. We exclude uniform stores 5100 // here as they demand the last lane, not the first one. 5101 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5102 addToWorklistIfAllowed(&I); 5103 5104 if (isUniformDecision(&I, VF)) { 5105 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5106 HasUniformUse.insert(Ptr); 5107 } 5108 } 5109 5110 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5111 // demanding) users. Since loops are assumed to be in LCSSA form, this 5112 // disallows uses outside the loop as well. 5113 for (auto *V : HasUniformUse) { 5114 if (isOutOfScope(V)) 5115 continue; 5116 auto *I = cast<Instruction>(V); 5117 auto UsersAreMemAccesses = 5118 llvm::all_of(I->users(), [&](User *U) -> bool { 5119 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5120 }); 5121 if (UsersAreMemAccesses) 5122 addToWorklistIfAllowed(I); 5123 } 5124 5125 // Expand Worklist in topological order: whenever a new instruction 5126 // is added , its users should be already inside Worklist. It ensures 5127 // a uniform instruction will only be used by uniform instructions. 5128 unsigned idx = 0; 5129 while (idx != Worklist.size()) { 5130 Instruction *I = Worklist[idx++]; 5131 5132 for (auto OV : I->operand_values()) { 5133 // isOutOfScope operands cannot be uniform instructions. 5134 if (isOutOfScope(OV)) 5135 continue; 5136 // First order recurrence Phi's should typically be considered 5137 // non-uniform. 5138 auto *OP = dyn_cast<PHINode>(OV); 5139 if (OP && Legal->isFirstOrderRecurrence(OP)) 5140 continue; 5141 // If all the users of the operand are uniform, then add the 5142 // operand into the uniform worklist. 5143 auto *OI = cast<Instruction>(OV); 5144 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5145 auto *J = cast<Instruction>(U); 5146 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5147 })) 5148 addToWorklistIfAllowed(OI); 5149 } 5150 } 5151 5152 // For an instruction to be added into Worklist above, all its users inside 5153 // the loop should also be in Worklist. However, this condition cannot be 5154 // true for phi nodes that form a cyclic dependence. We must process phi 5155 // nodes separately. An induction variable will remain uniform if all users 5156 // of the induction variable and induction variable update remain uniform. 5157 // The code below handles both pointer and non-pointer induction variables. 5158 for (auto &Induction : Legal->getInductionVars()) { 5159 auto *Ind = Induction.first; 5160 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5161 5162 // Determine if all users of the induction variable are uniform after 5163 // vectorization. 5164 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5165 auto *I = cast<Instruction>(U); 5166 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5167 isVectorizedMemAccessUse(I, Ind); 5168 }); 5169 if (!UniformInd) 5170 continue; 5171 5172 // Determine if all users of the induction variable update instruction are 5173 // uniform after vectorization. 5174 auto UniformIndUpdate = 5175 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5176 auto *I = cast<Instruction>(U); 5177 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5178 isVectorizedMemAccessUse(I, IndUpdate); 5179 }); 5180 if (!UniformIndUpdate) 5181 continue; 5182 5183 // The induction variable and its update instruction will remain uniform. 5184 addToWorklistIfAllowed(Ind); 5185 addToWorklistIfAllowed(IndUpdate); 5186 } 5187 5188 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5189 } 5190 5191 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5192 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5193 5194 if (Legal->getRuntimePointerChecking()->Need) { 5195 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5196 "runtime pointer checks needed. Enable vectorization of this " 5197 "loop with '#pragma clang loop vectorize(enable)' when " 5198 "compiling with -Os/-Oz", 5199 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5200 return true; 5201 } 5202 5203 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5204 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5205 "runtime SCEV checks needed. Enable vectorization of this " 5206 "loop with '#pragma clang loop vectorize(enable)' when " 5207 "compiling with -Os/-Oz", 5208 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5209 return true; 5210 } 5211 5212 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5213 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5214 reportVectorizationFailure("Runtime stride check for small trip count", 5215 "runtime stride == 1 checks needed. Enable vectorization of " 5216 "this loop without such check by compiling with -Os/-Oz", 5217 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5218 return true; 5219 } 5220 5221 return false; 5222 } 5223 5224 ElementCount 5225 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5226 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5227 return ElementCount::getScalable(0); 5228 5229 if (Hints->isScalableVectorizationDisabled()) { 5230 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5231 "ScalableVectorizationDisabled", ORE, TheLoop); 5232 return ElementCount::getScalable(0); 5233 } 5234 5235 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5236 5237 auto MaxScalableVF = ElementCount::getScalable( 5238 std::numeric_limits<ElementCount::ScalarTy>::max()); 5239 5240 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5241 // FIXME: While for scalable vectors this is currently sufficient, this should 5242 // be replaced by a more detailed mechanism that filters out specific VFs, 5243 // instead of invalidating vectorization for a whole set of VFs based on the 5244 // MaxVF. 5245 5246 // Disable scalable vectorization if the loop contains unsupported reductions. 5247 if (!canVectorizeReductions(MaxScalableVF)) { 5248 reportVectorizationInfo( 5249 "Scalable vectorization not supported for the reduction " 5250 "operations found in this loop.", 5251 "ScalableVFUnfeasible", ORE, TheLoop); 5252 return ElementCount::getScalable(0); 5253 } 5254 5255 // Disable scalable vectorization if the loop contains any instructions 5256 // with element types not supported for scalable vectors. 5257 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5258 return !Ty->isVoidTy() && 5259 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5260 })) { 5261 reportVectorizationInfo("Scalable vectorization is not supported " 5262 "for all element types found in this loop.", 5263 "ScalableVFUnfeasible", ORE, TheLoop); 5264 return ElementCount::getScalable(0); 5265 } 5266 5267 if (Legal->isSafeForAnyVectorWidth()) 5268 return MaxScalableVF; 5269 5270 // Limit MaxScalableVF by the maximum safe dependence distance. 5271 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5272 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5273 MaxVScale = 5274 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5275 MaxScalableVF = ElementCount::getScalable( 5276 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5277 if (!MaxScalableVF) 5278 reportVectorizationInfo( 5279 "Max legal vector width too small, scalable vectorization " 5280 "unfeasible.", 5281 "ScalableVFUnfeasible", ORE, TheLoop); 5282 5283 return MaxScalableVF; 5284 } 5285 5286 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5287 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5288 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5289 unsigned SmallestType, WidestType; 5290 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5291 5292 // Get the maximum safe dependence distance in bits computed by LAA. 5293 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5294 // the memory accesses that is most restrictive (involved in the smallest 5295 // dependence distance). 5296 unsigned MaxSafeElements = 5297 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5298 5299 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5300 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5301 5302 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5303 << ".\n"); 5304 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5305 << ".\n"); 5306 5307 // First analyze the UserVF, fall back if the UserVF should be ignored. 5308 if (UserVF) { 5309 auto MaxSafeUserVF = 5310 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5311 5312 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5313 // If `VF=vscale x N` is safe, then so is `VF=N` 5314 if (UserVF.isScalable()) 5315 return FixedScalableVFPair( 5316 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5317 else 5318 return UserVF; 5319 } 5320 5321 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5322 5323 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5324 // is better to ignore the hint and let the compiler choose a suitable VF. 5325 if (!UserVF.isScalable()) { 5326 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5327 << " is unsafe, clamping to max safe VF=" 5328 << MaxSafeFixedVF << ".\n"); 5329 ORE->emit([&]() { 5330 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5331 TheLoop->getStartLoc(), 5332 TheLoop->getHeader()) 5333 << "User-specified vectorization factor " 5334 << ore::NV("UserVectorizationFactor", UserVF) 5335 << " is unsafe, clamping to maximum safe vectorization factor " 5336 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5337 }); 5338 return MaxSafeFixedVF; 5339 } 5340 5341 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5342 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5343 << " is ignored because scalable vectors are not " 5344 "available.\n"); 5345 ORE->emit([&]() { 5346 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5347 TheLoop->getStartLoc(), 5348 TheLoop->getHeader()) 5349 << "User-specified vectorization factor " 5350 << ore::NV("UserVectorizationFactor", UserVF) 5351 << " is ignored because the target does not support scalable " 5352 "vectors. The compiler will pick a more suitable value."; 5353 }); 5354 } else { 5355 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5356 << " is unsafe. Ignoring scalable UserVF.\n"); 5357 ORE->emit([&]() { 5358 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5359 TheLoop->getStartLoc(), 5360 TheLoop->getHeader()) 5361 << "User-specified vectorization factor " 5362 << ore::NV("UserVectorizationFactor", UserVF) 5363 << " is unsafe. Ignoring the hint to let the compiler pick a " 5364 "more suitable value."; 5365 }); 5366 } 5367 } 5368 5369 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5370 << " / " << WidestType << " bits.\n"); 5371 5372 FixedScalableVFPair Result(ElementCount::getFixed(1), 5373 ElementCount::getScalable(0)); 5374 if (auto MaxVF = 5375 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5376 MaxSafeFixedVF, FoldTailByMasking)) 5377 Result.FixedVF = MaxVF; 5378 5379 if (auto MaxVF = 5380 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5381 MaxSafeScalableVF, FoldTailByMasking)) 5382 if (MaxVF.isScalable()) { 5383 Result.ScalableVF = MaxVF; 5384 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5385 << "\n"); 5386 } 5387 5388 return Result; 5389 } 5390 5391 FixedScalableVFPair 5392 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5393 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5394 // TODO: It may by useful to do since it's still likely to be dynamically 5395 // uniform if the target can skip. 5396 reportVectorizationFailure( 5397 "Not inserting runtime ptr check for divergent target", 5398 "runtime pointer checks needed. Not enabled for divergent target", 5399 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5400 return FixedScalableVFPair::getNone(); 5401 } 5402 5403 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5404 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5405 if (TC == 1) { 5406 reportVectorizationFailure("Single iteration (non) loop", 5407 "loop trip count is one, irrelevant for vectorization", 5408 "SingleIterationLoop", ORE, TheLoop); 5409 return FixedScalableVFPair::getNone(); 5410 } 5411 5412 switch (ScalarEpilogueStatus) { 5413 case CM_ScalarEpilogueAllowed: 5414 return computeFeasibleMaxVF(TC, UserVF, false); 5415 case CM_ScalarEpilogueNotAllowedUsePredicate: 5416 LLVM_FALLTHROUGH; 5417 case CM_ScalarEpilogueNotNeededUsePredicate: 5418 LLVM_DEBUG( 5419 dbgs() << "LV: vector predicate hint/switch found.\n" 5420 << "LV: Not allowing scalar epilogue, creating predicated " 5421 << "vector loop.\n"); 5422 break; 5423 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5424 // fallthrough as a special case of OptForSize 5425 case CM_ScalarEpilogueNotAllowedOptSize: 5426 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5427 LLVM_DEBUG( 5428 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5429 else 5430 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5431 << "count.\n"); 5432 5433 // Bail if runtime checks are required, which are not good when optimising 5434 // for size. 5435 if (runtimeChecksRequired()) 5436 return FixedScalableVFPair::getNone(); 5437 5438 break; 5439 } 5440 5441 // The only loops we can vectorize without a scalar epilogue, are loops with 5442 // a bottom-test and a single exiting block. We'd have to handle the fact 5443 // that not every instruction executes on the last iteration. This will 5444 // require a lane mask which varies through the vector loop body. (TODO) 5445 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5446 // If there was a tail-folding hint/switch, but we can't fold the tail by 5447 // masking, fallback to a vectorization with a scalar epilogue. 5448 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5449 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5450 "scalar epilogue instead.\n"); 5451 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5452 return computeFeasibleMaxVF(TC, UserVF, false); 5453 } 5454 return FixedScalableVFPair::getNone(); 5455 } 5456 5457 // Now try the tail folding 5458 5459 // Invalidate interleave groups that require an epilogue if we can't mask 5460 // the interleave-group. 5461 if (!useMaskedInterleavedAccesses(TTI)) { 5462 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5463 "No decisions should have been taken at this point"); 5464 // Note: There is no need to invalidate any cost modeling decisions here, as 5465 // non where taken so far. 5466 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5467 } 5468 5469 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5470 // Avoid tail folding if the trip count is known to be a multiple of any VF 5471 // we chose. 5472 // FIXME: The condition below pessimises the case for fixed-width vectors, 5473 // when scalable VFs are also candidates for vectorization. 5474 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5475 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5476 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5477 "MaxFixedVF must be a power of 2"); 5478 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5479 : MaxFixedVF.getFixedValue(); 5480 ScalarEvolution *SE = PSE.getSE(); 5481 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5482 const SCEV *ExitCount = SE->getAddExpr( 5483 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5484 const SCEV *Rem = SE->getURemExpr( 5485 SE->applyLoopGuards(ExitCount, TheLoop), 5486 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5487 if (Rem->isZero()) { 5488 // Accept MaxFixedVF if we do not have a tail. 5489 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5490 return MaxFactors; 5491 } 5492 } 5493 5494 // For scalable vectors don't use tail folding for low trip counts or 5495 // optimizing for code size. We only permit this if the user has explicitly 5496 // requested it. 5497 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5498 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5499 MaxFactors.ScalableVF.isVector()) 5500 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5501 5502 // If we don't know the precise trip count, or if the trip count that we 5503 // found modulo the vectorization factor is not zero, try to fold the tail 5504 // by masking. 5505 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5506 if (Legal->prepareToFoldTailByMasking()) { 5507 FoldTailByMasking = true; 5508 return MaxFactors; 5509 } 5510 5511 // If there was a tail-folding hint/switch, but we can't fold the tail by 5512 // masking, fallback to a vectorization with a scalar epilogue. 5513 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5514 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5515 "scalar epilogue instead.\n"); 5516 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5517 return MaxFactors; 5518 } 5519 5520 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5521 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5522 return FixedScalableVFPair::getNone(); 5523 } 5524 5525 if (TC == 0) { 5526 reportVectorizationFailure( 5527 "Unable to calculate the loop count due to complex control flow", 5528 "unable to calculate the loop count due to complex control flow", 5529 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5530 return FixedScalableVFPair::getNone(); 5531 } 5532 5533 reportVectorizationFailure( 5534 "Cannot optimize for size and vectorize at the same time.", 5535 "cannot optimize for size and vectorize at the same time. " 5536 "Enable vectorization of this loop with '#pragma clang loop " 5537 "vectorize(enable)' when compiling with -Os/-Oz", 5538 "NoTailLoopWithOptForSize", ORE, TheLoop); 5539 return FixedScalableVFPair::getNone(); 5540 } 5541 5542 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5543 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5544 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5545 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5546 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5547 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5548 : TargetTransformInfo::RGK_FixedWidthVector); 5549 5550 // Convenience function to return the minimum of two ElementCounts. 5551 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5552 assert((LHS.isScalable() == RHS.isScalable()) && 5553 "Scalable flags must match"); 5554 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5555 }; 5556 5557 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5558 // Note that both WidestRegister and WidestType may not be a powers of 2. 5559 auto MaxVectorElementCount = ElementCount::get( 5560 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5561 ComputeScalableMaxVF); 5562 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5563 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5564 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5565 5566 if (!MaxVectorElementCount) { 5567 LLVM_DEBUG(dbgs() << "LV: The target has no " 5568 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5569 << " vector registers.\n"); 5570 return ElementCount::getFixed(1); 5571 } 5572 5573 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5574 if (ConstTripCount && 5575 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5576 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5577 // If loop trip count (TC) is known at compile time there is no point in 5578 // choosing VF greater than TC (as done in the loop below). Select maximum 5579 // power of two which doesn't exceed TC. 5580 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5581 // when the TC is less than or equal to the known number of lanes. 5582 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5583 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5584 "exceeding the constant trip count: " 5585 << ClampedConstTripCount << "\n"); 5586 return ElementCount::getFixed(ClampedConstTripCount); 5587 } 5588 5589 ElementCount MaxVF = MaxVectorElementCount; 5590 if (TTI.shouldMaximizeVectorBandwidth() || 5591 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5592 auto MaxVectorElementCountMaxBW = ElementCount::get( 5593 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5594 ComputeScalableMaxVF); 5595 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5596 5597 // Collect all viable vectorization factors larger than the default MaxVF 5598 // (i.e. MaxVectorElementCount). 5599 SmallVector<ElementCount, 8> VFs; 5600 for (ElementCount VS = MaxVectorElementCount * 2; 5601 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5602 VFs.push_back(VS); 5603 5604 // For each VF calculate its register usage. 5605 auto RUs = calculateRegisterUsage(VFs); 5606 5607 // Select the largest VF which doesn't require more registers than existing 5608 // ones. 5609 for (int i = RUs.size() - 1; i >= 0; --i) { 5610 bool Selected = true; 5611 for (auto &pair : RUs[i].MaxLocalUsers) { 5612 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5613 if (pair.second > TargetNumRegisters) 5614 Selected = false; 5615 } 5616 if (Selected) { 5617 MaxVF = VFs[i]; 5618 break; 5619 } 5620 } 5621 if (ElementCount MinVF = 5622 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5623 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5624 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5625 << ") with target's minimum: " << MinVF << '\n'); 5626 MaxVF = MinVF; 5627 } 5628 } 5629 } 5630 return MaxVF; 5631 } 5632 5633 bool LoopVectorizationCostModel::isMoreProfitable( 5634 const VectorizationFactor &A, const VectorizationFactor &B) const { 5635 InstructionCost CostA = A.Cost; 5636 InstructionCost CostB = B.Cost; 5637 5638 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5639 5640 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5641 MaxTripCount) { 5642 // If we are folding the tail and the trip count is a known (possibly small) 5643 // constant, the trip count will be rounded up to an integer number of 5644 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5645 // which we compare directly. When not folding the tail, the total cost will 5646 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5647 // approximated with the per-lane cost below instead of using the tripcount 5648 // as here. 5649 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5650 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5651 return RTCostA < RTCostB; 5652 } 5653 5654 // Improve estimate for the vector width if it is scalable. 5655 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5656 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5657 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5658 if (A.Width.isScalable()) 5659 EstimatedWidthA *= VScale.getValue(); 5660 if (B.Width.isScalable()) 5661 EstimatedWidthB *= VScale.getValue(); 5662 } 5663 5664 // Assume vscale may be larger than 1 (or the value being tuned for), 5665 // so that scalable vectorization is slightly favorable over fixed-width 5666 // vectorization. 5667 if (A.Width.isScalable() && !B.Width.isScalable()) 5668 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5669 5670 // To avoid the need for FP division: 5671 // (CostA / A.Width) < (CostB / B.Width) 5672 // <=> (CostA * B.Width) < (CostB * A.Width) 5673 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5674 } 5675 5676 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5677 const ElementCountSet &VFCandidates) { 5678 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5679 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5680 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5681 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5682 "Expected Scalar VF to be a candidate"); 5683 5684 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5685 VectorizationFactor ChosenFactor = ScalarCost; 5686 5687 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5688 if (ForceVectorization && VFCandidates.size() > 1) { 5689 // Ignore scalar width, because the user explicitly wants vectorization. 5690 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5691 // evaluation. 5692 ChosenFactor.Cost = InstructionCost::getMax(); 5693 } 5694 5695 SmallVector<InstructionVFPair> InvalidCosts; 5696 for (const auto &i : VFCandidates) { 5697 // The cost for scalar VF=1 is already calculated, so ignore it. 5698 if (i.isScalar()) 5699 continue; 5700 5701 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5702 VectorizationFactor Candidate(i, C.first); 5703 5704 #ifndef NDEBUG 5705 unsigned AssumedMinimumVscale = 1; 5706 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5707 AssumedMinimumVscale = VScale.getValue(); 5708 unsigned Width = 5709 Candidate.Width.isScalable() 5710 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5711 : Candidate.Width.getFixedValue(); 5712 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5713 << " costs: " << (Candidate.Cost / Width)); 5714 if (i.isScalable()) 5715 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5716 << AssumedMinimumVscale << ")"); 5717 LLVM_DEBUG(dbgs() << ".\n"); 5718 #endif 5719 5720 if (!C.second && !ForceVectorization) { 5721 LLVM_DEBUG( 5722 dbgs() << "LV: Not considering vector loop of width " << i 5723 << " because it will not generate any vector instructions.\n"); 5724 continue; 5725 } 5726 5727 // If profitable add it to ProfitableVF list. 5728 if (isMoreProfitable(Candidate, ScalarCost)) 5729 ProfitableVFs.push_back(Candidate); 5730 5731 if (isMoreProfitable(Candidate, ChosenFactor)) 5732 ChosenFactor = Candidate; 5733 } 5734 5735 // Emit a report of VFs with invalid costs in the loop. 5736 if (!InvalidCosts.empty()) { 5737 // Group the remarks per instruction, keeping the instruction order from 5738 // InvalidCosts. 5739 std::map<Instruction *, unsigned> Numbering; 5740 unsigned I = 0; 5741 for (auto &Pair : InvalidCosts) 5742 if (!Numbering.count(Pair.first)) 5743 Numbering[Pair.first] = I++; 5744 5745 // Sort the list, first on instruction(number) then on VF. 5746 llvm::sort(InvalidCosts, 5747 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5748 if (Numbering[A.first] != Numbering[B.first]) 5749 return Numbering[A.first] < Numbering[B.first]; 5750 ElementCountComparator ECC; 5751 return ECC(A.second, B.second); 5752 }); 5753 5754 // For a list of ordered instruction-vf pairs: 5755 // [(load, vf1), (load, vf2), (store, vf1)] 5756 // Group the instructions together to emit separate remarks for: 5757 // load (vf1, vf2) 5758 // store (vf1) 5759 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5760 auto Subset = ArrayRef<InstructionVFPair>(); 5761 do { 5762 if (Subset.empty()) 5763 Subset = Tail.take_front(1); 5764 5765 Instruction *I = Subset.front().first; 5766 5767 // If the next instruction is different, or if there are no other pairs, 5768 // emit a remark for the collated subset. e.g. 5769 // [(load, vf1), (load, vf2))] 5770 // to emit: 5771 // remark: invalid costs for 'load' at VF=(vf, vf2) 5772 if (Subset == Tail || Tail[Subset.size()].first != I) { 5773 std::string OutString; 5774 raw_string_ostream OS(OutString); 5775 assert(!Subset.empty() && "Unexpected empty range"); 5776 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5777 for (auto &Pair : Subset) 5778 OS << (Pair.second == Subset.front().second ? "" : ", ") 5779 << Pair.second; 5780 OS << "):"; 5781 if (auto *CI = dyn_cast<CallInst>(I)) 5782 OS << " call to " << CI->getCalledFunction()->getName(); 5783 else 5784 OS << " " << I->getOpcodeName(); 5785 OS.flush(); 5786 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5787 Tail = Tail.drop_front(Subset.size()); 5788 Subset = {}; 5789 } else 5790 // Grow the subset by one element 5791 Subset = Tail.take_front(Subset.size() + 1); 5792 } while (!Tail.empty()); 5793 } 5794 5795 if (!EnableCondStoresVectorization && NumPredStores) { 5796 reportVectorizationFailure("There are conditional stores.", 5797 "store that is conditionally executed prevents vectorization", 5798 "ConditionalStore", ORE, TheLoop); 5799 ChosenFactor = ScalarCost; 5800 } 5801 5802 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5803 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5804 << "LV: Vectorization seems to be not beneficial, " 5805 << "but was forced by a user.\n"); 5806 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5807 return ChosenFactor; 5808 } 5809 5810 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5811 const Loop &L, ElementCount VF) const { 5812 // Cross iteration phis such as reductions need special handling and are 5813 // currently unsupported. 5814 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5815 return Legal->isFirstOrderRecurrence(&Phi) || 5816 Legal->isReductionVariable(&Phi); 5817 })) 5818 return false; 5819 5820 // Phis with uses outside of the loop require special handling and are 5821 // currently unsupported. 5822 for (auto &Entry : Legal->getInductionVars()) { 5823 // Look for uses of the value of the induction at the last iteration. 5824 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5825 for (User *U : PostInc->users()) 5826 if (!L.contains(cast<Instruction>(U))) 5827 return false; 5828 // Look for uses of penultimate value of the induction. 5829 for (User *U : Entry.first->users()) 5830 if (!L.contains(cast<Instruction>(U))) 5831 return false; 5832 } 5833 5834 // Induction variables that are widened require special handling that is 5835 // currently not supported. 5836 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5837 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5838 this->isProfitableToScalarize(Entry.first, VF)); 5839 })) 5840 return false; 5841 5842 // Epilogue vectorization code has not been auditted to ensure it handles 5843 // non-latch exits properly. It may be fine, but it needs auditted and 5844 // tested. 5845 if (L.getExitingBlock() != L.getLoopLatch()) 5846 return false; 5847 5848 return true; 5849 } 5850 5851 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5852 const ElementCount VF) const { 5853 // FIXME: We need a much better cost-model to take different parameters such 5854 // as register pressure, code size increase and cost of extra branches into 5855 // account. For now we apply a very crude heuristic and only consider loops 5856 // with vectorization factors larger than a certain value. 5857 // We also consider epilogue vectorization unprofitable for targets that don't 5858 // consider interleaving beneficial (eg. MVE). 5859 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5860 return false; 5861 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5862 return true; 5863 return false; 5864 } 5865 5866 VectorizationFactor 5867 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5868 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5869 VectorizationFactor Result = VectorizationFactor::Disabled(); 5870 if (!EnableEpilogueVectorization) { 5871 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5872 return Result; 5873 } 5874 5875 if (!isScalarEpilogueAllowed()) { 5876 LLVM_DEBUG( 5877 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5878 "allowed.\n";); 5879 return Result; 5880 } 5881 5882 // Not really a cost consideration, but check for unsupported cases here to 5883 // simplify the logic. 5884 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5885 LLVM_DEBUG( 5886 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5887 "not a supported candidate.\n";); 5888 return Result; 5889 } 5890 5891 if (EpilogueVectorizationForceVF > 1) { 5892 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5893 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5894 if (LVP.hasPlanWithVF(ForcedEC)) 5895 return {ForcedEC, 0}; 5896 else { 5897 LLVM_DEBUG( 5898 dbgs() 5899 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5900 return Result; 5901 } 5902 } 5903 5904 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5905 TheLoop->getHeader()->getParent()->hasMinSize()) { 5906 LLVM_DEBUG( 5907 dbgs() 5908 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5909 return Result; 5910 } 5911 5912 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5913 if (MainLoopVF.isScalable()) 5914 LLVM_DEBUG( 5915 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 5916 "yet supported. Converting to fixed-width (VF=" 5917 << FixedMainLoopVF << ") instead\n"); 5918 5919 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 5920 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5921 "this loop\n"); 5922 return Result; 5923 } 5924 5925 for (auto &NextVF : ProfitableVFs) 5926 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 5927 (Result.Width.getFixedValue() == 1 || 5928 isMoreProfitable(NextVF, Result)) && 5929 LVP.hasPlanWithVF(NextVF.Width)) 5930 Result = NextVF; 5931 5932 if (Result != VectorizationFactor::Disabled()) 5933 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5934 << Result.Width.getFixedValue() << "\n";); 5935 return Result; 5936 } 5937 5938 std::pair<unsigned, unsigned> 5939 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5940 unsigned MinWidth = -1U; 5941 unsigned MaxWidth = 8; 5942 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5943 // For in-loop reductions, no element types are added to ElementTypesInLoop 5944 // if there are no loads/stores in the loop. In this case, check through the 5945 // reduction variables to determine the maximum width. 5946 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5947 // Reset MaxWidth so that we can find the smallest type used by recurrences 5948 // in the loop. 5949 MaxWidth = -1U; 5950 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5951 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5952 // When finding the min width used by the recurrence we need to account 5953 // for casts on the input operands of the recurrence. 5954 MaxWidth = std::min<unsigned>( 5955 MaxWidth, std::min<unsigned>( 5956 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5957 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5958 } 5959 } else { 5960 for (Type *T : ElementTypesInLoop) { 5961 MinWidth = std::min<unsigned>( 5962 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5963 MaxWidth = std::max<unsigned>( 5964 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5965 } 5966 } 5967 return {MinWidth, MaxWidth}; 5968 } 5969 5970 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5971 ElementTypesInLoop.clear(); 5972 // For each block. 5973 for (BasicBlock *BB : TheLoop->blocks()) { 5974 // For each instruction in the loop. 5975 for (Instruction &I : BB->instructionsWithoutDebug()) { 5976 Type *T = I.getType(); 5977 5978 // Skip ignored values. 5979 if (ValuesToIgnore.count(&I)) 5980 continue; 5981 5982 // Only examine Loads, Stores and PHINodes. 5983 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5984 continue; 5985 5986 // Examine PHI nodes that are reduction variables. Update the type to 5987 // account for the recurrence type. 5988 if (auto *PN = dyn_cast<PHINode>(&I)) { 5989 if (!Legal->isReductionVariable(PN)) 5990 continue; 5991 const RecurrenceDescriptor &RdxDesc = 5992 Legal->getReductionVars().find(PN)->second; 5993 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5994 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5995 RdxDesc.getRecurrenceType(), 5996 TargetTransformInfo::ReductionFlags())) 5997 continue; 5998 T = RdxDesc.getRecurrenceType(); 5999 } 6000 6001 // Examine the stored values. 6002 if (auto *ST = dyn_cast<StoreInst>(&I)) 6003 T = ST->getValueOperand()->getType(); 6004 6005 assert(T->isSized() && 6006 "Expected the load/store/recurrence type to be sized"); 6007 6008 ElementTypesInLoop.insert(T); 6009 } 6010 } 6011 } 6012 6013 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6014 unsigned LoopCost) { 6015 // -- The interleave heuristics -- 6016 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6017 // There are many micro-architectural considerations that we can't predict 6018 // at this level. For example, frontend pressure (on decode or fetch) due to 6019 // code size, or the number and capabilities of the execution ports. 6020 // 6021 // We use the following heuristics to select the interleave count: 6022 // 1. If the code has reductions, then we interleave to break the cross 6023 // iteration dependency. 6024 // 2. If the loop is really small, then we interleave to reduce the loop 6025 // overhead. 6026 // 3. We don't interleave if we think that we will spill registers to memory 6027 // due to the increased register pressure. 6028 6029 if (!isScalarEpilogueAllowed()) 6030 return 1; 6031 6032 // We used the distance for the interleave count. 6033 if (Legal->getMaxSafeDepDistBytes() != -1U) 6034 return 1; 6035 6036 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6037 const bool HasReductions = !Legal->getReductionVars().empty(); 6038 // Do not interleave loops with a relatively small known or estimated trip 6039 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6040 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6041 // because with the above conditions interleaving can expose ILP and break 6042 // cross iteration dependences for reductions. 6043 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6044 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6045 return 1; 6046 6047 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6048 // We divide by these constants so assume that we have at least one 6049 // instruction that uses at least one register. 6050 for (auto& pair : R.MaxLocalUsers) { 6051 pair.second = std::max(pair.second, 1U); 6052 } 6053 6054 // We calculate the interleave count using the following formula. 6055 // Subtract the number of loop invariants from the number of available 6056 // registers. These registers are used by all of the interleaved instances. 6057 // Next, divide the remaining registers by the number of registers that is 6058 // required by the loop, in order to estimate how many parallel instances 6059 // fit without causing spills. All of this is rounded down if necessary to be 6060 // a power of two. We want power of two interleave count to simplify any 6061 // addressing operations or alignment considerations. 6062 // We also want power of two interleave counts to ensure that the induction 6063 // variable of the vector loop wraps to zero, when tail is folded by masking; 6064 // this currently happens when OptForSize, in which case IC is set to 1 above. 6065 unsigned IC = UINT_MAX; 6066 6067 for (auto& pair : R.MaxLocalUsers) { 6068 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6069 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6070 << " registers of " 6071 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6072 if (VF.isScalar()) { 6073 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6074 TargetNumRegisters = ForceTargetNumScalarRegs; 6075 } else { 6076 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6077 TargetNumRegisters = ForceTargetNumVectorRegs; 6078 } 6079 unsigned MaxLocalUsers = pair.second; 6080 unsigned LoopInvariantRegs = 0; 6081 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6082 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6083 6084 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6085 // Don't count the induction variable as interleaved. 6086 if (EnableIndVarRegisterHeur) { 6087 TmpIC = 6088 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6089 std::max(1U, (MaxLocalUsers - 1))); 6090 } 6091 6092 IC = std::min(IC, TmpIC); 6093 } 6094 6095 // Clamp the interleave ranges to reasonable counts. 6096 unsigned MaxInterleaveCount = 6097 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6098 6099 // Check if the user has overridden the max. 6100 if (VF.isScalar()) { 6101 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6102 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6103 } else { 6104 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6105 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6106 } 6107 6108 // If trip count is known or estimated compile time constant, limit the 6109 // interleave count to be less than the trip count divided by VF, provided it 6110 // is at least 1. 6111 // 6112 // For scalable vectors we can't know if interleaving is beneficial. It may 6113 // not be beneficial for small loops if none of the lanes in the second vector 6114 // iterations is enabled. However, for larger loops, there is likely to be a 6115 // similar benefit as for fixed-width vectors. For now, we choose to leave 6116 // the InterleaveCount as if vscale is '1', although if some information about 6117 // the vector is known (e.g. min vector size), we can make a better decision. 6118 if (BestKnownTC) { 6119 MaxInterleaveCount = 6120 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6121 // Make sure MaxInterleaveCount is greater than 0. 6122 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6123 } 6124 6125 assert(MaxInterleaveCount > 0 && 6126 "Maximum interleave count must be greater than 0"); 6127 6128 // Clamp the calculated IC to be between the 1 and the max interleave count 6129 // that the target and trip count allows. 6130 if (IC > MaxInterleaveCount) 6131 IC = MaxInterleaveCount; 6132 else 6133 // Make sure IC is greater than 0. 6134 IC = std::max(1u, IC); 6135 6136 assert(IC > 0 && "Interleave count must be greater than 0."); 6137 6138 // If we did not calculate the cost for VF (because the user selected the VF) 6139 // then we calculate the cost of VF here. 6140 if (LoopCost == 0) { 6141 InstructionCost C = expectedCost(VF).first; 6142 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6143 LoopCost = *C.getValue(); 6144 } 6145 6146 assert(LoopCost && "Non-zero loop cost expected"); 6147 6148 // Interleave if we vectorized this loop and there is a reduction that could 6149 // benefit from interleaving. 6150 if (VF.isVector() && HasReductions) { 6151 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6152 return IC; 6153 } 6154 6155 // Note that if we've already vectorized the loop we will have done the 6156 // runtime check and so interleaving won't require further checks. 6157 bool InterleavingRequiresRuntimePointerCheck = 6158 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6159 6160 // We want to interleave small loops in order to reduce the loop overhead and 6161 // potentially expose ILP opportunities. 6162 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6163 << "LV: IC is " << IC << '\n' 6164 << "LV: VF is " << VF << '\n'); 6165 const bool AggressivelyInterleaveReductions = 6166 TTI.enableAggressiveInterleaving(HasReductions); 6167 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6168 // We assume that the cost overhead is 1 and we use the cost model 6169 // to estimate the cost of the loop and interleave until the cost of the 6170 // loop overhead is about 5% of the cost of the loop. 6171 unsigned SmallIC = 6172 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6173 6174 // Interleave until store/load ports (estimated by max interleave count) are 6175 // saturated. 6176 unsigned NumStores = Legal->getNumStores(); 6177 unsigned NumLoads = Legal->getNumLoads(); 6178 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6179 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6180 6181 // There is little point in interleaving for reductions containing selects 6182 // and compares when VF=1 since it may just create more overhead than it's 6183 // worth for loops with small trip counts. This is because we still have to 6184 // do the final reduction after the loop. 6185 bool HasSelectCmpReductions = 6186 HasReductions && 6187 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6188 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6189 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6190 RdxDesc.getRecurrenceKind()); 6191 }); 6192 if (HasSelectCmpReductions) { 6193 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6194 return 1; 6195 } 6196 6197 // If we have a scalar reduction (vector reductions are already dealt with 6198 // by this point), we can increase the critical path length if the loop 6199 // we're interleaving is inside another loop. For tree-wise reductions 6200 // set the limit to 2, and for ordered reductions it's best to disable 6201 // interleaving entirely. 6202 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6203 bool HasOrderedReductions = 6204 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6205 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6206 return RdxDesc.isOrdered(); 6207 }); 6208 if (HasOrderedReductions) { 6209 LLVM_DEBUG( 6210 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6211 return 1; 6212 } 6213 6214 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6215 SmallIC = std::min(SmallIC, F); 6216 StoresIC = std::min(StoresIC, F); 6217 LoadsIC = std::min(LoadsIC, F); 6218 } 6219 6220 if (EnableLoadStoreRuntimeInterleave && 6221 std::max(StoresIC, LoadsIC) > SmallIC) { 6222 LLVM_DEBUG( 6223 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6224 return std::max(StoresIC, LoadsIC); 6225 } 6226 6227 // If there are scalar reductions and TTI has enabled aggressive 6228 // interleaving for reductions, we will interleave to expose ILP. 6229 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6230 AggressivelyInterleaveReductions) { 6231 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6232 // Interleave no less than SmallIC but not as aggressive as the normal IC 6233 // to satisfy the rare situation when resources are too limited. 6234 return std::max(IC / 2, SmallIC); 6235 } else { 6236 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6237 return SmallIC; 6238 } 6239 } 6240 6241 // Interleave if this is a large loop (small loops are already dealt with by 6242 // this point) that could benefit from interleaving. 6243 if (AggressivelyInterleaveReductions) { 6244 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6245 return IC; 6246 } 6247 6248 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6249 return 1; 6250 } 6251 6252 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6253 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6254 // This function calculates the register usage by measuring the highest number 6255 // of values that are alive at a single location. Obviously, this is a very 6256 // rough estimation. We scan the loop in a topological order in order and 6257 // assign a number to each instruction. We use RPO to ensure that defs are 6258 // met before their users. We assume that each instruction that has in-loop 6259 // users starts an interval. We record every time that an in-loop value is 6260 // used, so we have a list of the first and last occurrences of each 6261 // instruction. Next, we transpose this data structure into a multi map that 6262 // holds the list of intervals that *end* at a specific location. This multi 6263 // map allows us to perform a linear search. We scan the instructions linearly 6264 // and record each time that a new interval starts, by placing it in a set. 6265 // If we find this value in the multi-map then we remove it from the set. 6266 // The max register usage is the maximum size of the set. 6267 // We also search for instructions that are defined outside the loop, but are 6268 // used inside the loop. We need this number separately from the max-interval 6269 // usage number because when we unroll, loop-invariant values do not take 6270 // more register. 6271 LoopBlocksDFS DFS(TheLoop); 6272 DFS.perform(LI); 6273 6274 RegisterUsage RU; 6275 6276 // Each 'key' in the map opens a new interval. The values 6277 // of the map are the index of the 'last seen' usage of the 6278 // instruction that is the key. 6279 using IntervalMap = DenseMap<Instruction *, unsigned>; 6280 6281 // Maps instruction to its index. 6282 SmallVector<Instruction *, 64> IdxToInstr; 6283 // Marks the end of each interval. 6284 IntervalMap EndPoint; 6285 // Saves the list of instruction indices that are used in the loop. 6286 SmallPtrSet<Instruction *, 8> Ends; 6287 // Saves the list of values that are used in the loop but are 6288 // defined outside the loop, such as arguments and constants. 6289 SmallPtrSet<Value *, 8> LoopInvariants; 6290 6291 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6292 for (Instruction &I : BB->instructionsWithoutDebug()) { 6293 IdxToInstr.push_back(&I); 6294 6295 // Save the end location of each USE. 6296 for (Value *U : I.operands()) { 6297 auto *Instr = dyn_cast<Instruction>(U); 6298 6299 // Ignore non-instruction values such as arguments, constants, etc. 6300 if (!Instr) 6301 continue; 6302 6303 // If this instruction is outside the loop then record it and continue. 6304 if (!TheLoop->contains(Instr)) { 6305 LoopInvariants.insert(Instr); 6306 continue; 6307 } 6308 6309 // Overwrite previous end points. 6310 EndPoint[Instr] = IdxToInstr.size(); 6311 Ends.insert(Instr); 6312 } 6313 } 6314 } 6315 6316 // Saves the list of intervals that end with the index in 'key'. 6317 using InstrList = SmallVector<Instruction *, 2>; 6318 DenseMap<unsigned, InstrList> TransposeEnds; 6319 6320 // Transpose the EndPoints to a list of values that end at each index. 6321 for (auto &Interval : EndPoint) 6322 TransposeEnds[Interval.second].push_back(Interval.first); 6323 6324 SmallPtrSet<Instruction *, 8> OpenIntervals; 6325 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6326 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6327 6328 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6329 6330 // A lambda that gets the register usage for the given type and VF. 6331 const auto &TTICapture = TTI; 6332 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6333 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6334 return 0; 6335 InstructionCost::CostType RegUsage = 6336 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6337 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6338 "Nonsensical values for register usage."); 6339 return RegUsage; 6340 }; 6341 6342 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6343 Instruction *I = IdxToInstr[i]; 6344 6345 // Remove all of the instructions that end at this location. 6346 InstrList &List = TransposeEnds[i]; 6347 for (Instruction *ToRemove : List) 6348 OpenIntervals.erase(ToRemove); 6349 6350 // Ignore instructions that are never used within the loop. 6351 if (!Ends.count(I)) 6352 continue; 6353 6354 // Skip ignored values. 6355 if (ValuesToIgnore.count(I)) 6356 continue; 6357 6358 // For each VF find the maximum usage of registers. 6359 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6360 // Count the number of live intervals. 6361 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6362 6363 if (VFs[j].isScalar()) { 6364 for (auto Inst : OpenIntervals) { 6365 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6366 if (RegUsage.find(ClassID) == RegUsage.end()) 6367 RegUsage[ClassID] = 1; 6368 else 6369 RegUsage[ClassID] += 1; 6370 } 6371 } else { 6372 collectUniformsAndScalars(VFs[j]); 6373 for (auto Inst : OpenIntervals) { 6374 // Skip ignored values for VF > 1. 6375 if (VecValuesToIgnore.count(Inst)) 6376 continue; 6377 if (isScalarAfterVectorization(Inst, VFs[j])) { 6378 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6379 if (RegUsage.find(ClassID) == RegUsage.end()) 6380 RegUsage[ClassID] = 1; 6381 else 6382 RegUsage[ClassID] += 1; 6383 } else { 6384 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6385 if (RegUsage.find(ClassID) == RegUsage.end()) 6386 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6387 else 6388 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6389 } 6390 } 6391 } 6392 6393 for (auto& pair : RegUsage) { 6394 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6395 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6396 else 6397 MaxUsages[j][pair.first] = pair.second; 6398 } 6399 } 6400 6401 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6402 << OpenIntervals.size() << '\n'); 6403 6404 // Add the current instruction to the list of open intervals. 6405 OpenIntervals.insert(I); 6406 } 6407 6408 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6409 SmallMapVector<unsigned, unsigned, 4> Invariant; 6410 6411 for (auto Inst : LoopInvariants) { 6412 unsigned Usage = 6413 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6414 unsigned ClassID = 6415 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6416 if (Invariant.find(ClassID) == Invariant.end()) 6417 Invariant[ClassID] = Usage; 6418 else 6419 Invariant[ClassID] += Usage; 6420 } 6421 6422 LLVM_DEBUG({ 6423 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6424 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6425 << " item\n"; 6426 for (const auto &pair : MaxUsages[i]) { 6427 dbgs() << "LV(REG): RegisterClass: " 6428 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6429 << " registers\n"; 6430 } 6431 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6432 << " item\n"; 6433 for (const auto &pair : Invariant) { 6434 dbgs() << "LV(REG): RegisterClass: " 6435 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6436 << " registers\n"; 6437 } 6438 }); 6439 6440 RU.LoopInvariantRegs = Invariant; 6441 RU.MaxLocalUsers = MaxUsages[i]; 6442 RUs[i] = RU; 6443 } 6444 6445 return RUs; 6446 } 6447 6448 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6449 ElementCount VF) { 6450 // TODO: Cost model for emulated masked load/store is completely 6451 // broken. This hack guides the cost model to use an artificially 6452 // high enough value to practically disable vectorization with such 6453 // operations, except where previously deployed legality hack allowed 6454 // using very low cost values. This is to avoid regressions coming simply 6455 // from moving "masked load/store" check from legality to cost model. 6456 // Masked Load/Gather emulation was previously never allowed. 6457 // Limited number of Masked Store/Scatter emulation was allowed. 6458 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6459 return isa<LoadInst>(I) || 6460 (isa<StoreInst>(I) && 6461 NumPredStores > NumberOfStoresToPredicate); 6462 } 6463 6464 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6465 // If we aren't vectorizing the loop, or if we've already collected the 6466 // instructions to scalarize, there's nothing to do. Collection may already 6467 // have occurred if we have a user-selected VF and are now computing the 6468 // expected cost for interleaving. 6469 if (VF.isScalar() || VF.isZero() || 6470 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6471 return; 6472 6473 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6474 // not profitable to scalarize any instructions, the presence of VF in the 6475 // map will indicate that we've analyzed it already. 6476 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6477 6478 // Find all the instructions that are scalar with predication in the loop and 6479 // determine if it would be better to not if-convert the blocks they are in. 6480 // If so, we also record the instructions to scalarize. 6481 for (BasicBlock *BB : TheLoop->blocks()) { 6482 if (!blockNeedsPredicationForAnyReason(BB)) 6483 continue; 6484 for (Instruction &I : *BB) 6485 if (isScalarWithPredication(&I, VF)) { 6486 ScalarCostsTy ScalarCosts; 6487 // Do not apply discount if scalable, because that would lead to 6488 // invalid scalarization costs. 6489 // Do not apply discount logic if hacked cost is needed 6490 // for emulated masked memrefs. 6491 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6492 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6493 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6494 // Remember that BB will remain after vectorization. 6495 PredicatedBBsAfterVectorization.insert(BB); 6496 } 6497 } 6498 } 6499 6500 int LoopVectorizationCostModel::computePredInstDiscount( 6501 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6502 assert(!isUniformAfterVectorization(PredInst, VF) && 6503 "Instruction marked uniform-after-vectorization will be predicated"); 6504 6505 // Initialize the discount to zero, meaning that the scalar version and the 6506 // vector version cost the same. 6507 InstructionCost Discount = 0; 6508 6509 // Holds instructions to analyze. The instructions we visit are mapped in 6510 // ScalarCosts. Those instructions are the ones that would be scalarized if 6511 // we find that the scalar version costs less. 6512 SmallVector<Instruction *, 8> Worklist; 6513 6514 // Returns true if the given instruction can be scalarized. 6515 auto canBeScalarized = [&](Instruction *I) -> bool { 6516 // We only attempt to scalarize instructions forming a single-use chain 6517 // from the original predicated block that would otherwise be vectorized. 6518 // Although not strictly necessary, we give up on instructions we know will 6519 // already be scalar to avoid traversing chains that are unlikely to be 6520 // beneficial. 6521 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6522 isScalarAfterVectorization(I, VF)) 6523 return false; 6524 6525 // If the instruction is scalar with predication, it will be analyzed 6526 // separately. We ignore it within the context of PredInst. 6527 if (isScalarWithPredication(I, VF)) 6528 return false; 6529 6530 // If any of the instruction's operands are uniform after vectorization, 6531 // the instruction cannot be scalarized. This prevents, for example, a 6532 // masked load from being scalarized. 6533 // 6534 // We assume we will only emit a value for lane zero of an instruction 6535 // marked uniform after vectorization, rather than VF identical values. 6536 // Thus, if we scalarize an instruction that uses a uniform, we would 6537 // create uses of values corresponding to the lanes we aren't emitting code 6538 // for. This behavior can be changed by allowing getScalarValue to clone 6539 // the lane zero values for uniforms rather than asserting. 6540 for (Use &U : I->operands()) 6541 if (auto *J = dyn_cast<Instruction>(U.get())) 6542 if (isUniformAfterVectorization(J, VF)) 6543 return false; 6544 6545 // Otherwise, we can scalarize the instruction. 6546 return true; 6547 }; 6548 6549 // Compute the expected cost discount from scalarizing the entire expression 6550 // feeding the predicated instruction. We currently only consider expressions 6551 // that are single-use instruction chains. 6552 Worklist.push_back(PredInst); 6553 while (!Worklist.empty()) { 6554 Instruction *I = Worklist.pop_back_val(); 6555 6556 // If we've already analyzed the instruction, there's nothing to do. 6557 if (ScalarCosts.find(I) != ScalarCosts.end()) 6558 continue; 6559 6560 // Compute the cost of the vector instruction. Note that this cost already 6561 // includes the scalarization overhead of the predicated instruction. 6562 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6563 6564 // Compute the cost of the scalarized instruction. This cost is the cost of 6565 // the instruction as if it wasn't if-converted and instead remained in the 6566 // predicated block. We will scale this cost by block probability after 6567 // computing the scalarization overhead. 6568 InstructionCost ScalarCost = 6569 VF.getFixedValue() * 6570 getInstructionCost(I, ElementCount::getFixed(1)).first; 6571 6572 // Compute the scalarization overhead of needed insertelement instructions 6573 // and phi nodes. 6574 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6575 ScalarCost += TTI.getScalarizationOverhead( 6576 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6577 APInt::getAllOnes(VF.getFixedValue()), true, false); 6578 ScalarCost += 6579 VF.getFixedValue() * 6580 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6581 } 6582 6583 // Compute the scalarization overhead of needed extractelement 6584 // instructions. For each of the instruction's operands, if the operand can 6585 // be scalarized, add it to the worklist; otherwise, account for the 6586 // overhead. 6587 for (Use &U : I->operands()) 6588 if (auto *J = dyn_cast<Instruction>(U.get())) { 6589 assert(VectorType::isValidElementType(J->getType()) && 6590 "Instruction has non-scalar type"); 6591 if (canBeScalarized(J)) 6592 Worklist.push_back(J); 6593 else if (needsExtract(J, VF)) { 6594 ScalarCost += TTI.getScalarizationOverhead( 6595 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6596 APInt::getAllOnes(VF.getFixedValue()), false, true); 6597 } 6598 } 6599 6600 // Scale the total scalar cost by block probability. 6601 ScalarCost /= getReciprocalPredBlockProb(); 6602 6603 // Compute the discount. A non-negative discount means the vector version 6604 // of the instruction costs more, and scalarizing would be beneficial. 6605 Discount += VectorCost - ScalarCost; 6606 ScalarCosts[I] = ScalarCost; 6607 } 6608 6609 return *Discount.getValue(); 6610 } 6611 6612 LoopVectorizationCostModel::VectorizationCostTy 6613 LoopVectorizationCostModel::expectedCost( 6614 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6615 VectorizationCostTy Cost; 6616 6617 // For each block. 6618 for (BasicBlock *BB : TheLoop->blocks()) { 6619 VectorizationCostTy BlockCost; 6620 6621 // For each instruction in the old loop. 6622 for (Instruction &I : BB->instructionsWithoutDebug()) { 6623 // Skip ignored values. 6624 if (ValuesToIgnore.count(&I) || 6625 (VF.isVector() && VecValuesToIgnore.count(&I))) 6626 continue; 6627 6628 VectorizationCostTy C = getInstructionCost(&I, VF); 6629 6630 // Check if we should override the cost. 6631 if (C.first.isValid() && 6632 ForceTargetInstructionCost.getNumOccurrences() > 0) 6633 C.first = InstructionCost(ForceTargetInstructionCost); 6634 6635 // Keep a list of instructions with invalid costs. 6636 if (Invalid && !C.first.isValid()) 6637 Invalid->emplace_back(&I, VF); 6638 6639 BlockCost.first += C.first; 6640 BlockCost.second |= C.second; 6641 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6642 << " for VF " << VF << " For instruction: " << I 6643 << '\n'); 6644 } 6645 6646 // If we are vectorizing a predicated block, it will have been 6647 // if-converted. This means that the block's instructions (aside from 6648 // stores and instructions that may divide by zero) will now be 6649 // unconditionally executed. For the scalar case, we may not always execute 6650 // the predicated block, if it is an if-else block. Thus, scale the block's 6651 // cost by the probability of executing it. blockNeedsPredication from 6652 // Legal is used so as to not include all blocks in tail folded loops. 6653 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6654 BlockCost.first /= getReciprocalPredBlockProb(); 6655 6656 Cost.first += BlockCost.first; 6657 Cost.second |= BlockCost.second; 6658 } 6659 6660 return Cost; 6661 } 6662 6663 /// Gets Address Access SCEV after verifying that the access pattern 6664 /// is loop invariant except the induction variable dependence. 6665 /// 6666 /// This SCEV can be sent to the Target in order to estimate the address 6667 /// calculation cost. 6668 static const SCEV *getAddressAccessSCEV( 6669 Value *Ptr, 6670 LoopVectorizationLegality *Legal, 6671 PredicatedScalarEvolution &PSE, 6672 const Loop *TheLoop) { 6673 6674 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6675 if (!Gep) 6676 return nullptr; 6677 6678 // We are looking for a gep with all loop invariant indices except for one 6679 // which should be an induction variable. 6680 auto SE = PSE.getSE(); 6681 unsigned NumOperands = Gep->getNumOperands(); 6682 for (unsigned i = 1; i < NumOperands; ++i) { 6683 Value *Opd = Gep->getOperand(i); 6684 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6685 !Legal->isInductionVariable(Opd)) 6686 return nullptr; 6687 } 6688 6689 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6690 return PSE.getSCEV(Ptr); 6691 } 6692 6693 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6694 return Legal->hasStride(I->getOperand(0)) || 6695 Legal->hasStride(I->getOperand(1)); 6696 } 6697 6698 InstructionCost 6699 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6700 ElementCount VF) { 6701 assert(VF.isVector() && 6702 "Scalarization cost of instruction implies vectorization."); 6703 if (VF.isScalable()) 6704 return InstructionCost::getInvalid(); 6705 6706 Type *ValTy = getLoadStoreType(I); 6707 auto SE = PSE.getSE(); 6708 6709 unsigned AS = getLoadStoreAddressSpace(I); 6710 Value *Ptr = getLoadStorePointerOperand(I); 6711 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6712 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6713 // that it is being called from this specific place. 6714 6715 // Figure out whether the access is strided and get the stride value 6716 // if it's known in compile time 6717 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6718 6719 // Get the cost of the scalar memory instruction and address computation. 6720 InstructionCost Cost = 6721 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6722 6723 // Don't pass *I here, since it is scalar but will actually be part of a 6724 // vectorized loop where the user of it is a vectorized instruction. 6725 const Align Alignment = getLoadStoreAlignment(I); 6726 Cost += VF.getKnownMinValue() * 6727 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6728 AS, TTI::TCK_RecipThroughput); 6729 6730 // Get the overhead of the extractelement and insertelement instructions 6731 // we might create due to scalarization. 6732 Cost += getScalarizationOverhead(I, VF); 6733 6734 // If we have a predicated load/store, it will need extra i1 extracts and 6735 // conditional branches, but may not be executed for each vector lane. Scale 6736 // the cost by the probability of executing the predicated block. 6737 if (isPredicatedInst(I, VF)) { 6738 Cost /= getReciprocalPredBlockProb(); 6739 6740 // Add the cost of an i1 extract and a branch 6741 auto *Vec_i1Ty = 6742 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6743 Cost += TTI.getScalarizationOverhead( 6744 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6745 /*Insert=*/false, /*Extract=*/true); 6746 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6747 6748 if (useEmulatedMaskMemRefHack(I, VF)) 6749 // Artificially setting to a high enough value to practically disable 6750 // vectorization with such operations. 6751 Cost = 3000000; 6752 } 6753 6754 return Cost; 6755 } 6756 6757 InstructionCost 6758 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6759 ElementCount VF) { 6760 Type *ValTy = getLoadStoreType(I); 6761 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6762 Value *Ptr = getLoadStorePointerOperand(I); 6763 unsigned AS = getLoadStoreAddressSpace(I); 6764 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6765 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6766 6767 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6768 "Stride should be 1 or -1 for consecutive memory access"); 6769 const Align Alignment = getLoadStoreAlignment(I); 6770 InstructionCost Cost = 0; 6771 if (Legal->isMaskRequired(I)) 6772 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6773 CostKind); 6774 else 6775 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6776 CostKind, I); 6777 6778 bool Reverse = ConsecutiveStride < 0; 6779 if (Reverse) 6780 Cost += 6781 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6782 return Cost; 6783 } 6784 6785 InstructionCost 6786 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6787 ElementCount VF) { 6788 assert(Legal->isUniformMemOp(*I)); 6789 6790 Type *ValTy = getLoadStoreType(I); 6791 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6792 const Align Alignment = getLoadStoreAlignment(I); 6793 unsigned AS = getLoadStoreAddressSpace(I); 6794 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6795 if (isa<LoadInst>(I)) { 6796 return TTI.getAddressComputationCost(ValTy) + 6797 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6798 CostKind) + 6799 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6800 } 6801 StoreInst *SI = cast<StoreInst>(I); 6802 6803 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6804 return TTI.getAddressComputationCost(ValTy) + 6805 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6806 CostKind) + 6807 (isLoopInvariantStoreValue 6808 ? 0 6809 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6810 VF.getKnownMinValue() - 1)); 6811 } 6812 6813 InstructionCost 6814 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6815 ElementCount VF) { 6816 Type *ValTy = getLoadStoreType(I); 6817 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6818 const Align Alignment = getLoadStoreAlignment(I); 6819 const Value *Ptr = getLoadStorePointerOperand(I); 6820 6821 return TTI.getAddressComputationCost(VectorTy) + 6822 TTI.getGatherScatterOpCost( 6823 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6824 TargetTransformInfo::TCK_RecipThroughput, I); 6825 } 6826 6827 InstructionCost 6828 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6829 ElementCount VF) { 6830 // TODO: Once we have support for interleaving with scalable vectors 6831 // we can calculate the cost properly here. 6832 if (VF.isScalable()) 6833 return InstructionCost::getInvalid(); 6834 6835 Type *ValTy = getLoadStoreType(I); 6836 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6837 unsigned AS = getLoadStoreAddressSpace(I); 6838 6839 auto Group = getInterleavedAccessGroup(I); 6840 assert(Group && "Fail to get an interleaved access group."); 6841 6842 unsigned InterleaveFactor = Group->getFactor(); 6843 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6844 6845 // Holds the indices of existing members in the interleaved group. 6846 SmallVector<unsigned, 4> Indices; 6847 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6848 if (Group->getMember(IF)) 6849 Indices.push_back(IF); 6850 6851 // Calculate the cost of the whole interleaved group. 6852 bool UseMaskForGaps = 6853 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6854 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6855 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6856 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6857 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6858 6859 if (Group->isReverse()) { 6860 // TODO: Add support for reversed masked interleaved access. 6861 assert(!Legal->isMaskRequired(I) && 6862 "Reverse masked interleaved access not supported."); 6863 Cost += 6864 Group->getNumMembers() * 6865 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6866 } 6867 return Cost; 6868 } 6869 6870 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6871 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6872 using namespace llvm::PatternMatch; 6873 // Early exit for no inloop reductions 6874 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6875 return None; 6876 auto *VectorTy = cast<VectorType>(Ty); 6877 6878 // We are looking for a pattern of, and finding the minimal acceptable cost: 6879 // reduce(mul(ext(A), ext(B))) or 6880 // reduce(mul(A, B)) or 6881 // reduce(ext(A)) or 6882 // reduce(A). 6883 // The basic idea is that we walk down the tree to do that, finding the root 6884 // reduction instruction in InLoopReductionImmediateChains. From there we find 6885 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6886 // of the components. If the reduction cost is lower then we return it for the 6887 // reduction instruction and 0 for the other instructions in the pattern. If 6888 // it is not we return an invalid cost specifying the orignal cost method 6889 // should be used. 6890 Instruction *RetI = I; 6891 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6892 if (!RetI->hasOneUser()) 6893 return None; 6894 RetI = RetI->user_back(); 6895 } 6896 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6897 RetI->user_back()->getOpcode() == Instruction::Add) { 6898 if (!RetI->hasOneUser()) 6899 return None; 6900 RetI = RetI->user_back(); 6901 } 6902 6903 // Test if the found instruction is a reduction, and if not return an invalid 6904 // cost specifying the parent to use the original cost modelling. 6905 if (!InLoopReductionImmediateChains.count(RetI)) 6906 return None; 6907 6908 // Find the reduction this chain is a part of and calculate the basic cost of 6909 // the reduction on its own. 6910 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6911 Instruction *ReductionPhi = LastChain; 6912 while (!isa<PHINode>(ReductionPhi)) 6913 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6914 6915 const RecurrenceDescriptor &RdxDesc = 6916 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6917 6918 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6919 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6920 6921 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6922 // normal fmul instruction to the cost of the fadd reduction. 6923 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6924 BaseCost += 6925 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6926 6927 // If we're using ordered reductions then we can just return the base cost 6928 // here, since getArithmeticReductionCost calculates the full ordered 6929 // reduction cost when FP reassociation is not allowed. 6930 if (useOrderedReductions(RdxDesc)) 6931 return BaseCost; 6932 6933 // Get the operand that was not the reduction chain and match it to one of the 6934 // patterns, returning the better cost if it is found. 6935 Instruction *RedOp = RetI->getOperand(1) == LastChain 6936 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6937 : dyn_cast<Instruction>(RetI->getOperand(1)); 6938 6939 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6940 6941 Instruction *Op0, *Op1; 6942 if (RedOp && 6943 match(RedOp, 6944 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6945 match(Op0, m_ZExtOrSExt(m_Value())) && 6946 Op0->getOpcode() == Op1->getOpcode() && 6947 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6948 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6949 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6950 6951 // Matched reduce(ext(mul(ext(A), ext(B))) 6952 // Note that the extend opcodes need to all match, or if A==B they will have 6953 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6954 // which is equally fine. 6955 bool IsUnsigned = isa<ZExtInst>(Op0); 6956 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6957 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6958 6959 InstructionCost ExtCost = 6960 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6961 TTI::CastContextHint::None, CostKind, Op0); 6962 InstructionCost MulCost = 6963 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6964 InstructionCost Ext2Cost = 6965 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6966 TTI::CastContextHint::None, CostKind, RedOp); 6967 6968 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6969 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6970 CostKind); 6971 6972 if (RedCost.isValid() && 6973 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6974 return I == RetI ? RedCost : 0; 6975 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6976 !TheLoop->isLoopInvariant(RedOp)) { 6977 // Matched reduce(ext(A)) 6978 bool IsUnsigned = isa<ZExtInst>(RedOp); 6979 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6980 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6981 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6982 CostKind); 6983 6984 InstructionCost ExtCost = 6985 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6986 TTI::CastContextHint::None, CostKind, RedOp); 6987 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6988 return I == RetI ? RedCost : 0; 6989 } else if (RedOp && 6990 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6991 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6992 Op0->getOpcode() == Op1->getOpcode() && 6993 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6994 bool IsUnsigned = isa<ZExtInst>(Op0); 6995 Type *Op0Ty = Op0->getOperand(0)->getType(); 6996 Type *Op1Ty = Op1->getOperand(0)->getType(); 6997 Type *LargestOpTy = 6998 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6999 : Op0Ty; 7000 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 7001 7002 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 7003 // different sizes. We take the largest type as the ext to reduce, and add 7004 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 7005 InstructionCost ExtCost0 = TTI.getCastInstrCost( 7006 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 7007 TTI::CastContextHint::None, CostKind, Op0); 7008 InstructionCost ExtCost1 = TTI.getCastInstrCost( 7009 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 7010 TTI::CastContextHint::None, CostKind, Op1); 7011 InstructionCost MulCost = 7012 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7013 7014 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7015 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7016 CostKind); 7017 InstructionCost ExtraExtCost = 0; 7018 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7019 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7020 ExtraExtCost = TTI.getCastInstrCost( 7021 ExtraExtOp->getOpcode(), ExtType, 7022 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7023 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7024 } 7025 7026 if (RedCost.isValid() && 7027 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7028 return I == RetI ? RedCost : 0; 7029 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7030 // Matched reduce(mul()) 7031 InstructionCost MulCost = 7032 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7033 7034 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7035 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7036 CostKind); 7037 7038 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7039 return I == RetI ? RedCost : 0; 7040 } 7041 } 7042 7043 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7044 } 7045 7046 InstructionCost 7047 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7048 ElementCount VF) { 7049 // Calculate scalar cost only. Vectorization cost should be ready at this 7050 // moment. 7051 if (VF.isScalar()) { 7052 Type *ValTy = getLoadStoreType(I); 7053 const Align Alignment = getLoadStoreAlignment(I); 7054 unsigned AS = getLoadStoreAddressSpace(I); 7055 7056 return TTI.getAddressComputationCost(ValTy) + 7057 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7058 TTI::TCK_RecipThroughput, I); 7059 } 7060 return getWideningCost(I, VF); 7061 } 7062 7063 LoopVectorizationCostModel::VectorizationCostTy 7064 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7065 ElementCount VF) { 7066 // If we know that this instruction will remain uniform, check the cost of 7067 // the scalar version. 7068 if (isUniformAfterVectorization(I, VF)) 7069 VF = ElementCount::getFixed(1); 7070 7071 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7072 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7073 7074 // Forced scalars do not have any scalarization overhead. 7075 auto ForcedScalar = ForcedScalars.find(VF); 7076 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7077 auto InstSet = ForcedScalar->second; 7078 if (InstSet.count(I)) 7079 return VectorizationCostTy( 7080 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7081 VF.getKnownMinValue()), 7082 false); 7083 } 7084 7085 Type *VectorTy; 7086 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7087 7088 bool TypeNotScalarized = false; 7089 if (VF.isVector() && VectorTy->isVectorTy()) { 7090 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7091 if (NumParts) 7092 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7093 else 7094 C = InstructionCost::getInvalid(); 7095 } 7096 return VectorizationCostTy(C, TypeNotScalarized); 7097 } 7098 7099 InstructionCost 7100 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7101 ElementCount VF) const { 7102 7103 // There is no mechanism yet to create a scalable scalarization loop, 7104 // so this is currently Invalid. 7105 if (VF.isScalable()) 7106 return InstructionCost::getInvalid(); 7107 7108 if (VF.isScalar()) 7109 return 0; 7110 7111 InstructionCost Cost = 0; 7112 Type *RetTy = ToVectorTy(I->getType(), VF); 7113 if (!RetTy->isVoidTy() && 7114 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7115 Cost += TTI.getScalarizationOverhead( 7116 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7117 false); 7118 7119 // Some targets keep addresses scalar. 7120 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7121 return Cost; 7122 7123 // Some targets support efficient element stores. 7124 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7125 return Cost; 7126 7127 // Collect operands to consider. 7128 CallInst *CI = dyn_cast<CallInst>(I); 7129 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7130 7131 // Skip operands that do not require extraction/scalarization and do not incur 7132 // any overhead. 7133 SmallVector<Type *> Tys; 7134 for (auto *V : filterExtractingOperands(Ops, VF)) 7135 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7136 return Cost + TTI.getOperandsScalarizationOverhead( 7137 filterExtractingOperands(Ops, VF), Tys); 7138 } 7139 7140 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7141 if (VF.isScalar()) 7142 return; 7143 NumPredStores = 0; 7144 for (BasicBlock *BB : TheLoop->blocks()) { 7145 // For each instruction in the old loop. 7146 for (Instruction &I : *BB) { 7147 Value *Ptr = getLoadStorePointerOperand(&I); 7148 if (!Ptr) 7149 continue; 7150 7151 // TODO: We should generate better code and update the cost model for 7152 // predicated uniform stores. Today they are treated as any other 7153 // predicated store (see added test cases in 7154 // invariant-store-vectorization.ll). 7155 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 7156 NumPredStores++; 7157 7158 if (Legal->isUniformMemOp(I)) { 7159 // TODO: Avoid replicating loads and stores instead of 7160 // relying on instcombine to remove them. 7161 // Load: Scalar load + broadcast 7162 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7163 InstructionCost Cost; 7164 if (isa<StoreInst>(&I) && VF.isScalable() && 7165 isLegalGatherOrScatter(&I, VF)) { 7166 Cost = getGatherScatterCost(&I, VF); 7167 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7168 } else { 7169 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7170 "Cannot yet scalarize uniform stores"); 7171 Cost = getUniformMemOpCost(&I, VF); 7172 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7173 } 7174 continue; 7175 } 7176 7177 // We assume that widening is the best solution when possible. 7178 if (memoryInstructionCanBeWidened(&I, VF)) { 7179 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7180 int ConsecutiveStride = Legal->isConsecutivePtr( 7181 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7182 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7183 "Expected consecutive stride."); 7184 InstWidening Decision = 7185 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7186 setWideningDecision(&I, VF, Decision, Cost); 7187 continue; 7188 } 7189 7190 // Choose between Interleaving, Gather/Scatter or Scalarization. 7191 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7192 unsigned NumAccesses = 1; 7193 if (isAccessInterleaved(&I)) { 7194 auto Group = getInterleavedAccessGroup(&I); 7195 assert(Group && "Fail to get an interleaved access group."); 7196 7197 // Make one decision for the whole group. 7198 if (getWideningDecision(&I, VF) != CM_Unknown) 7199 continue; 7200 7201 NumAccesses = Group->getNumMembers(); 7202 if (interleavedAccessCanBeWidened(&I, VF)) 7203 InterleaveCost = getInterleaveGroupCost(&I, VF); 7204 } 7205 7206 InstructionCost GatherScatterCost = 7207 isLegalGatherOrScatter(&I, VF) 7208 ? getGatherScatterCost(&I, VF) * NumAccesses 7209 : InstructionCost::getInvalid(); 7210 7211 InstructionCost ScalarizationCost = 7212 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7213 7214 // Choose better solution for the current VF, 7215 // write down this decision and use it during vectorization. 7216 InstructionCost Cost; 7217 InstWidening Decision; 7218 if (InterleaveCost <= GatherScatterCost && 7219 InterleaveCost < ScalarizationCost) { 7220 Decision = CM_Interleave; 7221 Cost = InterleaveCost; 7222 } else if (GatherScatterCost < ScalarizationCost) { 7223 Decision = CM_GatherScatter; 7224 Cost = GatherScatterCost; 7225 } else { 7226 Decision = CM_Scalarize; 7227 Cost = ScalarizationCost; 7228 } 7229 // If the instructions belongs to an interleave group, the whole group 7230 // receives the same decision. The whole group receives the cost, but 7231 // the cost will actually be assigned to one instruction. 7232 if (auto Group = getInterleavedAccessGroup(&I)) 7233 setWideningDecision(Group, VF, Decision, Cost); 7234 else 7235 setWideningDecision(&I, VF, Decision, Cost); 7236 } 7237 } 7238 7239 // Make sure that any load of address and any other address computation 7240 // remains scalar unless there is gather/scatter support. This avoids 7241 // inevitable extracts into address registers, and also has the benefit of 7242 // activating LSR more, since that pass can't optimize vectorized 7243 // addresses. 7244 if (TTI.prefersVectorizedAddressing()) 7245 return; 7246 7247 // Start with all scalar pointer uses. 7248 SmallPtrSet<Instruction *, 8> AddrDefs; 7249 for (BasicBlock *BB : TheLoop->blocks()) 7250 for (Instruction &I : *BB) { 7251 Instruction *PtrDef = 7252 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7253 if (PtrDef && TheLoop->contains(PtrDef) && 7254 getWideningDecision(&I, VF) != CM_GatherScatter) 7255 AddrDefs.insert(PtrDef); 7256 } 7257 7258 // Add all instructions used to generate the addresses. 7259 SmallVector<Instruction *, 4> Worklist; 7260 append_range(Worklist, AddrDefs); 7261 while (!Worklist.empty()) { 7262 Instruction *I = Worklist.pop_back_val(); 7263 for (auto &Op : I->operands()) 7264 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7265 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7266 AddrDefs.insert(InstOp).second) 7267 Worklist.push_back(InstOp); 7268 } 7269 7270 for (auto *I : AddrDefs) { 7271 if (isa<LoadInst>(I)) { 7272 // Setting the desired widening decision should ideally be handled in 7273 // by cost functions, but since this involves the task of finding out 7274 // if the loaded register is involved in an address computation, it is 7275 // instead changed here when we know this is the case. 7276 InstWidening Decision = getWideningDecision(I, VF); 7277 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7278 // Scalarize a widened load of address. 7279 setWideningDecision( 7280 I, VF, CM_Scalarize, 7281 (VF.getKnownMinValue() * 7282 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7283 else if (auto Group = getInterleavedAccessGroup(I)) { 7284 // Scalarize an interleave group of address loads. 7285 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7286 if (Instruction *Member = Group->getMember(I)) 7287 setWideningDecision( 7288 Member, VF, CM_Scalarize, 7289 (VF.getKnownMinValue() * 7290 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7291 } 7292 } 7293 } else 7294 // Make sure I gets scalarized and a cost estimate without 7295 // scalarization overhead. 7296 ForcedScalars[VF].insert(I); 7297 } 7298 } 7299 7300 InstructionCost 7301 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7302 Type *&VectorTy) { 7303 Type *RetTy = I->getType(); 7304 if (canTruncateToMinimalBitwidth(I, VF)) 7305 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7306 auto SE = PSE.getSE(); 7307 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7308 7309 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7310 ElementCount VF) -> bool { 7311 if (VF.isScalar()) 7312 return true; 7313 7314 auto Scalarized = InstsToScalarize.find(VF); 7315 assert(Scalarized != InstsToScalarize.end() && 7316 "VF not yet analyzed for scalarization profitability"); 7317 return !Scalarized->second.count(I) && 7318 llvm::all_of(I->users(), [&](User *U) { 7319 auto *UI = cast<Instruction>(U); 7320 return !Scalarized->second.count(UI); 7321 }); 7322 }; 7323 (void) hasSingleCopyAfterVectorization; 7324 7325 if (isScalarAfterVectorization(I, VF)) { 7326 // With the exception of GEPs and PHIs, after scalarization there should 7327 // only be one copy of the instruction generated in the loop. This is 7328 // because the VF is either 1, or any instructions that need scalarizing 7329 // have already been dealt with by the the time we get here. As a result, 7330 // it means we don't have to multiply the instruction cost by VF. 7331 assert(I->getOpcode() == Instruction::GetElementPtr || 7332 I->getOpcode() == Instruction::PHI || 7333 (I->getOpcode() == Instruction::BitCast && 7334 I->getType()->isPointerTy()) || 7335 hasSingleCopyAfterVectorization(I, VF)); 7336 VectorTy = RetTy; 7337 } else 7338 VectorTy = ToVectorTy(RetTy, VF); 7339 7340 // TODO: We need to estimate the cost of intrinsic calls. 7341 switch (I->getOpcode()) { 7342 case Instruction::GetElementPtr: 7343 // We mark this instruction as zero-cost because the cost of GEPs in 7344 // vectorized code depends on whether the corresponding memory instruction 7345 // is scalarized or not. Therefore, we handle GEPs with the memory 7346 // instruction cost. 7347 return 0; 7348 case Instruction::Br: { 7349 // In cases of scalarized and predicated instructions, there will be VF 7350 // predicated blocks in the vectorized loop. Each branch around these 7351 // blocks requires also an extract of its vector compare i1 element. 7352 bool ScalarPredicatedBB = false; 7353 BranchInst *BI = cast<BranchInst>(I); 7354 if (VF.isVector() && BI->isConditional() && 7355 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7356 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7357 ScalarPredicatedBB = true; 7358 7359 if (ScalarPredicatedBB) { 7360 // Not possible to scalarize scalable vector with predicated instructions. 7361 if (VF.isScalable()) 7362 return InstructionCost::getInvalid(); 7363 // Return cost for branches around scalarized and predicated blocks. 7364 auto *Vec_i1Ty = 7365 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7366 return ( 7367 TTI.getScalarizationOverhead( 7368 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7369 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7370 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7371 // The back-edge branch will remain, as will all scalar branches. 7372 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7373 else 7374 // This branch will be eliminated by if-conversion. 7375 return 0; 7376 // Note: We currently assume zero cost for an unconditional branch inside 7377 // a predicated block since it will become a fall-through, although we 7378 // may decide in the future to call TTI for all branches. 7379 } 7380 case Instruction::PHI: { 7381 auto *Phi = cast<PHINode>(I); 7382 7383 // First-order recurrences are replaced by vector shuffles inside the loop. 7384 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7385 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7386 return TTI.getShuffleCost( 7387 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7388 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7389 7390 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7391 // converted into select instructions. We require N - 1 selects per phi 7392 // node, where N is the number of incoming values. 7393 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7394 return (Phi->getNumIncomingValues() - 1) * 7395 TTI.getCmpSelInstrCost( 7396 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7397 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7398 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7399 7400 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7401 } 7402 case Instruction::UDiv: 7403 case Instruction::SDiv: 7404 case Instruction::URem: 7405 case Instruction::SRem: 7406 // If we have a predicated instruction, it may not be executed for each 7407 // vector lane. Get the scalarization cost and scale this amount by the 7408 // probability of executing the predicated block. If the instruction is not 7409 // predicated, we fall through to the next case. 7410 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7411 InstructionCost Cost = 0; 7412 7413 // These instructions have a non-void type, so account for the phi nodes 7414 // that we will create. This cost is likely to be zero. The phi node 7415 // cost, if any, should be scaled by the block probability because it 7416 // models a copy at the end of each predicated block. 7417 Cost += VF.getKnownMinValue() * 7418 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7419 7420 // The cost of the non-predicated instruction. 7421 Cost += VF.getKnownMinValue() * 7422 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7423 7424 // The cost of insertelement and extractelement instructions needed for 7425 // scalarization. 7426 Cost += getScalarizationOverhead(I, VF); 7427 7428 // Scale the cost by the probability of executing the predicated blocks. 7429 // This assumes the predicated block for each vector lane is equally 7430 // likely. 7431 return Cost / getReciprocalPredBlockProb(); 7432 } 7433 LLVM_FALLTHROUGH; 7434 case Instruction::Add: 7435 case Instruction::FAdd: 7436 case Instruction::Sub: 7437 case Instruction::FSub: 7438 case Instruction::Mul: 7439 case Instruction::FMul: 7440 case Instruction::FDiv: 7441 case Instruction::FRem: 7442 case Instruction::Shl: 7443 case Instruction::LShr: 7444 case Instruction::AShr: 7445 case Instruction::And: 7446 case Instruction::Or: 7447 case Instruction::Xor: { 7448 // Since we will replace the stride by 1 the multiplication should go away. 7449 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7450 return 0; 7451 7452 // Detect reduction patterns 7453 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7454 return *RedCost; 7455 7456 // Certain instructions can be cheaper to vectorize if they have a constant 7457 // second vector operand. One example of this are shifts on x86. 7458 Value *Op2 = I->getOperand(1); 7459 TargetTransformInfo::OperandValueProperties Op2VP; 7460 TargetTransformInfo::OperandValueKind Op2VK = 7461 TTI.getOperandInfo(Op2, Op2VP); 7462 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7463 Op2VK = TargetTransformInfo::OK_UniformValue; 7464 7465 SmallVector<const Value *, 4> Operands(I->operand_values()); 7466 return TTI.getArithmeticInstrCost( 7467 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7468 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7469 } 7470 case Instruction::FNeg: { 7471 return TTI.getArithmeticInstrCost( 7472 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7473 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7474 TargetTransformInfo::OP_None, I->getOperand(0), I); 7475 } 7476 case Instruction::Select: { 7477 SelectInst *SI = cast<SelectInst>(I); 7478 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7479 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7480 7481 const Value *Op0, *Op1; 7482 using namespace llvm::PatternMatch; 7483 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7484 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7485 // select x, y, false --> x & y 7486 // select x, true, y --> x | y 7487 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7488 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7489 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7490 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7491 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7492 Op1->getType()->getScalarSizeInBits() == 1); 7493 7494 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7495 return TTI.getArithmeticInstrCost( 7496 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7497 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7498 } 7499 7500 Type *CondTy = SI->getCondition()->getType(); 7501 if (!ScalarCond) 7502 CondTy = VectorType::get(CondTy, VF); 7503 7504 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7505 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7506 Pred = Cmp->getPredicate(); 7507 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7508 CostKind, I); 7509 } 7510 case Instruction::ICmp: 7511 case Instruction::FCmp: { 7512 Type *ValTy = I->getOperand(0)->getType(); 7513 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7514 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7515 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7516 VectorTy = ToVectorTy(ValTy, VF); 7517 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7518 cast<CmpInst>(I)->getPredicate(), CostKind, 7519 I); 7520 } 7521 case Instruction::Store: 7522 case Instruction::Load: { 7523 ElementCount Width = VF; 7524 if (Width.isVector()) { 7525 InstWidening Decision = getWideningDecision(I, Width); 7526 assert(Decision != CM_Unknown && 7527 "CM decision should be taken at this point"); 7528 if (Decision == CM_Scalarize) 7529 Width = ElementCount::getFixed(1); 7530 } 7531 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7532 return getMemoryInstructionCost(I, VF); 7533 } 7534 case Instruction::BitCast: 7535 if (I->getType()->isPointerTy()) 7536 return 0; 7537 LLVM_FALLTHROUGH; 7538 case Instruction::ZExt: 7539 case Instruction::SExt: 7540 case Instruction::FPToUI: 7541 case Instruction::FPToSI: 7542 case Instruction::FPExt: 7543 case Instruction::PtrToInt: 7544 case Instruction::IntToPtr: 7545 case Instruction::SIToFP: 7546 case Instruction::UIToFP: 7547 case Instruction::Trunc: 7548 case Instruction::FPTrunc: { 7549 // Computes the CastContextHint from a Load/Store instruction. 7550 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7551 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7552 "Expected a load or a store!"); 7553 7554 if (VF.isScalar() || !TheLoop->contains(I)) 7555 return TTI::CastContextHint::Normal; 7556 7557 switch (getWideningDecision(I, VF)) { 7558 case LoopVectorizationCostModel::CM_GatherScatter: 7559 return TTI::CastContextHint::GatherScatter; 7560 case LoopVectorizationCostModel::CM_Interleave: 7561 return TTI::CastContextHint::Interleave; 7562 case LoopVectorizationCostModel::CM_Scalarize: 7563 case LoopVectorizationCostModel::CM_Widen: 7564 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7565 : TTI::CastContextHint::Normal; 7566 case LoopVectorizationCostModel::CM_Widen_Reverse: 7567 return TTI::CastContextHint::Reversed; 7568 case LoopVectorizationCostModel::CM_Unknown: 7569 llvm_unreachable("Instr did not go through cost modelling?"); 7570 } 7571 7572 llvm_unreachable("Unhandled case!"); 7573 }; 7574 7575 unsigned Opcode = I->getOpcode(); 7576 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7577 // For Trunc, the context is the only user, which must be a StoreInst. 7578 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7579 if (I->hasOneUse()) 7580 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7581 CCH = ComputeCCH(Store); 7582 } 7583 // For Z/Sext, the context is the operand, which must be a LoadInst. 7584 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7585 Opcode == Instruction::FPExt) { 7586 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7587 CCH = ComputeCCH(Load); 7588 } 7589 7590 // We optimize the truncation of induction variables having constant 7591 // integer steps. The cost of these truncations is the same as the scalar 7592 // operation. 7593 if (isOptimizableIVTruncate(I, VF)) { 7594 auto *Trunc = cast<TruncInst>(I); 7595 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7596 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7597 } 7598 7599 // Detect reduction patterns 7600 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7601 return *RedCost; 7602 7603 Type *SrcScalarTy = I->getOperand(0)->getType(); 7604 Type *SrcVecTy = 7605 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7606 if (canTruncateToMinimalBitwidth(I, VF)) { 7607 // This cast is going to be shrunk. This may remove the cast or it might 7608 // turn it into slightly different cast. For example, if MinBW == 16, 7609 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7610 // 7611 // Calculate the modified src and dest types. 7612 Type *MinVecTy = VectorTy; 7613 if (Opcode == Instruction::Trunc) { 7614 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7615 VectorTy = 7616 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7617 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7618 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7619 VectorTy = 7620 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7621 } 7622 } 7623 7624 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7625 } 7626 case Instruction::Call: { 7627 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7628 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7629 return *RedCost; 7630 bool NeedToScalarize; 7631 CallInst *CI = cast<CallInst>(I); 7632 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7633 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7634 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7635 return std::min(CallCost, IntrinsicCost); 7636 } 7637 return CallCost; 7638 } 7639 case Instruction::ExtractValue: 7640 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7641 case Instruction::Alloca: 7642 // We cannot easily widen alloca to a scalable alloca, as 7643 // the result would need to be a vector of pointers. 7644 if (VF.isScalable()) 7645 return InstructionCost::getInvalid(); 7646 LLVM_FALLTHROUGH; 7647 default: 7648 // This opcode is unknown. Assume that it is the same as 'mul'. 7649 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7650 } // end of switch. 7651 } 7652 7653 char LoopVectorize::ID = 0; 7654 7655 static const char lv_name[] = "Loop Vectorization"; 7656 7657 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7658 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7659 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7660 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7661 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7662 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7663 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7664 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7665 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7666 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7667 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7668 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7669 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7670 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7671 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7672 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7673 7674 namespace llvm { 7675 7676 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7677 7678 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7679 bool VectorizeOnlyWhenForced) { 7680 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7681 } 7682 7683 } // end namespace llvm 7684 7685 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7686 // Check if the pointer operand of a load or store instruction is 7687 // consecutive. 7688 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7689 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7690 return false; 7691 } 7692 7693 void LoopVectorizationCostModel::collectValuesToIgnore() { 7694 // Ignore ephemeral values. 7695 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7696 7697 // Ignore type-promoting instructions we identified during reduction 7698 // detection. 7699 for (auto &Reduction : Legal->getReductionVars()) { 7700 const RecurrenceDescriptor &RedDes = Reduction.second; 7701 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7702 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7703 } 7704 // Ignore type-casting instructions we identified during induction 7705 // detection. 7706 for (auto &Induction : Legal->getInductionVars()) { 7707 const InductionDescriptor &IndDes = Induction.second; 7708 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7709 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7710 } 7711 } 7712 7713 void LoopVectorizationCostModel::collectInLoopReductions() { 7714 for (auto &Reduction : Legal->getReductionVars()) { 7715 PHINode *Phi = Reduction.first; 7716 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7717 7718 // We don't collect reductions that are type promoted (yet). 7719 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7720 continue; 7721 7722 // If the target would prefer this reduction to happen "in-loop", then we 7723 // want to record it as such. 7724 unsigned Opcode = RdxDesc.getOpcode(); 7725 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7726 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7727 TargetTransformInfo::ReductionFlags())) 7728 continue; 7729 7730 // Check that we can correctly put the reductions into the loop, by 7731 // finding the chain of operations that leads from the phi to the loop 7732 // exit value. 7733 SmallVector<Instruction *, 4> ReductionOperations = 7734 RdxDesc.getReductionOpChain(Phi, TheLoop); 7735 bool InLoop = !ReductionOperations.empty(); 7736 if (InLoop) { 7737 InLoopReductionChains[Phi] = ReductionOperations; 7738 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7739 Instruction *LastChain = Phi; 7740 for (auto *I : ReductionOperations) { 7741 InLoopReductionImmediateChains[I] = LastChain; 7742 LastChain = I; 7743 } 7744 } 7745 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7746 << " reduction for phi: " << *Phi << "\n"); 7747 } 7748 } 7749 7750 // TODO: we could return a pair of values that specify the max VF and 7751 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7752 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7753 // doesn't have a cost model that can choose which plan to execute if 7754 // more than one is generated. 7755 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7756 LoopVectorizationCostModel &CM) { 7757 unsigned WidestType; 7758 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7759 return WidestVectorRegBits / WidestType; 7760 } 7761 7762 VectorizationFactor 7763 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7764 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7765 ElementCount VF = UserVF; 7766 // Outer loop handling: They may require CFG and instruction level 7767 // transformations before even evaluating whether vectorization is profitable. 7768 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7769 // the vectorization pipeline. 7770 if (!OrigLoop->isInnermost()) { 7771 // If the user doesn't provide a vectorization factor, determine a 7772 // reasonable one. 7773 if (UserVF.isZero()) { 7774 VF = ElementCount::getFixed(determineVPlanVF( 7775 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7776 .getFixedSize(), 7777 CM)); 7778 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7779 7780 // Make sure we have a VF > 1 for stress testing. 7781 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7782 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7783 << "overriding computed VF.\n"); 7784 VF = ElementCount::getFixed(4); 7785 } 7786 } 7787 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7788 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7789 "VF needs to be a power of two"); 7790 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7791 << "VF " << VF << " to build VPlans.\n"); 7792 buildVPlans(VF, VF); 7793 7794 // For VPlan build stress testing, we bail out after VPlan construction. 7795 if (VPlanBuildStressTest) 7796 return VectorizationFactor::Disabled(); 7797 7798 return {VF, 0 /*Cost*/}; 7799 } 7800 7801 LLVM_DEBUG( 7802 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7803 "VPlan-native path.\n"); 7804 return VectorizationFactor::Disabled(); 7805 } 7806 7807 Optional<VectorizationFactor> 7808 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7809 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7810 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7811 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7812 return None; 7813 7814 // Invalidate interleave groups if all blocks of loop will be predicated. 7815 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7816 !useMaskedInterleavedAccesses(*TTI)) { 7817 LLVM_DEBUG( 7818 dbgs() 7819 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7820 "which requires masked-interleaved support.\n"); 7821 if (CM.InterleaveInfo.invalidateGroups()) 7822 // Invalidating interleave groups also requires invalidating all decisions 7823 // based on them, which includes widening decisions and uniform and scalar 7824 // values. 7825 CM.invalidateCostModelingDecisions(); 7826 } 7827 7828 ElementCount MaxUserVF = 7829 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7830 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7831 if (!UserVF.isZero() && UserVFIsLegal) { 7832 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7833 "VF needs to be a power of two"); 7834 // Collect the instructions (and their associated costs) that will be more 7835 // profitable to scalarize. 7836 if (CM.selectUserVectorizationFactor(UserVF)) { 7837 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7838 CM.collectInLoopReductions(); 7839 buildVPlansWithVPRecipes(UserVF, UserVF); 7840 LLVM_DEBUG(printPlans(dbgs())); 7841 return {{UserVF, 0}}; 7842 } else 7843 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7844 "InvalidCost", ORE, OrigLoop); 7845 } 7846 7847 // Populate the set of Vectorization Factor Candidates. 7848 ElementCountSet VFCandidates; 7849 for (auto VF = ElementCount::getFixed(1); 7850 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7851 VFCandidates.insert(VF); 7852 for (auto VF = ElementCount::getScalable(1); 7853 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7854 VFCandidates.insert(VF); 7855 7856 for (const auto &VF : VFCandidates) { 7857 // Collect Uniform and Scalar instructions after vectorization with VF. 7858 CM.collectUniformsAndScalars(VF); 7859 7860 // Collect the instructions (and their associated costs) that will be more 7861 // profitable to scalarize. 7862 if (VF.isVector()) 7863 CM.collectInstsToScalarize(VF); 7864 } 7865 7866 CM.collectInLoopReductions(); 7867 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7868 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7869 7870 LLVM_DEBUG(printPlans(dbgs())); 7871 if (!MaxFactors.hasVector()) 7872 return VectorizationFactor::Disabled(); 7873 7874 // Select the optimal vectorization factor. 7875 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7876 7877 // Check if it is profitable to vectorize with runtime checks. 7878 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7879 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7880 bool PragmaThresholdReached = 7881 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7882 bool ThresholdReached = 7883 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7884 if ((ThresholdReached && !Hints.allowReordering()) || 7885 PragmaThresholdReached) { 7886 ORE->emit([&]() { 7887 return OptimizationRemarkAnalysisAliasing( 7888 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7889 OrigLoop->getHeader()) 7890 << "loop not vectorized: cannot prove it is safe to reorder " 7891 "memory operations"; 7892 }); 7893 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7894 Hints.emitRemarkWithHints(); 7895 return VectorizationFactor::Disabled(); 7896 } 7897 } 7898 return SelectedVF; 7899 } 7900 7901 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7902 assert(count_if(VPlans, 7903 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7904 1 && 7905 "Best VF has not a single VPlan."); 7906 7907 for (const VPlanPtr &Plan : VPlans) { 7908 if (Plan->hasVF(VF)) 7909 return *Plan.get(); 7910 } 7911 llvm_unreachable("No plan found!"); 7912 } 7913 7914 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7915 SmallVector<Metadata *, 4> MDs; 7916 // Reserve first location for self reference to the LoopID metadata node. 7917 MDs.push_back(nullptr); 7918 bool IsUnrollMetadata = false; 7919 MDNode *LoopID = L->getLoopID(); 7920 if (LoopID) { 7921 // First find existing loop unrolling disable metadata. 7922 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7923 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7924 if (MD) { 7925 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7926 IsUnrollMetadata = 7927 S && S->getString().startswith("llvm.loop.unroll.disable"); 7928 } 7929 MDs.push_back(LoopID->getOperand(i)); 7930 } 7931 } 7932 7933 if (!IsUnrollMetadata) { 7934 // Add runtime unroll disable metadata. 7935 LLVMContext &Context = L->getHeader()->getContext(); 7936 SmallVector<Metadata *, 1> DisableOperands; 7937 DisableOperands.push_back( 7938 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7939 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7940 MDs.push_back(DisableNode); 7941 MDNode *NewLoopID = MDNode::get(Context, MDs); 7942 // Set operand 0 to refer to the loop id itself. 7943 NewLoopID->replaceOperandWith(0, NewLoopID); 7944 L->setLoopID(NewLoopID); 7945 } 7946 } 7947 7948 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7949 VPlan &BestVPlan, 7950 InnerLoopVectorizer &ILV, 7951 DominatorTree *DT) { 7952 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7953 << '\n'); 7954 7955 // Perform the actual loop transformation. 7956 7957 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7958 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7959 Value *CanonicalIVStartValue; 7960 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7961 ILV.createVectorizedLoopSkeleton(); 7962 ILV.collectPoisonGeneratingRecipes(State); 7963 7964 ILV.printDebugTracesAtStart(); 7965 7966 //===------------------------------------------------===// 7967 // 7968 // Notice: any optimization or new instruction that go 7969 // into the code below should also be implemented in 7970 // the cost-model. 7971 // 7972 //===------------------------------------------------===// 7973 7974 // 2. Copy and widen instructions from the old loop into the new loop. 7975 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7976 ILV.getOrCreateVectorTripCount(nullptr), 7977 CanonicalIVStartValue, State); 7978 BestVPlan.execute(&State); 7979 7980 // Keep all loop hints from the original loop on the vector loop (we'll 7981 // replace the vectorizer-specific hints below). 7982 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7983 7984 Optional<MDNode *> VectorizedLoopID = 7985 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7986 LLVMLoopVectorizeFollowupVectorized}); 7987 7988 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 7989 if (VectorizedLoopID.hasValue()) 7990 L->setLoopID(VectorizedLoopID.getValue()); 7991 else { 7992 // Keep all loop hints from the original loop on the vector loop (we'll 7993 // replace the vectorizer-specific hints below). 7994 if (MDNode *LID = OrigLoop->getLoopID()) 7995 L->setLoopID(LID); 7996 7997 LoopVectorizeHints Hints(L, true, *ORE); 7998 Hints.setAlreadyVectorized(); 7999 } 8000 // Disable runtime unrolling when vectorizing the epilogue loop. 8001 if (CanonicalIVStartValue) 8002 AddRuntimeUnrollDisableMetaData(L); 8003 8004 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8005 // predication, updating analyses. 8006 ILV.fixVectorizedLoop(State); 8007 8008 ILV.printDebugTracesAtEnd(); 8009 } 8010 8011 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8012 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8013 for (const auto &Plan : VPlans) 8014 if (PrintVPlansInDotFormat) 8015 Plan->printDOT(O); 8016 else 8017 Plan->print(O); 8018 } 8019 #endif 8020 8021 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8022 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8023 8024 // We create new control-flow for the vectorized loop, so the original exit 8025 // conditions will be dead after vectorization if it's only used by the 8026 // terminator 8027 SmallVector<BasicBlock*> ExitingBlocks; 8028 OrigLoop->getExitingBlocks(ExitingBlocks); 8029 for (auto *BB : ExitingBlocks) { 8030 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8031 if (!Cmp || !Cmp->hasOneUse()) 8032 continue; 8033 8034 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8035 if (!DeadInstructions.insert(Cmp).second) 8036 continue; 8037 8038 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8039 // TODO: can recurse through operands in general 8040 for (Value *Op : Cmp->operands()) { 8041 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8042 DeadInstructions.insert(cast<Instruction>(Op)); 8043 } 8044 } 8045 8046 // We create new "steps" for induction variable updates to which the original 8047 // induction variables map. An original update instruction will be dead if 8048 // all its users except the induction variable are dead. 8049 auto *Latch = OrigLoop->getLoopLatch(); 8050 for (auto &Induction : Legal->getInductionVars()) { 8051 PHINode *Ind = Induction.first; 8052 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8053 8054 // If the tail is to be folded by masking, the primary induction variable, 8055 // if exists, isn't dead: it will be used for masking. Don't kill it. 8056 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8057 continue; 8058 8059 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8060 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8061 })) 8062 DeadInstructions.insert(IndUpdate); 8063 } 8064 } 8065 8066 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8067 8068 //===--------------------------------------------------------------------===// 8069 // EpilogueVectorizerMainLoop 8070 //===--------------------------------------------------------------------===// 8071 8072 /// This function is partially responsible for generating the control flow 8073 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8074 std::pair<BasicBlock *, Value *> 8075 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8076 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8077 Loop *Lp = createVectorLoopSkeleton(""); 8078 8079 // Generate the code to check the minimum iteration count of the vector 8080 // epilogue (see below). 8081 EPI.EpilogueIterationCountCheck = 8082 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8083 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8084 8085 // Generate the code to check any assumptions that we've made for SCEV 8086 // expressions. 8087 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8088 8089 // Generate the code that checks at runtime if arrays overlap. We put the 8090 // checks into a separate block to make the more common case of few elements 8091 // faster. 8092 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8093 8094 // Generate the iteration count check for the main loop, *after* the check 8095 // for the epilogue loop, so that the path-length is shorter for the case 8096 // that goes directly through the vector epilogue. The longer-path length for 8097 // the main loop is compensated for, by the gain from vectorizing the larger 8098 // trip count. Note: the branch will get updated later on when we vectorize 8099 // the epilogue. 8100 EPI.MainLoopIterationCountCheck = 8101 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8102 8103 // Generate the induction variable. 8104 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8105 EPI.VectorTripCount = CountRoundDown; 8106 createHeaderBranch(Lp); 8107 8108 // Skip induction resume value creation here because they will be created in 8109 // the second pass. If we created them here, they wouldn't be used anyway, 8110 // because the vplan in the second pass still contains the inductions from the 8111 // original loop. 8112 8113 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 8114 } 8115 8116 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8117 LLVM_DEBUG({ 8118 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8119 << "Main Loop VF:" << EPI.MainLoopVF 8120 << ", Main Loop UF:" << EPI.MainLoopUF 8121 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8122 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8123 }); 8124 } 8125 8126 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8127 DEBUG_WITH_TYPE(VerboseDebug, { 8128 dbgs() << "intermediate fn:\n" 8129 << *OrigLoop->getHeader()->getParent() << "\n"; 8130 }); 8131 } 8132 8133 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8134 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8135 assert(L && "Expected valid Loop."); 8136 assert(Bypass && "Expected valid bypass basic block."); 8137 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8138 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8139 Value *Count = getOrCreateTripCount(L); 8140 // Reuse existing vector loop preheader for TC checks. 8141 // Note that new preheader block is generated for vector loop. 8142 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8143 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8144 8145 // Generate code to check if the loop's trip count is less than VF * UF of the 8146 // main vector loop. 8147 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8148 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8149 8150 Value *CheckMinIters = Builder.CreateICmp( 8151 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8152 "min.iters.check"); 8153 8154 if (!ForEpilogue) 8155 TCCheckBlock->setName("vector.main.loop.iter.check"); 8156 8157 // Create new preheader for vector loop. 8158 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8159 DT, LI, nullptr, "vector.ph"); 8160 8161 if (ForEpilogue) { 8162 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8163 DT->getNode(Bypass)->getIDom()) && 8164 "TC check is expected to dominate Bypass"); 8165 8166 // Update dominator for Bypass & LoopExit. 8167 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8168 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8169 // For loops with multiple exits, there's no edge from the middle block 8170 // to exit blocks (as the epilogue must run) and thus no need to update 8171 // the immediate dominator of the exit blocks. 8172 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8173 8174 LoopBypassBlocks.push_back(TCCheckBlock); 8175 8176 // Save the trip count so we don't have to regenerate it in the 8177 // vec.epilog.iter.check. This is safe to do because the trip count 8178 // generated here dominates the vector epilog iter check. 8179 EPI.TripCount = Count; 8180 } 8181 8182 ReplaceInstWithInst( 8183 TCCheckBlock->getTerminator(), 8184 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8185 8186 return TCCheckBlock; 8187 } 8188 8189 //===--------------------------------------------------------------------===// 8190 // EpilogueVectorizerEpilogueLoop 8191 //===--------------------------------------------------------------------===// 8192 8193 /// This function is partially responsible for generating the control flow 8194 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8195 std::pair<BasicBlock *, Value *> 8196 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8197 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8198 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8199 8200 // Now, compare the remaining count and if there aren't enough iterations to 8201 // execute the vectorized epilogue skip to the scalar part. 8202 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8203 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8204 LoopVectorPreHeader = 8205 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8206 LI, nullptr, "vec.epilog.ph"); 8207 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8208 VecEpilogueIterationCountCheck); 8209 8210 // Adjust the control flow taking the state info from the main loop 8211 // vectorization into account. 8212 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8213 "expected this to be saved from the previous pass."); 8214 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8215 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8216 8217 DT->changeImmediateDominator(LoopVectorPreHeader, 8218 EPI.MainLoopIterationCountCheck); 8219 8220 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8221 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8222 8223 if (EPI.SCEVSafetyCheck) 8224 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8225 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8226 if (EPI.MemSafetyCheck) 8227 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8228 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8229 8230 DT->changeImmediateDominator( 8231 VecEpilogueIterationCountCheck, 8232 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8233 8234 DT->changeImmediateDominator(LoopScalarPreHeader, 8235 EPI.EpilogueIterationCountCheck); 8236 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8237 // If there is an epilogue which must run, there's no edge from the 8238 // middle block to exit blocks and thus no need to update the immediate 8239 // dominator of the exit blocks. 8240 DT->changeImmediateDominator(LoopExitBlock, 8241 EPI.EpilogueIterationCountCheck); 8242 8243 // Keep track of bypass blocks, as they feed start values to the induction 8244 // phis in the scalar loop preheader. 8245 if (EPI.SCEVSafetyCheck) 8246 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8247 if (EPI.MemSafetyCheck) 8248 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8249 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8250 8251 // Generate a resume induction for the vector epilogue and put it in the 8252 // vector epilogue preheader 8253 Type *IdxTy = Legal->getWidestInductionType(); 8254 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8255 LoopVectorPreHeader->getFirstNonPHI()); 8256 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8257 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8258 EPI.MainLoopIterationCountCheck); 8259 8260 // Generate the induction variable. 8261 createHeaderBranch(Lp); 8262 8263 // Generate induction resume values. These variables save the new starting 8264 // indexes for the scalar loop. They are used to test if there are any tail 8265 // iterations left once the vector loop has completed. 8266 // Note that when the vectorized epilogue is skipped due to iteration count 8267 // check, then the resume value for the induction variable comes from 8268 // the trip count of the main vector loop, hence passing the AdditionalBypass 8269 // argument. 8270 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8271 EPI.VectorTripCount} /* AdditionalBypass */); 8272 8273 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8274 } 8275 8276 BasicBlock * 8277 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8278 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8279 8280 assert(EPI.TripCount && 8281 "Expected trip count to have been safed in the first pass."); 8282 assert( 8283 (!isa<Instruction>(EPI.TripCount) || 8284 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8285 "saved trip count does not dominate insertion point."); 8286 Value *TC = EPI.TripCount; 8287 IRBuilder<> Builder(Insert->getTerminator()); 8288 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8289 8290 // Generate code to check if the loop's trip count is less than VF * UF of the 8291 // vector epilogue loop. 8292 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8293 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8294 8295 Value *CheckMinIters = 8296 Builder.CreateICmp(P, Count, 8297 createStepForVF(Builder, Count->getType(), 8298 EPI.EpilogueVF, EPI.EpilogueUF), 8299 "min.epilog.iters.check"); 8300 8301 ReplaceInstWithInst( 8302 Insert->getTerminator(), 8303 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8304 8305 LoopBypassBlocks.push_back(Insert); 8306 return Insert; 8307 } 8308 8309 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8310 LLVM_DEBUG({ 8311 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8312 << "Epilogue Loop VF:" << EPI.EpilogueVF 8313 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8314 }); 8315 } 8316 8317 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8318 DEBUG_WITH_TYPE(VerboseDebug, { 8319 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8320 }); 8321 } 8322 8323 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8324 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8325 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8326 bool PredicateAtRangeStart = Predicate(Range.Start); 8327 8328 for (ElementCount TmpVF = Range.Start * 2; 8329 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8330 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8331 Range.End = TmpVF; 8332 break; 8333 } 8334 8335 return PredicateAtRangeStart; 8336 } 8337 8338 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8339 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8340 /// of VF's starting at a given VF and extending it as much as possible. Each 8341 /// vectorization decision can potentially shorten this sub-range during 8342 /// buildVPlan(). 8343 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8344 ElementCount MaxVF) { 8345 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8346 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8347 VFRange SubRange = {VF, MaxVFPlusOne}; 8348 VPlans.push_back(buildVPlan(SubRange)); 8349 VF = SubRange.End; 8350 } 8351 } 8352 8353 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8354 VPlanPtr &Plan) { 8355 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8356 8357 // Look for cached value. 8358 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8359 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8360 if (ECEntryIt != EdgeMaskCache.end()) 8361 return ECEntryIt->second; 8362 8363 VPValue *SrcMask = createBlockInMask(Src, Plan); 8364 8365 // The terminator has to be a branch inst! 8366 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8367 assert(BI && "Unexpected terminator found"); 8368 8369 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8370 return EdgeMaskCache[Edge] = SrcMask; 8371 8372 // If source is an exiting block, we know the exit edge is dynamically dead 8373 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8374 // adding uses of an otherwise potentially dead instruction. 8375 if (OrigLoop->isLoopExiting(Src)) 8376 return EdgeMaskCache[Edge] = SrcMask; 8377 8378 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8379 assert(EdgeMask && "No Edge Mask found for condition"); 8380 8381 if (BI->getSuccessor(0) != Dst) 8382 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8383 8384 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8385 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8386 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8387 // The select version does not introduce new UB if SrcMask is false and 8388 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8389 VPValue *False = Plan->getOrAddVPValue( 8390 ConstantInt::getFalse(BI->getCondition()->getType())); 8391 EdgeMask = 8392 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8393 } 8394 8395 return EdgeMaskCache[Edge] = EdgeMask; 8396 } 8397 8398 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8399 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8400 8401 // Look for cached value. 8402 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8403 if (BCEntryIt != BlockMaskCache.end()) 8404 return BCEntryIt->second; 8405 8406 // All-one mask is modelled as no-mask following the convention for masked 8407 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8408 VPValue *BlockMask = nullptr; 8409 8410 if (OrigLoop->getHeader() == BB) { 8411 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8412 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8413 8414 // Introduce the early-exit compare IV <= BTC to form header block mask. 8415 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8416 // constructing the desired canonical IV in the header block as its first 8417 // non-phi instructions. 8418 assert(CM.foldTailByMasking() && "must fold the tail"); 8419 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8420 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8421 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8422 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8423 8424 VPBuilder::InsertPointGuard Guard(Builder); 8425 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8426 if (CM.TTI.emitGetActiveLaneMask()) { 8427 VPValue *TC = Plan->getOrCreateTripCount(); 8428 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8429 } else { 8430 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8431 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8432 } 8433 return BlockMaskCache[BB] = BlockMask; 8434 } 8435 8436 // This is the block mask. We OR all incoming edges. 8437 for (auto *Predecessor : predecessors(BB)) { 8438 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8439 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8440 return BlockMaskCache[BB] = EdgeMask; 8441 8442 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8443 BlockMask = EdgeMask; 8444 continue; 8445 } 8446 8447 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8448 } 8449 8450 return BlockMaskCache[BB] = BlockMask; 8451 } 8452 8453 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8454 ArrayRef<VPValue *> Operands, 8455 VFRange &Range, 8456 VPlanPtr &Plan) { 8457 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8458 "Must be called with either a load or store"); 8459 8460 auto willWiden = [&](ElementCount VF) -> bool { 8461 if (VF.isScalar()) 8462 return false; 8463 LoopVectorizationCostModel::InstWidening Decision = 8464 CM.getWideningDecision(I, VF); 8465 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8466 "CM decision should be taken at this point."); 8467 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8468 return true; 8469 if (CM.isScalarAfterVectorization(I, VF) || 8470 CM.isProfitableToScalarize(I, VF)) 8471 return false; 8472 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8473 }; 8474 8475 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8476 return nullptr; 8477 8478 VPValue *Mask = nullptr; 8479 if (Legal->isMaskRequired(I)) 8480 Mask = createBlockInMask(I->getParent(), Plan); 8481 8482 // Determine if the pointer operand of the access is either consecutive or 8483 // reverse consecutive. 8484 LoopVectorizationCostModel::InstWidening Decision = 8485 CM.getWideningDecision(I, Range.Start); 8486 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8487 bool Consecutive = 8488 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8489 8490 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8491 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8492 Consecutive, Reverse); 8493 8494 StoreInst *Store = cast<StoreInst>(I); 8495 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8496 Mask, Consecutive, Reverse); 8497 } 8498 8499 VPWidenIntOrFpInductionRecipe * 8500 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8501 ArrayRef<VPValue *> Operands) const { 8502 // Check if this is an integer or fp induction. If so, build the recipe that 8503 // produces its scalar and vector values. 8504 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { 8505 assert(II->getStartValue() == 8506 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8507 return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); 8508 } 8509 8510 return nullptr; 8511 } 8512 8513 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8514 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8515 VPlan &Plan) const { 8516 // Optimize the special case where the source is a constant integer 8517 // induction variable. Notice that we can only optimize the 'trunc' case 8518 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8519 // (c) other casts depend on pointer size. 8520 8521 // Determine whether \p K is a truncation based on an induction variable that 8522 // can be optimized. 8523 auto isOptimizableIVTruncate = 8524 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8525 return [=](ElementCount VF) -> bool { 8526 return CM.isOptimizableIVTruncate(K, VF); 8527 }; 8528 }; 8529 8530 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8531 isOptimizableIVTruncate(I), Range)) { 8532 8533 auto *Phi = cast<PHINode>(I->getOperand(0)); 8534 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8535 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8536 return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); 8537 } 8538 return nullptr; 8539 } 8540 8541 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8542 ArrayRef<VPValue *> Operands, 8543 VPlanPtr &Plan) { 8544 // If all incoming values are equal, the incoming VPValue can be used directly 8545 // instead of creating a new VPBlendRecipe. 8546 VPValue *FirstIncoming = Operands[0]; 8547 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8548 return FirstIncoming == Inc; 8549 })) { 8550 return Operands[0]; 8551 } 8552 8553 // We know that all PHIs in non-header blocks are converted into selects, so 8554 // we don't have to worry about the insertion order and we can just use the 8555 // builder. At this point we generate the predication tree. There may be 8556 // duplications since this is a simple recursive scan, but future 8557 // optimizations will clean it up. 8558 SmallVector<VPValue *, 2> OperandsWithMask; 8559 unsigned NumIncoming = Phi->getNumIncomingValues(); 8560 8561 for (unsigned In = 0; In < NumIncoming; In++) { 8562 VPValue *EdgeMask = 8563 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8564 assert((EdgeMask || NumIncoming == 1) && 8565 "Multiple predecessors with one having a full mask"); 8566 OperandsWithMask.push_back(Operands[In]); 8567 if (EdgeMask) 8568 OperandsWithMask.push_back(EdgeMask); 8569 } 8570 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8571 } 8572 8573 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8574 ArrayRef<VPValue *> Operands, 8575 VFRange &Range) const { 8576 8577 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8578 [this, CI](ElementCount VF) { 8579 return CM.isScalarWithPredication(CI, VF); 8580 }, 8581 Range); 8582 8583 if (IsPredicated) 8584 return nullptr; 8585 8586 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8587 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8588 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8589 ID == Intrinsic::pseudoprobe || 8590 ID == Intrinsic::experimental_noalias_scope_decl)) 8591 return nullptr; 8592 8593 auto willWiden = [&](ElementCount VF) -> bool { 8594 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8595 // The following case may be scalarized depending on the VF. 8596 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8597 // version of the instruction. 8598 // Is it beneficial to perform intrinsic call compared to lib call? 8599 bool NeedToScalarize = false; 8600 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8601 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8602 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8603 return UseVectorIntrinsic || !NeedToScalarize; 8604 }; 8605 8606 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8607 return nullptr; 8608 8609 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8610 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8611 } 8612 8613 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8614 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8615 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8616 // Instruction should be widened, unless it is scalar after vectorization, 8617 // scalarization is profitable or it is predicated. 8618 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8619 return CM.isScalarAfterVectorization(I, VF) || 8620 CM.isProfitableToScalarize(I, VF) || 8621 CM.isScalarWithPredication(I, VF); 8622 }; 8623 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8624 Range); 8625 } 8626 8627 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8628 ArrayRef<VPValue *> Operands) const { 8629 auto IsVectorizableOpcode = [](unsigned Opcode) { 8630 switch (Opcode) { 8631 case Instruction::Add: 8632 case Instruction::And: 8633 case Instruction::AShr: 8634 case Instruction::BitCast: 8635 case Instruction::FAdd: 8636 case Instruction::FCmp: 8637 case Instruction::FDiv: 8638 case Instruction::FMul: 8639 case Instruction::FNeg: 8640 case Instruction::FPExt: 8641 case Instruction::FPToSI: 8642 case Instruction::FPToUI: 8643 case Instruction::FPTrunc: 8644 case Instruction::FRem: 8645 case Instruction::FSub: 8646 case Instruction::ICmp: 8647 case Instruction::IntToPtr: 8648 case Instruction::LShr: 8649 case Instruction::Mul: 8650 case Instruction::Or: 8651 case Instruction::PtrToInt: 8652 case Instruction::SDiv: 8653 case Instruction::Select: 8654 case Instruction::SExt: 8655 case Instruction::Shl: 8656 case Instruction::SIToFP: 8657 case Instruction::SRem: 8658 case Instruction::Sub: 8659 case Instruction::Trunc: 8660 case Instruction::UDiv: 8661 case Instruction::UIToFP: 8662 case Instruction::URem: 8663 case Instruction::Xor: 8664 case Instruction::ZExt: 8665 return true; 8666 } 8667 return false; 8668 }; 8669 8670 if (!IsVectorizableOpcode(I->getOpcode())) 8671 return nullptr; 8672 8673 // Success: widen this instruction. 8674 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8675 } 8676 8677 void VPRecipeBuilder::fixHeaderPhis() { 8678 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8679 for (VPHeaderPHIRecipe *R : PhisToFix) { 8680 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8681 VPRecipeBase *IncR = 8682 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8683 R->addOperand(IncR->getVPSingleValue()); 8684 } 8685 } 8686 8687 VPBasicBlock *VPRecipeBuilder::handleReplication( 8688 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8689 VPlanPtr &Plan) { 8690 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8691 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8692 Range); 8693 8694 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8695 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8696 Range); 8697 8698 // Even if the instruction is not marked as uniform, there are certain 8699 // intrinsic calls that can be effectively treated as such, so we check for 8700 // them here. Conservatively, we only do this for scalable vectors, since 8701 // for fixed-width VFs we can always fall back on full scalarization. 8702 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8703 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8704 case Intrinsic::assume: 8705 case Intrinsic::lifetime_start: 8706 case Intrinsic::lifetime_end: 8707 // For scalable vectors if one of the operands is variant then we still 8708 // want to mark as uniform, which will generate one instruction for just 8709 // the first lane of the vector. We can't scalarize the call in the same 8710 // way as for fixed-width vectors because we don't know how many lanes 8711 // there are. 8712 // 8713 // The reasons for doing it this way for scalable vectors are: 8714 // 1. For the assume intrinsic generating the instruction for the first 8715 // lane is still be better than not generating any at all. For 8716 // example, the input may be a splat across all lanes. 8717 // 2. For the lifetime start/end intrinsics the pointer operand only 8718 // does anything useful when the input comes from a stack object, 8719 // which suggests it should always be uniform. For non-stack objects 8720 // the effect is to poison the object, which still allows us to 8721 // remove the call. 8722 IsUniform = true; 8723 break; 8724 default: 8725 break; 8726 } 8727 } 8728 8729 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8730 IsUniform, IsPredicated); 8731 setRecipe(I, Recipe); 8732 Plan->addVPValue(I, Recipe); 8733 8734 // Find if I uses a predicated instruction. If so, it will use its scalar 8735 // value. Avoid hoisting the insert-element which packs the scalar value into 8736 // a vector value, as that happens iff all users use the vector value. 8737 for (VPValue *Op : Recipe->operands()) { 8738 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8739 if (!PredR) 8740 continue; 8741 auto *RepR = 8742 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8743 assert(RepR->isPredicated() && 8744 "expected Replicate recipe to be predicated"); 8745 RepR->setAlsoPack(false); 8746 } 8747 8748 // Finalize the recipe for Instr, first if it is not predicated. 8749 if (!IsPredicated) { 8750 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8751 VPBB->appendRecipe(Recipe); 8752 return VPBB; 8753 } 8754 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8755 8756 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8757 assert(SingleSucc && "VPBB must have a single successor when handling " 8758 "predicated replication."); 8759 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8760 // Record predicated instructions for above packing optimizations. 8761 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8762 VPBlockUtils::insertBlockAfter(Region, VPBB); 8763 auto *RegSucc = new VPBasicBlock(); 8764 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8765 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8766 return RegSucc; 8767 } 8768 8769 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8770 VPRecipeBase *PredRecipe, 8771 VPlanPtr &Plan) { 8772 // Instructions marked for predication are replicated and placed under an 8773 // if-then construct to prevent side-effects. 8774 8775 // Generate recipes to compute the block mask for this region. 8776 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8777 8778 // Build the triangular if-then region. 8779 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8780 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8781 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8782 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8783 auto *PHIRecipe = Instr->getType()->isVoidTy() 8784 ? nullptr 8785 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8786 if (PHIRecipe) { 8787 Plan->removeVPValueFor(Instr); 8788 Plan->addVPValue(Instr, PHIRecipe); 8789 } 8790 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8791 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8792 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8793 8794 // Note: first set Entry as region entry and then connect successors starting 8795 // from it in order, to propagate the "parent" of each VPBasicBlock. 8796 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8797 VPBlockUtils::connectBlocks(Pred, Exit); 8798 8799 return Region; 8800 } 8801 8802 VPRecipeOrVPValueTy 8803 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8804 ArrayRef<VPValue *> Operands, 8805 VFRange &Range, VPlanPtr &Plan) { 8806 // First, check for specific widening recipes that deal with calls, memory 8807 // operations, inductions and Phi nodes. 8808 if (auto *CI = dyn_cast<CallInst>(Instr)) 8809 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8810 8811 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8812 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8813 8814 VPRecipeBase *Recipe; 8815 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8816 if (Phi->getParent() != OrigLoop->getHeader()) 8817 return tryToBlend(Phi, Operands, Plan); 8818 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8819 return toVPRecipeResult(Recipe); 8820 8821 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8822 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8823 VPValue *StartV = Operands[0]; 8824 if (Legal->isReductionVariable(Phi)) { 8825 const RecurrenceDescriptor &RdxDesc = 8826 Legal->getReductionVars().find(Phi)->second; 8827 assert(RdxDesc.getRecurrenceStartValue() == 8828 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8829 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8830 CM.isInLoopReduction(Phi), 8831 CM.useOrderedReductions(RdxDesc)); 8832 } else { 8833 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8834 } 8835 8836 // Record the incoming value from the backedge, so we can add the incoming 8837 // value from the backedge after all recipes have been created. 8838 recordRecipeOf(cast<Instruction>( 8839 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8840 PhisToFix.push_back(PhiRecipe); 8841 } else { 8842 // TODO: record backedge value for remaining pointer induction phis. 8843 assert(Phi->getType()->isPointerTy() && 8844 "only pointer phis should be handled here"); 8845 assert(Legal->getInductionVars().count(Phi) && 8846 "Not an induction variable"); 8847 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8848 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8849 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8850 } 8851 8852 return toVPRecipeResult(PhiRecipe); 8853 } 8854 8855 if (isa<TruncInst>(Instr) && 8856 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8857 Range, *Plan))) 8858 return toVPRecipeResult(Recipe); 8859 8860 if (!shouldWiden(Instr, Range)) 8861 return nullptr; 8862 8863 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8864 return toVPRecipeResult(new VPWidenGEPRecipe( 8865 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8866 8867 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8868 bool InvariantCond = 8869 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8870 return toVPRecipeResult(new VPWidenSelectRecipe( 8871 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8872 } 8873 8874 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8875 } 8876 8877 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8878 ElementCount MaxVF) { 8879 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8880 8881 // Collect instructions from the original loop that will become trivially dead 8882 // in the vectorized loop. We don't need to vectorize these instructions. For 8883 // example, original induction update instructions can become dead because we 8884 // separately emit induction "steps" when generating code for the new loop. 8885 // Similarly, we create a new latch condition when setting up the structure 8886 // of the new loop, so the old one can become dead. 8887 SmallPtrSet<Instruction *, 4> DeadInstructions; 8888 collectTriviallyDeadInstructions(DeadInstructions); 8889 8890 // Add assume instructions we need to drop to DeadInstructions, to prevent 8891 // them from being added to the VPlan. 8892 // TODO: We only need to drop assumes in blocks that get flattend. If the 8893 // control flow is preserved, we should keep them. 8894 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8895 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8896 8897 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8898 // Dead instructions do not need sinking. Remove them from SinkAfter. 8899 for (Instruction *I : DeadInstructions) 8900 SinkAfter.erase(I); 8901 8902 // Cannot sink instructions after dead instructions (there won't be any 8903 // recipes for them). Instead, find the first non-dead previous instruction. 8904 for (auto &P : Legal->getSinkAfter()) { 8905 Instruction *SinkTarget = P.second; 8906 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8907 (void)FirstInst; 8908 while (DeadInstructions.contains(SinkTarget)) { 8909 assert( 8910 SinkTarget != FirstInst && 8911 "Must find a live instruction (at least the one feeding the " 8912 "first-order recurrence PHI) before reaching beginning of the block"); 8913 SinkTarget = SinkTarget->getPrevNode(); 8914 assert(SinkTarget != P.first && 8915 "sink source equals target, no sinking required"); 8916 } 8917 P.second = SinkTarget; 8918 } 8919 8920 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8921 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8922 VFRange SubRange = {VF, MaxVFPlusOne}; 8923 VPlans.push_back( 8924 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8925 VF = SubRange.End; 8926 } 8927 } 8928 8929 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8930 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8931 // BranchOnCount VPInstruction to the latch. 8932 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8933 bool HasNUW, bool IsVPlanNative) { 8934 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8935 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8936 8937 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8938 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8939 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8940 if (IsVPlanNative) 8941 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 8942 Header->insert(CanonicalIVPHI, Header->begin()); 8943 8944 auto *CanonicalIVIncrement = 8945 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8946 : VPInstruction::CanonicalIVIncrement, 8947 {CanonicalIVPHI}, DL); 8948 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8949 8950 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 8951 if (IsVPlanNative) { 8952 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 8953 EB->setCondBit(nullptr); 8954 } 8955 EB->appendRecipe(CanonicalIVIncrement); 8956 8957 auto *BranchOnCount = 8958 new VPInstruction(VPInstruction::BranchOnCount, 8959 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8960 EB->appendRecipe(BranchOnCount); 8961 } 8962 8963 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8964 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8965 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8966 8967 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8968 8969 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8970 8971 // --------------------------------------------------------------------------- 8972 // Pre-construction: record ingredients whose recipes we'll need to further 8973 // process after constructing the initial VPlan. 8974 // --------------------------------------------------------------------------- 8975 8976 // Mark instructions we'll need to sink later and their targets as 8977 // ingredients whose recipe we'll need to record. 8978 for (auto &Entry : SinkAfter) { 8979 RecipeBuilder.recordRecipeOf(Entry.first); 8980 RecipeBuilder.recordRecipeOf(Entry.second); 8981 } 8982 for (auto &Reduction : CM.getInLoopReductionChains()) { 8983 PHINode *Phi = Reduction.first; 8984 RecurKind Kind = 8985 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8986 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8987 8988 RecipeBuilder.recordRecipeOf(Phi); 8989 for (auto &R : ReductionOperations) { 8990 RecipeBuilder.recordRecipeOf(R); 8991 // For min/max reducitons, where we have a pair of icmp/select, we also 8992 // need to record the ICmp recipe, so it can be removed later. 8993 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8994 "Only min/max recurrences allowed for inloop reductions"); 8995 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8996 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8997 } 8998 } 8999 9000 // For each interleave group which is relevant for this (possibly trimmed) 9001 // Range, add it to the set of groups to be later applied to the VPlan and add 9002 // placeholders for its members' Recipes which we'll be replacing with a 9003 // single VPInterleaveRecipe. 9004 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9005 auto applyIG = [IG, this](ElementCount VF) -> bool { 9006 return (VF.isVector() && // Query is illegal for VF == 1 9007 CM.getWideningDecision(IG->getInsertPos(), VF) == 9008 LoopVectorizationCostModel::CM_Interleave); 9009 }; 9010 if (!getDecisionAndClampRange(applyIG, Range)) 9011 continue; 9012 InterleaveGroups.insert(IG); 9013 for (unsigned i = 0; i < IG->getFactor(); i++) 9014 if (Instruction *Member = IG->getMember(i)) 9015 RecipeBuilder.recordRecipeOf(Member); 9016 }; 9017 9018 // --------------------------------------------------------------------------- 9019 // Build initial VPlan: Scan the body of the loop in a topological order to 9020 // visit each basic block after having visited its predecessor basic blocks. 9021 // --------------------------------------------------------------------------- 9022 9023 // Create initial VPlan skeleton, with separate header and latch blocks. 9024 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9025 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9026 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9027 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9028 auto Plan = std::make_unique<VPlan>(TopRegion); 9029 9030 Instruction *DLInst = 9031 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9032 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 9033 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 9034 !CM.foldTailByMasking(), false); 9035 9036 // Scan the body of the loop in a topological order to visit each basic block 9037 // after having visited its predecessor basic blocks. 9038 LoopBlocksDFS DFS(OrigLoop); 9039 DFS.perform(LI); 9040 9041 VPBasicBlock *VPBB = HeaderVPBB; 9042 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9043 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9044 // Relevant instructions from basic block BB will be grouped into VPRecipe 9045 // ingredients and fill a new VPBasicBlock. 9046 unsigned VPBBsForBB = 0; 9047 VPBB->setName(BB->getName()); 9048 Builder.setInsertPoint(VPBB); 9049 9050 // Introduce each ingredient into VPlan. 9051 // TODO: Model and preserve debug instrinsics in VPlan. 9052 for (Instruction &I : BB->instructionsWithoutDebug()) { 9053 Instruction *Instr = &I; 9054 9055 // First filter out irrelevant instructions, to ensure no recipes are 9056 // built for them. 9057 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9058 continue; 9059 9060 SmallVector<VPValue *, 4> Operands; 9061 auto *Phi = dyn_cast<PHINode>(Instr); 9062 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9063 Operands.push_back(Plan->getOrAddVPValue( 9064 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9065 } else { 9066 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9067 Operands = {OpRange.begin(), OpRange.end()}; 9068 } 9069 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9070 Instr, Operands, Range, Plan)) { 9071 // If Instr can be simplified to an existing VPValue, use it. 9072 if (RecipeOrValue.is<VPValue *>()) { 9073 auto *VPV = RecipeOrValue.get<VPValue *>(); 9074 Plan->addVPValue(Instr, VPV); 9075 // If the re-used value is a recipe, register the recipe for the 9076 // instruction, in case the recipe for Instr needs to be recorded. 9077 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9078 RecipeBuilder.setRecipe(Instr, R); 9079 continue; 9080 } 9081 // Otherwise, add the new recipe. 9082 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9083 for (auto *Def : Recipe->definedValues()) { 9084 auto *UV = Def->getUnderlyingValue(); 9085 Plan->addVPValue(UV, Def); 9086 } 9087 9088 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9089 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9090 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9091 // of the header block. That can happen for truncates of induction 9092 // variables. Those recipes are moved to the phi section of the header 9093 // block after applying SinkAfter, which relies on the original 9094 // position of the trunc. 9095 assert(isa<TruncInst>(Instr)); 9096 InductionsToMove.push_back( 9097 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9098 } 9099 RecipeBuilder.setRecipe(Instr, Recipe); 9100 VPBB->appendRecipe(Recipe); 9101 continue; 9102 } 9103 9104 // Otherwise, if all widening options failed, Instruction is to be 9105 // replicated. This may create a successor for VPBB. 9106 VPBasicBlock *NextVPBB = 9107 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9108 if (NextVPBB != VPBB) { 9109 VPBB = NextVPBB; 9110 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9111 : ""); 9112 } 9113 } 9114 9115 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9116 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9117 } 9118 9119 // Fold the last, empty block into its predecessor. 9120 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9121 assert(VPBB && "expected to fold last (empty) block"); 9122 // After here, VPBB should not be used. 9123 VPBB = nullptr; 9124 9125 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9126 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9127 "entry block must be set to a VPRegionBlock having a non-empty entry " 9128 "VPBasicBlock"); 9129 RecipeBuilder.fixHeaderPhis(); 9130 9131 // --------------------------------------------------------------------------- 9132 // Transform initial VPlan: Apply previously taken decisions, in order, to 9133 // bring the VPlan to its final state. 9134 // --------------------------------------------------------------------------- 9135 9136 // Apply Sink-After legal constraints. 9137 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9138 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9139 if (Region && Region->isReplicator()) { 9140 assert(Region->getNumSuccessors() == 1 && 9141 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9142 assert(R->getParent()->size() == 1 && 9143 "A recipe in an original replicator region must be the only " 9144 "recipe in its block"); 9145 return Region; 9146 } 9147 return nullptr; 9148 }; 9149 for (auto &Entry : SinkAfter) { 9150 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9151 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9152 9153 auto *TargetRegion = GetReplicateRegion(Target); 9154 auto *SinkRegion = GetReplicateRegion(Sink); 9155 if (!SinkRegion) { 9156 // If the sink source is not a replicate region, sink the recipe directly. 9157 if (TargetRegion) { 9158 // The target is in a replication region, make sure to move Sink to 9159 // the block after it, not into the replication region itself. 9160 VPBasicBlock *NextBlock = 9161 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9162 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9163 } else 9164 Sink->moveAfter(Target); 9165 continue; 9166 } 9167 9168 // The sink source is in a replicate region. Unhook the region from the CFG. 9169 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9170 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9171 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9172 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9173 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9174 9175 if (TargetRegion) { 9176 // The target recipe is also in a replicate region, move the sink region 9177 // after the target region. 9178 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9179 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9180 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9181 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9182 } else { 9183 // The sink source is in a replicate region, we need to move the whole 9184 // replicate region, which should only contain a single recipe in the 9185 // main block. 9186 auto *SplitBlock = 9187 Target->getParent()->splitAt(std::next(Target->getIterator())); 9188 9189 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9190 9191 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9192 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9193 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9194 } 9195 } 9196 9197 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9198 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9199 9200 // Now that sink-after is done, move induction recipes for optimized truncates 9201 // to the phi section of the header block. 9202 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9203 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9204 9205 // Adjust the recipes for any inloop reductions. 9206 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9207 RecipeBuilder, Range.Start); 9208 9209 // Introduce a recipe to combine the incoming and previous values of a 9210 // first-order recurrence. 9211 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9212 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9213 if (!RecurPhi) 9214 continue; 9215 9216 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9217 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9218 auto *Region = GetReplicateRegion(PrevRecipe); 9219 if (Region) 9220 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9221 if (Region || PrevRecipe->isPhi()) 9222 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9223 else 9224 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9225 9226 auto *RecurSplice = cast<VPInstruction>( 9227 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9228 {RecurPhi, RecurPhi->getBackedgeValue()})); 9229 9230 RecurPhi->replaceAllUsesWith(RecurSplice); 9231 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9232 // all users. 9233 RecurSplice->setOperand(0, RecurPhi); 9234 } 9235 9236 // Interleave memory: for each Interleave Group we marked earlier as relevant 9237 // for this VPlan, replace the Recipes widening its memory instructions with a 9238 // single VPInterleaveRecipe at its insertion point. 9239 for (auto IG : InterleaveGroups) { 9240 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9241 RecipeBuilder.getRecipe(IG->getInsertPos())); 9242 SmallVector<VPValue *, 4> StoredValues; 9243 for (unsigned i = 0; i < IG->getFactor(); ++i) 9244 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9245 auto *StoreR = 9246 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9247 StoredValues.push_back(StoreR->getStoredValue()); 9248 } 9249 9250 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9251 Recipe->getMask()); 9252 VPIG->insertBefore(Recipe); 9253 unsigned J = 0; 9254 for (unsigned i = 0; i < IG->getFactor(); ++i) 9255 if (Instruction *Member = IG->getMember(i)) { 9256 if (!Member->getType()->isVoidTy()) { 9257 VPValue *OriginalV = Plan->getVPValue(Member); 9258 Plan->removeVPValueFor(Member); 9259 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9260 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9261 J++; 9262 } 9263 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9264 } 9265 } 9266 9267 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9268 // in ways that accessing values using original IR values is incorrect. 9269 Plan->disableValue2VPValue(); 9270 9271 VPlanTransforms::sinkScalarOperands(*Plan); 9272 VPlanTransforms::mergeReplicateRegions(*Plan); 9273 9274 std::string PlanName; 9275 raw_string_ostream RSO(PlanName); 9276 ElementCount VF = Range.Start; 9277 Plan->addVF(VF); 9278 RSO << "Initial VPlan for VF={" << VF; 9279 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9280 Plan->addVF(VF); 9281 RSO << "," << VF; 9282 } 9283 RSO << "},UF>=1"; 9284 RSO.flush(); 9285 Plan->setName(PlanName); 9286 9287 // Fold Exit block into its predecessor if possible. 9288 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9289 // VPBasicBlock as exit. 9290 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9291 9292 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9293 return Plan; 9294 } 9295 9296 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9297 // Outer loop handling: They may require CFG and instruction level 9298 // transformations before even evaluating whether vectorization is profitable. 9299 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9300 // the vectorization pipeline. 9301 assert(!OrigLoop->isInnermost()); 9302 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9303 9304 // Create new empty VPlan 9305 auto Plan = std::make_unique<VPlan>(); 9306 9307 // Build hierarchical CFG 9308 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9309 HCFGBuilder.buildHierarchicalCFG(); 9310 9311 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9312 VF *= 2) 9313 Plan->addVF(VF); 9314 9315 if (EnableVPlanPredication) { 9316 VPlanPredicator VPP(*Plan); 9317 VPP.predicate(); 9318 9319 // Avoid running transformation to recipes until masked code generation in 9320 // VPlan-native path is in place. 9321 return Plan; 9322 } 9323 9324 SmallPtrSet<Instruction *, 1> DeadInstructions; 9325 VPlanTransforms::VPInstructionsToVPRecipes( 9326 OrigLoop, Plan, 9327 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9328 DeadInstructions, *PSE.getSE()); 9329 9330 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9331 true, true); 9332 return Plan; 9333 } 9334 9335 // Adjust the recipes for reductions. For in-loop reductions the chain of 9336 // instructions leading from the loop exit instr to the phi need to be converted 9337 // to reductions, with one operand being vector and the other being the scalar 9338 // reduction chain. For other reductions, a select is introduced between the phi 9339 // and live-out recipes when folding the tail. 9340 void LoopVectorizationPlanner::adjustRecipesForReductions( 9341 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9342 ElementCount MinVF) { 9343 for (auto &Reduction : CM.getInLoopReductionChains()) { 9344 PHINode *Phi = Reduction.first; 9345 const RecurrenceDescriptor &RdxDesc = 9346 Legal->getReductionVars().find(Phi)->second; 9347 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9348 9349 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9350 continue; 9351 9352 // ReductionOperations are orders top-down from the phi's use to the 9353 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9354 // which of the two operands will remain scalar and which will be reduced. 9355 // For minmax the chain will be the select instructions. 9356 Instruction *Chain = Phi; 9357 for (Instruction *R : ReductionOperations) { 9358 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9359 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9360 9361 VPValue *ChainOp = Plan->getVPValue(Chain); 9362 unsigned FirstOpId; 9363 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9364 "Only min/max recurrences allowed for inloop reductions"); 9365 // Recognize a call to the llvm.fmuladd intrinsic. 9366 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9367 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9368 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9369 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9370 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9371 "Expected to replace a VPWidenSelectSC"); 9372 FirstOpId = 1; 9373 } else { 9374 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9375 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9376 "Expected to replace a VPWidenSC"); 9377 FirstOpId = 0; 9378 } 9379 unsigned VecOpId = 9380 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9381 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9382 9383 auto *CondOp = CM.foldTailByMasking() 9384 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9385 : nullptr; 9386 9387 if (IsFMulAdd) { 9388 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9389 // need to create an fmul recipe to use as the vector operand for the 9390 // fadd reduction. 9391 VPInstruction *FMulRecipe = new VPInstruction( 9392 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9393 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9394 WidenRecipe->getParent()->insert(FMulRecipe, 9395 WidenRecipe->getIterator()); 9396 VecOp = FMulRecipe; 9397 } 9398 VPReductionRecipe *RedRecipe = 9399 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9400 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9401 Plan->removeVPValueFor(R); 9402 Plan->addVPValue(R, RedRecipe); 9403 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9404 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9405 WidenRecipe->eraseFromParent(); 9406 9407 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9408 VPRecipeBase *CompareRecipe = 9409 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9410 assert(isa<VPWidenRecipe>(CompareRecipe) && 9411 "Expected to replace a VPWidenSC"); 9412 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9413 "Expected no remaining users"); 9414 CompareRecipe->eraseFromParent(); 9415 } 9416 Chain = R; 9417 } 9418 } 9419 9420 // If tail is folded by masking, introduce selects between the phi 9421 // and the live-out instruction of each reduction, at the beginning of the 9422 // dedicated latch block. 9423 if (CM.foldTailByMasking()) { 9424 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9425 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9426 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9427 if (!PhiR || PhiR->isInLoop()) 9428 continue; 9429 VPValue *Cond = 9430 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9431 VPValue *Red = PhiR->getBackedgeValue(); 9432 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9433 "reduction recipe must be defined before latch"); 9434 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9435 } 9436 } 9437 } 9438 9439 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9440 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9441 VPSlotTracker &SlotTracker) const { 9442 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9443 IG->getInsertPos()->printAsOperand(O, false); 9444 O << ", "; 9445 getAddr()->printAsOperand(O, SlotTracker); 9446 VPValue *Mask = getMask(); 9447 if (Mask) { 9448 O << ", "; 9449 Mask->printAsOperand(O, SlotTracker); 9450 } 9451 9452 unsigned OpIdx = 0; 9453 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9454 if (!IG->getMember(i)) 9455 continue; 9456 if (getNumStoreOperands() > 0) { 9457 O << "\n" << Indent << " store "; 9458 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9459 O << " to index " << i; 9460 } else { 9461 O << "\n" << Indent << " "; 9462 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9463 O << " = load from index " << i; 9464 } 9465 ++OpIdx; 9466 } 9467 } 9468 #endif 9469 9470 void VPWidenCallRecipe::execute(VPTransformState &State) { 9471 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9472 *this, State); 9473 } 9474 9475 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9476 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9477 State.ILV->setDebugLocFromInst(&I); 9478 9479 // The condition can be loop invariant but still defined inside the 9480 // loop. This means that we can't just use the original 'cond' value. 9481 // We have to take the 'vectorized' value and pick the first lane. 9482 // Instcombine will make this a no-op. 9483 auto *InvarCond = 9484 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9485 9486 for (unsigned Part = 0; Part < State.UF; ++Part) { 9487 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9488 Value *Op0 = State.get(getOperand(1), Part); 9489 Value *Op1 = State.get(getOperand(2), Part); 9490 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9491 State.set(this, Sel, Part); 9492 State.ILV->addMetadata(Sel, &I); 9493 } 9494 } 9495 9496 void VPWidenRecipe::execute(VPTransformState &State) { 9497 auto &I = *cast<Instruction>(getUnderlyingValue()); 9498 auto &Builder = State.Builder; 9499 switch (I.getOpcode()) { 9500 case Instruction::Call: 9501 case Instruction::Br: 9502 case Instruction::PHI: 9503 case Instruction::GetElementPtr: 9504 case Instruction::Select: 9505 llvm_unreachable("This instruction is handled by a different recipe."); 9506 case Instruction::UDiv: 9507 case Instruction::SDiv: 9508 case Instruction::SRem: 9509 case Instruction::URem: 9510 case Instruction::Add: 9511 case Instruction::FAdd: 9512 case Instruction::Sub: 9513 case Instruction::FSub: 9514 case Instruction::FNeg: 9515 case Instruction::Mul: 9516 case Instruction::FMul: 9517 case Instruction::FDiv: 9518 case Instruction::FRem: 9519 case Instruction::Shl: 9520 case Instruction::LShr: 9521 case Instruction::AShr: 9522 case Instruction::And: 9523 case Instruction::Or: 9524 case Instruction::Xor: { 9525 // Just widen unops and binops. 9526 State.ILV->setDebugLocFromInst(&I); 9527 9528 for (unsigned Part = 0; Part < State.UF; ++Part) { 9529 SmallVector<Value *, 2> Ops; 9530 for (VPValue *VPOp : operands()) 9531 Ops.push_back(State.get(VPOp, Part)); 9532 9533 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9534 9535 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9536 VecOp->copyIRFlags(&I); 9537 9538 // If the instruction is vectorized and was in a basic block that needed 9539 // predication, we can't propagate poison-generating flags (nuw/nsw, 9540 // exact, etc.). The control flow has been linearized and the 9541 // instruction is no longer guarded by the predicate, which could make 9542 // the flag properties to no longer hold. 9543 if (State.MayGeneratePoisonRecipes.contains(this)) 9544 VecOp->dropPoisonGeneratingFlags(); 9545 } 9546 9547 // Use this vector value for all users of the original instruction. 9548 State.set(this, V, Part); 9549 State.ILV->addMetadata(V, &I); 9550 } 9551 9552 break; 9553 } 9554 case Instruction::ICmp: 9555 case Instruction::FCmp: { 9556 // Widen compares. Generate vector compares. 9557 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9558 auto *Cmp = cast<CmpInst>(&I); 9559 State.ILV->setDebugLocFromInst(Cmp); 9560 for (unsigned Part = 0; Part < State.UF; ++Part) { 9561 Value *A = State.get(getOperand(0), Part); 9562 Value *B = State.get(getOperand(1), Part); 9563 Value *C = nullptr; 9564 if (FCmp) { 9565 // Propagate fast math flags. 9566 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9567 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9568 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9569 } else { 9570 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9571 } 9572 State.set(this, C, Part); 9573 State.ILV->addMetadata(C, &I); 9574 } 9575 9576 break; 9577 } 9578 9579 case Instruction::ZExt: 9580 case Instruction::SExt: 9581 case Instruction::FPToUI: 9582 case Instruction::FPToSI: 9583 case Instruction::FPExt: 9584 case Instruction::PtrToInt: 9585 case Instruction::IntToPtr: 9586 case Instruction::SIToFP: 9587 case Instruction::UIToFP: 9588 case Instruction::Trunc: 9589 case Instruction::FPTrunc: 9590 case Instruction::BitCast: { 9591 auto *CI = cast<CastInst>(&I); 9592 State.ILV->setDebugLocFromInst(CI); 9593 9594 /// Vectorize casts. 9595 Type *DestTy = (State.VF.isScalar()) 9596 ? CI->getType() 9597 : VectorType::get(CI->getType(), State.VF); 9598 9599 for (unsigned Part = 0; Part < State.UF; ++Part) { 9600 Value *A = State.get(getOperand(0), Part); 9601 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9602 State.set(this, Cast, Part); 9603 State.ILV->addMetadata(Cast, &I); 9604 } 9605 break; 9606 } 9607 default: 9608 // This instruction is not vectorized by simple widening. 9609 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9610 llvm_unreachable("Unhandled instruction!"); 9611 } // end of switch. 9612 } 9613 9614 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9615 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9616 // Construct a vector GEP by widening the operands of the scalar GEP as 9617 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9618 // results in a vector of pointers when at least one operand of the GEP 9619 // is vector-typed. Thus, to keep the representation compact, we only use 9620 // vector-typed operands for loop-varying values. 9621 9622 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9623 // If we are vectorizing, but the GEP has only loop-invariant operands, 9624 // the GEP we build (by only using vector-typed operands for 9625 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9626 // produce a vector of pointers, we need to either arbitrarily pick an 9627 // operand to broadcast, or broadcast a clone of the original GEP. 9628 // Here, we broadcast a clone of the original. 9629 // 9630 // TODO: If at some point we decide to scalarize instructions having 9631 // loop-invariant operands, this special case will no longer be 9632 // required. We would add the scalarization decision to 9633 // collectLoopScalars() and teach getVectorValue() to broadcast 9634 // the lane-zero scalar value. 9635 auto *Clone = State.Builder.Insert(GEP->clone()); 9636 for (unsigned Part = 0; Part < State.UF; ++Part) { 9637 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9638 State.set(this, EntryPart, Part); 9639 State.ILV->addMetadata(EntryPart, GEP); 9640 } 9641 } else { 9642 // If the GEP has at least one loop-varying operand, we are sure to 9643 // produce a vector of pointers. But if we are only unrolling, we want 9644 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9645 // produce with the code below will be scalar (if VF == 1) or vector 9646 // (otherwise). Note that for the unroll-only case, we still maintain 9647 // values in the vector mapping with initVector, as we do for other 9648 // instructions. 9649 for (unsigned Part = 0; Part < State.UF; ++Part) { 9650 // The pointer operand of the new GEP. If it's loop-invariant, we 9651 // won't broadcast it. 9652 auto *Ptr = IsPtrLoopInvariant 9653 ? State.get(getOperand(0), VPIteration(0, 0)) 9654 : State.get(getOperand(0), Part); 9655 9656 // Collect all the indices for the new GEP. If any index is 9657 // loop-invariant, we won't broadcast it. 9658 SmallVector<Value *, 4> Indices; 9659 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9660 VPValue *Operand = getOperand(I); 9661 if (IsIndexLoopInvariant[I - 1]) 9662 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9663 else 9664 Indices.push_back(State.get(Operand, Part)); 9665 } 9666 9667 // If the GEP instruction is vectorized and was in a basic block that 9668 // needed predication, we can't propagate the poison-generating 'inbounds' 9669 // flag. The control flow has been linearized and the GEP is no longer 9670 // guarded by the predicate, which could make the 'inbounds' properties to 9671 // no longer hold. 9672 bool IsInBounds = 9673 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9674 9675 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9676 // but it should be a vector, otherwise. 9677 auto *NewGEP = IsInBounds 9678 ? State.Builder.CreateInBoundsGEP( 9679 GEP->getSourceElementType(), Ptr, Indices) 9680 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9681 Ptr, Indices); 9682 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9683 "NewGEP is not a pointer vector"); 9684 State.set(this, NewGEP, Part); 9685 State.ILV->addMetadata(NewGEP, GEP); 9686 } 9687 } 9688 } 9689 9690 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9691 assert(!State.Instance && "Int or FP induction being replicated."); 9692 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9693 State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); 9694 } 9695 9696 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9697 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9698 State); 9699 } 9700 9701 void VPBlendRecipe::execute(VPTransformState &State) { 9702 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9703 // We know that all PHIs in non-header blocks are converted into 9704 // selects, so we don't have to worry about the insertion order and we 9705 // can just use the builder. 9706 // At this point we generate the predication tree. There may be 9707 // duplications since this is a simple recursive scan, but future 9708 // optimizations will clean it up. 9709 9710 unsigned NumIncoming = getNumIncomingValues(); 9711 9712 // Generate a sequence of selects of the form: 9713 // SELECT(Mask3, In3, 9714 // SELECT(Mask2, In2, 9715 // SELECT(Mask1, In1, 9716 // In0))) 9717 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9718 // are essentially undef are taken from In0. 9719 InnerLoopVectorizer::VectorParts Entry(State.UF); 9720 for (unsigned In = 0; In < NumIncoming; ++In) { 9721 for (unsigned Part = 0; Part < State.UF; ++Part) { 9722 // We might have single edge PHIs (blocks) - use an identity 9723 // 'select' for the first PHI operand. 9724 Value *In0 = State.get(getIncomingValue(In), Part); 9725 if (In == 0) 9726 Entry[Part] = In0; // Initialize with the first incoming value. 9727 else { 9728 // Select between the current value and the previous incoming edge 9729 // based on the incoming mask. 9730 Value *Cond = State.get(getMask(In), Part); 9731 Entry[Part] = 9732 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9733 } 9734 } 9735 } 9736 for (unsigned Part = 0; Part < State.UF; ++Part) 9737 State.set(this, Entry[Part], Part); 9738 } 9739 9740 void VPInterleaveRecipe::execute(VPTransformState &State) { 9741 assert(!State.Instance && "Interleave group being replicated."); 9742 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9743 getStoredValues(), getMask()); 9744 } 9745 9746 void VPReductionRecipe::execute(VPTransformState &State) { 9747 assert(!State.Instance && "Reduction being replicated."); 9748 Value *PrevInChain = State.get(getChainOp(), 0); 9749 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9750 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9751 // Propagate the fast-math flags carried by the underlying instruction. 9752 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9753 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9754 for (unsigned Part = 0; Part < State.UF; ++Part) { 9755 Value *NewVecOp = State.get(getVecOp(), Part); 9756 if (VPValue *Cond = getCondOp()) { 9757 Value *NewCond = State.get(Cond, Part); 9758 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9759 Value *Iden = RdxDesc->getRecurrenceIdentity( 9760 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9761 Value *IdenVec = 9762 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9763 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9764 NewVecOp = Select; 9765 } 9766 Value *NewRed; 9767 Value *NextInChain; 9768 if (IsOrdered) { 9769 if (State.VF.isVector()) 9770 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9771 PrevInChain); 9772 else 9773 NewRed = State.Builder.CreateBinOp( 9774 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9775 NewVecOp); 9776 PrevInChain = NewRed; 9777 } else { 9778 PrevInChain = State.get(getChainOp(), Part); 9779 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9780 } 9781 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9782 NextInChain = 9783 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9784 NewRed, PrevInChain); 9785 } else if (IsOrdered) 9786 NextInChain = NewRed; 9787 else 9788 NextInChain = State.Builder.CreateBinOp( 9789 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9790 PrevInChain); 9791 State.set(this, NextInChain, Part); 9792 } 9793 } 9794 9795 void VPReplicateRecipe::execute(VPTransformState &State) { 9796 if (State.Instance) { // Generate a single instance. 9797 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9798 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9799 IsPredicated, State); 9800 // Insert scalar instance packing it into a vector. 9801 if (AlsoPack && State.VF.isVector()) { 9802 // If we're constructing lane 0, initialize to start from poison. 9803 if (State.Instance->Lane.isFirstLane()) { 9804 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9805 Value *Poison = PoisonValue::get( 9806 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9807 State.set(this, Poison, State.Instance->Part); 9808 } 9809 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9810 } 9811 return; 9812 } 9813 9814 // Generate scalar instances for all VF lanes of all UF parts, unless the 9815 // instruction is uniform inwhich case generate only the first lane for each 9816 // of the UF parts. 9817 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9818 assert((!State.VF.isScalable() || IsUniform) && 9819 "Can't scalarize a scalable vector"); 9820 for (unsigned Part = 0; Part < State.UF; ++Part) 9821 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9822 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9823 VPIteration(Part, Lane), IsPredicated, 9824 State); 9825 } 9826 9827 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9828 assert(State.Instance && "Branch on Mask works only on single instance."); 9829 9830 unsigned Part = State.Instance->Part; 9831 unsigned Lane = State.Instance->Lane.getKnownLane(); 9832 9833 Value *ConditionBit = nullptr; 9834 VPValue *BlockInMask = getMask(); 9835 if (BlockInMask) { 9836 ConditionBit = State.get(BlockInMask, Part); 9837 if (ConditionBit->getType()->isVectorTy()) 9838 ConditionBit = State.Builder.CreateExtractElement( 9839 ConditionBit, State.Builder.getInt32(Lane)); 9840 } else // Block in mask is all-one. 9841 ConditionBit = State.Builder.getTrue(); 9842 9843 // Replace the temporary unreachable terminator with a new conditional branch, 9844 // whose two destinations will be set later when they are created. 9845 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9846 assert(isa<UnreachableInst>(CurrentTerminator) && 9847 "Expected to replace unreachable terminator with conditional branch."); 9848 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9849 CondBr->setSuccessor(0, nullptr); 9850 ReplaceInstWithInst(CurrentTerminator, CondBr); 9851 } 9852 9853 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9854 assert(State.Instance && "Predicated instruction PHI works per instance."); 9855 Instruction *ScalarPredInst = 9856 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9857 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9858 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9859 assert(PredicatingBB && "Predicated block has no single predecessor."); 9860 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9861 "operand must be VPReplicateRecipe"); 9862 9863 // By current pack/unpack logic we need to generate only a single phi node: if 9864 // a vector value for the predicated instruction exists at this point it means 9865 // the instruction has vector users only, and a phi for the vector value is 9866 // needed. In this case the recipe of the predicated instruction is marked to 9867 // also do that packing, thereby "hoisting" the insert-element sequence. 9868 // Otherwise, a phi node for the scalar value is needed. 9869 unsigned Part = State.Instance->Part; 9870 if (State.hasVectorValue(getOperand(0), Part)) { 9871 Value *VectorValue = State.get(getOperand(0), Part); 9872 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9873 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9874 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9875 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9876 if (State.hasVectorValue(this, Part)) 9877 State.reset(this, VPhi, Part); 9878 else 9879 State.set(this, VPhi, Part); 9880 // NOTE: Currently we need to update the value of the operand, so the next 9881 // predicated iteration inserts its generated value in the correct vector. 9882 State.reset(getOperand(0), VPhi, Part); 9883 } else { 9884 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9885 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9886 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9887 PredicatingBB); 9888 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9889 if (State.hasScalarValue(this, *State.Instance)) 9890 State.reset(this, Phi, *State.Instance); 9891 else 9892 State.set(this, Phi, *State.Instance); 9893 // NOTE: Currently we need to update the value of the operand, so the next 9894 // predicated iteration inserts its generated value in the correct vector. 9895 State.reset(getOperand(0), Phi, *State.Instance); 9896 } 9897 } 9898 9899 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9900 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9901 9902 // Attempt to issue a wide load. 9903 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9904 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9905 9906 assert((LI || SI) && "Invalid Load/Store instruction"); 9907 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9908 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9909 9910 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9911 9912 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9913 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9914 bool CreateGatherScatter = !Consecutive; 9915 9916 auto &Builder = State.Builder; 9917 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9918 bool isMaskRequired = getMask(); 9919 if (isMaskRequired) 9920 for (unsigned Part = 0; Part < State.UF; ++Part) 9921 BlockInMaskParts[Part] = State.get(getMask(), Part); 9922 9923 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9924 // Calculate the pointer for the specific unroll-part. 9925 GetElementPtrInst *PartPtr = nullptr; 9926 9927 bool InBounds = false; 9928 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9929 InBounds = gep->isInBounds(); 9930 if (Reverse) { 9931 // If the address is consecutive but reversed, then the 9932 // wide store needs to start at the last vector element. 9933 // RunTimeVF = VScale * VF.getKnownMinValue() 9934 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9935 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9936 // NumElt = -Part * RunTimeVF 9937 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9938 // LastLane = 1 - RunTimeVF 9939 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9940 PartPtr = 9941 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9942 PartPtr->setIsInBounds(InBounds); 9943 PartPtr = cast<GetElementPtrInst>( 9944 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9945 PartPtr->setIsInBounds(InBounds); 9946 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9947 BlockInMaskParts[Part] = 9948 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9949 } else { 9950 Value *Increment = 9951 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9952 PartPtr = cast<GetElementPtrInst>( 9953 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9954 PartPtr->setIsInBounds(InBounds); 9955 } 9956 9957 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9958 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9959 }; 9960 9961 // Handle Stores: 9962 if (SI) { 9963 State.ILV->setDebugLocFromInst(SI); 9964 9965 for (unsigned Part = 0; Part < State.UF; ++Part) { 9966 Instruction *NewSI = nullptr; 9967 Value *StoredVal = State.get(StoredValue, Part); 9968 if (CreateGatherScatter) { 9969 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9970 Value *VectorGep = State.get(getAddr(), Part); 9971 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9972 MaskPart); 9973 } else { 9974 if (Reverse) { 9975 // If we store to reverse consecutive memory locations, then we need 9976 // to reverse the order of elements in the stored value. 9977 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9978 // We don't want to update the value in the map as it might be used in 9979 // another expression. So don't call resetVectorValue(StoredVal). 9980 } 9981 auto *VecPtr = 9982 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9983 if (isMaskRequired) 9984 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9985 BlockInMaskParts[Part]); 9986 else 9987 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9988 } 9989 State.ILV->addMetadata(NewSI, SI); 9990 } 9991 return; 9992 } 9993 9994 // Handle loads. 9995 assert(LI && "Must have a load instruction"); 9996 State.ILV->setDebugLocFromInst(LI); 9997 for (unsigned Part = 0; Part < State.UF; ++Part) { 9998 Value *NewLI; 9999 if (CreateGatherScatter) { 10000 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10001 Value *VectorGep = State.get(getAddr(), Part); 10002 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10003 nullptr, "wide.masked.gather"); 10004 State.ILV->addMetadata(NewLI, LI); 10005 } else { 10006 auto *VecPtr = 10007 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10008 if (isMaskRequired) 10009 NewLI = Builder.CreateMaskedLoad( 10010 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10011 PoisonValue::get(DataTy), "wide.masked.load"); 10012 else 10013 NewLI = 10014 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10015 10016 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10017 State.ILV->addMetadata(NewLI, LI); 10018 if (Reverse) 10019 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10020 } 10021 10022 State.set(this, NewLI, Part); 10023 } 10024 } 10025 10026 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10027 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10028 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10029 // for predication. 10030 static ScalarEpilogueLowering getScalarEpilogueLowering( 10031 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10032 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10033 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10034 LoopVectorizationLegality &LVL) { 10035 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10036 // don't look at hints or options, and don't request a scalar epilogue. 10037 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10038 // LoopAccessInfo (due to code dependency and not being able to reliably get 10039 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10040 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10041 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10042 // back to the old way and vectorize with versioning when forced. See D81345.) 10043 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10044 PGSOQueryType::IRPass) && 10045 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10046 return CM_ScalarEpilogueNotAllowedOptSize; 10047 10048 // 2) If set, obey the directives 10049 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10050 switch (PreferPredicateOverEpilogue) { 10051 case PreferPredicateTy::ScalarEpilogue: 10052 return CM_ScalarEpilogueAllowed; 10053 case PreferPredicateTy::PredicateElseScalarEpilogue: 10054 return CM_ScalarEpilogueNotNeededUsePredicate; 10055 case PreferPredicateTy::PredicateOrDontVectorize: 10056 return CM_ScalarEpilogueNotAllowedUsePredicate; 10057 }; 10058 } 10059 10060 // 3) If set, obey the hints 10061 switch (Hints.getPredicate()) { 10062 case LoopVectorizeHints::FK_Enabled: 10063 return CM_ScalarEpilogueNotNeededUsePredicate; 10064 case LoopVectorizeHints::FK_Disabled: 10065 return CM_ScalarEpilogueAllowed; 10066 }; 10067 10068 // 4) if the TTI hook indicates this is profitable, request predication. 10069 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10070 LVL.getLAI())) 10071 return CM_ScalarEpilogueNotNeededUsePredicate; 10072 10073 return CM_ScalarEpilogueAllowed; 10074 } 10075 10076 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10077 // If Values have been set for this Def return the one relevant for \p Part. 10078 if (hasVectorValue(Def, Part)) 10079 return Data.PerPartOutput[Def][Part]; 10080 10081 if (!hasScalarValue(Def, {Part, 0})) { 10082 Value *IRV = Def->getLiveInIRValue(); 10083 Value *B = ILV->getBroadcastInstrs(IRV); 10084 set(Def, B, Part); 10085 return B; 10086 } 10087 10088 Value *ScalarValue = get(Def, {Part, 0}); 10089 // If we aren't vectorizing, we can just copy the scalar map values over 10090 // to the vector map. 10091 if (VF.isScalar()) { 10092 set(Def, ScalarValue, Part); 10093 return ScalarValue; 10094 } 10095 10096 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10097 bool IsUniform = RepR && RepR->isUniform(); 10098 10099 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10100 // Check if there is a scalar value for the selected lane. 10101 if (!hasScalarValue(Def, {Part, LastLane})) { 10102 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10103 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10104 "unexpected recipe found to be invariant"); 10105 IsUniform = true; 10106 LastLane = 0; 10107 } 10108 10109 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10110 // Set the insert point after the last scalarized instruction or after the 10111 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10112 // will directly follow the scalar definitions. 10113 auto OldIP = Builder.saveIP(); 10114 auto NewIP = 10115 isa<PHINode>(LastInst) 10116 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10117 : std::next(BasicBlock::iterator(LastInst)); 10118 Builder.SetInsertPoint(&*NewIP); 10119 10120 // However, if we are vectorizing, we need to construct the vector values. 10121 // If the value is known to be uniform after vectorization, we can just 10122 // broadcast the scalar value corresponding to lane zero for each unroll 10123 // iteration. Otherwise, we construct the vector values using 10124 // insertelement instructions. Since the resulting vectors are stored in 10125 // State, we will only generate the insertelements once. 10126 Value *VectorValue = nullptr; 10127 if (IsUniform) { 10128 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10129 set(Def, VectorValue, Part); 10130 } else { 10131 // Initialize packing with insertelements to start from undef. 10132 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10133 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10134 set(Def, Undef, Part); 10135 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10136 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10137 VectorValue = get(Def, Part); 10138 } 10139 Builder.restoreIP(OldIP); 10140 return VectorValue; 10141 } 10142 10143 // Process the loop in the VPlan-native vectorization path. This path builds 10144 // VPlan upfront in the vectorization pipeline, which allows to apply 10145 // VPlan-to-VPlan transformations from the very beginning without modifying the 10146 // input LLVM IR. 10147 static bool processLoopInVPlanNativePath( 10148 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10149 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10150 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10151 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10152 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10153 LoopVectorizationRequirements &Requirements) { 10154 10155 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10156 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10157 return false; 10158 } 10159 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10160 Function *F = L->getHeader()->getParent(); 10161 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10162 10163 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10164 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10165 10166 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10167 &Hints, IAI); 10168 // Use the planner for outer loop vectorization. 10169 // TODO: CM is not used at this point inside the planner. Turn CM into an 10170 // optional argument if we don't need it in the future. 10171 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10172 Requirements, ORE); 10173 10174 // Get user vectorization factor. 10175 ElementCount UserVF = Hints.getWidth(); 10176 10177 CM.collectElementTypesForWidening(); 10178 10179 // Plan how to best vectorize, return the best VF and its cost. 10180 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10181 10182 // If we are stress testing VPlan builds, do not attempt to generate vector 10183 // code. Masked vector code generation support will follow soon. 10184 // Also, do not attempt to vectorize if no vector code will be produced. 10185 if (VPlanBuildStressTest || EnableVPlanPredication || 10186 VectorizationFactor::Disabled() == VF) 10187 return false; 10188 10189 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10190 10191 { 10192 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10193 F->getParent()->getDataLayout()); 10194 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10195 &CM, BFI, PSI, Checks); 10196 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10197 << L->getHeader()->getParent()->getName() << "\"\n"); 10198 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10199 } 10200 10201 // Mark the loop as already vectorized to avoid vectorizing again. 10202 Hints.setAlreadyVectorized(); 10203 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10204 return true; 10205 } 10206 10207 // Emit a remark if there are stores to floats that required a floating point 10208 // extension. If the vectorized loop was generated with floating point there 10209 // will be a performance penalty from the conversion overhead and the change in 10210 // the vector width. 10211 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10212 SmallVector<Instruction *, 4> Worklist; 10213 for (BasicBlock *BB : L->getBlocks()) { 10214 for (Instruction &Inst : *BB) { 10215 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10216 if (S->getValueOperand()->getType()->isFloatTy()) 10217 Worklist.push_back(S); 10218 } 10219 } 10220 } 10221 10222 // Traverse the floating point stores upwards searching, for floating point 10223 // conversions. 10224 SmallPtrSet<const Instruction *, 4> Visited; 10225 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10226 while (!Worklist.empty()) { 10227 auto *I = Worklist.pop_back_val(); 10228 if (!L->contains(I)) 10229 continue; 10230 if (!Visited.insert(I).second) 10231 continue; 10232 10233 // Emit a remark if the floating point store required a floating 10234 // point conversion. 10235 // TODO: More work could be done to identify the root cause such as a 10236 // constant or a function return type and point the user to it. 10237 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10238 ORE->emit([&]() { 10239 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10240 I->getDebugLoc(), L->getHeader()) 10241 << "floating point conversion changes vector width. " 10242 << "Mixed floating point precision requires an up/down " 10243 << "cast that will negatively impact performance."; 10244 }); 10245 10246 for (Use &Op : I->operands()) 10247 if (auto *OpI = dyn_cast<Instruction>(Op)) 10248 Worklist.push_back(OpI); 10249 } 10250 } 10251 10252 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10253 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10254 !EnableLoopInterleaving), 10255 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10256 !EnableLoopVectorization) {} 10257 10258 bool LoopVectorizePass::processLoop(Loop *L) { 10259 assert((EnableVPlanNativePath || L->isInnermost()) && 10260 "VPlan-native path is not enabled. Only process inner loops."); 10261 10262 #ifndef NDEBUG 10263 const std::string DebugLocStr = getDebugLocString(L); 10264 #endif /* NDEBUG */ 10265 10266 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10267 << L->getHeader()->getParent()->getName() << "\" from " 10268 << DebugLocStr << "\n"); 10269 10270 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10271 10272 LLVM_DEBUG( 10273 dbgs() << "LV: Loop hints:" 10274 << " force=" 10275 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10276 ? "disabled" 10277 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10278 ? "enabled" 10279 : "?")) 10280 << " width=" << Hints.getWidth() 10281 << " interleave=" << Hints.getInterleave() << "\n"); 10282 10283 // Function containing loop 10284 Function *F = L->getHeader()->getParent(); 10285 10286 // Looking at the diagnostic output is the only way to determine if a loop 10287 // was vectorized (other than looking at the IR or machine code), so it 10288 // is important to generate an optimization remark for each loop. Most of 10289 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10290 // generated as OptimizationRemark and OptimizationRemarkMissed are 10291 // less verbose reporting vectorized loops and unvectorized loops that may 10292 // benefit from vectorization, respectively. 10293 10294 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10295 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10296 return false; 10297 } 10298 10299 PredicatedScalarEvolution PSE(*SE, *L); 10300 10301 // Check if it is legal to vectorize the loop. 10302 LoopVectorizationRequirements Requirements; 10303 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10304 &Requirements, &Hints, DB, AC, BFI, PSI); 10305 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10306 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10307 Hints.emitRemarkWithHints(); 10308 return false; 10309 } 10310 10311 // Check the function attributes and profiles to find out if this function 10312 // should be optimized for size. 10313 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10314 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10315 10316 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10317 // here. They may require CFG and instruction level transformations before 10318 // even evaluating whether vectorization is profitable. Since we cannot modify 10319 // the incoming IR, we need to build VPlan upfront in the vectorization 10320 // pipeline. 10321 if (!L->isInnermost()) 10322 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10323 ORE, BFI, PSI, Hints, Requirements); 10324 10325 assert(L->isInnermost() && "Inner loop expected."); 10326 10327 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10328 // count by optimizing for size, to minimize overheads. 10329 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10330 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10331 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10332 << "This loop is worth vectorizing only if no scalar " 10333 << "iteration overheads are incurred."); 10334 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10335 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10336 else { 10337 LLVM_DEBUG(dbgs() << "\n"); 10338 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10339 } 10340 } 10341 10342 // Check the function attributes to see if implicit floats are allowed. 10343 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10344 // an integer loop and the vector instructions selected are purely integer 10345 // vector instructions? 10346 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10347 reportVectorizationFailure( 10348 "Can't vectorize when the NoImplicitFloat attribute is used", 10349 "loop not vectorized due to NoImplicitFloat attribute", 10350 "NoImplicitFloat", ORE, L); 10351 Hints.emitRemarkWithHints(); 10352 return false; 10353 } 10354 10355 // Check if the target supports potentially unsafe FP vectorization. 10356 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10357 // for the target we're vectorizing for, to make sure none of the 10358 // additional fp-math flags can help. 10359 if (Hints.isPotentiallyUnsafe() && 10360 TTI->isFPVectorizationPotentiallyUnsafe()) { 10361 reportVectorizationFailure( 10362 "Potentially unsafe FP op prevents vectorization", 10363 "loop not vectorized due to unsafe FP support.", 10364 "UnsafeFP", ORE, L); 10365 Hints.emitRemarkWithHints(); 10366 return false; 10367 } 10368 10369 bool AllowOrderedReductions; 10370 // If the flag is set, use that instead and override the TTI behaviour. 10371 if (ForceOrderedReductions.getNumOccurrences() > 0) 10372 AllowOrderedReductions = ForceOrderedReductions; 10373 else 10374 AllowOrderedReductions = TTI->enableOrderedReductions(); 10375 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10376 ORE->emit([&]() { 10377 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10378 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10379 ExactFPMathInst->getDebugLoc(), 10380 ExactFPMathInst->getParent()) 10381 << "loop not vectorized: cannot prove it is safe to reorder " 10382 "floating-point operations"; 10383 }); 10384 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10385 "reorder floating-point operations\n"); 10386 Hints.emitRemarkWithHints(); 10387 return false; 10388 } 10389 10390 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10391 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10392 10393 // If an override option has been passed in for interleaved accesses, use it. 10394 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10395 UseInterleaved = EnableInterleavedMemAccesses; 10396 10397 // Analyze interleaved memory accesses. 10398 if (UseInterleaved) { 10399 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10400 } 10401 10402 // Use the cost model. 10403 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10404 F, &Hints, IAI); 10405 CM.collectValuesToIgnore(); 10406 CM.collectElementTypesForWidening(); 10407 10408 // Use the planner for vectorization. 10409 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10410 Requirements, ORE); 10411 10412 // Get user vectorization factor and interleave count. 10413 ElementCount UserVF = Hints.getWidth(); 10414 unsigned UserIC = Hints.getInterleave(); 10415 10416 // Plan how to best vectorize, return the best VF and its cost. 10417 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10418 10419 VectorizationFactor VF = VectorizationFactor::Disabled(); 10420 unsigned IC = 1; 10421 10422 if (MaybeVF) { 10423 VF = *MaybeVF; 10424 // Select the interleave count. 10425 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10426 } 10427 10428 // Identify the diagnostic messages that should be produced. 10429 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10430 bool VectorizeLoop = true, InterleaveLoop = true; 10431 if (VF.Width.isScalar()) { 10432 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10433 VecDiagMsg = std::make_pair( 10434 "VectorizationNotBeneficial", 10435 "the cost-model indicates that vectorization is not beneficial"); 10436 VectorizeLoop = false; 10437 } 10438 10439 if (!MaybeVF && UserIC > 1) { 10440 // Tell the user interleaving was avoided up-front, despite being explicitly 10441 // requested. 10442 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10443 "interleaving should be avoided up front\n"); 10444 IntDiagMsg = std::make_pair( 10445 "InterleavingAvoided", 10446 "Ignoring UserIC, because interleaving was avoided up front"); 10447 InterleaveLoop = false; 10448 } else if (IC == 1 && UserIC <= 1) { 10449 // Tell the user interleaving is not beneficial. 10450 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10451 IntDiagMsg = std::make_pair( 10452 "InterleavingNotBeneficial", 10453 "the cost-model indicates that interleaving is not beneficial"); 10454 InterleaveLoop = false; 10455 if (UserIC == 1) { 10456 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10457 IntDiagMsg.second += 10458 " and is explicitly disabled or interleave count is set to 1"; 10459 } 10460 } else if (IC > 1 && UserIC == 1) { 10461 // Tell the user interleaving is beneficial, but it explicitly disabled. 10462 LLVM_DEBUG( 10463 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10464 IntDiagMsg = std::make_pair( 10465 "InterleavingBeneficialButDisabled", 10466 "the cost-model indicates that interleaving is beneficial " 10467 "but is explicitly disabled or interleave count is set to 1"); 10468 InterleaveLoop = false; 10469 } 10470 10471 // Override IC if user provided an interleave count. 10472 IC = UserIC > 0 ? UserIC : IC; 10473 10474 // Emit diagnostic messages, if any. 10475 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10476 if (!VectorizeLoop && !InterleaveLoop) { 10477 // Do not vectorize or interleaving the loop. 10478 ORE->emit([&]() { 10479 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10480 L->getStartLoc(), L->getHeader()) 10481 << VecDiagMsg.second; 10482 }); 10483 ORE->emit([&]() { 10484 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10485 L->getStartLoc(), L->getHeader()) 10486 << IntDiagMsg.second; 10487 }); 10488 return false; 10489 } else if (!VectorizeLoop && InterleaveLoop) { 10490 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10491 ORE->emit([&]() { 10492 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10493 L->getStartLoc(), L->getHeader()) 10494 << VecDiagMsg.second; 10495 }); 10496 } else if (VectorizeLoop && !InterleaveLoop) { 10497 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10498 << ") in " << DebugLocStr << '\n'); 10499 ORE->emit([&]() { 10500 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10501 L->getStartLoc(), L->getHeader()) 10502 << IntDiagMsg.second; 10503 }); 10504 } else if (VectorizeLoop && InterleaveLoop) { 10505 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10506 << ") in " << DebugLocStr << '\n'); 10507 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10508 } 10509 10510 bool DisableRuntimeUnroll = false; 10511 MDNode *OrigLoopID = L->getLoopID(); 10512 { 10513 // Optimistically generate runtime checks. Drop them if they turn out to not 10514 // be profitable. Limit the scope of Checks, so the cleanup happens 10515 // immediately after vector codegeneration is done. 10516 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10517 F->getParent()->getDataLayout()); 10518 if (!VF.Width.isScalar() || IC > 1) 10519 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10520 10521 using namespace ore; 10522 if (!VectorizeLoop) { 10523 assert(IC > 1 && "interleave count should not be 1 or 0"); 10524 // If we decided that it is not legal to vectorize the loop, then 10525 // interleave it. 10526 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10527 &CM, BFI, PSI, Checks); 10528 10529 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10530 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10531 10532 ORE->emit([&]() { 10533 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10534 L->getHeader()) 10535 << "interleaved loop (interleaved count: " 10536 << NV("InterleaveCount", IC) << ")"; 10537 }); 10538 } else { 10539 // If we decided that it is *legal* to vectorize the loop, then do it. 10540 10541 // Consider vectorizing the epilogue too if it's profitable. 10542 VectorizationFactor EpilogueVF = 10543 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10544 if (EpilogueVF.Width.isVector()) { 10545 10546 // The first pass vectorizes the main loop and creates a scalar epilogue 10547 // to be vectorized by executing the plan (potentially with a different 10548 // factor) again shortly afterwards. 10549 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10550 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10551 EPI, &LVL, &CM, BFI, PSI, Checks); 10552 10553 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10554 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10555 DT); 10556 ++LoopsVectorized; 10557 10558 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10559 formLCSSARecursively(*L, *DT, LI, SE); 10560 10561 // Second pass vectorizes the epilogue and adjusts the control flow 10562 // edges from the first pass. 10563 EPI.MainLoopVF = EPI.EpilogueVF; 10564 EPI.MainLoopUF = EPI.EpilogueUF; 10565 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10566 ORE, EPI, &LVL, &CM, BFI, PSI, 10567 Checks); 10568 10569 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10570 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10571 DT); 10572 ++LoopsEpilogueVectorized; 10573 10574 if (!MainILV.areSafetyChecksAdded()) 10575 DisableRuntimeUnroll = true; 10576 } else { 10577 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10578 &LVL, &CM, BFI, PSI, Checks); 10579 10580 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10581 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10582 ++LoopsVectorized; 10583 10584 // Add metadata to disable runtime unrolling a scalar loop when there 10585 // are no runtime checks about strides and memory. A scalar loop that is 10586 // rarely used is not worth unrolling. 10587 if (!LB.areSafetyChecksAdded()) 10588 DisableRuntimeUnroll = true; 10589 } 10590 // Report the vectorization decision. 10591 ORE->emit([&]() { 10592 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10593 L->getHeader()) 10594 << "vectorized loop (vectorization width: " 10595 << NV("VectorizationFactor", VF.Width) 10596 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10597 }); 10598 } 10599 10600 if (ORE->allowExtraAnalysis(LV_NAME)) 10601 checkMixedPrecision(L, ORE); 10602 } 10603 10604 Optional<MDNode *> RemainderLoopID = 10605 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10606 LLVMLoopVectorizeFollowupEpilogue}); 10607 if (RemainderLoopID.hasValue()) { 10608 L->setLoopID(RemainderLoopID.getValue()); 10609 } else { 10610 if (DisableRuntimeUnroll) 10611 AddRuntimeUnrollDisableMetaData(L); 10612 10613 // Mark the loop as already vectorized to avoid vectorizing again. 10614 Hints.setAlreadyVectorized(); 10615 } 10616 10617 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10618 return true; 10619 } 10620 10621 LoopVectorizeResult LoopVectorizePass::runImpl( 10622 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10623 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10624 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10625 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10626 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10627 SE = &SE_; 10628 LI = &LI_; 10629 TTI = &TTI_; 10630 DT = &DT_; 10631 BFI = &BFI_; 10632 TLI = TLI_; 10633 AA = &AA_; 10634 AC = &AC_; 10635 GetLAA = &GetLAA_; 10636 DB = &DB_; 10637 ORE = &ORE_; 10638 PSI = PSI_; 10639 10640 // Don't attempt if 10641 // 1. the target claims to have no vector registers, and 10642 // 2. interleaving won't help ILP. 10643 // 10644 // The second condition is necessary because, even if the target has no 10645 // vector registers, loop vectorization may still enable scalar 10646 // interleaving. 10647 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10648 TTI->getMaxInterleaveFactor(1) < 2) 10649 return LoopVectorizeResult(false, false); 10650 10651 bool Changed = false, CFGChanged = false; 10652 10653 // The vectorizer requires loops to be in simplified form. 10654 // Since simplification may add new inner loops, it has to run before the 10655 // legality and profitability checks. This means running the loop vectorizer 10656 // will simplify all loops, regardless of whether anything end up being 10657 // vectorized. 10658 for (auto &L : *LI) 10659 Changed |= CFGChanged |= 10660 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10661 10662 // Build up a worklist of inner-loops to vectorize. This is necessary as 10663 // the act of vectorizing or partially unrolling a loop creates new loops 10664 // and can invalidate iterators across the loops. 10665 SmallVector<Loop *, 8> Worklist; 10666 10667 for (Loop *L : *LI) 10668 collectSupportedLoops(*L, LI, ORE, Worklist); 10669 10670 LoopsAnalyzed += Worklist.size(); 10671 10672 // Now walk the identified inner loops. 10673 while (!Worklist.empty()) { 10674 Loop *L = Worklist.pop_back_val(); 10675 10676 // For the inner loops we actually process, form LCSSA to simplify the 10677 // transform. 10678 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10679 10680 Changed |= CFGChanged |= processLoop(L); 10681 } 10682 10683 // Process each loop nest in the function. 10684 return LoopVectorizeResult(Changed, CFGChanged); 10685 } 10686 10687 PreservedAnalyses LoopVectorizePass::run(Function &F, 10688 FunctionAnalysisManager &AM) { 10689 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10690 auto &LI = AM.getResult<LoopAnalysis>(F); 10691 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10692 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10693 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10694 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10695 auto &AA = AM.getResult<AAManager>(F); 10696 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10697 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10698 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10699 10700 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10701 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10702 [&](Loop &L) -> const LoopAccessInfo & { 10703 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10704 TLI, TTI, nullptr, nullptr, nullptr}; 10705 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10706 }; 10707 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10708 ProfileSummaryInfo *PSI = 10709 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10710 LoopVectorizeResult Result = 10711 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10712 if (!Result.MadeAnyChange) 10713 return PreservedAnalyses::all(); 10714 PreservedAnalyses PA; 10715 10716 // We currently do not preserve loopinfo/dominator analyses with outer loop 10717 // vectorization. Until this is addressed, mark these analyses as preserved 10718 // only for non-VPlan-native path. 10719 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10720 if (!EnableVPlanNativePath) { 10721 PA.preserve<LoopAnalysis>(); 10722 PA.preserve<DominatorTreeAnalysis>(); 10723 } 10724 10725 if (Result.MadeCFGChange) { 10726 // Making CFG changes likely means a loop got vectorized. Indicate that 10727 // extra simplification passes should be run. 10728 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10729 // be run if runtime checks have been added. 10730 AM.getResult<ShouldRunExtraVectorPasses>(F); 10731 PA.preserve<ShouldRunExtraVectorPasses>(); 10732 } else { 10733 PA.preserveSet<CFGAnalyses>(); 10734 } 10735 return PA; 10736 } 10737 10738 void LoopVectorizePass::printPipeline( 10739 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10740 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10741 OS, MapClassName2PassName); 10742 10743 OS << "<"; 10744 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10745 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10746 OS << ">"; 10747 } 10748