1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop and the start value for the canonical induction, if it is != 0. The 474 /// latter is the case when vectorizing the epilogue loop. In the case of 475 /// epilogue vectorization, this function is overriden to handle the more 476 /// complex control flow around the loops. 477 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 478 479 /// Widen a single call instruction within the innermost loop. 480 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 481 VPTransformState &State); 482 483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 484 void fixVectorizedLoop(VPTransformState &State); 485 486 // Return true if any runtime check is added. 487 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 488 489 /// A type for vectorized values in the new loop. Each value from the 490 /// original loop, when vectorized, is represented by UF vector values in the 491 /// new unrolled loop, where UF is the unroll factor. 492 using VectorParts = SmallVector<Value *, 2>; 493 494 /// Vectorize a single first-order recurrence or pointer induction PHINode in 495 /// a block. This method handles the induction variable canonicalization. It 496 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 497 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 498 VPTransformState &State); 499 500 /// A helper function to scalarize a single Instruction in the innermost loop. 501 /// Generates a sequence of scalar instances for each lane between \p MinLane 502 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 503 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 504 /// Instr's operands. 505 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 506 const VPIteration &Instance, bool IfPredicateInstr, 507 VPTransformState &State); 508 509 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 510 /// is provided, the integer induction variable will first be truncated to 511 /// the corresponding type. \p CanonicalIV is the scalar value generated for 512 /// the canonical induction variable. 513 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, 514 VPTransformState &State, Value *CanonicalIV); 515 516 /// Construct the vector value of a scalarized value \p V one lane at a time. 517 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 518 VPTransformState &State); 519 520 /// Try to vectorize interleaved access group \p Group with the base address 521 /// given in \p Addr, optionally masking the vector operations if \p 522 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 523 /// values in the vectorized loop. 524 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 525 ArrayRef<VPValue *> VPDefs, 526 VPTransformState &State, VPValue *Addr, 527 ArrayRef<VPValue *> StoredValues, 528 VPValue *BlockInMask = nullptr); 529 530 /// Set the debug location in the builder \p Ptr using the debug location in 531 /// \p V. If \p Ptr is None then it uses the class member's Builder. 532 void setDebugLocFromInst(const Value *V, 533 Optional<IRBuilderBase *> CustomBuilder = None); 534 535 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 536 void fixNonInductionPHIs(VPTransformState &State); 537 538 /// Returns true if the reordering of FP operations is not allowed, but we are 539 /// able to vectorize with strict in-order reductions for the given RdxDesc. 540 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 541 542 /// Create a broadcast instruction. This method generates a broadcast 543 /// instruction (shuffle) for loop invariant values and for the induction 544 /// value. If this is the induction variable then we extend it to N, N+1, ... 545 /// this is needed because each iteration in the loop corresponds to a SIMD 546 /// element. 547 virtual Value *getBroadcastInstrs(Value *V); 548 549 /// Add metadata from one instruction to another. 550 /// 551 /// This includes both the original MDs from \p From and additional ones (\see 552 /// addNewMetadata). Use this for *newly created* instructions in the vector 553 /// loop. 554 void addMetadata(Instruction *To, Instruction *From); 555 556 /// Similar to the previous function but it adds the metadata to a 557 /// vector of instructions. 558 void addMetadata(ArrayRef<Value *> To, Instruction *From); 559 560 // Returns the resume value (bc.merge.rdx) for a reduction as 561 // generated by fixReduction. 562 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 563 564 protected: 565 friend class LoopVectorizationPlanner; 566 567 /// A small list of PHINodes. 568 using PhiVector = SmallVector<PHINode *, 4>; 569 570 /// A type for scalarized values in the new loop. Each value from the 571 /// original loop, when scalarized, is represented by UF x VF scalar values 572 /// in the new unrolled loop, where UF is the unroll factor and VF is the 573 /// vectorization factor. 574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 575 576 /// Set up the values of the IVs correctly when exiting the vector loop. 577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 578 Value *CountRoundDown, Value *EndValue, 579 BasicBlock *MiddleBlock); 580 581 /// Introduce a conditional branch (on true, condition to be set later) at the 582 /// end of the header=latch connecting it to itself (across the backedge) and 583 /// to the exit block of \p L. 584 void createHeaderBranch(Loop *L); 585 586 /// Handle all cross-iteration phis in the header. 587 void fixCrossIterationPHIs(VPTransformState &State); 588 589 /// Create the exit value of first order recurrences in the middle block and 590 /// update their users. 591 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 592 VPTransformState &State); 593 594 /// Create code for the loop exit value of the reduction. 595 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 596 597 /// Clear NSW/NUW flags from reduction instructions if necessary. 598 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 599 VPTransformState &State); 600 601 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 602 /// means we need to add the appropriate incoming value from the middle 603 /// block as exiting edges from the scalar epilogue loop (if present) are 604 /// already in place, and we exit the vector loop exclusively to the middle 605 /// block. 606 void fixLCSSAPHIs(VPTransformState &State); 607 608 /// Iteratively sink the scalarized operands of a predicated instruction into 609 /// the block that was created for it. 610 void sinkScalarOperands(Instruction *PredInst); 611 612 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 613 /// represented as. 614 void truncateToMinimalBitwidths(VPTransformState &State); 615 616 /// Create a vector induction phi node based on an existing scalar one. \p 617 /// EntryVal is the value from the original loop that maps to the vector phi 618 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 619 /// truncate instruction, instead of widening the original IV, we widen a 620 /// version of the IV truncated to \p EntryVal's type. 621 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 622 Value *Step, Value *Start, 623 Instruction *EntryVal, VPValue *Def, 624 VPTransformState &State); 625 626 /// Returns (and creates if needed) the original loop trip count. 627 Value *getOrCreateTripCount(Loop *NewLoop); 628 629 /// Returns (and creates if needed) the trip count of the widened loop. 630 Value *getOrCreateVectorTripCount(Loop *NewLoop); 631 632 /// Returns a bitcasted value to the requested vector type. 633 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 634 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 635 const DataLayout &DL); 636 637 /// Emit a bypass check to see if the vector trip count is zero, including if 638 /// it overflows. 639 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 640 641 /// Emit a bypass check to see if all of the SCEV assumptions we've 642 /// had to make are correct. Returns the block containing the checks or 643 /// nullptr if no checks have been added. 644 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 645 646 /// Emit bypass checks to check any memory assumptions we may have made. 647 /// Returns the block containing the checks or nullptr if no checks have been 648 /// added. 649 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 650 651 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 652 /// vector loop preheader, middle block and scalar preheader. Also 653 /// allocate a loop object for the new vector loop and return it. 654 Loop *createVectorLoopSkeleton(StringRef Prefix); 655 656 /// Create new phi nodes for the induction variables to resume iteration count 657 /// in the scalar epilogue, from where the vectorized loop left off. 658 /// In cases where the loop skeleton is more complicated (eg. epilogue 659 /// vectorization) and the resume values can come from an additional bypass 660 /// block, the \p AdditionalBypass pair provides information about the bypass 661 /// block and the end value on the edge from bypass to this loop. 662 void createInductionResumeValues( 663 Loop *L, 664 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 665 666 /// Complete the loop skeleton by adding debug MDs, creating appropriate 667 /// conditional branches in the middle block, preparing the builder and 668 /// running the verifier. Take in the vector loop \p L as argument, and return 669 /// the preheader of the completed vector loop. 670 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 671 672 /// Add additional metadata to \p To that was not present on \p Orig. 673 /// 674 /// Currently this is used to add the noalias annotations based on the 675 /// inserted memchecks. Use this for instructions that are *cloned* into the 676 /// vector loop. 677 void addNewMetadata(Instruction *To, const Instruction *Orig); 678 679 /// Collect poison-generating recipes that may generate a poison value that is 680 /// used after vectorization, even when their operands are not poison. Those 681 /// recipes meet the following conditions: 682 /// * Contribute to the address computation of a recipe generating a widen 683 /// memory load/store (VPWidenMemoryInstructionRecipe or 684 /// VPInterleaveRecipe). 685 /// * Such a widen memory load/store has at least one underlying Instruction 686 /// that is in a basic block that needs predication and after vectorization 687 /// the generated instruction won't be predicated. 688 void collectPoisonGeneratingRecipes(VPTransformState &State); 689 690 /// Allow subclasses to override and print debug traces before/after vplan 691 /// execution, when trace information is requested. 692 virtual void printDebugTracesAtStart(){}; 693 virtual void printDebugTracesAtEnd(){}; 694 695 /// The original loop. 696 Loop *OrigLoop; 697 698 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 699 /// dynamic knowledge to simplify SCEV expressions and converts them to a 700 /// more usable form. 701 PredicatedScalarEvolution &PSE; 702 703 /// Loop Info. 704 LoopInfo *LI; 705 706 /// Dominator Tree. 707 DominatorTree *DT; 708 709 /// Alias Analysis. 710 AAResults *AA; 711 712 /// Target Library Info. 713 const TargetLibraryInfo *TLI; 714 715 /// Target Transform Info. 716 const TargetTransformInfo *TTI; 717 718 /// Assumption Cache. 719 AssumptionCache *AC; 720 721 /// Interface to emit optimization remarks. 722 OptimizationRemarkEmitter *ORE; 723 724 /// LoopVersioning. It's only set up (non-null) if memchecks were 725 /// used. 726 /// 727 /// This is currently only used to add no-alias metadata based on the 728 /// memchecks. The actually versioning is performed manually. 729 std::unique_ptr<LoopVersioning> LVer; 730 731 /// The vectorization SIMD factor to use. Each vector will have this many 732 /// vector elements. 733 ElementCount VF; 734 735 /// The vectorization unroll factor to use. Each scalar is vectorized to this 736 /// many different vector instructions. 737 unsigned UF; 738 739 /// The builder that we use 740 IRBuilder<> Builder; 741 742 // --- Vectorization state --- 743 744 /// The vector-loop preheader. 745 BasicBlock *LoopVectorPreHeader; 746 747 /// The scalar-loop preheader. 748 BasicBlock *LoopScalarPreHeader; 749 750 /// Middle Block between the vector and the scalar. 751 BasicBlock *LoopMiddleBlock; 752 753 /// The unique ExitBlock of the scalar loop if one exists. Note that 754 /// there can be multiple exiting edges reaching this block. 755 BasicBlock *LoopExitBlock; 756 757 /// The vector loop body. 758 BasicBlock *LoopVectorBody; 759 760 /// The scalar loop body. 761 BasicBlock *LoopScalarBody; 762 763 /// A list of all bypass blocks. The first block is the entry of the loop. 764 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 765 766 /// Store instructions that were predicated. 767 SmallVector<Instruction *, 4> PredicatedInstructions; 768 769 /// Trip count of the original loop. 770 Value *TripCount = nullptr; 771 772 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 773 Value *VectorTripCount = nullptr; 774 775 /// The legality analysis. 776 LoopVectorizationLegality *Legal; 777 778 /// The profitablity analysis. 779 LoopVectorizationCostModel *Cost; 780 781 // Record whether runtime checks are added. 782 bool AddedSafetyChecks = false; 783 784 // Holds the end values for each induction variable. We save the end values 785 // so we can later fix-up the external users of the induction variables. 786 DenseMap<PHINode *, Value *> IVEndValues; 787 788 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 789 // fixed up at the end of vector code generation. 790 SmallVector<PHINode *, 8> OrigPHIsToFix; 791 792 /// BFI and PSI are used to check for profile guided size optimizations. 793 BlockFrequencyInfo *BFI; 794 ProfileSummaryInfo *PSI; 795 796 // Whether this loop should be optimized for size based on profile guided size 797 // optimizatios. 798 bool OptForSizeBasedOnProfile; 799 800 /// Structure to hold information about generated runtime checks, responsible 801 /// for cleaning the checks, if vectorization turns out unprofitable. 802 GeneratedRTChecks &RTChecks; 803 804 // Holds the resume values for reductions in the loops, used to set the 805 // correct start value of reduction PHIs when vectorizing the epilogue. 806 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 807 ReductionResumeValues; 808 }; 809 810 class InnerLoopUnroller : public InnerLoopVectorizer { 811 public: 812 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 813 LoopInfo *LI, DominatorTree *DT, 814 const TargetLibraryInfo *TLI, 815 const TargetTransformInfo *TTI, AssumptionCache *AC, 816 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 817 LoopVectorizationLegality *LVL, 818 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 819 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 820 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 821 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 822 BFI, PSI, Check) {} 823 824 private: 825 Value *getBroadcastInstrs(Value *V) override; 826 }; 827 828 /// Encapsulate information regarding vectorization of a loop and its epilogue. 829 /// This information is meant to be updated and used across two stages of 830 /// epilogue vectorization. 831 struct EpilogueLoopVectorizationInfo { 832 ElementCount MainLoopVF = ElementCount::getFixed(0); 833 unsigned MainLoopUF = 0; 834 ElementCount EpilogueVF = ElementCount::getFixed(0); 835 unsigned EpilogueUF = 0; 836 BasicBlock *MainLoopIterationCountCheck = nullptr; 837 BasicBlock *EpilogueIterationCountCheck = nullptr; 838 BasicBlock *SCEVSafetyCheck = nullptr; 839 BasicBlock *MemSafetyCheck = nullptr; 840 Value *TripCount = nullptr; 841 Value *VectorTripCount = nullptr; 842 843 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 844 ElementCount EVF, unsigned EUF) 845 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 846 assert(EUF == 1 && 847 "A high UF for the epilogue loop is likely not beneficial."); 848 } 849 }; 850 851 /// An extension of the inner loop vectorizer that creates a skeleton for a 852 /// vectorized loop that has its epilogue (residual) also vectorized. 853 /// The idea is to run the vplan on a given loop twice, firstly to setup the 854 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 855 /// from the first step and vectorize the epilogue. This is achieved by 856 /// deriving two concrete strategy classes from this base class and invoking 857 /// them in succession from the loop vectorizer planner. 858 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 859 public: 860 InnerLoopAndEpilogueVectorizer( 861 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 862 DominatorTree *DT, const TargetLibraryInfo *TLI, 863 const TargetTransformInfo *TTI, AssumptionCache *AC, 864 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 865 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 866 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 867 GeneratedRTChecks &Checks) 868 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 869 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 870 Checks), 871 EPI(EPI) {} 872 873 // Override this function to handle the more complex control flow around the 874 // three loops. 875 std::pair<BasicBlock *, Value *> 876 createVectorizedLoopSkeleton() final override { 877 return createEpilogueVectorizedLoopSkeleton(); 878 } 879 880 /// The interface for creating a vectorized skeleton using one of two 881 /// different strategies, each corresponding to one execution of the vplan 882 /// as described above. 883 virtual std::pair<BasicBlock *, Value *> 884 createEpilogueVectorizedLoopSkeleton() = 0; 885 886 /// Holds and updates state information required to vectorize the main loop 887 /// and its epilogue in two separate passes. This setup helps us avoid 888 /// regenerating and recomputing runtime safety checks. It also helps us to 889 /// shorten the iteration-count-check path length for the cases where the 890 /// iteration count of the loop is so small that the main vector loop is 891 /// completely skipped. 892 EpilogueLoopVectorizationInfo &EPI; 893 }; 894 895 /// A specialized derived class of inner loop vectorizer that performs 896 /// vectorization of *main* loops in the process of vectorizing loops and their 897 /// epilogues. 898 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 899 public: 900 EpilogueVectorizerMainLoop( 901 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 902 DominatorTree *DT, const TargetLibraryInfo *TLI, 903 const TargetTransformInfo *TTI, AssumptionCache *AC, 904 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 905 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 906 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 907 GeneratedRTChecks &Check) 908 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 909 EPI, LVL, CM, BFI, PSI, Check) {} 910 /// Implements the interface for creating a vectorized skeleton using the 911 /// *main loop* strategy (ie the first pass of vplan execution). 912 std::pair<BasicBlock *, Value *> 913 createEpilogueVectorizedLoopSkeleton() final override; 914 915 protected: 916 /// Emits an iteration count bypass check once for the main loop (when \p 917 /// ForEpilogue is false) and once for the epilogue loop (when \p 918 /// ForEpilogue is true). 919 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 920 bool ForEpilogue); 921 void printDebugTracesAtStart() override; 922 void printDebugTracesAtEnd() override; 923 }; 924 925 // A specialized derived class of inner loop vectorizer that performs 926 // vectorization of *epilogue* loops in the process of vectorizing loops and 927 // their epilogues. 928 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 929 public: 930 EpilogueVectorizerEpilogueLoop( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI, LVL, CM, BFI, PSI, Checks) {} 940 /// Implements the interface for creating a vectorized skeleton using the 941 /// *epilogue loop* strategy (ie the second pass of vplan execution). 942 std::pair<BasicBlock *, Value *> 943 createEpilogueVectorizedLoopSkeleton() final override; 944 945 protected: 946 /// Emits an iteration count bypass check after the main vector loop has 947 /// finished to see if there are any iterations left to execute by either 948 /// the vector epilogue or the scalar epilogue. 949 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 950 BasicBlock *Bypass, 951 BasicBlock *Insert); 952 void printDebugTracesAtStart() override; 953 void printDebugTracesAtEnd() override; 954 }; 955 } // end namespace llvm 956 957 /// Look for a meaningful debug location on the instruction or it's 958 /// operands. 959 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 960 if (!I) 961 return I; 962 963 DebugLoc Empty; 964 if (I->getDebugLoc() != Empty) 965 return I; 966 967 for (Use &Op : I->operands()) { 968 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 969 if (OpInst->getDebugLoc() != Empty) 970 return OpInst; 971 } 972 973 return I; 974 } 975 976 void InnerLoopVectorizer::setDebugLocFromInst( 977 const Value *V, Optional<IRBuilderBase *> CustomBuilder) { 978 IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 979 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 980 const DILocation *DIL = Inst->getDebugLoc(); 981 982 // When a FSDiscriminator is enabled, we don't need to add the multiply 983 // factors to the discriminators. 984 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 985 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 986 // FIXME: For scalable vectors, assume vscale=1. 987 auto NewDIL = 988 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 989 if (NewDIL) 990 B->SetCurrentDebugLocation(NewDIL.getValue()); 991 else 992 LLVM_DEBUG(dbgs() 993 << "Failed to create new discriminator: " 994 << DIL->getFilename() << " Line: " << DIL->getLine()); 995 } else 996 B->SetCurrentDebugLocation(DIL); 997 } else 998 B->SetCurrentDebugLocation(DebugLoc()); 999 } 1000 1001 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1002 /// is passed, the message relates to that particular instruction. 1003 #ifndef NDEBUG 1004 static void debugVectorizationMessage(const StringRef Prefix, 1005 const StringRef DebugMsg, 1006 Instruction *I) { 1007 dbgs() << "LV: " << Prefix << DebugMsg; 1008 if (I != nullptr) 1009 dbgs() << " " << *I; 1010 else 1011 dbgs() << '.'; 1012 dbgs() << '\n'; 1013 } 1014 #endif 1015 1016 /// Create an analysis remark that explains why vectorization failed 1017 /// 1018 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1019 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1020 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1021 /// the location of the remark. \return the remark object that can be 1022 /// streamed to. 1023 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1024 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1025 Value *CodeRegion = TheLoop->getHeader(); 1026 DebugLoc DL = TheLoop->getStartLoc(); 1027 1028 if (I) { 1029 CodeRegion = I->getParent(); 1030 // If there is no debug location attached to the instruction, revert back to 1031 // using the loop's. 1032 if (I->getDebugLoc()) 1033 DL = I->getDebugLoc(); 1034 } 1035 1036 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1037 } 1038 1039 namespace llvm { 1040 1041 /// Return a value for Step multiplied by VF. 1042 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1043 int64_t Step) { 1044 assert(Ty->isIntegerTy() && "Expected an integer step"); 1045 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1046 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1047 } 1048 1049 /// Return the runtime value for VF. 1050 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1051 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1052 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1053 } 1054 1055 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1056 ElementCount VF) { 1057 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1058 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1059 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1060 return B.CreateUIToFP(RuntimeVF, FTy); 1061 } 1062 1063 void reportVectorizationFailure(const StringRef DebugMsg, 1064 const StringRef OREMsg, const StringRef ORETag, 1065 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1066 Instruction *I) { 1067 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1068 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1069 ORE->emit( 1070 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1071 << "loop not vectorized: " << OREMsg); 1072 } 1073 1074 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1075 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1076 Instruction *I) { 1077 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1078 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1079 ORE->emit( 1080 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1081 << Msg); 1082 } 1083 1084 } // end namespace llvm 1085 1086 #ifndef NDEBUG 1087 /// \return string containing a file name and a line # for the given loop. 1088 static std::string getDebugLocString(const Loop *L) { 1089 std::string Result; 1090 if (L) { 1091 raw_string_ostream OS(Result); 1092 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1093 LoopDbgLoc.print(OS); 1094 else 1095 // Just print the module name. 1096 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1097 OS.flush(); 1098 } 1099 return Result; 1100 } 1101 #endif 1102 1103 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1104 const Instruction *Orig) { 1105 // If the loop was versioned with memchecks, add the corresponding no-alias 1106 // metadata. 1107 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1108 LVer->annotateInstWithNoAlias(To, Orig); 1109 } 1110 1111 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1112 VPTransformState &State) { 1113 1114 // Collect recipes in the backward slice of `Root` that may generate a poison 1115 // value that is used after vectorization. 1116 SmallPtrSet<VPRecipeBase *, 16> Visited; 1117 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1118 SmallVector<VPRecipeBase *, 16> Worklist; 1119 Worklist.push_back(Root); 1120 1121 // Traverse the backward slice of Root through its use-def chain. 1122 while (!Worklist.empty()) { 1123 VPRecipeBase *CurRec = Worklist.back(); 1124 Worklist.pop_back(); 1125 1126 if (!Visited.insert(CurRec).second) 1127 continue; 1128 1129 // Prune search if we find another recipe generating a widen memory 1130 // instruction. Widen memory instructions involved in address computation 1131 // will lead to gather/scatter instructions, which don't need to be 1132 // handled. 1133 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1134 isa<VPInterleaveRecipe>(CurRec) || 1135 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1136 continue; 1137 1138 // This recipe contributes to the address computation of a widen 1139 // load/store. Collect recipe if its underlying instruction has 1140 // poison-generating flags. 1141 Instruction *Instr = CurRec->getUnderlyingInstr(); 1142 if (Instr && Instr->hasPoisonGeneratingFlags()) 1143 State.MayGeneratePoisonRecipes.insert(CurRec); 1144 1145 // Add new definitions to the worklist. 1146 for (VPValue *operand : CurRec->operands()) 1147 if (VPDef *OpDef = operand->getDef()) 1148 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1149 } 1150 }); 1151 1152 // Traverse all the recipes in the VPlan and collect the poison-generating 1153 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1154 // VPInterleaveRecipe. 1155 auto Iter = depth_first( 1156 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1157 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1158 for (VPRecipeBase &Recipe : *VPBB) { 1159 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1160 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1161 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1162 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1163 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1164 collectPoisonGeneratingInstrsInBackwardSlice( 1165 cast<VPRecipeBase>(AddrDef)); 1166 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1167 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1168 if (AddrDef) { 1169 // Check if any member of the interleave group needs predication. 1170 const InterleaveGroup<Instruction> *InterGroup = 1171 InterleaveRec->getInterleaveGroup(); 1172 bool NeedPredication = false; 1173 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1174 I < NumMembers; ++I) { 1175 Instruction *Member = InterGroup->getMember(I); 1176 if (Member) 1177 NeedPredication |= 1178 Legal->blockNeedsPredication(Member->getParent()); 1179 } 1180 1181 if (NeedPredication) 1182 collectPoisonGeneratingInstrsInBackwardSlice( 1183 cast<VPRecipeBase>(AddrDef)); 1184 } 1185 } 1186 } 1187 } 1188 } 1189 1190 void InnerLoopVectorizer::addMetadata(Instruction *To, 1191 Instruction *From) { 1192 propagateMetadata(To, From); 1193 addNewMetadata(To, From); 1194 } 1195 1196 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1197 Instruction *From) { 1198 for (Value *V : To) { 1199 if (Instruction *I = dyn_cast<Instruction>(V)) 1200 addMetadata(I, From); 1201 } 1202 } 1203 1204 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1205 const RecurrenceDescriptor &RdxDesc) { 1206 auto It = ReductionResumeValues.find(&RdxDesc); 1207 assert(It != ReductionResumeValues.end() && 1208 "Expected to find a resume value for the reduction."); 1209 return It->second; 1210 } 1211 1212 namespace llvm { 1213 1214 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1215 // lowered. 1216 enum ScalarEpilogueLowering { 1217 1218 // The default: allowing scalar epilogues. 1219 CM_ScalarEpilogueAllowed, 1220 1221 // Vectorization with OptForSize: don't allow epilogues. 1222 CM_ScalarEpilogueNotAllowedOptSize, 1223 1224 // A special case of vectorisation with OptForSize: loops with a very small 1225 // trip count are considered for vectorization under OptForSize, thereby 1226 // making sure the cost of their loop body is dominant, free of runtime 1227 // guards and scalar iteration overheads. 1228 CM_ScalarEpilogueNotAllowedLowTripLoop, 1229 1230 // Loop hint predicate indicating an epilogue is undesired. 1231 CM_ScalarEpilogueNotNeededUsePredicate, 1232 1233 // Directive indicating we must either tail fold or not vectorize 1234 CM_ScalarEpilogueNotAllowedUsePredicate 1235 }; 1236 1237 /// ElementCountComparator creates a total ordering for ElementCount 1238 /// for the purposes of using it in a set structure. 1239 struct ElementCountComparator { 1240 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1241 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1242 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1243 } 1244 }; 1245 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1246 1247 /// LoopVectorizationCostModel - estimates the expected speedups due to 1248 /// vectorization. 1249 /// In many cases vectorization is not profitable. This can happen because of 1250 /// a number of reasons. In this class we mainly attempt to predict the 1251 /// expected speedup/slowdowns due to the supported instruction set. We use the 1252 /// TargetTransformInfo to query the different backends for the cost of 1253 /// different operations. 1254 class LoopVectorizationCostModel { 1255 public: 1256 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1257 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1258 LoopVectorizationLegality *Legal, 1259 const TargetTransformInfo &TTI, 1260 const TargetLibraryInfo *TLI, DemandedBits *DB, 1261 AssumptionCache *AC, 1262 OptimizationRemarkEmitter *ORE, const Function *F, 1263 const LoopVectorizeHints *Hints, 1264 InterleavedAccessInfo &IAI) 1265 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1266 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1267 Hints(Hints), InterleaveInfo(IAI) {} 1268 1269 /// \return An upper bound for the vectorization factors (both fixed and 1270 /// scalable). If the factors are 0, vectorization and interleaving should be 1271 /// avoided up front. 1272 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1273 1274 /// \return True if runtime checks are required for vectorization, and false 1275 /// otherwise. 1276 bool runtimeChecksRequired(); 1277 1278 /// \return The most profitable vectorization factor and the cost of that VF. 1279 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1280 /// then this vectorization factor will be selected if vectorization is 1281 /// possible. 1282 VectorizationFactor 1283 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1284 1285 VectorizationFactor 1286 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1287 const LoopVectorizationPlanner &LVP); 1288 1289 /// Setup cost-based decisions for user vectorization factor. 1290 /// \return true if the UserVF is a feasible VF to be chosen. 1291 bool selectUserVectorizationFactor(ElementCount UserVF) { 1292 collectUniformsAndScalars(UserVF); 1293 collectInstsToScalarize(UserVF); 1294 return expectedCost(UserVF).first.isValid(); 1295 } 1296 1297 /// \return The size (in bits) of the smallest and widest types in the code 1298 /// that needs to be vectorized. We ignore values that remain scalar such as 1299 /// 64 bit loop indices. 1300 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1301 1302 /// \return The desired interleave count. 1303 /// If interleave count has been specified by metadata it will be returned. 1304 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1305 /// are the selected vectorization factor and the cost of the selected VF. 1306 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1307 1308 /// Memory access instruction may be vectorized in more than one way. 1309 /// Form of instruction after vectorization depends on cost. 1310 /// This function takes cost-based decisions for Load/Store instructions 1311 /// and collects them in a map. This decisions map is used for building 1312 /// the lists of loop-uniform and loop-scalar instructions. 1313 /// The calculated cost is saved with widening decision in order to 1314 /// avoid redundant calculations. 1315 void setCostBasedWideningDecision(ElementCount VF); 1316 1317 /// A struct that represents some properties of the register usage 1318 /// of a loop. 1319 struct RegisterUsage { 1320 /// Holds the number of loop invariant values that are used in the loop. 1321 /// The key is ClassID of target-provided register class. 1322 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1323 /// Holds the maximum number of concurrent live intervals in the loop. 1324 /// The key is ClassID of target-provided register class. 1325 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1326 }; 1327 1328 /// \return Returns information about the register usages of the loop for the 1329 /// given vectorization factors. 1330 SmallVector<RegisterUsage, 8> 1331 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1332 1333 /// Collect values we want to ignore in the cost model. 1334 void collectValuesToIgnore(); 1335 1336 /// Collect all element types in the loop for which widening is needed. 1337 void collectElementTypesForWidening(); 1338 1339 /// Split reductions into those that happen in the loop, and those that happen 1340 /// outside. In loop reductions are collected into InLoopReductionChains. 1341 void collectInLoopReductions(); 1342 1343 /// Returns true if we should use strict in-order reductions for the given 1344 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1345 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1346 /// of FP operations. 1347 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1348 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1349 } 1350 1351 /// \returns The smallest bitwidth each instruction can be represented with. 1352 /// The vector equivalents of these instructions should be truncated to this 1353 /// type. 1354 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1355 return MinBWs; 1356 } 1357 1358 /// \returns True if it is more profitable to scalarize instruction \p I for 1359 /// vectorization factor \p VF. 1360 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1361 assert(VF.isVector() && 1362 "Profitable to scalarize relevant only for VF > 1."); 1363 1364 // Cost model is not run in the VPlan-native path - return conservative 1365 // result until this changes. 1366 if (EnableVPlanNativePath) 1367 return false; 1368 1369 auto Scalars = InstsToScalarize.find(VF); 1370 assert(Scalars != InstsToScalarize.end() && 1371 "VF not yet analyzed for scalarization profitability"); 1372 return Scalars->second.find(I) != Scalars->second.end(); 1373 } 1374 1375 /// Returns true if \p I is known to be uniform after vectorization. 1376 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1377 if (VF.isScalar()) 1378 return true; 1379 1380 // Cost model is not run in the VPlan-native path - return conservative 1381 // result until this changes. 1382 if (EnableVPlanNativePath) 1383 return false; 1384 1385 auto UniformsPerVF = Uniforms.find(VF); 1386 assert(UniformsPerVF != Uniforms.end() && 1387 "VF not yet analyzed for uniformity"); 1388 return UniformsPerVF->second.count(I); 1389 } 1390 1391 /// Returns true if \p I is known to be scalar after vectorization. 1392 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1393 if (VF.isScalar()) 1394 return true; 1395 1396 // Cost model is not run in the VPlan-native path - return conservative 1397 // result until this changes. 1398 if (EnableVPlanNativePath) 1399 return false; 1400 1401 auto ScalarsPerVF = Scalars.find(VF); 1402 assert(ScalarsPerVF != Scalars.end() && 1403 "Scalar values are not calculated for VF"); 1404 return ScalarsPerVF->second.count(I); 1405 } 1406 1407 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1408 /// for vectorization factor \p VF. 1409 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1410 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1411 !isProfitableToScalarize(I, VF) && 1412 !isScalarAfterVectorization(I, VF); 1413 } 1414 1415 /// Decision that was taken during cost calculation for memory instruction. 1416 enum InstWidening { 1417 CM_Unknown, 1418 CM_Widen, // For consecutive accesses with stride +1. 1419 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1420 CM_Interleave, 1421 CM_GatherScatter, 1422 CM_Scalarize 1423 }; 1424 1425 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1426 /// instruction \p I and vector width \p VF. 1427 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1428 InstructionCost Cost) { 1429 assert(VF.isVector() && "Expected VF >=2"); 1430 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1431 } 1432 1433 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1434 /// interleaving group \p Grp and vector width \p VF. 1435 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1436 ElementCount VF, InstWidening W, 1437 InstructionCost Cost) { 1438 assert(VF.isVector() && "Expected VF >=2"); 1439 /// Broadcast this decicion to all instructions inside the group. 1440 /// But the cost will be assigned to one instruction only. 1441 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1442 if (auto *I = Grp->getMember(i)) { 1443 if (Grp->getInsertPos() == I) 1444 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1445 else 1446 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1447 } 1448 } 1449 } 1450 1451 /// Return the cost model decision for the given instruction \p I and vector 1452 /// width \p VF. Return CM_Unknown if this instruction did not pass 1453 /// through the cost modeling. 1454 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1455 assert(VF.isVector() && "Expected VF to be a vector VF"); 1456 // Cost model is not run in the VPlan-native path - return conservative 1457 // result until this changes. 1458 if (EnableVPlanNativePath) 1459 return CM_GatherScatter; 1460 1461 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1462 auto Itr = WideningDecisions.find(InstOnVF); 1463 if (Itr == WideningDecisions.end()) 1464 return CM_Unknown; 1465 return Itr->second.first; 1466 } 1467 1468 /// Return the vectorization cost for the given instruction \p I and vector 1469 /// width \p VF. 1470 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1471 assert(VF.isVector() && "Expected VF >=2"); 1472 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1473 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1474 "The cost is not calculated"); 1475 return WideningDecisions[InstOnVF].second; 1476 } 1477 1478 /// Return True if instruction \p I is an optimizable truncate whose operand 1479 /// is an induction variable. Such a truncate will be removed by adding a new 1480 /// induction variable with the destination type. 1481 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1482 // If the instruction is not a truncate, return false. 1483 auto *Trunc = dyn_cast<TruncInst>(I); 1484 if (!Trunc) 1485 return false; 1486 1487 // Get the source and destination types of the truncate. 1488 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1489 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1490 1491 // If the truncate is free for the given types, return false. Replacing a 1492 // free truncate with an induction variable would add an induction variable 1493 // update instruction to each iteration of the loop. We exclude from this 1494 // check the primary induction variable since it will need an update 1495 // instruction regardless. 1496 Value *Op = Trunc->getOperand(0); 1497 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1498 return false; 1499 1500 // If the truncated value is not an induction variable, return false. 1501 return Legal->isInductionPhi(Op); 1502 } 1503 1504 /// Collects the instructions to scalarize for each predicated instruction in 1505 /// the loop. 1506 void collectInstsToScalarize(ElementCount VF); 1507 1508 /// Collect Uniform and Scalar values for the given \p VF. 1509 /// The sets depend on CM decision for Load/Store instructions 1510 /// that may be vectorized as interleave, gather-scatter or scalarized. 1511 void collectUniformsAndScalars(ElementCount VF) { 1512 // Do the analysis once. 1513 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1514 return; 1515 setCostBasedWideningDecision(VF); 1516 collectLoopUniforms(VF); 1517 collectLoopScalars(VF); 1518 } 1519 1520 /// Returns true if the target machine supports masked store operation 1521 /// for the given \p DataType and kind of access to \p Ptr. 1522 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1523 return Legal->isConsecutivePtr(DataType, Ptr) && 1524 TTI.isLegalMaskedStore(DataType, Alignment); 1525 } 1526 1527 /// Returns true if the target machine supports masked load operation 1528 /// for the given \p DataType and kind of access to \p Ptr. 1529 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1530 return Legal->isConsecutivePtr(DataType, Ptr) && 1531 TTI.isLegalMaskedLoad(DataType, Alignment); 1532 } 1533 1534 /// Returns true if the target machine can represent \p V as a masked gather 1535 /// or scatter operation. 1536 bool isLegalGatherOrScatter(Value *V, 1537 ElementCount VF = ElementCount::getFixed(1)) { 1538 bool LI = isa<LoadInst>(V); 1539 bool SI = isa<StoreInst>(V); 1540 if (!LI && !SI) 1541 return false; 1542 auto *Ty = getLoadStoreType(V); 1543 Align Align = getLoadStoreAlignment(V); 1544 if (VF.isVector()) 1545 Ty = VectorType::get(Ty, VF); 1546 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1547 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1548 } 1549 1550 /// Returns true if the target machine supports all of the reduction 1551 /// variables found for the given VF. 1552 bool canVectorizeReductions(ElementCount VF) const { 1553 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1554 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1555 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1556 })); 1557 } 1558 1559 /// Returns true if \p I is an instruction that will be scalarized with 1560 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1561 /// instructions include conditional stores and instructions that may divide 1562 /// by zero. 1563 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1564 1565 // Returns true if \p I is an instruction that will be predicated either 1566 // through scalar predication or masked load/store or masked gather/scatter. 1567 // \p VF is the vectorization factor that will be used to vectorize \p I. 1568 // Superset of instructions that return true for isScalarWithPredication. 1569 bool isPredicatedInst(Instruction *I, ElementCount VF, 1570 bool IsKnownUniform = false) { 1571 // When we know the load is uniform and the original scalar loop was not 1572 // predicated we don't need to mark it as a predicated instruction. Any 1573 // vectorised blocks created when tail-folding are something artificial we 1574 // have introduced and we know there is always at least one active lane. 1575 // That's why we call Legal->blockNeedsPredication here because it doesn't 1576 // query tail-folding. 1577 if (IsKnownUniform && isa<LoadInst>(I) && 1578 !Legal->blockNeedsPredication(I->getParent())) 1579 return false; 1580 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1581 return false; 1582 // Loads and stores that need some form of masked operation are predicated 1583 // instructions. 1584 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1585 return Legal->isMaskRequired(I); 1586 return isScalarWithPredication(I, VF); 1587 } 1588 1589 /// Returns true if \p I is a memory instruction with consecutive memory 1590 /// access that can be widened. 1591 bool 1592 memoryInstructionCanBeWidened(Instruction *I, 1593 ElementCount VF = ElementCount::getFixed(1)); 1594 1595 /// Returns true if \p I is a memory instruction in an interleaved-group 1596 /// of memory accesses that can be vectorized with wide vector loads/stores 1597 /// and shuffles. 1598 bool 1599 interleavedAccessCanBeWidened(Instruction *I, 1600 ElementCount VF = ElementCount::getFixed(1)); 1601 1602 /// Check if \p Instr belongs to any interleaved access group. 1603 bool isAccessInterleaved(Instruction *Instr) { 1604 return InterleaveInfo.isInterleaved(Instr); 1605 } 1606 1607 /// Get the interleaved access group that \p Instr belongs to. 1608 const InterleaveGroup<Instruction> * 1609 getInterleavedAccessGroup(Instruction *Instr) { 1610 return InterleaveInfo.getInterleaveGroup(Instr); 1611 } 1612 1613 /// Returns true if we're required to use a scalar epilogue for at least 1614 /// the final iteration of the original loop. 1615 bool requiresScalarEpilogue(ElementCount VF) const { 1616 if (!isScalarEpilogueAllowed()) 1617 return false; 1618 // If we might exit from anywhere but the latch, must run the exiting 1619 // iteration in scalar form. 1620 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1621 return true; 1622 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1623 } 1624 1625 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1626 /// loop hint annotation. 1627 bool isScalarEpilogueAllowed() const { 1628 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1629 } 1630 1631 /// Returns true if all loop blocks should be masked to fold tail loop. 1632 bool foldTailByMasking() const { return FoldTailByMasking; } 1633 1634 /// Returns true if the instructions in this block requires predication 1635 /// for any reason, e.g. because tail folding now requires a predicate 1636 /// or because the block in the original loop was predicated. 1637 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1638 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1639 } 1640 1641 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1642 /// nodes to the chain of instructions representing the reductions. Uses a 1643 /// MapVector to ensure deterministic iteration order. 1644 using ReductionChainMap = 1645 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1646 1647 /// Return the chain of instructions representing an inloop reduction. 1648 const ReductionChainMap &getInLoopReductionChains() const { 1649 return InLoopReductionChains; 1650 } 1651 1652 /// Returns true if the Phi is part of an inloop reduction. 1653 bool isInLoopReduction(PHINode *Phi) const { 1654 return InLoopReductionChains.count(Phi); 1655 } 1656 1657 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1658 /// with factor VF. Return the cost of the instruction, including 1659 /// scalarization overhead if it's needed. 1660 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1661 1662 /// Estimate cost of a call instruction CI if it were vectorized with factor 1663 /// VF. Return the cost of the instruction, including scalarization overhead 1664 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1665 /// scalarized - 1666 /// i.e. either vector version isn't available, or is too expensive. 1667 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1668 bool &NeedToScalarize) const; 1669 1670 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1671 /// that of B. 1672 bool isMoreProfitable(const VectorizationFactor &A, 1673 const VectorizationFactor &B) const; 1674 1675 /// Invalidates decisions already taken by the cost model. 1676 void invalidateCostModelingDecisions() { 1677 WideningDecisions.clear(); 1678 Uniforms.clear(); 1679 Scalars.clear(); 1680 } 1681 1682 private: 1683 unsigned NumPredStores = 0; 1684 1685 /// Convenience function that returns the value of vscale_range iff 1686 /// vscale_range.min == vscale_range.max or otherwise returns the value 1687 /// returned by the corresponding TLI method. 1688 Optional<unsigned> getVScaleForTuning() const; 1689 1690 /// \return An upper bound for the vectorization factors for both 1691 /// fixed and scalable vectorization, where the minimum-known number of 1692 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1693 /// disabled or unsupported, then the scalable part will be equal to 1694 /// ElementCount::getScalable(0). 1695 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1696 ElementCount UserVF, 1697 bool FoldTailByMasking); 1698 1699 /// \return the maximized element count based on the targets vector 1700 /// registers and the loop trip-count, but limited to a maximum safe VF. 1701 /// This is a helper function of computeFeasibleMaxVF. 1702 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1703 /// issue that occurred on one of the buildbots which cannot be reproduced 1704 /// without having access to the properietary compiler (see comments on 1705 /// D98509). The issue is currently under investigation and this workaround 1706 /// will be removed as soon as possible. 1707 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1708 unsigned SmallestType, 1709 unsigned WidestType, 1710 const ElementCount &MaxSafeVF, 1711 bool FoldTailByMasking); 1712 1713 /// \return the maximum legal scalable VF, based on the safe max number 1714 /// of elements. 1715 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1716 1717 /// The vectorization cost is a combination of the cost itself and a boolean 1718 /// indicating whether any of the contributing operations will actually 1719 /// operate on vector values after type legalization in the backend. If this 1720 /// latter value is false, then all operations will be scalarized (i.e. no 1721 /// vectorization has actually taken place). 1722 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1723 1724 /// Returns the expected execution cost. The unit of the cost does 1725 /// not matter because we use the 'cost' units to compare different 1726 /// vector widths. The cost that is returned is *not* normalized by 1727 /// the factor width. If \p Invalid is not nullptr, this function 1728 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1729 /// each instruction that has an Invalid cost for the given VF. 1730 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1731 VectorizationCostTy 1732 expectedCost(ElementCount VF, 1733 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1734 1735 /// Returns the execution time cost of an instruction for a given vector 1736 /// width. Vector width of one means scalar. 1737 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1738 1739 /// The cost-computation logic from getInstructionCost which provides 1740 /// the vector type as an output parameter. 1741 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1742 Type *&VectorTy); 1743 1744 /// Return the cost of instructions in an inloop reduction pattern, if I is 1745 /// part of that pattern. 1746 Optional<InstructionCost> 1747 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1748 TTI::TargetCostKind CostKind); 1749 1750 /// Calculate vectorization cost of memory instruction \p I. 1751 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1752 1753 /// The cost computation for scalarized memory instruction. 1754 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1755 1756 /// The cost computation for interleaving group of memory instructions. 1757 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1758 1759 /// The cost computation for Gather/Scatter instruction. 1760 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1761 1762 /// The cost computation for widening instruction \p I with consecutive 1763 /// memory access. 1764 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1765 1766 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1767 /// Load: scalar load + broadcast. 1768 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1769 /// element) 1770 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1771 1772 /// Estimate the overhead of scalarizing an instruction. This is a 1773 /// convenience wrapper for the type-based getScalarizationOverhead API. 1774 InstructionCost getScalarizationOverhead(Instruction *I, 1775 ElementCount VF) const; 1776 1777 /// Returns whether the instruction is a load or store and will be a emitted 1778 /// as a vector operation. 1779 bool isConsecutiveLoadOrStore(Instruction *I); 1780 1781 /// Returns true if an artificially high cost for emulated masked memrefs 1782 /// should be used. 1783 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1784 1785 /// Map of scalar integer values to the smallest bitwidth they can be legally 1786 /// represented as. The vector equivalents of these values should be truncated 1787 /// to this type. 1788 MapVector<Instruction *, uint64_t> MinBWs; 1789 1790 /// A type representing the costs for instructions if they were to be 1791 /// scalarized rather than vectorized. The entries are Instruction-Cost 1792 /// pairs. 1793 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1794 1795 /// A set containing all BasicBlocks that are known to present after 1796 /// vectorization as a predicated block. 1797 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1798 1799 /// Records whether it is allowed to have the original scalar loop execute at 1800 /// least once. This may be needed as a fallback loop in case runtime 1801 /// aliasing/dependence checks fail, or to handle the tail/remainder 1802 /// iterations when the trip count is unknown or doesn't divide by the VF, 1803 /// or as a peel-loop to handle gaps in interleave-groups. 1804 /// Under optsize and when the trip count is very small we don't allow any 1805 /// iterations to execute in the scalar loop. 1806 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1807 1808 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1809 bool FoldTailByMasking = false; 1810 1811 /// A map holding scalar costs for different vectorization factors. The 1812 /// presence of a cost for an instruction in the mapping indicates that the 1813 /// instruction will be scalarized when vectorizing with the associated 1814 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1815 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1816 1817 /// Holds the instructions known to be uniform after vectorization. 1818 /// The data is collected per VF. 1819 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1820 1821 /// Holds the instructions known to be scalar after vectorization. 1822 /// The data is collected per VF. 1823 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1824 1825 /// Holds the instructions (address computations) that are forced to be 1826 /// scalarized. 1827 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1828 1829 /// PHINodes of the reductions that should be expanded in-loop along with 1830 /// their associated chains of reduction operations, in program order from top 1831 /// (PHI) to bottom 1832 ReductionChainMap InLoopReductionChains; 1833 1834 /// A Map of inloop reduction operations and their immediate chain operand. 1835 /// FIXME: This can be removed once reductions can be costed correctly in 1836 /// vplan. This was added to allow quick lookup to the inloop operations, 1837 /// without having to loop through InLoopReductionChains. 1838 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1839 1840 /// Returns the expected difference in cost from scalarizing the expression 1841 /// feeding a predicated instruction \p PredInst. The instructions to 1842 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1843 /// non-negative return value implies the expression will be scalarized. 1844 /// Currently, only single-use chains are considered for scalarization. 1845 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1846 ElementCount VF); 1847 1848 /// Collect the instructions that are uniform after vectorization. An 1849 /// instruction is uniform if we represent it with a single scalar value in 1850 /// the vectorized loop corresponding to each vector iteration. Examples of 1851 /// uniform instructions include pointer operands of consecutive or 1852 /// interleaved memory accesses. Note that although uniformity implies an 1853 /// instruction will be scalar, the reverse is not true. In general, a 1854 /// scalarized instruction will be represented by VF scalar values in the 1855 /// vectorized loop, each corresponding to an iteration of the original 1856 /// scalar loop. 1857 void collectLoopUniforms(ElementCount VF); 1858 1859 /// Collect the instructions that are scalar after vectorization. An 1860 /// instruction is scalar if it is known to be uniform or will be scalarized 1861 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1862 /// to the list if they are used by a load/store instruction that is marked as 1863 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1864 /// VF values in the vectorized loop, each corresponding to an iteration of 1865 /// the original scalar loop. 1866 void collectLoopScalars(ElementCount VF); 1867 1868 /// Keeps cost model vectorization decision and cost for instructions. 1869 /// Right now it is used for memory instructions only. 1870 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1871 std::pair<InstWidening, InstructionCost>>; 1872 1873 DecisionList WideningDecisions; 1874 1875 /// Returns true if \p V is expected to be vectorized and it needs to be 1876 /// extracted. 1877 bool needsExtract(Value *V, ElementCount VF) const { 1878 Instruction *I = dyn_cast<Instruction>(V); 1879 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1880 TheLoop->isLoopInvariant(I)) 1881 return false; 1882 1883 // Assume we can vectorize V (and hence we need extraction) if the 1884 // scalars are not computed yet. This can happen, because it is called 1885 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1886 // the scalars are collected. That should be a safe assumption in most 1887 // cases, because we check if the operands have vectorizable types 1888 // beforehand in LoopVectorizationLegality. 1889 return Scalars.find(VF) == Scalars.end() || 1890 !isScalarAfterVectorization(I, VF); 1891 }; 1892 1893 /// Returns a range containing only operands needing to be extracted. 1894 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1895 ElementCount VF) const { 1896 return SmallVector<Value *, 4>(make_filter_range( 1897 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1898 } 1899 1900 /// Determines if we have the infrastructure to vectorize loop \p L and its 1901 /// epilogue, assuming the main loop is vectorized by \p VF. 1902 bool isCandidateForEpilogueVectorization(const Loop &L, 1903 const ElementCount VF) const; 1904 1905 /// Returns true if epilogue vectorization is considered profitable, and 1906 /// false otherwise. 1907 /// \p VF is the vectorization factor chosen for the original loop. 1908 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1909 1910 public: 1911 /// The loop that we evaluate. 1912 Loop *TheLoop; 1913 1914 /// Predicated scalar evolution analysis. 1915 PredicatedScalarEvolution &PSE; 1916 1917 /// Loop Info analysis. 1918 LoopInfo *LI; 1919 1920 /// Vectorization legality. 1921 LoopVectorizationLegality *Legal; 1922 1923 /// Vector target information. 1924 const TargetTransformInfo &TTI; 1925 1926 /// Target Library Info. 1927 const TargetLibraryInfo *TLI; 1928 1929 /// Demanded bits analysis. 1930 DemandedBits *DB; 1931 1932 /// Assumption cache. 1933 AssumptionCache *AC; 1934 1935 /// Interface to emit optimization remarks. 1936 OptimizationRemarkEmitter *ORE; 1937 1938 const Function *TheFunction; 1939 1940 /// Loop Vectorize Hint. 1941 const LoopVectorizeHints *Hints; 1942 1943 /// The interleave access information contains groups of interleaved accesses 1944 /// with the same stride and close to each other. 1945 InterleavedAccessInfo &InterleaveInfo; 1946 1947 /// Values to ignore in the cost model. 1948 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1949 1950 /// Values to ignore in the cost model when VF > 1. 1951 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1952 1953 /// All element types found in the loop. 1954 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1955 1956 /// Profitable vector factors. 1957 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1958 }; 1959 } // end namespace llvm 1960 1961 /// Helper struct to manage generating runtime checks for vectorization. 1962 /// 1963 /// The runtime checks are created up-front in temporary blocks to allow better 1964 /// estimating the cost and un-linked from the existing IR. After deciding to 1965 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1966 /// temporary blocks are completely removed. 1967 class GeneratedRTChecks { 1968 /// Basic block which contains the generated SCEV checks, if any. 1969 BasicBlock *SCEVCheckBlock = nullptr; 1970 1971 /// The value representing the result of the generated SCEV checks. If it is 1972 /// nullptr, either no SCEV checks have been generated or they have been used. 1973 Value *SCEVCheckCond = nullptr; 1974 1975 /// Basic block which contains the generated memory runtime checks, if any. 1976 BasicBlock *MemCheckBlock = nullptr; 1977 1978 /// The value representing the result of the generated memory runtime checks. 1979 /// If it is nullptr, either no memory runtime checks have been generated or 1980 /// they have been used. 1981 Value *MemRuntimeCheckCond = nullptr; 1982 1983 DominatorTree *DT; 1984 LoopInfo *LI; 1985 1986 SCEVExpander SCEVExp; 1987 SCEVExpander MemCheckExp; 1988 1989 public: 1990 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1991 const DataLayout &DL) 1992 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1993 MemCheckExp(SE, DL, "scev.check") {} 1994 1995 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1996 /// accurately estimate the cost of the runtime checks. The blocks are 1997 /// un-linked from the IR and is added back during vector code generation. If 1998 /// there is no vector code generation, the check blocks are removed 1999 /// completely. 2000 void Create(Loop *L, const LoopAccessInfo &LAI, 2001 const SCEVPredicate &Pred) { 2002 2003 BasicBlock *LoopHeader = L->getHeader(); 2004 BasicBlock *Preheader = L->getLoopPreheader(); 2005 2006 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2007 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2008 // may be used by SCEVExpander. The blocks will be un-linked from their 2009 // predecessors and removed from LI & DT at the end of the function. 2010 if (!Pred.isAlwaysTrue()) { 2011 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2012 nullptr, "vector.scevcheck"); 2013 2014 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2015 &Pred, SCEVCheckBlock->getTerminator()); 2016 } 2017 2018 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2019 if (RtPtrChecking.Need) { 2020 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2021 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2022 "vector.memcheck"); 2023 2024 MemRuntimeCheckCond = 2025 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2026 RtPtrChecking.getChecks(), MemCheckExp); 2027 assert(MemRuntimeCheckCond && 2028 "no RT checks generated although RtPtrChecking " 2029 "claimed checks are required"); 2030 } 2031 2032 if (!MemCheckBlock && !SCEVCheckBlock) 2033 return; 2034 2035 // Unhook the temporary block with the checks, update various places 2036 // accordingly. 2037 if (SCEVCheckBlock) 2038 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2039 if (MemCheckBlock) 2040 MemCheckBlock->replaceAllUsesWith(Preheader); 2041 2042 if (SCEVCheckBlock) { 2043 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2044 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2045 Preheader->getTerminator()->eraseFromParent(); 2046 } 2047 if (MemCheckBlock) { 2048 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2049 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2050 Preheader->getTerminator()->eraseFromParent(); 2051 } 2052 2053 DT->changeImmediateDominator(LoopHeader, Preheader); 2054 if (MemCheckBlock) { 2055 DT->eraseNode(MemCheckBlock); 2056 LI->removeBlock(MemCheckBlock); 2057 } 2058 if (SCEVCheckBlock) { 2059 DT->eraseNode(SCEVCheckBlock); 2060 LI->removeBlock(SCEVCheckBlock); 2061 } 2062 } 2063 2064 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2065 /// unused. 2066 ~GeneratedRTChecks() { 2067 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2068 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2069 if (!SCEVCheckCond) 2070 SCEVCleaner.markResultUsed(); 2071 2072 if (!MemRuntimeCheckCond) 2073 MemCheckCleaner.markResultUsed(); 2074 2075 if (MemRuntimeCheckCond) { 2076 auto &SE = *MemCheckExp.getSE(); 2077 // Memory runtime check generation creates compares that use expanded 2078 // values. Remove them before running the SCEVExpanderCleaners. 2079 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2080 if (MemCheckExp.isInsertedInstruction(&I)) 2081 continue; 2082 SE.forgetValue(&I); 2083 I.eraseFromParent(); 2084 } 2085 } 2086 MemCheckCleaner.cleanup(); 2087 SCEVCleaner.cleanup(); 2088 2089 if (SCEVCheckCond) 2090 SCEVCheckBlock->eraseFromParent(); 2091 if (MemRuntimeCheckCond) 2092 MemCheckBlock->eraseFromParent(); 2093 } 2094 2095 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2096 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2097 /// depending on the generated condition. 2098 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2099 BasicBlock *LoopVectorPreHeader, 2100 BasicBlock *LoopExitBlock) { 2101 if (!SCEVCheckCond) 2102 return nullptr; 2103 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2104 if (C->isZero()) 2105 return nullptr; 2106 2107 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2108 2109 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2110 // Create new preheader for vector loop. 2111 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2112 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2113 2114 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2115 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2116 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2117 SCEVCheckBlock); 2118 2119 DT->addNewBlock(SCEVCheckBlock, Pred); 2120 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2121 2122 ReplaceInstWithInst( 2123 SCEVCheckBlock->getTerminator(), 2124 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2125 // Mark the check as used, to prevent it from being removed during cleanup. 2126 SCEVCheckCond = nullptr; 2127 return SCEVCheckBlock; 2128 } 2129 2130 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2131 /// the branches to branch to the vector preheader or \p Bypass, depending on 2132 /// the generated condition. 2133 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2134 BasicBlock *LoopVectorPreHeader) { 2135 // Check if we generated code that checks in runtime if arrays overlap. 2136 if (!MemRuntimeCheckCond) 2137 return nullptr; 2138 2139 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2140 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2141 MemCheckBlock); 2142 2143 DT->addNewBlock(MemCheckBlock, Pred); 2144 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2145 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2146 2147 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2148 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2149 2150 ReplaceInstWithInst( 2151 MemCheckBlock->getTerminator(), 2152 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2153 MemCheckBlock->getTerminator()->setDebugLoc( 2154 Pred->getTerminator()->getDebugLoc()); 2155 2156 // Mark the check as used, to prevent it from being removed during cleanup. 2157 MemRuntimeCheckCond = nullptr; 2158 return MemCheckBlock; 2159 } 2160 }; 2161 2162 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2163 // vectorization. The loop needs to be annotated with #pragma omp simd 2164 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2165 // vector length information is not provided, vectorization is not considered 2166 // explicit. Interleave hints are not allowed either. These limitations will be 2167 // relaxed in the future. 2168 // Please, note that we are currently forced to abuse the pragma 'clang 2169 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2170 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2171 // provides *explicit vectorization hints* (LV can bypass legal checks and 2172 // assume that vectorization is legal). However, both hints are implemented 2173 // using the same metadata (llvm.loop.vectorize, processed by 2174 // LoopVectorizeHints). This will be fixed in the future when the native IR 2175 // representation for pragma 'omp simd' is introduced. 2176 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2177 OptimizationRemarkEmitter *ORE) { 2178 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2179 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2180 2181 // Only outer loops with an explicit vectorization hint are supported. 2182 // Unannotated outer loops are ignored. 2183 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2184 return false; 2185 2186 Function *Fn = OuterLp->getHeader()->getParent(); 2187 if (!Hints.allowVectorization(Fn, OuterLp, 2188 true /*VectorizeOnlyWhenForced*/)) { 2189 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2190 return false; 2191 } 2192 2193 if (Hints.getInterleave() > 1) { 2194 // TODO: Interleave support is future work. 2195 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2196 "outer loops.\n"); 2197 Hints.emitRemarkWithHints(); 2198 return false; 2199 } 2200 2201 return true; 2202 } 2203 2204 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2205 OptimizationRemarkEmitter *ORE, 2206 SmallVectorImpl<Loop *> &V) { 2207 // Collect inner loops and outer loops without irreducible control flow. For 2208 // now, only collect outer loops that have explicit vectorization hints. If we 2209 // are stress testing the VPlan H-CFG construction, we collect the outermost 2210 // loop of every loop nest. 2211 if (L.isInnermost() || VPlanBuildStressTest || 2212 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2213 LoopBlocksRPO RPOT(&L); 2214 RPOT.perform(LI); 2215 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2216 V.push_back(&L); 2217 // TODO: Collect inner loops inside marked outer loops in case 2218 // vectorization fails for the outer loop. Do not invoke 2219 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2220 // already known to be reducible. We can use an inherited attribute for 2221 // that. 2222 return; 2223 } 2224 } 2225 for (Loop *InnerL : L) 2226 collectSupportedLoops(*InnerL, LI, ORE, V); 2227 } 2228 2229 namespace { 2230 2231 /// The LoopVectorize Pass. 2232 struct LoopVectorize : public FunctionPass { 2233 /// Pass identification, replacement for typeid 2234 static char ID; 2235 2236 LoopVectorizePass Impl; 2237 2238 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2239 bool VectorizeOnlyWhenForced = false) 2240 : FunctionPass(ID), 2241 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2242 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2243 } 2244 2245 bool runOnFunction(Function &F) override { 2246 if (skipFunction(F)) 2247 return false; 2248 2249 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2250 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2251 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2252 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2253 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2254 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2255 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2256 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2257 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2258 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2259 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2260 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2261 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2262 2263 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2264 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2265 2266 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2267 GetLAA, *ORE, PSI).MadeAnyChange; 2268 } 2269 2270 void getAnalysisUsage(AnalysisUsage &AU) const override { 2271 AU.addRequired<AssumptionCacheTracker>(); 2272 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2273 AU.addRequired<DominatorTreeWrapperPass>(); 2274 AU.addRequired<LoopInfoWrapperPass>(); 2275 AU.addRequired<ScalarEvolutionWrapperPass>(); 2276 AU.addRequired<TargetTransformInfoWrapperPass>(); 2277 AU.addRequired<AAResultsWrapperPass>(); 2278 AU.addRequired<LoopAccessLegacyAnalysis>(); 2279 AU.addRequired<DemandedBitsWrapperPass>(); 2280 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2281 AU.addRequired<InjectTLIMappingsLegacy>(); 2282 2283 // We currently do not preserve loopinfo/dominator analyses with outer loop 2284 // vectorization. Until this is addressed, mark these analyses as preserved 2285 // only for non-VPlan-native path. 2286 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2287 if (!EnableVPlanNativePath) { 2288 AU.addPreserved<LoopInfoWrapperPass>(); 2289 AU.addPreserved<DominatorTreeWrapperPass>(); 2290 } 2291 2292 AU.addPreserved<BasicAAWrapperPass>(); 2293 AU.addPreserved<GlobalsAAWrapperPass>(); 2294 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2295 } 2296 }; 2297 2298 } // end anonymous namespace 2299 2300 //===----------------------------------------------------------------------===// 2301 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2302 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2303 //===----------------------------------------------------------------------===// 2304 2305 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2306 // We need to place the broadcast of invariant variables outside the loop, 2307 // but only if it's proven safe to do so. Else, broadcast will be inside 2308 // vector loop body. 2309 Instruction *Instr = dyn_cast<Instruction>(V); 2310 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2311 (!Instr || 2312 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2313 // Place the code for broadcasting invariant variables in the new preheader. 2314 IRBuilder<>::InsertPointGuard Guard(Builder); 2315 if (SafeToHoist) 2316 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2317 2318 // Broadcast the scalar into all locations in the vector. 2319 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2320 2321 return Shuf; 2322 } 2323 2324 /// This function adds 2325 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2326 /// to each vector element of Val. The sequence starts at StartIndex. 2327 /// \p Opcode is relevant for FP induction variable. 2328 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2329 Instruction::BinaryOps BinOp, ElementCount VF, 2330 IRBuilderBase &Builder) { 2331 assert(VF.isVector() && "only vector VFs are supported"); 2332 2333 // Create and check the types. 2334 auto *ValVTy = cast<VectorType>(Val->getType()); 2335 ElementCount VLen = ValVTy->getElementCount(); 2336 2337 Type *STy = Val->getType()->getScalarType(); 2338 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2339 "Induction Step must be an integer or FP"); 2340 assert(Step->getType() == STy && "Step has wrong type"); 2341 2342 SmallVector<Constant *, 8> Indices; 2343 2344 // Create a vector of consecutive numbers from zero to VF. 2345 VectorType *InitVecValVTy = ValVTy; 2346 if (STy->isFloatingPointTy()) { 2347 Type *InitVecValSTy = 2348 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2349 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2350 } 2351 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2352 2353 // Splat the StartIdx 2354 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2355 2356 if (STy->isIntegerTy()) { 2357 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2358 Step = Builder.CreateVectorSplat(VLen, Step); 2359 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2360 // FIXME: The newly created binary instructions should contain nsw/nuw 2361 // flags, which can be found from the original scalar operations. 2362 Step = Builder.CreateMul(InitVec, Step); 2363 return Builder.CreateAdd(Val, Step, "induction"); 2364 } 2365 2366 // Floating point induction. 2367 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2368 "Binary Opcode should be specified for FP induction"); 2369 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2370 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2371 2372 Step = Builder.CreateVectorSplat(VLen, Step); 2373 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2374 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2375 } 2376 2377 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2378 const InductionDescriptor &II, Value *Step, Value *Start, 2379 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2380 IRBuilderBase &Builder = State.Builder; 2381 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2382 "Expected either an induction phi-node or a truncate of it!"); 2383 2384 // Construct the initial value of the vector IV in the vector loop preheader 2385 auto CurrIP = Builder.saveIP(); 2386 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2387 if (isa<TruncInst>(EntryVal)) { 2388 assert(Start->getType()->isIntegerTy() && 2389 "Truncation requires an integer type"); 2390 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2391 Step = Builder.CreateTrunc(Step, TruncType); 2392 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2393 } 2394 2395 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2396 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2397 Value *SteppedStart = getStepVector( 2398 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2399 2400 // We create vector phi nodes for both integer and floating-point induction 2401 // variables. Here, we determine the kind of arithmetic we will perform. 2402 Instruction::BinaryOps AddOp; 2403 Instruction::BinaryOps MulOp; 2404 if (Step->getType()->isIntegerTy()) { 2405 AddOp = Instruction::Add; 2406 MulOp = Instruction::Mul; 2407 } else { 2408 AddOp = II.getInductionOpcode(); 2409 MulOp = Instruction::FMul; 2410 } 2411 2412 // Multiply the vectorization factor by the step using integer or 2413 // floating-point arithmetic as appropriate. 2414 Type *StepType = Step->getType(); 2415 Value *RuntimeVF; 2416 if (Step->getType()->isFloatingPointTy()) 2417 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2418 else 2419 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2420 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2421 2422 // Create a vector splat to use in the induction update. 2423 // 2424 // FIXME: If the step is non-constant, we create the vector splat with 2425 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2426 // handle a constant vector splat. 2427 Value *SplatVF = isa<Constant>(Mul) 2428 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2429 : Builder.CreateVectorSplat(State.VF, Mul); 2430 Builder.restoreIP(CurrIP); 2431 2432 // We may need to add the step a number of times, depending on the unroll 2433 // factor. The last of those goes into the PHI. 2434 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2435 &*LoopVectorBody->getFirstInsertionPt()); 2436 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2437 Instruction *LastInduction = VecInd; 2438 for (unsigned Part = 0; Part < UF; ++Part) { 2439 State.set(Def, LastInduction, Part); 2440 2441 if (isa<TruncInst>(EntryVal)) 2442 addMetadata(LastInduction, EntryVal); 2443 2444 LastInduction = cast<Instruction>( 2445 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2446 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2447 } 2448 2449 // Move the last step to the end of the latch block. This ensures consistent 2450 // placement of all induction updates. 2451 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2452 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2453 LastInduction->moveBefore(Br); 2454 LastInduction->setName("vec.ind.next"); 2455 2456 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2457 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2458 } 2459 2460 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2461 /// variable on which to base the steps, \p Step is the size of the step, and 2462 /// \p EntryVal is the value from the original loop that maps to the steps. 2463 /// Note that \p EntryVal doesn't have to be an induction variable - it 2464 /// can also be a truncate instruction. 2465 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2466 Instruction *EntryVal, 2467 const InductionDescriptor &ID, VPValue *Def, 2468 VPTransformState &State) { 2469 IRBuilderBase &Builder = State.Builder; 2470 // We shouldn't have to build scalar steps if we aren't vectorizing. 2471 assert(State.VF.isVector() && "VF should be greater than one"); 2472 // Get the value type and ensure it and the step have the same integer type. 2473 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2474 assert(ScalarIVTy == Step->getType() && 2475 "Val and Step should have the same type"); 2476 2477 // We build scalar steps for both integer and floating-point induction 2478 // variables. Here, we determine the kind of arithmetic we will perform. 2479 Instruction::BinaryOps AddOp; 2480 Instruction::BinaryOps MulOp; 2481 if (ScalarIVTy->isIntegerTy()) { 2482 AddOp = Instruction::Add; 2483 MulOp = Instruction::Mul; 2484 } else { 2485 AddOp = ID.getInductionOpcode(); 2486 MulOp = Instruction::FMul; 2487 } 2488 2489 // Determine the number of scalars we need to generate for each unroll 2490 // iteration. 2491 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2492 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2493 // Compute the scalar steps and save the results in State. 2494 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2495 ScalarIVTy->getScalarSizeInBits()); 2496 Type *VecIVTy = nullptr; 2497 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2498 if (!FirstLaneOnly && State.VF.isScalable()) { 2499 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2500 UnitStepVec = 2501 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2502 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2503 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2504 } 2505 2506 for (unsigned Part = 0; Part < State.UF; ++Part) { 2507 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2508 2509 if (!FirstLaneOnly && State.VF.isScalable()) { 2510 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2511 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2512 if (ScalarIVTy->isFloatingPointTy()) 2513 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2514 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2515 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2516 State.set(Def, Add, Part); 2517 // It's useful to record the lane values too for the known minimum number 2518 // of elements so we do those below. This improves the code quality when 2519 // trying to extract the first element, for example. 2520 } 2521 2522 if (ScalarIVTy->isFloatingPointTy()) 2523 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2524 2525 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2526 Value *StartIdx = Builder.CreateBinOp( 2527 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2528 // The step returned by `createStepForVF` is a runtime-evaluated value 2529 // when VF is scalable. Otherwise, it should be folded into a Constant. 2530 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2531 "Expected StartIdx to be folded to a constant when VF is not " 2532 "scalable"); 2533 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2534 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2535 State.set(Def, Add, VPIteration(Part, Lane)); 2536 } 2537 } 2538 } 2539 2540 // Generate code for the induction step. Note that induction steps are 2541 // required to be loop-invariant 2542 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2543 Instruction *InsertBefore, 2544 Loop *OrigLoop = nullptr) { 2545 const DataLayout &DL = SE.getDataLayout(); 2546 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2547 "Induction step should be loop invariant"); 2548 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2549 return E->getValue(); 2550 2551 SCEVExpander Exp(SE, DL, "induction"); 2552 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2553 } 2554 2555 /// Compute the transformed value of Index at offset StartValue using step 2556 /// StepValue. 2557 /// For integer induction, returns StartValue + Index * StepValue. 2558 /// For pointer induction, returns StartValue[Index * StepValue]. 2559 /// FIXME: The newly created binary instructions should contain nsw/nuw 2560 /// flags, which can be found from the original scalar operations. 2561 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *Step, 2562 const InductionDescriptor &ID) { 2563 2564 auto StartValue = ID.getStartValue(); 2565 assert(Index->getType()->getScalarType() == Step->getType() && 2566 "Index scalar type does not match StepValue type"); 2567 2568 // Note: the IR at this point is broken. We cannot use SE to create any new 2569 // SCEV and then expand it, hoping that SCEV's simplification will give us 2570 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2571 // lead to various SCEV crashes. So all we can do is to use builder and rely 2572 // on InstCombine for future simplifications. Here we handle some trivial 2573 // cases only. 2574 auto CreateAdd = [&B](Value *X, Value *Y) { 2575 assert(X->getType() == Y->getType() && "Types don't match!"); 2576 if (auto *CX = dyn_cast<ConstantInt>(X)) 2577 if (CX->isZero()) 2578 return Y; 2579 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2580 if (CY->isZero()) 2581 return X; 2582 return B.CreateAdd(X, Y); 2583 }; 2584 2585 // We allow X to be a vector type, in which case Y will potentially be 2586 // splatted into a vector with the same element count. 2587 auto CreateMul = [&B](Value *X, Value *Y) { 2588 assert(X->getType()->getScalarType() == Y->getType() && 2589 "Types don't match!"); 2590 if (auto *CX = dyn_cast<ConstantInt>(X)) 2591 if (CX->isOne()) 2592 return Y; 2593 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2594 if (CY->isOne()) 2595 return X; 2596 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2597 if (XVTy && !isa<VectorType>(Y->getType())) 2598 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2599 return B.CreateMul(X, Y); 2600 }; 2601 2602 switch (ID.getKind()) { 2603 case InductionDescriptor::IK_IntInduction: { 2604 assert(!isa<VectorType>(Index->getType()) && 2605 "Vector indices not supported for integer inductions yet"); 2606 assert(Index->getType() == StartValue->getType() && 2607 "Index type does not match StartValue type"); 2608 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2609 return B.CreateSub(StartValue, Index); 2610 auto *Offset = CreateMul(Index, Step); 2611 return CreateAdd(StartValue, Offset); 2612 } 2613 case InductionDescriptor::IK_PtrInduction: { 2614 assert(isa<Constant>(Step) && 2615 "Expected constant step for pointer induction"); 2616 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2617 } 2618 case InductionDescriptor::IK_FpInduction: { 2619 assert(!isa<VectorType>(Index->getType()) && 2620 "Vector indices not supported for FP inductions yet"); 2621 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2622 auto InductionBinOp = ID.getInductionBinOp(); 2623 assert(InductionBinOp && 2624 (InductionBinOp->getOpcode() == Instruction::FAdd || 2625 InductionBinOp->getOpcode() == Instruction::FSub) && 2626 "Original bin op should be defined for FP induction"); 2627 2628 Value *MulExp = B.CreateFMul(Step, Index); 2629 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2630 "induction"); 2631 } 2632 case InductionDescriptor::IK_NoInduction: 2633 return nullptr; 2634 } 2635 llvm_unreachable("invalid enum"); 2636 } 2637 2638 void InnerLoopVectorizer::widenIntOrFpInduction( 2639 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, 2640 Value *CanonicalIV) { 2641 Value *Start = Def->getStartValue()->getLiveInIRValue(); 2642 const InductionDescriptor &ID = Def->getInductionDescriptor(); 2643 TruncInst *Trunc = Def->getTruncInst(); 2644 IRBuilderBase &Builder = State.Builder; 2645 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2646 assert(!State.VF.isZero() && "VF must be non-zero"); 2647 2648 // The value from the original loop to which we are mapping the new induction 2649 // variable. 2650 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2651 2652 auto &DL = EntryVal->getModule()->getDataLayout(); 2653 2654 // Generate code for the induction step. Note that induction steps are 2655 // required to be loop-invariant 2656 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2657 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2658 "Induction step should be loop invariant"); 2659 if (PSE.getSE()->isSCEVable(IV->getType())) { 2660 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2661 return Exp.expandCodeFor(Step, Step->getType(), 2662 State.CFG.VectorPreHeader->getTerminator()); 2663 } 2664 return cast<SCEVUnknown>(Step)->getValue(); 2665 }; 2666 2667 // The scalar value to broadcast. This is derived from the canonical 2668 // induction variable. If a truncation type is given, truncate the canonical 2669 // induction variable and step. Otherwise, derive these values from the 2670 // induction descriptor. 2671 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2672 Value *ScalarIV = CanonicalIV; 2673 Type *NeededType = IV->getType(); 2674 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { 2675 ScalarIV = 2676 NeededType->isIntegerTy() 2677 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) 2678 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); 2679 ScalarIV = emitTransformedIndex(Builder, ScalarIV, Step, ID); 2680 ScalarIV->setName("offset.idx"); 2681 } 2682 if (Trunc) { 2683 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2684 assert(Step->getType()->isIntegerTy() && 2685 "Truncation requires an integer step"); 2686 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2687 Step = Builder.CreateTrunc(Step, TruncType); 2688 } 2689 return ScalarIV; 2690 }; 2691 2692 // Fast-math-flags propagate from the original induction instruction. 2693 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2694 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2695 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2696 2697 // Now do the actual transformations, and start with creating the step value. 2698 Value *Step = CreateStepValue(ID.getStep()); 2699 if (State.VF.isScalar()) { 2700 Value *ScalarIV = CreateScalarIV(Step); 2701 Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), 2702 Step->getType()->getScalarSizeInBits()); 2703 2704 Instruction::BinaryOps IncOp = ID.getInductionOpcode(); 2705 if (IncOp == Instruction::BinaryOpsEnd) 2706 IncOp = Instruction::Add; 2707 for (unsigned Part = 0; Part < UF; ++Part) { 2708 Value *StartIdx = ConstantInt::get(ScalarTy, Part); 2709 Instruction::BinaryOps MulOp = Instruction::Mul; 2710 if (Step->getType()->isFloatingPointTy()) { 2711 StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); 2712 MulOp = Instruction::FMul; 2713 } 2714 2715 Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2716 Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); 2717 State.set(Def, EntryPart, Part); 2718 if (Trunc) { 2719 assert(!Step->getType()->isFloatingPointTy() && 2720 "fp inductions shouldn't be truncated"); 2721 addMetadata(EntryPart, Trunc); 2722 } 2723 } 2724 return; 2725 } 2726 2727 // Create a new independent vector induction variable, if one is needed. 2728 if (Def->needsVectorIV()) 2729 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2730 2731 if (Def->needsScalarIV()) { 2732 // Create scalar steps that can be used by instructions we will later 2733 // scalarize. Note that the addition of the scalar steps will not increase 2734 // the number of instructions in the loop in the common case prior to 2735 // InstCombine. We will be trading one vector extract for each scalar step. 2736 Value *ScalarIV = CreateScalarIV(Step); 2737 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2738 } 2739 } 2740 2741 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2742 const VPIteration &Instance, 2743 VPTransformState &State) { 2744 Value *ScalarInst = State.get(Def, Instance); 2745 Value *VectorValue = State.get(Def, Instance.Part); 2746 VectorValue = Builder.CreateInsertElement( 2747 VectorValue, ScalarInst, 2748 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2749 State.set(Def, VectorValue, Instance.Part); 2750 } 2751 2752 // Return whether we allow using masked interleave-groups (for dealing with 2753 // strided loads/stores that reside in predicated blocks, or for dealing 2754 // with gaps). 2755 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2756 // If an override option has been passed in for interleaved accesses, use it. 2757 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2758 return EnableMaskedInterleavedMemAccesses; 2759 2760 return TTI.enableMaskedInterleavedAccessVectorization(); 2761 } 2762 2763 // Try to vectorize the interleave group that \p Instr belongs to. 2764 // 2765 // E.g. Translate following interleaved load group (factor = 3): 2766 // for (i = 0; i < N; i+=3) { 2767 // R = Pic[i]; // Member of index 0 2768 // G = Pic[i+1]; // Member of index 1 2769 // B = Pic[i+2]; // Member of index 2 2770 // ... // do something to R, G, B 2771 // } 2772 // To: 2773 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2774 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2775 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2776 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2777 // 2778 // Or translate following interleaved store group (factor = 3): 2779 // for (i = 0; i < N; i+=3) { 2780 // ... do something to R, G, B 2781 // Pic[i] = R; // Member of index 0 2782 // Pic[i+1] = G; // Member of index 1 2783 // Pic[i+2] = B; // Member of index 2 2784 // } 2785 // To: 2786 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2787 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2788 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2789 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2790 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2791 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2792 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2793 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2794 VPValue *BlockInMask) { 2795 Instruction *Instr = Group->getInsertPos(); 2796 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2797 2798 // Prepare for the vector type of the interleaved load/store. 2799 Type *ScalarTy = getLoadStoreType(Instr); 2800 unsigned InterleaveFactor = Group->getFactor(); 2801 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2802 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2803 2804 // Prepare for the new pointers. 2805 SmallVector<Value *, 2> AddrParts; 2806 unsigned Index = Group->getIndex(Instr); 2807 2808 // TODO: extend the masked interleaved-group support to reversed access. 2809 assert((!BlockInMask || !Group->isReverse()) && 2810 "Reversed masked interleave-group not supported."); 2811 2812 // If the group is reverse, adjust the index to refer to the last vector lane 2813 // instead of the first. We adjust the index from the first vector lane, 2814 // rather than directly getting the pointer for lane VF - 1, because the 2815 // pointer operand of the interleaved access is supposed to be uniform. For 2816 // uniform instructions, we're only required to generate a value for the 2817 // first vector lane in each unroll iteration. 2818 if (Group->isReverse()) 2819 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2820 2821 for (unsigned Part = 0; Part < UF; Part++) { 2822 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2823 setDebugLocFromInst(AddrPart); 2824 2825 // Notice current instruction could be any index. Need to adjust the address 2826 // to the member of index 0. 2827 // 2828 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2829 // b = A[i]; // Member of index 0 2830 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2831 // 2832 // E.g. A[i+1] = a; // Member of index 1 2833 // A[i] = b; // Member of index 0 2834 // A[i+2] = c; // Member of index 2 (Current instruction) 2835 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2836 2837 bool InBounds = false; 2838 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2839 InBounds = gep->isInBounds(); 2840 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2841 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2842 2843 // Cast to the vector pointer type. 2844 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2845 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2846 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2847 } 2848 2849 setDebugLocFromInst(Instr); 2850 Value *PoisonVec = PoisonValue::get(VecTy); 2851 2852 Value *MaskForGaps = nullptr; 2853 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2854 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2855 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2856 } 2857 2858 // Vectorize the interleaved load group. 2859 if (isa<LoadInst>(Instr)) { 2860 // For each unroll part, create a wide load for the group. 2861 SmallVector<Value *, 2> NewLoads; 2862 for (unsigned Part = 0; Part < UF; Part++) { 2863 Instruction *NewLoad; 2864 if (BlockInMask || MaskForGaps) { 2865 assert(useMaskedInterleavedAccesses(*TTI) && 2866 "masked interleaved groups are not allowed."); 2867 Value *GroupMask = MaskForGaps; 2868 if (BlockInMask) { 2869 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2870 Value *ShuffledMask = Builder.CreateShuffleVector( 2871 BlockInMaskPart, 2872 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2873 "interleaved.mask"); 2874 GroupMask = MaskForGaps 2875 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2876 MaskForGaps) 2877 : ShuffledMask; 2878 } 2879 NewLoad = 2880 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2881 GroupMask, PoisonVec, "wide.masked.vec"); 2882 } 2883 else 2884 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2885 Group->getAlign(), "wide.vec"); 2886 Group->addMetadata(NewLoad); 2887 NewLoads.push_back(NewLoad); 2888 } 2889 2890 // For each member in the group, shuffle out the appropriate data from the 2891 // wide loads. 2892 unsigned J = 0; 2893 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2894 Instruction *Member = Group->getMember(I); 2895 2896 // Skip the gaps in the group. 2897 if (!Member) 2898 continue; 2899 2900 auto StrideMask = 2901 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2902 for (unsigned Part = 0; Part < UF; Part++) { 2903 Value *StridedVec = Builder.CreateShuffleVector( 2904 NewLoads[Part], StrideMask, "strided.vec"); 2905 2906 // If this member has different type, cast the result type. 2907 if (Member->getType() != ScalarTy) { 2908 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2909 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2910 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2911 } 2912 2913 if (Group->isReverse()) 2914 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2915 2916 State.set(VPDefs[J], StridedVec, Part); 2917 } 2918 ++J; 2919 } 2920 return; 2921 } 2922 2923 // The sub vector type for current instruction. 2924 auto *SubVT = VectorType::get(ScalarTy, VF); 2925 2926 // Vectorize the interleaved store group. 2927 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2928 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2929 "masked interleaved groups are not allowed."); 2930 assert((!MaskForGaps || !VF.isScalable()) && 2931 "masking gaps for scalable vectors is not yet supported."); 2932 for (unsigned Part = 0; Part < UF; Part++) { 2933 // Collect the stored vector from each member. 2934 SmallVector<Value *, 4> StoredVecs; 2935 for (unsigned i = 0; i < InterleaveFactor; i++) { 2936 assert((Group->getMember(i) || MaskForGaps) && 2937 "Fail to get a member from an interleaved store group"); 2938 Instruction *Member = Group->getMember(i); 2939 2940 // Skip the gaps in the group. 2941 if (!Member) { 2942 Value *Undef = PoisonValue::get(SubVT); 2943 StoredVecs.push_back(Undef); 2944 continue; 2945 } 2946 2947 Value *StoredVec = State.get(StoredValues[i], Part); 2948 2949 if (Group->isReverse()) 2950 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2951 2952 // If this member has different type, cast it to a unified type. 2953 2954 if (StoredVec->getType() != SubVT) 2955 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2956 2957 StoredVecs.push_back(StoredVec); 2958 } 2959 2960 // Concatenate all vectors into a wide vector. 2961 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2962 2963 // Interleave the elements in the wide vector. 2964 Value *IVec = Builder.CreateShuffleVector( 2965 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2966 "interleaved.vec"); 2967 2968 Instruction *NewStoreInstr; 2969 if (BlockInMask || MaskForGaps) { 2970 Value *GroupMask = MaskForGaps; 2971 if (BlockInMask) { 2972 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2973 Value *ShuffledMask = Builder.CreateShuffleVector( 2974 BlockInMaskPart, 2975 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2976 "interleaved.mask"); 2977 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2978 ShuffledMask, MaskForGaps) 2979 : ShuffledMask; 2980 } 2981 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2982 Group->getAlign(), GroupMask); 2983 } else 2984 NewStoreInstr = 2985 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2986 2987 Group->addMetadata(NewStoreInstr); 2988 } 2989 } 2990 2991 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2992 VPReplicateRecipe *RepRecipe, 2993 const VPIteration &Instance, 2994 bool IfPredicateInstr, 2995 VPTransformState &State) { 2996 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2997 2998 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2999 // the first lane and part. 3000 if (isa<NoAliasScopeDeclInst>(Instr)) 3001 if (!Instance.isFirstIteration()) 3002 return; 3003 3004 setDebugLocFromInst(Instr); 3005 3006 // Does this instruction return a value ? 3007 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3008 3009 Instruction *Cloned = Instr->clone(); 3010 if (!IsVoidRetTy) 3011 Cloned->setName(Instr->getName() + ".cloned"); 3012 3013 // If the scalarized instruction contributes to the address computation of a 3014 // widen masked load/store which was in a basic block that needed predication 3015 // and is not predicated after vectorization, we can't propagate 3016 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 3017 // instruction could feed a poison value to the base address of the widen 3018 // load/store. 3019 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 3020 Cloned->dropPoisonGeneratingFlags(); 3021 3022 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3023 Builder.GetInsertPoint()); 3024 // Replace the operands of the cloned instructions with their scalar 3025 // equivalents in the new loop. 3026 for (auto &I : enumerate(RepRecipe->operands())) { 3027 auto InputInstance = Instance; 3028 VPValue *Operand = I.value(); 3029 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 3030 if (OperandR && OperandR->isUniform()) 3031 InputInstance.Lane = VPLane::getFirstLane(); 3032 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 3033 } 3034 addNewMetadata(Cloned, Instr); 3035 3036 // Place the cloned scalar in the new loop. 3037 Builder.Insert(Cloned); 3038 3039 State.set(RepRecipe, Cloned, Instance); 3040 3041 // If we just cloned a new assumption, add it the assumption cache. 3042 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3043 AC->registerAssumption(II); 3044 3045 // End if-block. 3046 if (IfPredicateInstr) 3047 PredicatedInstructions.push_back(Cloned); 3048 } 3049 3050 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 3051 BasicBlock *Header = L->getHeader(); 3052 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 3053 3054 IRBuilder<> B(Header->getTerminator()); 3055 Instruction *OldInst = 3056 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 3057 setDebugLocFromInst(OldInst, &B); 3058 3059 // Connect the header to the exit and header blocks and replace the old 3060 // terminator. 3061 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 3062 3063 // Now we have two terminators. Remove the old one from the block. 3064 Header->getTerminator()->eraseFromParent(); 3065 } 3066 3067 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3068 if (TripCount) 3069 return TripCount; 3070 3071 assert(L && "Create Trip Count for null loop."); 3072 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3073 // Find the loop boundaries. 3074 ScalarEvolution *SE = PSE.getSE(); 3075 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3076 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3077 "Invalid loop count"); 3078 3079 Type *IdxTy = Legal->getWidestInductionType(); 3080 assert(IdxTy && "No type for induction"); 3081 3082 // The exit count might have the type of i64 while the phi is i32. This can 3083 // happen if we have an induction variable that is sign extended before the 3084 // compare. The only way that we get a backedge taken count is that the 3085 // induction variable was signed and as such will not overflow. In such a case 3086 // truncation is legal. 3087 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3088 IdxTy->getPrimitiveSizeInBits()) 3089 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3090 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3091 3092 // Get the total trip count from the count by adding 1. 3093 const SCEV *ExitCount = SE->getAddExpr( 3094 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3095 3096 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3097 3098 // Expand the trip count and place the new instructions in the preheader. 3099 // Notice that the pre-header does not change, only the loop body. 3100 SCEVExpander Exp(*SE, DL, "induction"); 3101 3102 // Count holds the overall loop count (N). 3103 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3104 L->getLoopPreheader()->getTerminator()); 3105 3106 if (TripCount->getType()->isPointerTy()) 3107 TripCount = 3108 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3109 L->getLoopPreheader()->getTerminator()); 3110 3111 return TripCount; 3112 } 3113 3114 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3115 if (VectorTripCount) 3116 return VectorTripCount; 3117 3118 Value *TC = getOrCreateTripCount(L); 3119 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3120 3121 Type *Ty = TC->getType(); 3122 // This is where we can make the step a runtime constant. 3123 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3124 3125 // If the tail is to be folded by masking, round the number of iterations N 3126 // up to a multiple of Step instead of rounding down. This is done by first 3127 // adding Step-1 and then rounding down. Note that it's ok if this addition 3128 // overflows: the vector induction variable will eventually wrap to zero given 3129 // that it starts at zero and its Step is a power of two; the loop will then 3130 // exit, with the last early-exit vector comparison also producing all-true. 3131 if (Cost->foldTailByMasking()) { 3132 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3133 "VF*UF must be a power of 2 when folding tail by masking"); 3134 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 3135 TC = Builder.CreateAdd( 3136 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 3137 } 3138 3139 // Now we need to generate the expression for the part of the loop that the 3140 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3141 // iterations are not required for correctness, or N - Step, otherwise. Step 3142 // is equal to the vectorization factor (number of SIMD elements) times the 3143 // unroll factor (number of SIMD instructions). 3144 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3145 3146 // There are cases where we *must* run at least one iteration in the remainder 3147 // loop. See the cost model for when this can happen. If the step evenly 3148 // divides the trip count, we set the remainder to be equal to the step. If 3149 // the step does not evenly divide the trip count, no adjustment is necessary 3150 // since there will already be scalar iterations. Note that the minimum 3151 // iterations check ensures that N >= Step. 3152 if (Cost->requiresScalarEpilogue(VF)) { 3153 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3154 R = Builder.CreateSelect(IsZero, Step, R); 3155 } 3156 3157 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3158 3159 return VectorTripCount; 3160 } 3161 3162 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3163 const DataLayout &DL) { 3164 // Verify that V is a vector type with same number of elements as DstVTy. 3165 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3166 unsigned VF = DstFVTy->getNumElements(); 3167 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3168 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3169 Type *SrcElemTy = SrcVecTy->getElementType(); 3170 Type *DstElemTy = DstFVTy->getElementType(); 3171 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3172 "Vector elements must have same size"); 3173 3174 // Do a direct cast if element types are castable. 3175 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3176 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3177 } 3178 // V cannot be directly casted to desired vector type. 3179 // May happen when V is a floating point vector but DstVTy is a vector of 3180 // pointers or vice-versa. Handle this using a two-step bitcast using an 3181 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3182 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3183 "Only one type should be a pointer type"); 3184 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3185 "Only one type should be a floating point type"); 3186 Type *IntTy = 3187 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3188 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3189 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3190 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3191 } 3192 3193 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3194 BasicBlock *Bypass) { 3195 Value *Count = getOrCreateTripCount(L); 3196 // Reuse existing vector loop preheader for TC checks. 3197 // Note that new preheader block is generated for vector loop. 3198 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3199 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3200 3201 // Generate code to check if the loop's trip count is less than VF * UF, or 3202 // equal to it in case a scalar epilogue is required; this implies that the 3203 // vector trip count is zero. This check also covers the case where adding one 3204 // to the backedge-taken count overflowed leading to an incorrect trip count 3205 // of zero. In this case we will also jump to the scalar loop. 3206 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3207 : ICmpInst::ICMP_ULT; 3208 3209 // If tail is to be folded, vector loop takes care of all iterations. 3210 Value *CheckMinIters = Builder.getFalse(); 3211 if (!Cost->foldTailByMasking()) { 3212 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3213 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3214 } 3215 // Create new preheader for vector loop. 3216 LoopVectorPreHeader = 3217 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3218 "vector.ph"); 3219 3220 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3221 DT->getNode(Bypass)->getIDom()) && 3222 "TC check is expected to dominate Bypass"); 3223 3224 // Update dominator for Bypass & LoopExit (if needed). 3225 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3226 if (!Cost->requiresScalarEpilogue(VF)) 3227 // If there is an epilogue which must run, there's no edge from the 3228 // middle block to exit blocks and thus no need to update the immediate 3229 // dominator of the exit blocks. 3230 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3231 3232 ReplaceInstWithInst( 3233 TCCheckBlock->getTerminator(), 3234 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3235 LoopBypassBlocks.push_back(TCCheckBlock); 3236 } 3237 3238 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3239 3240 BasicBlock *const SCEVCheckBlock = 3241 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3242 if (!SCEVCheckBlock) 3243 return nullptr; 3244 3245 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3246 (OptForSizeBasedOnProfile && 3247 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3248 "Cannot SCEV check stride or overflow when optimizing for size"); 3249 3250 3251 // Update dominator only if this is first RT check. 3252 if (LoopBypassBlocks.empty()) { 3253 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3254 if (!Cost->requiresScalarEpilogue(VF)) 3255 // If there is an epilogue which must run, there's no edge from the 3256 // middle block to exit blocks and thus no need to update the immediate 3257 // dominator of the exit blocks. 3258 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3259 } 3260 3261 LoopBypassBlocks.push_back(SCEVCheckBlock); 3262 AddedSafetyChecks = true; 3263 return SCEVCheckBlock; 3264 } 3265 3266 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3267 BasicBlock *Bypass) { 3268 // VPlan-native path does not do any analysis for runtime checks currently. 3269 if (EnableVPlanNativePath) 3270 return nullptr; 3271 3272 BasicBlock *const MemCheckBlock = 3273 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3274 3275 // Check if we generated code that checks in runtime if arrays overlap. We put 3276 // the checks into a separate block to make the more common case of few 3277 // elements faster. 3278 if (!MemCheckBlock) 3279 return nullptr; 3280 3281 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3282 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3283 "Cannot emit memory checks when optimizing for size, unless forced " 3284 "to vectorize."); 3285 ORE->emit([&]() { 3286 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3287 L->getStartLoc(), L->getHeader()) 3288 << "Code-size may be reduced by not forcing " 3289 "vectorization, or by source-code modifications " 3290 "eliminating the need for runtime checks " 3291 "(e.g., adding 'restrict')."; 3292 }); 3293 } 3294 3295 LoopBypassBlocks.push_back(MemCheckBlock); 3296 3297 AddedSafetyChecks = true; 3298 3299 // We currently don't use LoopVersioning for the actual loop cloning but we 3300 // still use it to add the noalias metadata. 3301 LVer = std::make_unique<LoopVersioning>( 3302 *Legal->getLAI(), 3303 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3304 DT, PSE.getSE()); 3305 LVer->prepareNoAliasMetadata(); 3306 return MemCheckBlock; 3307 } 3308 3309 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3310 LoopScalarBody = OrigLoop->getHeader(); 3311 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3312 assert(LoopVectorPreHeader && "Invalid loop structure"); 3313 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3314 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3315 "multiple exit loop without required epilogue?"); 3316 3317 LoopMiddleBlock = 3318 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3319 LI, nullptr, Twine(Prefix) + "middle.block"); 3320 LoopScalarPreHeader = 3321 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3322 nullptr, Twine(Prefix) + "scalar.ph"); 3323 3324 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3325 3326 // Set up the middle block terminator. Two cases: 3327 // 1) If we know that we must execute the scalar epilogue, emit an 3328 // unconditional branch. 3329 // 2) Otherwise, we must have a single unique exit block (due to how we 3330 // implement the multiple exit case). In this case, set up a conditonal 3331 // branch from the middle block to the loop scalar preheader, and the 3332 // exit block. completeLoopSkeleton will update the condition to use an 3333 // iteration check, if required to decide whether to execute the remainder. 3334 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3335 BranchInst::Create(LoopScalarPreHeader) : 3336 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3337 Builder.getTrue()); 3338 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3339 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3340 3341 // We intentionally don't let SplitBlock to update LoopInfo since 3342 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3343 // LoopVectorBody is explicitly added to the correct place few lines later. 3344 LoopVectorBody = 3345 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3346 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3347 3348 // Update dominator for loop exit. 3349 if (!Cost->requiresScalarEpilogue(VF)) 3350 // If there is an epilogue which must run, there's no edge from the 3351 // middle block to exit blocks and thus no need to update the immediate 3352 // dominator of the exit blocks. 3353 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3354 3355 // Create and register the new vector loop. 3356 Loop *Lp = LI->AllocateLoop(); 3357 Loop *ParentLoop = OrigLoop->getParentLoop(); 3358 3359 // Insert the new loop into the loop nest and register the new basic blocks 3360 // before calling any utilities such as SCEV that require valid LoopInfo. 3361 if (ParentLoop) { 3362 ParentLoop->addChildLoop(Lp); 3363 } else { 3364 LI->addTopLevelLoop(Lp); 3365 } 3366 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3367 return Lp; 3368 } 3369 3370 void InnerLoopVectorizer::createInductionResumeValues( 3371 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3372 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3373 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3374 "Inconsistent information about additional bypass."); 3375 3376 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3377 assert(VectorTripCount && L && "Expected valid arguments"); 3378 // We are going to resume the execution of the scalar loop. 3379 // Go over all of the induction variables that we found and fix the 3380 // PHIs that are left in the scalar version of the loop. 3381 // The starting values of PHI nodes depend on the counter of the last 3382 // iteration in the vectorized loop. 3383 // If we come from a bypass edge then we need to start from the original 3384 // start value. 3385 Instruction *OldInduction = Legal->getPrimaryInduction(); 3386 for (auto &InductionEntry : Legal->getInductionVars()) { 3387 PHINode *OrigPhi = InductionEntry.first; 3388 InductionDescriptor II = InductionEntry.second; 3389 3390 // Create phi nodes to merge from the backedge-taken check block. 3391 PHINode *BCResumeVal = 3392 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3393 LoopScalarPreHeader->getTerminator()); 3394 // Copy original phi DL over to the new one. 3395 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3396 Value *&EndValue = IVEndValues[OrigPhi]; 3397 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3398 if (OrigPhi == OldInduction) { 3399 // We know what the end value is. 3400 EndValue = VectorTripCount; 3401 } else { 3402 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3403 3404 // Fast-math-flags propagate from the original induction instruction. 3405 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3406 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3407 3408 Type *StepType = II.getStep()->getType(); 3409 Instruction::CastOps CastOp = 3410 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3411 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3412 Value *Step = 3413 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3414 EndValue = emitTransformedIndex(B, CRD, Step, II); 3415 EndValue->setName("ind.end"); 3416 3417 // Compute the end value for the additional bypass (if applicable). 3418 if (AdditionalBypass.first) { 3419 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3420 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3421 StepType, true); 3422 Value *Step = 3423 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3424 CRD = 3425 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3426 EndValueFromAdditionalBypass = emitTransformedIndex(B, CRD, Step, II); 3427 EndValueFromAdditionalBypass->setName("ind.end"); 3428 } 3429 } 3430 // The new PHI merges the original incoming value, in case of a bypass, 3431 // or the value at the end of the vectorized loop. 3432 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3433 3434 // Fix the scalar body counter (PHI node). 3435 // The old induction's phi node in the scalar body needs the truncated 3436 // value. 3437 for (BasicBlock *BB : LoopBypassBlocks) 3438 BCResumeVal->addIncoming(II.getStartValue(), BB); 3439 3440 if (AdditionalBypass.first) 3441 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3442 EndValueFromAdditionalBypass); 3443 3444 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3445 } 3446 } 3447 3448 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3449 MDNode *OrigLoopID) { 3450 assert(L && "Expected valid loop."); 3451 3452 // The trip counts should be cached by now. 3453 Value *Count = getOrCreateTripCount(L); 3454 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3455 3456 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3457 3458 // Add a check in the middle block to see if we have completed 3459 // all of the iterations in the first vector loop. Three cases: 3460 // 1) If we require a scalar epilogue, there is no conditional branch as 3461 // we unconditionally branch to the scalar preheader. Do nothing. 3462 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3463 // Thus if tail is to be folded, we know we don't need to run the 3464 // remainder and we can use the previous value for the condition (true). 3465 // 3) Otherwise, construct a runtime check. 3466 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3467 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3468 Count, VectorTripCount, "cmp.n", 3469 LoopMiddleBlock->getTerminator()); 3470 3471 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3472 // of the corresponding compare because they may have ended up with 3473 // different line numbers and we want to avoid awkward line stepping while 3474 // debugging. Eg. if the compare has got a line number inside the loop. 3475 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3476 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3477 } 3478 3479 // Get ready to start creating new instructions into the vectorized body. 3480 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3481 "Inconsistent vector loop preheader"); 3482 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3483 3484 #ifdef EXPENSIVE_CHECKS 3485 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3486 LI->verify(*DT); 3487 #endif 3488 3489 return LoopVectorPreHeader; 3490 } 3491 3492 std::pair<BasicBlock *, Value *> 3493 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3494 /* 3495 In this function we generate a new loop. The new loop will contain 3496 the vectorized instructions while the old loop will continue to run the 3497 scalar remainder. 3498 3499 [ ] <-- loop iteration number check. 3500 / | 3501 / v 3502 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3503 | / | 3504 | / v 3505 || [ ] <-- vector pre header. 3506 |/ | 3507 | v 3508 | [ ] \ 3509 | [ ]_| <-- vector loop. 3510 | | 3511 | v 3512 \ -[ ] <--- middle-block. 3513 \/ | 3514 /\ v 3515 | ->[ ] <--- new preheader. 3516 | | 3517 (opt) v <-- edge from middle to exit iff epilogue is not required. 3518 | [ ] \ 3519 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3520 \ | 3521 \ v 3522 >[ ] <-- exit block(s). 3523 ... 3524 */ 3525 3526 // Get the metadata of the original loop before it gets modified. 3527 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3528 3529 // Workaround! Compute the trip count of the original loop and cache it 3530 // before we start modifying the CFG. This code has a systemic problem 3531 // wherein it tries to run analysis over partially constructed IR; this is 3532 // wrong, and not simply for SCEV. The trip count of the original loop 3533 // simply happens to be prone to hitting this in practice. In theory, we 3534 // can hit the same issue for any SCEV, or ValueTracking query done during 3535 // mutation. See PR49900. 3536 getOrCreateTripCount(OrigLoop); 3537 3538 // Create an empty vector loop, and prepare basic blocks for the runtime 3539 // checks. 3540 Loop *Lp = createVectorLoopSkeleton(""); 3541 3542 // Now, compare the new count to zero. If it is zero skip the vector loop and 3543 // jump to the scalar loop. This check also covers the case where the 3544 // backedge-taken count is uint##_max: adding one to it will overflow leading 3545 // to an incorrect trip count of zero. In this (rare) case we will also jump 3546 // to the scalar loop. 3547 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3548 3549 // Generate the code to check any assumptions that we've made for SCEV 3550 // expressions. 3551 emitSCEVChecks(Lp, LoopScalarPreHeader); 3552 3553 // Generate the code that checks in runtime if arrays overlap. We put the 3554 // checks into a separate block to make the more common case of few elements 3555 // faster. 3556 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3557 3558 createHeaderBranch(Lp); 3559 3560 // Emit phis for the new starting index of the scalar loop. 3561 createInductionResumeValues(Lp); 3562 3563 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3564 } 3565 3566 // Fix up external users of the induction variable. At this point, we are 3567 // in LCSSA form, with all external PHIs that use the IV having one input value, 3568 // coming from the remainder loop. We need those PHIs to also have a correct 3569 // value for the IV when arriving directly from the middle block. 3570 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3571 const InductionDescriptor &II, 3572 Value *CountRoundDown, Value *EndValue, 3573 BasicBlock *MiddleBlock) { 3574 // There are two kinds of external IV usages - those that use the value 3575 // computed in the last iteration (the PHI) and those that use the penultimate 3576 // value (the value that feeds into the phi from the loop latch). 3577 // We allow both, but they, obviously, have different values. 3578 3579 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3580 3581 DenseMap<Value *, Value *> MissingVals; 3582 3583 // An external user of the last iteration's value should see the value that 3584 // the remainder loop uses to initialize its own IV. 3585 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3586 for (User *U : PostInc->users()) { 3587 Instruction *UI = cast<Instruction>(U); 3588 if (!OrigLoop->contains(UI)) { 3589 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3590 MissingVals[UI] = EndValue; 3591 } 3592 } 3593 3594 // An external user of the penultimate value need to see EndValue - Step. 3595 // The simplest way to get this is to recompute it from the constituent SCEVs, 3596 // that is Start + (Step * (CRD - 1)). 3597 for (User *U : OrigPhi->users()) { 3598 auto *UI = cast<Instruction>(U); 3599 if (!OrigLoop->contains(UI)) { 3600 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3601 3602 IRBuilder<> B(MiddleBlock->getTerminator()); 3603 3604 // Fast-math-flags propagate from the original induction instruction. 3605 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3606 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3607 3608 Value *CountMinusOne = B.CreateSub( 3609 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3610 Value *CMO = 3611 !II.getStep()->getType()->isIntegerTy() 3612 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3613 II.getStep()->getType()) 3614 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3615 CMO->setName("cast.cmo"); 3616 3617 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3618 LoopVectorBody->getTerminator()); 3619 Value *Escape = emitTransformedIndex(B, CMO, Step, II); 3620 Escape->setName("ind.escape"); 3621 MissingVals[UI] = Escape; 3622 } 3623 } 3624 3625 for (auto &I : MissingVals) { 3626 PHINode *PHI = cast<PHINode>(I.first); 3627 // One corner case we have to handle is two IVs "chasing" each-other, 3628 // that is %IV2 = phi [...], [ %IV1, %latch ] 3629 // In this case, if IV1 has an external use, we need to avoid adding both 3630 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3631 // don't already have an incoming value for the middle block. 3632 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3633 PHI->addIncoming(I.second, MiddleBlock); 3634 } 3635 } 3636 3637 namespace { 3638 3639 struct CSEDenseMapInfo { 3640 static bool canHandle(const Instruction *I) { 3641 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3642 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3643 } 3644 3645 static inline Instruction *getEmptyKey() { 3646 return DenseMapInfo<Instruction *>::getEmptyKey(); 3647 } 3648 3649 static inline Instruction *getTombstoneKey() { 3650 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3651 } 3652 3653 static unsigned getHashValue(const Instruction *I) { 3654 assert(canHandle(I) && "Unknown instruction!"); 3655 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3656 I->value_op_end())); 3657 } 3658 3659 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3660 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3661 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3662 return LHS == RHS; 3663 return LHS->isIdenticalTo(RHS); 3664 } 3665 }; 3666 3667 } // end anonymous namespace 3668 3669 ///Perform cse of induction variable instructions. 3670 static void cse(BasicBlock *BB) { 3671 // Perform simple cse. 3672 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3673 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3674 if (!CSEDenseMapInfo::canHandle(&In)) 3675 continue; 3676 3677 // Check if we can replace this instruction with any of the 3678 // visited instructions. 3679 if (Instruction *V = CSEMap.lookup(&In)) { 3680 In.replaceAllUsesWith(V); 3681 In.eraseFromParent(); 3682 continue; 3683 } 3684 3685 CSEMap[&In] = &In; 3686 } 3687 } 3688 3689 InstructionCost 3690 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3691 bool &NeedToScalarize) const { 3692 Function *F = CI->getCalledFunction(); 3693 Type *ScalarRetTy = CI->getType(); 3694 SmallVector<Type *, 4> Tys, ScalarTys; 3695 for (auto &ArgOp : CI->args()) 3696 ScalarTys.push_back(ArgOp->getType()); 3697 3698 // Estimate cost of scalarized vector call. The source operands are assumed 3699 // to be vectors, so we need to extract individual elements from there, 3700 // execute VF scalar calls, and then gather the result into the vector return 3701 // value. 3702 InstructionCost ScalarCallCost = 3703 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3704 if (VF.isScalar()) 3705 return ScalarCallCost; 3706 3707 // Compute corresponding vector type for return value and arguments. 3708 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3709 for (Type *ScalarTy : ScalarTys) 3710 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3711 3712 // Compute costs of unpacking argument values for the scalar calls and 3713 // packing the return values to a vector. 3714 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3715 3716 InstructionCost Cost = 3717 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3718 3719 // If we can't emit a vector call for this function, then the currently found 3720 // cost is the cost we need to return. 3721 NeedToScalarize = true; 3722 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3723 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3724 3725 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3726 return Cost; 3727 3728 // If the corresponding vector cost is cheaper, return its cost. 3729 InstructionCost VectorCallCost = 3730 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3731 if (VectorCallCost < Cost) { 3732 NeedToScalarize = false; 3733 Cost = VectorCallCost; 3734 } 3735 return Cost; 3736 } 3737 3738 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3739 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3740 return Elt; 3741 return VectorType::get(Elt, VF); 3742 } 3743 3744 InstructionCost 3745 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3746 ElementCount VF) const { 3747 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3748 assert(ID && "Expected intrinsic call!"); 3749 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3750 FastMathFlags FMF; 3751 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3752 FMF = FPMO->getFastMathFlags(); 3753 3754 SmallVector<const Value *> Arguments(CI->args()); 3755 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3756 SmallVector<Type *> ParamTys; 3757 std::transform(FTy->param_begin(), FTy->param_end(), 3758 std::back_inserter(ParamTys), 3759 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3760 3761 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3762 dyn_cast<IntrinsicInst>(CI)); 3763 return TTI.getIntrinsicInstrCost(CostAttrs, 3764 TargetTransformInfo::TCK_RecipThroughput); 3765 } 3766 3767 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3768 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3769 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3770 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3771 } 3772 3773 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3774 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3775 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3776 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3777 } 3778 3779 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3780 // For every instruction `I` in MinBWs, truncate the operands, create a 3781 // truncated version of `I` and reextend its result. InstCombine runs 3782 // later and will remove any ext/trunc pairs. 3783 SmallPtrSet<Value *, 4> Erased; 3784 for (const auto &KV : Cost->getMinimalBitwidths()) { 3785 // If the value wasn't vectorized, we must maintain the original scalar 3786 // type. The absence of the value from State indicates that it 3787 // wasn't vectorized. 3788 // FIXME: Should not rely on getVPValue at this point. 3789 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3790 if (!State.hasAnyVectorValue(Def)) 3791 continue; 3792 for (unsigned Part = 0; Part < UF; ++Part) { 3793 Value *I = State.get(Def, Part); 3794 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3795 continue; 3796 Type *OriginalTy = I->getType(); 3797 Type *ScalarTruncatedTy = 3798 IntegerType::get(OriginalTy->getContext(), KV.second); 3799 auto *TruncatedTy = VectorType::get( 3800 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3801 if (TruncatedTy == OriginalTy) 3802 continue; 3803 3804 IRBuilder<> B(cast<Instruction>(I)); 3805 auto ShrinkOperand = [&](Value *V) -> Value * { 3806 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3807 if (ZI->getSrcTy() == TruncatedTy) 3808 return ZI->getOperand(0); 3809 return B.CreateZExtOrTrunc(V, TruncatedTy); 3810 }; 3811 3812 // The actual instruction modification depends on the instruction type, 3813 // unfortunately. 3814 Value *NewI = nullptr; 3815 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3816 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3817 ShrinkOperand(BO->getOperand(1))); 3818 3819 // Any wrapping introduced by shrinking this operation shouldn't be 3820 // considered undefined behavior. So, we can't unconditionally copy 3821 // arithmetic wrapping flags to NewI. 3822 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3823 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3824 NewI = 3825 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3826 ShrinkOperand(CI->getOperand(1))); 3827 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3828 NewI = B.CreateSelect(SI->getCondition(), 3829 ShrinkOperand(SI->getTrueValue()), 3830 ShrinkOperand(SI->getFalseValue())); 3831 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3832 switch (CI->getOpcode()) { 3833 default: 3834 llvm_unreachable("Unhandled cast!"); 3835 case Instruction::Trunc: 3836 NewI = ShrinkOperand(CI->getOperand(0)); 3837 break; 3838 case Instruction::SExt: 3839 NewI = B.CreateSExtOrTrunc( 3840 CI->getOperand(0), 3841 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3842 break; 3843 case Instruction::ZExt: 3844 NewI = B.CreateZExtOrTrunc( 3845 CI->getOperand(0), 3846 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3847 break; 3848 } 3849 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3850 auto Elements0 = 3851 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3852 auto *O0 = B.CreateZExtOrTrunc( 3853 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3854 auto Elements1 = 3855 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3856 auto *O1 = B.CreateZExtOrTrunc( 3857 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3858 3859 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3860 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3861 // Don't do anything with the operands, just extend the result. 3862 continue; 3863 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3864 auto Elements = 3865 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3866 auto *O0 = B.CreateZExtOrTrunc( 3867 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3868 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3869 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3870 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3871 auto Elements = 3872 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3873 auto *O0 = B.CreateZExtOrTrunc( 3874 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3875 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3876 } else { 3877 // If we don't know what to do, be conservative and don't do anything. 3878 continue; 3879 } 3880 3881 // Lastly, extend the result. 3882 NewI->takeName(cast<Instruction>(I)); 3883 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3884 I->replaceAllUsesWith(Res); 3885 cast<Instruction>(I)->eraseFromParent(); 3886 Erased.insert(I); 3887 State.reset(Def, Res, Part); 3888 } 3889 } 3890 3891 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3892 for (const auto &KV : Cost->getMinimalBitwidths()) { 3893 // If the value wasn't vectorized, we must maintain the original scalar 3894 // type. The absence of the value from State indicates that it 3895 // wasn't vectorized. 3896 // FIXME: Should not rely on getVPValue at this point. 3897 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3898 if (!State.hasAnyVectorValue(Def)) 3899 continue; 3900 for (unsigned Part = 0; Part < UF; ++Part) { 3901 Value *I = State.get(Def, Part); 3902 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3903 if (Inst && Inst->use_empty()) { 3904 Value *NewI = Inst->getOperand(0); 3905 Inst->eraseFromParent(); 3906 State.reset(Def, NewI, Part); 3907 } 3908 } 3909 } 3910 } 3911 3912 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3913 // Insert truncates and extends for any truncated instructions as hints to 3914 // InstCombine. 3915 if (VF.isVector()) 3916 truncateToMinimalBitwidths(State); 3917 3918 // Fix widened non-induction PHIs by setting up the PHI operands. 3919 if (OrigPHIsToFix.size()) { 3920 assert(EnableVPlanNativePath && 3921 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3922 fixNonInductionPHIs(State); 3923 } 3924 3925 // At this point every instruction in the original loop is widened to a 3926 // vector form. Now we need to fix the recurrences in the loop. These PHI 3927 // nodes are currently empty because we did not want to introduce cycles. 3928 // This is the second stage of vectorizing recurrences. 3929 fixCrossIterationPHIs(State); 3930 3931 // Forget the original basic block. 3932 PSE.getSE()->forgetLoop(OrigLoop); 3933 3934 // If we inserted an edge from the middle block to the unique exit block, 3935 // update uses outside the loop (phis) to account for the newly inserted 3936 // edge. 3937 if (!Cost->requiresScalarEpilogue(VF)) { 3938 // Fix-up external users of the induction variables. 3939 for (auto &Entry : Legal->getInductionVars()) 3940 fixupIVUsers(Entry.first, Entry.second, 3941 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3942 IVEndValues[Entry.first], LoopMiddleBlock); 3943 3944 fixLCSSAPHIs(State); 3945 } 3946 3947 for (Instruction *PI : PredicatedInstructions) 3948 sinkScalarOperands(&*PI); 3949 3950 // Remove redundant induction instructions. 3951 cse(LoopVectorBody); 3952 3953 // Set/update profile weights for the vector and remainder loops as original 3954 // loop iterations are now distributed among them. Note that original loop 3955 // represented by LoopScalarBody becomes remainder loop after vectorization. 3956 // 3957 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3958 // end up getting slightly roughened result but that should be OK since 3959 // profile is not inherently precise anyway. Note also possible bypass of 3960 // vector code caused by legality checks is ignored, assigning all the weight 3961 // to the vector loop, optimistically. 3962 // 3963 // For scalable vectorization we can't know at compile time how many iterations 3964 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3965 // vscale of '1'. 3966 setProfileInfoAfterUnrolling( 3967 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3968 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3969 } 3970 3971 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3972 // In order to support recurrences we need to be able to vectorize Phi nodes. 3973 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3974 // stage #2: We now need to fix the recurrences by adding incoming edges to 3975 // the currently empty PHI nodes. At this point every instruction in the 3976 // original loop is widened to a vector form so we can use them to construct 3977 // the incoming edges. 3978 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 3979 for (VPRecipeBase &R : Header->phis()) { 3980 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3981 fixReduction(ReductionPhi, State); 3982 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3983 fixFirstOrderRecurrence(FOR, State); 3984 } 3985 } 3986 3987 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3988 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3989 // This is the second phase of vectorizing first-order recurrences. An 3990 // overview of the transformation is described below. Suppose we have the 3991 // following loop. 3992 // 3993 // for (int i = 0; i < n; ++i) 3994 // b[i] = a[i] - a[i - 1]; 3995 // 3996 // There is a first-order recurrence on "a". For this loop, the shorthand 3997 // scalar IR looks like: 3998 // 3999 // scalar.ph: 4000 // s_init = a[-1] 4001 // br scalar.body 4002 // 4003 // scalar.body: 4004 // i = phi [0, scalar.ph], [i+1, scalar.body] 4005 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4006 // s2 = a[i] 4007 // b[i] = s2 - s1 4008 // br cond, scalar.body, ... 4009 // 4010 // In this example, s1 is a recurrence because it's value depends on the 4011 // previous iteration. In the first phase of vectorization, we created a 4012 // vector phi v1 for s1. We now complete the vectorization and produce the 4013 // shorthand vector IR shown below (for VF = 4, UF = 1). 4014 // 4015 // vector.ph: 4016 // v_init = vector(..., ..., ..., a[-1]) 4017 // br vector.body 4018 // 4019 // vector.body 4020 // i = phi [0, vector.ph], [i+4, vector.body] 4021 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4022 // v2 = a[i, i+1, i+2, i+3]; 4023 // v3 = vector(v1(3), v2(0, 1, 2)) 4024 // b[i, i+1, i+2, i+3] = v2 - v3 4025 // br cond, vector.body, middle.block 4026 // 4027 // middle.block: 4028 // x = v2(3) 4029 // br scalar.ph 4030 // 4031 // scalar.ph: 4032 // s_init = phi [x, middle.block], [a[-1], otherwise] 4033 // br scalar.body 4034 // 4035 // After execution completes the vector loop, we extract the next value of 4036 // the recurrence (x) to use as the initial value in the scalar loop. 4037 4038 // Extract the last vector element in the middle block. This will be the 4039 // initial value for the recurrence when jumping to the scalar loop. 4040 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4041 Value *Incoming = State.get(PreviousDef, UF - 1); 4042 auto *ExtractForScalar = Incoming; 4043 auto *IdxTy = Builder.getInt32Ty(); 4044 if (VF.isVector()) { 4045 auto *One = ConstantInt::get(IdxTy, 1); 4046 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4047 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4048 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4049 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4050 "vector.recur.extract"); 4051 } 4052 // Extract the second last element in the middle block if the 4053 // Phi is used outside the loop. We need to extract the phi itself 4054 // and not the last element (the phi update in the current iteration). This 4055 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4056 // when the scalar loop is not run at all. 4057 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4058 if (VF.isVector()) { 4059 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4060 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4061 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4062 Incoming, Idx, "vector.recur.extract.for.phi"); 4063 } else if (UF > 1) 4064 // When loop is unrolled without vectorizing, initialize 4065 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4066 // of `Incoming`. This is analogous to the vectorized case above: extracting 4067 // the second last element when VF > 1. 4068 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4069 4070 // Fix the initial value of the original recurrence in the scalar loop. 4071 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4072 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4073 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4074 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4075 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4076 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4077 Start->addIncoming(Incoming, BB); 4078 } 4079 4080 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4081 Phi->setName("scalar.recur"); 4082 4083 // Finally, fix users of the recurrence outside the loop. The users will need 4084 // either the last value of the scalar recurrence or the last value of the 4085 // vector recurrence we extracted in the middle block. Since the loop is in 4086 // LCSSA form, we just need to find all the phi nodes for the original scalar 4087 // recurrence in the exit block, and then add an edge for the middle block. 4088 // Note that LCSSA does not imply single entry when the original scalar loop 4089 // had multiple exiting edges (as we always run the last iteration in the 4090 // scalar epilogue); in that case, there is no edge from middle to exit and 4091 // and thus no phis which needed updated. 4092 if (!Cost->requiresScalarEpilogue(VF)) 4093 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4094 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4095 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4096 } 4097 4098 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4099 VPTransformState &State) { 4100 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4101 // Get it's reduction variable descriptor. 4102 assert(Legal->isReductionVariable(OrigPhi) && 4103 "Unable to find the reduction variable"); 4104 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4105 4106 RecurKind RK = RdxDesc.getRecurrenceKind(); 4107 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4108 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4109 setDebugLocFromInst(ReductionStartValue); 4110 4111 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4112 // This is the vector-clone of the value that leaves the loop. 4113 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4114 4115 // Wrap flags are in general invalid after vectorization, clear them. 4116 clearReductionWrapFlags(RdxDesc, State); 4117 4118 // Before each round, move the insertion point right between 4119 // the PHIs and the values we are going to write. 4120 // This allows us to write both PHINodes and the extractelement 4121 // instructions. 4122 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4123 4124 setDebugLocFromInst(LoopExitInst); 4125 4126 Type *PhiTy = OrigPhi->getType(); 4127 // If tail is folded by masking, the vector value to leave the loop should be 4128 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4129 // instead of the former. For an inloop reduction the reduction will already 4130 // be predicated, and does not need to be handled here. 4131 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4132 for (unsigned Part = 0; Part < UF; ++Part) { 4133 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4134 Value *Sel = nullptr; 4135 for (User *U : VecLoopExitInst->users()) { 4136 if (isa<SelectInst>(U)) { 4137 assert(!Sel && "Reduction exit feeding two selects"); 4138 Sel = U; 4139 } else 4140 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4141 } 4142 assert(Sel && "Reduction exit feeds no select"); 4143 State.reset(LoopExitInstDef, Sel, Part); 4144 4145 // If the target can create a predicated operator for the reduction at no 4146 // extra cost in the loop (for example a predicated vadd), it can be 4147 // cheaper for the select to remain in the loop than be sunk out of it, 4148 // and so use the select value for the phi instead of the old 4149 // LoopExitValue. 4150 if (PreferPredicatedReductionSelect || 4151 TTI->preferPredicatedReductionSelect( 4152 RdxDesc.getOpcode(), PhiTy, 4153 TargetTransformInfo::ReductionFlags())) { 4154 auto *VecRdxPhi = 4155 cast<PHINode>(State.get(PhiR, Part)); 4156 VecRdxPhi->setIncomingValueForBlock( 4157 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4158 } 4159 } 4160 } 4161 4162 // If the vector reduction can be performed in a smaller type, we truncate 4163 // then extend the loop exit value to enable InstCombine to evaluate the 4164 // entire expression in the smaller type. 4165 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4166 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4167 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4168 Builder.SetInsertPoint( 4169 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4170 VectorParts RdxParts(UF); 4171 for (unsigned Part = 0; Part < UF; ++Part) { 4172 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4173 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4174 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4175 : Builder.CreateZExt(Trunc, VecTy); 4176 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4177 if (U != Trunc) { 4178 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4179 RdxParts[Part] = Extnd; 4180 } 4181 } 4182 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4183 for (unsigned Part = 0; Part < UF; ++Part) { 4184 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4185 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4186 } 4187 } 4188 4189 // Reduce all of the unrolled parts into a single vector. 4190 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4191 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4192 4193 // The middle block terminator has already been assigned a DebugLoc here (the 4194 // OrigLoop's single latch terminator). We want the whole middle block to 4195 // appear to execute on this line because: (a) it is all compiler generated, 4196 // (b) these instructions are always executed after evaluating the latch 4197 // conditional branch, and (c) other passes may add new predecessors which 4198 // terminate on this line. This is the easiest way to ensure we don't 4199 // accidentally cause an extra step back into the loop while debugging. 4200 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4201 if (PhiR->isOrdered()) 4202 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4203 else { 4204 // Floating-point operations should have some FMF to enable the reduction. 4205 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4206 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4207 for (unsigned Part = 1; Part < UF; ++Part) { 4208 Value *RdxPart = State.get(LoopExitInstDef, Part); 4209 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4210 ReducedPartRdx = Builder.CreateBinOp( 4211 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4212 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4213 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4214 ReducedPartRdx, RdxPart); 4215 else 4216 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4217 } 4218 } 4219 4220 // Create the reduction after the loop. Note that inloop reductions create the 4221 // target reduction in the loop using a Reduction recipe. 4222 if (VF.isVector() && !PhiR->isInLoop()) { 4223 ReducedPartRdx = 4224 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4225 // If the reduction can be performed in a smaller type, we need to extend 4226 // the reduction to the wider type before we branch to the original loop. 4227 if (PhiTy != RdxDesc.getRecurrenceType()) 4228 ReducedPartRdx = RdxDesc.isSigned() 4229 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4230 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4231 } 4232 4233 PHINode *ResumePhi = 4234 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4235 4236 // Create a phi node that merges control-flow from the backedge-taken check 4237 // block and the middle block. 4238 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4239 LoopScalarPreHeader->getTerminator()); 4240 4241 // If we are fixing reductions in the epilogue loop then we should already 4242 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4243 // we carry over the incoming values correctly. 4244 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4245 if (Incoming == LoopMiddleBlock) 4246 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4247 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4248 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4249 Incoming); 4250 else 4251 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4252 } 4253 4254 // Set the resume value for this reduction 4255 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4256 4257 // Now, we need to fix the users of the reduction variable 4258 // inside and outside of the scalar remainder loop. 4259 4260 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4261 // in the exit blocks. See comment on analogous loop in 4262 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4263 if (!Cost->requiresScalarEpilogue(VF)) 4264 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4265 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4266 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4267 4268 // Fix the scalar loop reduction variable with the incoming reduction sum 4269 // from the vector body and from the backedge value. 4270 int IncomingEdgeBlockIdx = 4271 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4272 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4273 // Pick the other block. 4274 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4275 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4276 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4277 } 4278 4279 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4280 VPTransformState &State) { 4281 RecurKind RK = RdxDesc.getRecurrenceKind(); 4282 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4283 return; 4284 4285 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4286 assert(LoopExitInstr && "null loop exit instruction"); 4287 SmallVector<Instruction *, 8> Worklist; 4288 SmallPtrSet<Instruction *, 8> Visited; 4289 Worklist.push_back(LoopExitInstr); 4290 Visited.insert(LoopExitInstr); 4291 4292 while (!Worklist.empty()) { 4293 Instruction *Cur = Worklist.pop_back_val(); 4294 if (isa<OverflowingBinaryOperator>(Cur)) 4295 for (unsigned Part = 0; Part < UF; ++Part) { 4296 // FIXME: Should not rely on getVPValue at this point. 4297 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4298 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4299 } 4300 4301 for (User *U : Cur->users()) { 4302 Instruction *UI = cast<Instruction>(U); 4303 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4304 Visited.insert(UI).second) 4305 Worklist.push_back(UI); 4306 } 4307 } 4308 } 4309 4310 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4311 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4312 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4313 // Some phis were already hand updated by the reduction and recurrence 4314 // code above, leave them alone. 4315 continue; 4316 4317 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4318 // Non-instruction incoming values will have only one value. 4319 4320 VPLane Lane = VPLane::getFirstLane(); 4321 if (isa<Instruction>(IncomingValue) && 4322 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4323 VF)) 4324 Lane = VPLane::getLastLaneForVF(VF); 4325 4326 // Can be a loop invariant incoming value or the last scalar value to be 4327 // extracted from the vectorized loop. 4328 // FIXME: Should not rely on getVPValue at this point. 4329 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4330 Value *lastIncomingValue = 4331 OrigLoop->isLoopInvariant(IncomingValue) 4332 ? IncomingValue 4333 : State.get(State.Plan->getVPValue(IncomingValue, true), 4334 VPIteration(UF - 1, Lane)); 4335 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4336 } 4337 } 4338 4339 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4340 // The basic block and loop containing the predicated instruction. 4341 auto *PredBB = PredInst->getParent(); 4342 auto *VectorLoop = LI->getLoopFor(PredBB); 4343 4344 // Initialize a worklist with the operands of the predicated instruction. 4345 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4346 4347 // Holds instructions that we need to analyze again. An instruction may be 4348 // reanalyzed if we don't yet know if we can sink it or not. 4349 SmallVector<Instruction *, 8> InstsToReanalyze; 4350 4351 // Returns true if a given use occurs in the predicated block. Phi nodes use 4352 // their operands in their corresponding predecessor blocks. 4353 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4354 auto *I = cast<Instruction>(U.getUser()); 4355 BasicBlock *BB = I->getParent(); 4356 if (auto *Phi = dyn_cast<PHINode>(I)) 4357 BB = Phi->getIncomingBlock( 4358 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4359 return BB == PredBB; 4360 }; 4361 4362 // Iteratively sink the scalarized operands of the predicated instruction 4363 // into the block we created for it. When an instruction is sunk, it's 4364 // operands are then added to the worklist. The algorithm ends after one pass 4365 // through the worklist doesn't sink a single instruction. 4366 bool Changed; 4367 do { 4368 // Add the instructions that need to be reanalyzed to the worklist, and 4369 // reset the changed indicator. 4370 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4371 InstsToReanalyze.clear(); 4372 Changed = false; 4373 4374 while (!Worklist.empty()) { 4375 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4376 4377 // We can't sink an instruction if it is a phi node, is not in the loop, 4378 // or may have side effects. 4379 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4380 I->mayHaveSideEffects()) 4381 continue; 4382 4383 // If the instruction is already in PredBB, check if we can sink its 4384 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4385 // sinking the scalar instruction I, hence it appears in PredBB; but it 4386 // may have failed to sink I's operands (recursively), which we try 4387 // (again) here. 4388 if (I->getParent() == PredBB) { 4389 Worklist.insert(I->op_begin(), I->op_end()); 4390 continue; 4391 } 4392 4393 // It's legal to sink the instruction if all its uses occur in the 4394 // predicated block. Otherwise, there's nothing to do yet, and we may 4395 // need to reanalyze the instruction. 4396 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4397 InstsToReanalyze.push_back(I); 4398 continue; 4399 } 4400 4401 // Move the instruction to the beginning of the predicated block, and add 4402 // it's operands to the worklist. 4403 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4404 Worklist.insert(I->op_begin(), I->op_end()); 4405 4406 // The sinking may have enabled other instructions to be sunk, so we will 4407 // need to iterate. 4408 Changed = true; 4409 } 4410 } while (Changed); 4411 } 4412 4413 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4414 for (PHINode *OrigPhi : OrigPHIsToFix) { 4415 VPWidenPHIRecipe *VPPhi = 4416 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4417 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4418 // Make sure the builder has a valid insert point. 4419 Builder.SetInsertPoint(NewPhi); 4420 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4421 VPValue *Inc = VPPhi->getIncomingValue(i); 4422 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4423 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4424 } 4425 } 4426 } 4427 4428 bool InnerLoopVectorizer::useOrderedReductions( 4429 const RecurrenceDescriptor &RdxDesc) { 4430 return Cost->useOrderedReductions(RdxDesc); 4431 } 4432 4433 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4434 VPWidenPHIRecipe *PhiR, 4435 VPTransformState &State) { 4436 PHINode *P = cast<PHINode>(PN); 4437 if (EnableVPlanNativePath) { 4438 // Currently we enter here in the VPlan-native path for non-induction 4439 // PHIs where all control flow is uniform. We simply widen these PHIs. 4440 // Create a vector phi with no operands - the vector phi operands will be 4441 // set at the end of vector code generation. 4442 Type *VecTy = (State.VF.isScalar()) 4443 ? PN->getType() 4444 : VectorType::get(PN->getType(), State.VF); 4445 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4446 State.set(PhiR, VecPhi, 0); 4447 OrigPHIsToFix.push_back(P); 4448 4449 return; 4450 } 4451 4452 assert(PN->getParent() == OrigLoop->getHeader() && 4453 "Non-header phis should have been handled elsewhere"); 4454 4455 // In order to support recurrences we need to be able to vectorize Phi nodes. 4456 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4457 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4458 // this value when we vectorize all of the instructions that use the PHI. 4459 4460 assert(!Legal->isReductionVariable(P) && 4461 "reductions should be handled elsewhere"); 4462 4463 setDebugLocFromInst(P); 4464 4465 // This PHINode must be an induction variable. 4466 // Make sure that we know about it. 4467 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4468 4469 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4470 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4471 4472 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4473 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4474 4475 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4476 // which can be found from the original scalar operations. 4477 switch (II.getKind()) { 4478 case InductionDescriptor::IK_NoInduction: 4479 llvm_unreachable("Unknown induction"); 4480 case InductionDescriptor::IK_IntInduction: 4481 case InductionDescriptor::IK_FpInduction: 4482 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4483 case InductionDescriptor::IK_PtrInduction: { 4484 // Handle the pointer induction variable case. 4485 assert(P->getType()->isPointerTy() && "Unexpected type."); 4486 4487 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4488 // This is the normalized GEP that starts counting at zero. 4489 Value *PtrInd = 4490 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4491 // Determine the number of scalars we need to generate for each unroll 4492 // iteration. If the instruction is uniform, we only need to generate the 4493 // first lane. Otherwise, we generate all VF values. 4494 bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); 4495 assert((IsUniform || !State.VF.isScalable()) && 4496 "Cannot scalarize a scalable VF"); 4497 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4498 4499 for (unsigned Part = 0; Part < UF; ++Part) { 4500 Value *PartStart = 4501 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4502 4503 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4504 Value *Idx = Builder.CreateAdd( 4505 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4506 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4507 4508 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 4509 State.CFG.PrevBB->getTerminator()); 4510 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, Step, II); 4511 SclrGep->setName("next.gep"); 4512 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4513 } 4514 } 4515 return; 4516 } 4517 assert(isa<SCEVConstant>(II.getStep()) && 4518 "Induction step not a SCEV constant!"); 4519 Type *PhiType = II.getStep()->getType(); 4520 4521 // Build a pointer phi 4522 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4523 Type *ScStValueType = ScalarStartValue->getType(); 4524 PHINode *NewPointerPhi = 4525 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4526 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4527 4528 // A pointer induction, performed by using a gep 4529 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4530 Instruction *InductionLoc = LoopLatch->getTerminator(); 4531 const SCEV *ScalarStep = II.getStep(); 4532 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4533 Value *ScalarStepValue = 4534 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4535 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4536 Value *NumUnrolledElems = 4537 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4538 Value *InductionGEP = GetElementPtrInst::Create( 4539 II.getElementType(), NewPointerPhi, 4540 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4541 InductionLoc); 4542 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4543 4544 // Create UF many actual address geps that use the pointer 4545 // phi as base and a vectorized version of the step value 4546 // (<step*0, ..., step*N>) as offset. 4547 for (unsigned Part = 0; Part < State.UF; ++Part) { 4548 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4549 Value *StartOffsetScalar = 4550 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4551 Value *StartOffset = 4552 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4553 // Create a vector of consecutive numbers from zero to VF. 4554 StartOffset = 4555 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4556 4557 Value *GEP = Builder.CreateGEP( 4558 II.getElementType(), NewPointerPhi, 4559 Builder.CreateMul( 4560 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4561 "vector.gep")); 4562 State.set(PhiR, GEP, Part); 4563 } 4564 } 4565 } 4566 } 4567 4568 /// A helper function for checking whether an integer division-related 4569 /// instruction may divide by zero (in which case it must be predicated if 4570 /// executed conditionally in the scalar code). 4571 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4572 /// Non-zero divisors that are non compile-time constants will not be 4573 /// converted into multiplication, so we will still end up scalarizing 4574 /// the division, but can do so w/o predication. 4575 static bool mayDivideByZero(Instruction &I) { 4576 assert((I.getOpcode() == Instruction::UDiv || 4577 I.getOpcode() == Instruction::SDiv || 4578 I.getOpcode() == Instruction::URem || 4579 I.getOpcode() == Instruction::SRem) && 4580 "Unexpected instruction"); 4581 Value *Divisor = I.getOperand(1); 4582 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4583 return !CInt || CInt->isZero(); 4584 } 4585 4586 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4587 VPUser &ArgOperands, 4588 VPTransformState &State) { 4589 assert(!isa<DbgInfoIntrinsic>(I) && 4590 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4591 setDebugLocFromInst(&I); 4592 4593 Module *M = I.getParent()->getParent()->getParent(); 4594 auto *CI = cast<CallInst>(&I); 4595 4596 SmallVector<Type *, 4> Tys; 4597 for (Value *ArgOperand : CI->args()) 4598 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4599 4600 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4601 4602 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4603 // version of the instruction. 4604 // Is it beneficial to perform intrinsic call compared to lib call? 4605 bool NeedToScalarize = false; 4606 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4607 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4608 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4609 assert((UseVectorIntrinsic || !NeedToScalarize) && 4610 "Instruction should be scalarized elsewhere."); 4611 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4612 "Either the intrinsic cost or vector call cost must be valid"); 4613 4614 for (unsigned Part = 0; Part < UF; ++Part) { 4615 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4616 SmallVector<Value *, 4> Args; 4617 for (auto &I : enumerate(ArgOperands.operands())) { 4618 // Some intrinsics have a scalar argument - don't replace it with a 4619 // vector. 4620 Value *Arg; 4621 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4622 Arg = State.get(I.value(), Part); 4623 else { 4624 Arg = State.get(I.value(), VPIteration(0, 0)); 4625 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4626 TysForDecl.push_back(Arg->getType()); 4627 } 4628 Args.push_back(Arg); 4629 } 4630 4631 Function *VectorF; 4632 if (UseVectorIntrinsic) { 4633 // Use vector version of the intrinsic. 4634 if (VF.isVector()) 4635 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4636 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4637 assert(VectorF && "Can't retrieve vector intrinsic."); 4638 } else { 4639 // Use vector version of the function call. 4640 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4641 #ifndef NDEBUG 4642 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4643 "Can't create vector function."); 4644 #endif 4645 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4646 } 4647 SmallVector<OperandBundleDef, 1> OpBundles; 4648 CI->getOperandBundlesAsDefs(OpBundles); 4649 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4650 4651 if (isa<FPMathOperator>(V)) 4652 V->copyFastMathFlags(CI); 4653 4654 State.set(Def, V, Part); 4655 addMetadata(V, &I); 4656 } 4657 } 4658 4659 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4660 // We should not collect Scalars more than once per VF. Right now, this 4661 // function is called from collectUniformsAndScalars(), which already does 4662 // this check. Collecting Scalars for VF=1 does not make any sense. 4663 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4664 "This function should not be visited twice for the same VF"); 4665 4666 SmallSetVector<Instruction *, 8> Worklist; 4667 4668 // These sets are used to seed the analysis with pointers used by memory 4669 // accesses that will remain scalar. 4670 SmallSetVector<Instruction *, 8> ScalarPtrs; 4671 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4672 auto *Latch = TheLoop->getLoopLatch(); 4673 4674 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4675 // The pointer operands of loads and stores will be scalar as long as the 4676 // memory access is not a gather or scatter operation. The value operand of a 4677 // store will remain scalar if the store is scalarized. 4678 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4679 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4680 assert(WideningDecision != CM_Unknown && 4681 "Widening decision should be ready at this moment"); 4682 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4683 if (Ptr == Store->getValueOperand()) 4684 return WideningDecision == CM_Scalarize; 4685 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4686 "Ptr is neither a value or pointer operand"); 4687 return WideningDecision != CM_GatherScatter; 4688 }; 4689 4690 // A helper that returns true if the given value is a bitcast or 4691 // getelementptr instruction contained in the loop. 4692 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4693 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4694 isa<GetElementPtrInst>(V)) && 4695 !TheLoop->isLoopInvariant(V); 4696 }; 4697 4698 // A helper that evaluates a memory access's use of a pointer. If the use will 4699 // be a scalar use and the pointer is only used by memory accesses, we place 4700 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4701 // PossibleNonScalarPtrs. 4702 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4703 // We only care about bitcast and getelementptr instructions contained in 4704 // the loop. 4705 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4706 return; 4707 4708 // If the pointer has already been identified as scalar (e.g., if it was 4709 // also identified as uniform), there's nothing to do. 4710 auto *I = cast<Instruction>(Ptr); 4711 if (Worklist.count(I)) 4712 return; 4713 4714 // If the use of the pointer will be a scalar use, and all users of the 4715 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4716 // place the pointer in PossibleNonScalarPtrs. 4717 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4718 return isa<LoadInst>(U) || isa<StoreInst>(U); 4719 })) 4720 ScalarPtrs.insert(I); 4721 else 4722 PossibleNonScalarPtrs.insert(I); 4723 }; 4724 4725 // We seed the scalars analysis with three classes of instructions: (1) 4726 // instructions marked uniform-after-vectorization and (2) bitcast, 4727 // getelementptr and (pointer) phi instructions used by memory accesses 4728 // requiring a scalar use. 4729 // 4730 // (1) Add to the worklist all instructions that have been identified as 4731 // uniform-after-vectorization. 4732 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4733 4734 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4735 // memory accesses requiring a scalar use. The pointer operands of loads and 4736 // stores will be scalar as long as the memory accesses is not a gather or 4737 // scatter operation. The value operand of a store will remain scalar if the 4738 // store is scalarized. 4739 for (auto *BB : TheLoop->blocks()) 4740 for (auto &I : *BB) { 4741 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4742 evaluatePtrUse(Load, Load->getPointerOperand()); 4743 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4744 evaluatePtrUse(Store, Store->getPointerOperand()); 4745 evaluatePtrUse(Store, Store->getValueOperand()); 4746 } 4747 } 4748 for (auto *I : ScalarPtrs) 4749 if (!PossibleNonScalarPtrs.count(I)) { 4750 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4751 Worklist.insert(I); 4752 } 4753 4754 // Insert the forced scalars. 4755 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4756 // induction variable when the PHI user is scalarized. 4757 auto ForcedScalar = ForcedScalars.find(VF); 4758 if (ForcedScalar != ForcedScalars.end()) 4759 for (auto *I : ForcedScalar->second) 4760 Worklist.insert(I); 4761 4762 // Expand the worklist by looking through any bitcasts and getelementptr 4763 // instructions we've already identified as scalar. This is similar to the 4764 // expansion step in collectLoopUniforms(); however, here we're only 4765 // expanding to include additional bitcasts and getelementptr instructions. 4766 unsigned Idx = 0; 4767 while (Idx != Worklist.size()) { 4768 Instruction *Dst = Worklist[Idx++]; 4769 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4770 continue; 4771 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4772 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4773 auto *J = cast<Instruction>(U); 4774 return !TheLoop->contains(J) || Worklist.count(J) || 4775 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4776 isScalarUse(J, Src)); 4777 })) { 4778 Worklist.insert(Src); 4779 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4780 } 4781 } 4782 4783 // An induction variable will remain scalar if all users of the induction 4784 // variable and induction variable update remain scalar. 4785 for (auto &Induction : Legal->getInductionVars()) { 4786 auto *Ind = Induction.first; 4787 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4788 4789 // If tail-folding is applied, the primary induction variable will be used 4790 // to feed a vector compare. 4791 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4792 continue; 4793 4794 // Returns true if \p Indvar is a pointer induction that is used directly by 4795 // load/store instruction \p I. 4796 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4797 Instruction *I) { 4798 return Induction.second.getKind() == 4799 InductionDescriptor::IK_PtrInduction && 4800 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4801 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4802 }; 4803 4804 // Determine if all users of the induction variable are scalar after 4805 // vectorization. 4806 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4807 auto *I = cast<Instruction>(U); 4808 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4809 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4810 }); 4811 if (!ScalarInd) 4812 continue; 4813 4814 // Determine if all users of the induction variable update instruction are 4815 // scalar after vectorization. 4816 auto ScalarIndUpdate = 4817 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4818 auto *I = cast<Instruction>(U); 4819 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4820 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4821 }); 4822 if (!ScalarIndUpdate) 4823 continue; 4824 4825 // The induction variable and its update instruction will remain scalar. 4826 Worklist.insert(Ind); 4827 Worklist.insert(IndUpdate); 4828 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4829 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4830 << "\n"); 4831 } 4832 4833 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4834 } 4835 4836 bool LoopVectorizationCostModel::isScalarWithPredication( 4837 Instruction *I, ElementCount VF) const { 4838 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4839 return false; 4840 switch(I->getOpcode()) { 4841 default: 4842 break; 4843 case Instruction::Load: 4844 case Instruction::Store: { 4845 if (!Legal->isMaskRequired(I)) 4846 return false; 4847 auto *Ptr = getLoadStorePointerOperand(I); 4848 auto *Ty = getLoadStoreType(I); 4849 Type *VTy = Ty; 4850 if (VF.isVector()) 4851 VTy = VectorType::get(Ty, VF); 4852 const Align Alignment = getLoadStoreAlignment(I); 4853 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4854 TTI.isLegalMaskedGather(VTy, Alignment)) 4855 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4856 TTI.isLegalMaskedScatter(VTy, Alignment)); 4857 } 4858 case Instruction::UDiv: 4859 case Instruction::SDiv: 4860 case Instruction::SRem: 4861 case Instruction::URem: 4862 return mayDivideByZero(*I); 4863 } 4864 return false; 4865 } 4866 4867 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4868 Instruction *I, ElementCount VF) { 4869 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4870 assert(getWideningDecision(I, VF) == CM_Unknown && 4871 "Decision should not be set yet."); 4872 auto *Group = getInterleavedAccessGroup(I); 4873 assert(Group && "Must have a group."); 4874 4875 // If the instruction's allocated size doesn't equal it's type size, it 4876 // requires padding and will be scalarized. 4877 auto &DL = I->getModule()->getDataLayout(); 4878 auto *ScalarTy = getLoadStoreType(I); 4879 if (hasIrregularType(ScalarTy, DL)) 4880 return false; 4881 4882 // Check if masking is required. 4883 // A Group may need masking for one of two reasons: it resides in a block that 4884 // needs predication, or it was decided to use masking to deal with gaps 4885 // (either a gap at the end of a load-access that may result in a speculative 4886 // load, or any gaps in a store-access). 4887 bool PredicatedAccessRequiresMasking = 4888 blockNeedsPredicationForAnyReason(I->getParent()) && 4889 Legal->isMaskRequired(I); 4890 bool LoadAccessWithGapsRequiresEpilogMasking = 4891 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4892 !isScalarEpilogueAllowed(); 4893 bool StoreAccessWithGapsRequiresMasking = 4894 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4895 if (!PredicatedAccessRequiresMasking && 4896 !LoadAccessWithGapsRequiresEpilogMasking && 4897 !StoreAccessWithGapsRequiresMasking) 4898 return true; 4899 4900 // If masked interleaving is required, we expect that the user/target had 4901 // enabled it, because otherwise it either wouldn't have been created or 4902 // it should have been invalidated by the CostModel. 4903 assert(useMaskedInterleavedAccesses(TTI) && 4904 "Masked interleave-groups for predicated accesses are not enabled."); 4905 4906 if (Group->isReverse()) 4907 return false; 4908 4909 auto *Ty = getLoadStoreType(I); 4910 const Align Alignment = getLoadStoreAlignment(I); 4911 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4912 : TTI.isLegalMaskedStore(Ty, Alignment); 4913 } 4914 4915 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4916 Instruction *I, ElementCount VF) { 4917 // Get and ensure we have a valid memory instruction. 4918 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4919 4920 auto *Ptr = getLoadStorePointerOperand(I); 4921 auto *ScalarTy = getLoadStoreType(I); 4922 4923 // In order to be widened, the pointer should be consecutive, first of all. 4924 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4925 return false; 4926 4927 // If the instruction is a store located in a predicated block, it will be 4928 // scalarized. 4929 if (isScalarWithPredication(I, VF)) 4930 return false; 4931 4932 // If the instruction's allocated size doesn't equal it's type size, it 4933 // requires padding and will be scalarized. 4934 auto &DL = I->getModule()->getDataLayout(); 4935 if (hasIrregularType(ScalarTy, DL)) 4936 return false; 4937 4938 return true; 4939 } 4940 4941 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4942 // We should not collect Uniforms more than once per VF. Right now, 4943 // this function is called from collectUniformsAndScalars(), which 4944 // already does this check. Collecting Uniforms for VF=1 does not make any 4945 // sense. 4946 4947 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4948 "This function should not be visited twice for the same VF"); 4949 4950 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4951 // not analyze again. Uniforms.count(VF) will return 1. 4952 Uniforms[VF].clear(); 4953 4954 // We now know that the loop is vectorizable! 4955 // Collect instructions inside the loop that will remain uniform after 4956 // vectorization. 4957 4958 // Global values, params and instructions outside of current loop are out of 4959 // scope. 4960 auto isOutOfScope = [&](Value *V) -> bool { 4961 Instruction *I = dyn_cast<Instruction>(V); 4962 return (!I || !TheLoop->contains(I)); 4963 }; 4964 4965 // Worklist containing uniform instructions demanding lane 0. 4966 SetVector<Instruction *> Worklist; 4967 BasicBlock *Latch = TheLoop->getLoopLatch(); 4968 4969 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4970 // that are scalar with predication must not be considered uniform after 4971 // vectorization, because that would create an erroneous replicating region 4972 // where only a single instance out of VF should be formed. 4973 // TODO: optimize such seldom cases if found important, see PR40816. 4974 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4975 if (isOutOfScope(I)) { 4976 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4977 << *I << "\n"); 4978 return; 4979 } 4980 if (isScalarWithPredication(I, VF)) { 4981 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4982 << *I << "\n"); 4983 return; 4984 } 4985 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4986 Worklist.insert(I); 4987 }; 4988 4989 // Start with the conditional branch. If the branch condition is an 4990 // instruction contained in the loop that is only used by the branch, it is 4991 // uniform. 4992 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4993 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4994 addToWorklistIfAllowed(Cmp); 4995 4996 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4997 InstWidening WideningDecision = getWideningDecision(I, VF); 4998 assert(WideningDecision != CM_Unknown && 4999 "Widening decision should be ready at this moment"); 5000 5001 // A uniform memory op is itself uniform. We exclude uniform stores 5002 // here as they demand the last lane, not the first one. 5003 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5004 assert(WideningDecision == CM_Scalarize); 5005 return true; 5006 } 5007 5008 return (WideningDecision == CM_Widen || 5009 WideningDecision == CM_Widen_Reverse || 5010 WideningDecision == CM_Interleave); 5011 }; 5012 5013 5014 // Returns true if Ptr is the pointer operand of a memory access instruction 5015 // I, and I is known to not require scalarization. 5016 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5017 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5018 }; 5019 5020 // Holds a list of values which are known to have at least one uniform use. 5021 // Note that there may be other uses which aren't uniform. A "uniform use" 5022 // here is something which only demands lane 0 of the unrolled iterations; 5023 // it does not imply that all lanes produce the same value (e.g. this is not 5024 // the usual meaning of uniform) 5025 SetVector<Value *> HasUniformUse; 5026 5027 // Scan the loop for instructions which are either a) known to have only 5028 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5029 for (auto *BB : TheLoop->blocks()) 5030 for (auto &I : *BB) { 5031 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5032 switch (II->getIntrinsicID()) { 5033 case Intrinsic::sideeffect: 5034 case Intrinsic::experimental_noalias_scope_decl: 5035 case Intrinsic::assume: 5036 case Intrinsic::lifetime_start: 5037 case Intrinsic::lifetime_end: 5038 if (TheLoop->hasLoopInvariantOperands(&I)) 5039 addToWorklistIfAllowed(&I); 5040 break; 5041 default: 5042 break; 5043 } 5044 } 5045 5046 // ExtractValue instructions must be uniform, because the operands are 5047 // known to be loop-invariant. 5048 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5049 assert(isOutOfScope(EVI->getAggregateOperand()) && 5050 "Expected aggregate value to be loop invariant"); 5051 addToWorklistIfAllowed(EVI); 5052 continue; 5053 } 5054 5055 // If there's no pointer operand, there's nothing to do. 5056 auto *Ptr = getLoadStorePointerOperand(&I); 5057 if (!Ptr) 5058 continue; 5059 5060 // A uniform memory op is itself uniform. We exclude uniform stores 5061 // here as they demand the last lane, not the first one. 5062 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5063 addToWorklistIfAllowed(&I); 5064 5065 if (isUniformDecision(&I, VF)) { 5066 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5067 HasUniformUse.insert(Ptr); 5068 } 5069 } 5070 5071 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5072 // demanding) users. Since loops are assumed to be in LCSSA form, this 5073 // disallows uses outside the loop as well. 5074 for (auto *V : HasUniformUse) { 5075 if (isOutOfScope(V)) 5076 continue; 5077 auto *I = cast<Instruction>(V); 5078 auto UsersAreMemAccesses = 5079 llvm::all_of(I->users(), [&](User *U) -> bool { 5080 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5081 }); 5082 if (UsersAreMemAccesses) 5083 addToWorklistIfAllowed(I); 5084 } 5085 5086 // Expand Worklist in topological order: whenever a new instruction 5087 // is added , its users should be already inside Worklist. It ensures 5088 // a uniform instruction will only be used by uniform instructions. 5089 unsigned idx = 0; 5090 while (idx != Worklist.size()) { 5091 Instruction *I = Worklist[idx++]; 5092 5093 for (auto OV : I->operand_values()) { 5094 // isOutOfScope operands cannot be uniform instructions. 5095 if (isOutOfScope(OV)) 5096 continue; 5097 // First order recurrence Phi's should typically be considered 5098 // non-uniform. 5099 auto *OP = dyn_cast<PHINode>(OV); 5100 if (OP && Legal->isFirstOrderRecurrence(OP)) 5101 continue; 5102 // If all the users of the operand are uniform, then add the 5103 // operand into the uniform worklist. 5104 auto *OI = cast<Instruction>(OV); 5105 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5106 auto *J = cast<Instruction>(U); 5107 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5108 })) 5109 addToWorklistIfAllowed(OI); 5110 } 5111 } 5112 5113 // For an instruction to be added into Worklist above, all its users inside 5114 // the loop should also be in Worklist. However, this condition cannot be 5115 // true for phi nodes that form a cyclic dependence. We must process phi 5116 // nodes separately. An induction variable will remain uniform if all users 5117 // of the induction variable and induction variable update remain uniform. 5118 // The code below handles both pointer and non-pointer induction variables. 5119 for (auto &Induction : Legal->getInductionVars()) { 5120 auto *Ind = Induction.first; 5121 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5122 5123 // Determine if all users of the induction variable are uniform after 5124 // vectorization. 5125 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5126 auto *I = cast<Instruction>(U); 5127 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5128 isVectorizedMemAccessUse(I, Ind); 5129 }); 5130 if (!UniformInd) 5131 continue; 5132 5133 // Determine if all users of the induction variable update instruction are 5134 // uniform after vectorization. 5135 auto UniformIndUpdate = 5136 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5137 auto *I = cast<Instruction>(U); 5138 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5139 isVectorizedMemAccessUse(I, IndUpdate); 5140 }); 5141 if (!UniformIndUpdate) 5142 continue; 5143 5144 // The induction variable and its update instruction will remain uniform. 5145 addToWorklistIfAllowed(Ind); 5146 addToWorklistIfAllowed(IndUpdate); 5147 } 5148 5149 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5150 } 5151 5152 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5153 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5154 5155 if (Legal->getRuntimePointerChecking()->Need) { 5156 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5157 "runtime pointer checks needed. Enable vectorization of this " 5158 "loop with '#pragma clang loop vectorize(enable)' when " 5159 "compiling with -Os/-Oz", 5160 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5161 return true; 5162 } 5163 5164 if (!PSE.getPredicate().isAlwaysTrue()) { 5165 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5166 "runtime SCEV checks needed. Enable vectorization of this " 5167 "loop with '#pragma clang loop vectorize(enable)' when " 5168 "compiling with -Os/-Oz", 5169 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5170 return true; 5171 } 5172 5173 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5174 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5175 reportVectorizationFailure("Runtime stride check for small trip count", 5176 "runtime stride == 1 checks needed. Enable vectorization of " 5177 "this loop without such check by compiling with -Os/-Oz", 5178 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5179 return true; 5180 } 5181 5182 return false; 5183 } 5184 5185 ElementCount 5186 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5187 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5188 return ElementCount::getScalable(0); 5189 5190 if (Hints->isScalableVectorizationDisabled()) { 5191 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5192 "ScalableVectorizationDisabled", ORE, TheLoop); 5193 return ElementCount::getScalable(0); 5194 } 5195 5196 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5197 5198 auto MaxScalableVF = ElementCount::getScalable( 5199 std::numeric_limits<ElementCount::ScalarTy>::max()); 5200 5201 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5202 // FIXME: While for scalable vectors this is currently sufficient, this should 5203 // be replaced by a more detailed mechanism that filters out specific VFs, 5204 // instead of invalidating vectorization for a whole set of VFs based on the 5205 // MaxVF. 5206 5207 // Disable scalable vectorization if the loop contains unsupported reductions. 5208 if (!canVectorizeReductions(MaxScalableVF)) { 5209 reportVectorizationInfo( 5210 "Scalable vectorization not supported for the reduction " 5211 "operations found in this loop.", 5212 "ScalableVFUnfeasible", ORE, TheLoop); 5213 return ElementCount::getScalable(0); 5214 } 5215 5216 // Disable scalable vectorization if the loop contains any instructions 5217 // with element types not supported for scalable vectors. 5218 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5219 return !Ty->isVoidTy() && 5220 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5221 })) { 5222 reportVectorizationInfo("Scalable vectorization is not supported " 5223 "for all element types found in this loop.", 5224 "ScalableVFUnfeasible", ORE, TheLoop); 5225 return ElementCount::getScalable(0); 5226 } 5227 5228 if (Legal->isSafeForAnyVectorWidth()) 5229 return MaxScalableVF; 5230 5231 // Limit MaxScalableVF by the maximum safe dependence distance. 5232 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5233 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5234 MaxVScale = 5235 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5236 MaxScalableVF = ElementCount::getScalable( 5237 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5238 if (!MaxScalableVF) 5239 reportVectorizationInfo( 5240 "Max legal vector width too small, scalable vectorization " 5241 "unfeasible.", 5242 "ScalableVFUnfeasible", ORE, TheLoop); 5243 5244 return MaxScalableVF; 5245 } 5246 5247 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5248 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5249 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5250 unsigned SmallestType, WidestType; 5251 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5252 5253 // Get the maximum safe dependence distance in bits computed by LAA. 5254 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5255 // the memory accesses that is most restrictive (involved in the smallest 5256 // dependence distance). 5257 unsigned MaxSafeElements = 5258 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5259 5260 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5261 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5262 5263 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5264 << ".\n"); 5265 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5266 << ".\n"); 5267 5268 // First analyze the UserVF, fall back if the UserVF should be ignored. 5269 if (UserVF) { 5270 auto MaxSafeUserVF = 5271 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5272 5273 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5274 // If `VF=vscale x N` is safe, then so is `VF=N` 5275 if (UserVF.isScalable()) 5276 return FixedScalableVFPair( 5277 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5278 else 5279 return UserVF; 5280 } 5281 5282 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5283 5284 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5285 // is better to ignore the hint and let the compiler choose a suitable VF. 5286 if (!UserVF.isScalable()) { 5287 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5288 << " is unsafe, clamping to max safe VF=" 5289 << MaxSafeFixedVF << ".\n"); 5290 ORE->emit([&]() { 5291 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5292 TheLoop->getStartLoc(), 5293 TheLoop->getHeader()) 5294 << "User-specified vectorization factor " 5295 << ore::NV("UserVectorizationFactor", UserVF) 5296 << " is unsafe, clamping to maximum safe vectorization factor " 5297 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5298 }); 5299 return MaxSafeFixedVF; 5300 } 5301 5302 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5303 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5304 << " is ignored because scalable vectors are not " 5305 "available.\n"); 5306 ORE->emit([&]() { 5307 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5308 TheLoop->getStartLoc(), 5309 TheLoop->getHeader()) 5310 << "User-specified vectorization factor " 5311 << ore::NV("UserVectorizationFactor", UserVF) 5312 << " is ignored because the target does not support scalable " 5313 "vectors. The compiler will pick a more suitable value."; 5314 }); 5315 } else { 5316 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5317 << " is unsafe. Ignoring scalable UserVF.\n"); 5318 ORE->emit([&]() { 5319 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5320 TheLoop->getStartLoc(), 5321 TheLoop->getHeader()) 5322 << "User-specified vectorization factor " 5323 << ore::NV("UserVectorizationFactor", UserVF) 5324 << " is unsafe. Ignoring the hint to let the compiler pick a " 5325 "more suitable value."; 5326 }); 5327 } 5328 } 5329 5330 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5331 << " / " << WidestType << " bits.\n"); 5332 5333 FixedScalableVFPair Result(ElementCount::getFixed(1), 5334 ElementCount::getScalable(0)); 5335 if (auto MaxVF = 5336 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5337 MaxSafeFixedVF, FoldTailByMasking)) 5338 Result.FixedVF = MaxVF; 5339 5340 if (auto MaxVF = 5341 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5342 MaxSafeScalableVF, FoldTailByMasking)) 5343 if (MaxVF.isScalable()) { 5344 Result.ScalableVF = MaxVF; 5345 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5346 << "\n"); 5347 } 5348 5349 return Result; 5350 } 5351 5352 FixedScalableVFPair 5353 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5354 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5355 // TODO: It may by useful to do since it's still likely to be dynamically 5356 // uniform if the target can skip. 5357 reportVectorizationFailure( 5358 "Not inserting runtime ptr check for divergent target", 5359 "runtime pointer checks needed. Not enabled for divergent target", 5360 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5361 return FixedScalableVFPair::getNone(); 5362 } 5363 5364 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5365 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5366 if (TC == 1) { 5367 reportVectorizationFailure("Single iteration (non) loop", 5368 "loop trip count is one, irrelevant for vectorization", 5369 "SingleIterationLoop", ORE, TheLoop); 5370 return FixedScalableVFPair::getNone(); 5371 } 5372 5373 switch (ScalarEpilogueStatus) { 5374 case CM_ScalarEpilogueAllowed: 5375 return computeFeasibleMaxVF(TC, UserVF, false); 5376 case CM_ScalarEpilogueNotAllowedUsePredicate: 5377 LLVM_FALLTHROUGH; 5378 case CM_ScalarEpilogueNotNeededUsePredicate: 5379 LLVM_DEBUG( 5380 dbgs() << "LV: vector predicate hint/switch found.\n" 5381 << "LV: Not allowing scalar epilogue, creating predicated " 5382 << "vector loop.\n"); 5383 break; 5384 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5385 // fallthrough as a special case of OptForSize 5386 case CM_ScalarEpilogueNotAllowedOptSize: 5387 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5388 LLVM_DEBUG( 5389 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5390 else 5391 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5392 << "count.\n"); 5393 5394 // Bail if runtime checks are required, which are not good when optimising 5395 // for size. 5396 if (runtimeChecksRequired()) 5397 return FixedScalableVFPair::getNone(); 5398 5399 break; 5400 } 5401 5402 // The only loops we can vectorize without a scalar epilogue, are loops with 5403 // a bottom-test and a single exiting block. We'd have to handle the fact 5404 // that not every instruction executes on the last iteration. This will 5405 // require a lane mask which varies through the vector loop body. (TODO) 5406 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5407 // If there was a tail-folding hint/switch, but we can't fold the tail by 5408 // masking, fallback to a vectorization with a scalar epilogue. 5409 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5410 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5411 "scalar epilogue instead.\n"); 5412 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5413 return computeFeasibleMaxVF(TC, UserVF, false); 5414 } 5415 return FixedScalableVFPair::getNone(); 5416 } 5417 5418 // Now try the tail folding 5419 5420 // Invalidate interleave groups that require an epilogue if we can't mask 5421 // the interleave-group. 5422 if (!useMaskedInterleavedAccesses(TTI)) { 5423 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5424 "No decisions should have been taken at this point"); 5425 // Note: There is no need to invalidate any cost modeling decisions here, as 5426 // non where taken so far. 5427 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5428 } 5429 5430 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5431 // Avoid tail folding if the trip count is known to be a multiple of any VF 5432 // we chose. 5433 // FIXME: The condition below pessimises the case for fixed-width vectors, 5434 // when scalable VFs are also candidates for vectorization. 5435 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5436 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5437 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5438 "MaxFixedVF must be a power of 2"); 5439 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5440 : MaxFixedVF.getFixedValue(); 5441 ScalarEvolution *SE = PSE.getSE(); 5442 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5443 const SCEV *ExitCount = SE->getAddExpr( 5444 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5445 const SCEV *Rem = SE->getURemExpr( 5446 SE->applyLoopGuards(ExitCount, TheLoop), 5447 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5448 if (Rem->isZero()) { 5449 // Accept MaxFixedVF if we do not have a tail. 5450 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5451 return MaxFactors; 5452 } 5453 } 5454 5455 // For scalable vectors don't use tail folding for low trip counts or 5456 // optimizing for code size. We only permit this if the user has explicitly 5457 // requested it. 5458 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5459 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5460 MaxFactors.ScalableVF.isVector()) 5461 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5462 5463 // If we don't know the precise trip count, or if the trip count that we 5464 // found modulo the vectorization factor is not zero, try to fold the tail 5465 // by masking. 5466 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5467 if (Legal->prepareToFoldTailByMasking()) { 5468 FoldTailByMasking = true; 5469 return MaxFactors; 5470 } 5471 5472 // If there was a tail-folding hint/switch, but we can't fold the tail by 5473 // masking, fallback to a vectorization with a scalar epilogue. 5474 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5475 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5476 "scalar epilogue instead.\n"); 5477 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5478 return MaxFactors; 5479 } 5480 5481 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5482 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5483 return FixedScalableVFPair::getNone(); 5484 } 5485 5486 if (TC == 0) { 5487 reportVectorizationFailure( 5488 "Unable to calculate the loop count due to complex control flow", 5489 "unable to calculate the loop count due to complex control flow", 5490 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5491 return FixedScalableVFPair::getNone(); 5492 } 5493 5494 reportVectorizationFailure( 5495 "Cannot optimize for size and vectorize at the same time.", 5496 "cannot optimize for size and vectorize at the same time. " 5497 "Enable vectorization of this loop with '#pragma clang loop " 5498 "vectorize(enable)' when compiling with -Os/-Oz", 5499 "NoTailLoopWithOptForSize", ORE, TheLoop); 5500 return FixedScalableVFPair::getNone(); 5501 } 5502 5503 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5504 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5505 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5506 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5507 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5508 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5509 : TargetTransformInfo::RGK_FixedWidthVector); 5510 5511 // Convenience function to return the minimum of two ElementCounts. 5512 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5513 assert((LHS.isScalable() == RHS.isScalable()) && 5514 "Scalable flags must match"); 5515 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5516 }; 5517 5518 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5519 // Note that both WidestRegister and WidestType may not be a powers of 2. 5520 auto MaxVectorElementCount = ElementCount::get( 5521 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5522 ComputeScalableMaxVF); 5523 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5524 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5525 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5526 5527 if (!MaxVectorElementCount) { 5528 LLVM_DEBUG(dbgs() << "LV: The target has no " 5529 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5530 << " vector registers.\n"); 5531 return ElementCount::getFixed(1); 5532 } 5533 5534 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5535 if (ConstTripCount && 5536 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5537 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5538 // If loop trip count (TC) is known at compile time there is no point in 5539 // choosing VF greater than TC (as done in the loop below). Select maximum 5540 // power of two which doesn't exceed TC. 5541 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5542 // when the TC is less than or equal to the known number of lanes. 5543 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5544 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5545 "exceeding the constant trip count: " 5546 << ClampedConstTripCount << "\n"); 5547 return ElementCount::getFixed(ClampedConstTripCount); 5548 } 5549 5550 ElementCount MaxVF = MaxVectorElementCount; 5551 if (TTI.shouldMaximizeVectorBandwidth() || 5552 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5553 auto MaxVectorElementCountMaxBW = ElementCount::get( 5554 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5555 ComputeScalableMaxVF); 5556 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5557 5558 // Collect all viable vectorization factors larger than the default MaxVF 5559 // (i.e. MaxVectorElementCount). 5560 SmallVector<ElementCount, 8> VFs; 5561 for (ElementCount VS = MaxVectorElementCount * 2; 5562 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5563 VFs.push_back(VS); 5564 5565 // For each VF calculate its register usage. 5566 auto RUs = calculateRegisterUsage(VFs); 5567 5568 // Select the largest VF which doesn't require more registers than existing 5569 // ones. 5570 for (int i = RUs.size() - 1; i >= 0; --i) { 5571 bool Selected = true; 5572 for (auto &pair : RUs[i].MaxLocalUsers) { 5573 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5574 if (pair.second > TargetNumRegisters) 5575 Selected = false; 5576 } 5577 if (Selected) { 5578 MaxVF = VFs[i]; 5579 break; 5580 } 5581 } 5582 if (ElementCount MinVF = 5583 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5584 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5585 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5586 << ") with target's minimum: " << MinVF << '\n'); 5587 MaxVF = MinVF; 5588 } 5589 } 5590 } 5591 return MaxVF; 5592 } 5593 5594 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5595 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5596 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5597 auto Min = Attr.getVScaleRangeMin(); 5598 auto Max = Attr.getVScaleRangeMax(); 5599 if (Max && Min == Max) 5600 return Max; 5601 } 5602 5603 return TTI.getVScaleForTuning(); 5604 } 5605 5606 bool LoopVectorizationCostModel::isMoreProfitable( 5607 const VectorizationFactor &A, const VectorizationFactor &B) const { 5608 InstructionCost CostA = A.Cost; 5609 InstructionCost CostB = B.Cost; 5610 5611 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5612 5613 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5614 MaxTripCount) { 5615 // If we are folding the tail and the trip count is a known (possibly small) 5616 // constant, the trip count will be rounded up to an integer number of 5617 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5618 // which we compare directly. When not folding the tail, the total cost will 5619 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5620 // approximated with the per-lane cost below instead of using the tripcount 5621 // as here. 5622 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5623 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5624 return RTCostA < RTCostB; 5625 } 5626 5627 // Improve estimate for the vector width if it is scalable. 5628 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5629 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5630 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5631 if (A.Width.isScalable()) 5632 EstimatedWidthA *= VScale.getValue(); 5633 if (B.Width.isScalable()) 5634 EstimatedWidthB *= VScale.getValue(); 5635 } 5636 5637 // Assume vscale may be larger than 1 (or the value being tuned for), 5638 // so that scalable vectorization is slightly favorable over fixed-width 5639 // vectorization. 5640 if (A.Width.isScalable() && !B.Width.isScalable()) 5641 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5642 5643 // To avoid the need for FP division: 5644 // (CostA / A.Width) < (CostB / B.Width) 5645 // <=> (CostA * B.Width) < (CostB * A.Width) 5646 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5647 } 5648 5649 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5650 const ElementCountSet &VFCandidates) { 5651 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5652 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5653 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5654 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5655 "Expected Scalar VF to be a candidate"); 5656 5657 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5658 VectorizationFactor ChosenFactor = ScalarCost; 5659 5660 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5661 if (ForceVectorization && VFCandidates.size() > 1) { 5662 // Ignore scalar width, because the user explicitly wants vectorization. 5663 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5664 // evaluation. 5665 ChosenFactor.Cost = InstructionCost::getMax(); 5666 } 5667 5668 SmallVector<InstructionVFPair> InvalidCosts; 5669 for (const auto &i : VFCandidates) { 5670 // The cost for scalar VF=1 is already calculated, so ignore it. 5671 if (i.isScalar()) 5672 continue; 5673 5674 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5675 VectorizationFactor Candidate(i, C.first); 5676 5677 #ifndef NDEBUG 5678 unsigned AssumedMinimumVscale = 1; 5679 if (Optional<unsigned> VScale = getVScaleForTuning()) 5680 AssumedMinimumVscale = VScale.getValue(); 5681 unsigned Width = 5682 Candidate.Width.isScalable() 5683 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5684 : Candidate.Width.getFixedValue(); 5685 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5686 << " costs: " << (Candidate.Cost / Width)); 5687 if (i.isScalable()) 5688 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5689 << AssumedMinimumVscale << ")"); 5690 LLVM_DEBUG(dbgs() << ".\n"); 5691 #endif 5692 5693 if (!C.second && !ForceVectorization) { 5694 LLVM_DEBUG( 5695 dbgs() << "LV: Not considering vector loop of width " << i 5696 << " because it will not generate any vector instructions.\n"); 5697 continue; 5698 } 5699 5700 // If profitable add it to ProfitableVF list. 5701 if (isMoreProfitable(Candidate, ScalarCost)) 5702 ProfitableVFs.push_back(Candidate); 5703 5704 if (isMoreProfitable(Candidate, ChosenFactor)) 5705 ChosenFactor = Candidate; 5706 } 5707 5708 // Emit a report of VFs with invalid costs in the loop. 5709 if (!InvalidCosts.empty()) { 5710 // Group the remarks per instruction, keeping the instruction order from 5711 // InvalidCosts. 5712 std::map<Instruction *, unsigned> Numbering; 5713 unsigned I = 0; 5714 for (auto &Pair : InvalidCosts) 5715 if (!Numbering.count(Pair.first)) 5716 Numbering[Pair.first] = I++; 5717 5718 // Sort the list, first on instruction(number) then on VF. 5719 llvm::sort(InvalidCosts, 5720 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5721 if (Numbering[A.first] != Numbering[B.first]) 5722 return Numbering[A.first] < Numbering[B.first]; 5723 ElementCountComparator ECC; 5724 return ECC(A.second, B.second); 5725 }); 5726 5727 // For a list of ordered instruction-vf pairs: 5728 // [(load, vf1), (load, vf2), (store, vf1)] 5729 // Group the instructions together to emit separate remarks for: 5730 // load (vf1, vf2) 5731 // store (vf1) 5732 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5733 auto Subset = ArrayRef<InstructionVFPair>(); 5734 do { 5735 if (Subset.empty()) 5736 Subset = Tail.take_front(1); 5737 5738 Instruction *I = Subset.front().first; 5739 5740 // If the next instruction is different, or if there are no other pairs, 5741 // emit a remark for the collated subset. e.g. 5742 // [(load, vf1), (load, vf2))] 5743 // to emit: 5744 // remark: invalid costs for 'load' at VF=(vf, vf2) 5745 if (Subset == Tail || Tail[Subset.size()].first != I) { 5746 std::string OutString; 5747 raw_string_ostream OS(OutString); 5748 assert(!Subset.empty() && "Unexpected empty range"); 5749 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5750 for (auto &Pair : Subset) 5751 OS << (Pair.second == Subset.front().second ? "" : ", ") 5752 << Pair.second; 5753 OS << "):"; 5754 if (auto *CI = dyn_cast<CallInst>(I)) 5755 OS << " call to " << CI->getCalledFunction()->getName(); 5756 else 5757 OS << " " << I->getOpcodeName(); 5758 OS.flush(); 5759 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5760 Tail = Tail.drop_front(Subset.size()); 5761 Subset = {}; 5762 } else 5763 // Grow the subset by one element 5764 Subset = Tail.take_front(Subset.size() + 1); 5765 } while (!Tail.empty()); 5766 } 5767 5768 if (!EnableCondStoresVectorization && NumPredStores) { 5769 reportVectorizationFailure("There are conditional stores.", 5770 "store that is conditionally executed prevents vectorization", 5771 "ConditionalStore", ORE, TheLoop); 5772 ChosenFactor = ScalarCost; 5773 } 5774 5775 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5776 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5777 << "LV: Vectorization seems to be not beneficial, " 5778 << "but was forced by a user.\n"); 5779 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5780 return ChosenFactor; 5781 } 5782 5783 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5784 const Loop &L, ElementCount VF) const { 5785 // Cross iteration phis such as reductions need special handling and are 5786 // currently unsupported. 5787 if (any_of(L.getHeader()->phis(), 5788 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5789 return false; 5790 5791 // Phis with uses outside of the loop require special handling and are 5792 // currently unsupported. 5793 for (auto &Entry : Legal->getInductionVars()) { 5794 // Look for uses of the value of the induction at the last iteration. 5795 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5796 for (User *U : PostInc->users()) 5797 if (!L.contains(cast<Instruction>(U))) 5798 return false; 5799 // Look for uses of penultimate value of the induction. 5800 for (User *U : Entry.first->users()) 5801 if (!L.contains(cast<Instruction>(U))) 5802 return false; 5803 } 5804 5805 // Induction variables that are widened require special handling that is 5806 // currently not supported. 5807 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5808 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5809 this->isProfitableToScalarize(Entry.first, VF)); 5810 })) 5811 return false; 5812 5813 // Epilogue vectorization code has not been auditted to ensure it handles 5814 // non-latch exits properly. It may be fine, but it needs auditted and 5815 // tested. 5816 if (L.getExitingBlock() != L.getLoopLatch()) 5817 return false; 5818 5819 return true; 5820 } 5821 5822 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5823 const ElementCount VF) const { 5824 // FIXME: We need a much better cost-model to take different parameters such 5825 // as register pressure, code size increase and cost of extra branches into 5826 // account. For now we apply a very crude heuristic and only consider loops 5827 // with vectorization factors larger than a certain value. 5828 // We also consider epilogue vectorization unprofitable for targets that don't 5829 // consider interleaving beneficial (eg. MVE). 5830 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5831 return false; 5832 // FIXME: We should consider changing the threshold for scalable 5833 // vectors to take VScaleForTuning into account. 5834 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5835 return true; 5836 return false; 5837 } 5838 5839 VectorizationFactor 5840 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5841 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5842 VectorizationFactor Result = VectorizationFactor::Disabled(); 5843 if (!EnableEpilogueVectorization) { 5844 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5845 return Result; 5846 } 5847 5848 if (!isScalarEpilogueAllowed()) { 5849 LLVM_DEBUG( 5850 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5851 "allowed.\n";); 5852 return Result; 5853 } 5854 5855 // Not really a cost consideration, but check for unsupported cases here to 5856 // simplify the logic. 5857 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5858 LLVM_DEBUG( 5859 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5860 "not a supported candidate.\n";); 5861 return Result; 5862 } 5863 5864 if (EpilogueVectorizationForceVF > 1) { 5865 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5866 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5867 if (LVP.hasPlanWithVF(ForcedEC)) 5868 return {ForcedEC, 0}; 5869 else { 5870 LLVM_DEBUG( 5871 dbgs() 5872 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5873 return Result; 5874 } 5875 } 5876 5877 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5878 TheLoop->getHeader()->getParent()->hasMinSize()) { 5879 LLVM_DEBUG( 5880 dbgs() 5881 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5882 return Result; 5883 } 5884 5885 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5886 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5887 "this loop\n"); 5888 return Result; 5889 } 5890 5891 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5892 // the main loop handles 8 lanes per iteration. We could still benefit from 5893 // vectorizing the epilogue loop with VF=4. 5894 ElementCount EstimatedRuntimeVF = MainLoopVF; 5895 if (MainLoopVF.isScalable()) { 5896 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5897 if (Optional<unsigned> VScale = getVScaleForTuning()) 5898 EstimatedRuntimeVF *= VScale.getValue(); 5899 } 5900 5901 for (auto &NextVF : ProfitableVFs) 5902 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5903 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5904 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5905 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5906 LVP.hasPlanWithVF(NextVF.Width)) 5907 Result = NextVF; 5908 5909 if (Result != VectorizationFactor::Disabled()) 5910 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5911 << Result.Width << "\n";); 5912 return Result; 5913 } 5914 5915 std::pair<unsigned, unsigned> 5916 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5917 unsigned MinWidth = -1U; 5918 unsigned MaxWidth = 8; 5919 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5920 // For in-loop reductions, no element types are added to ElementTypesInLoop 5921 // if there are no loads/stores in the loop. In this case, check through the 5922 // reduction variables to determine the maximum width. 5923 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5924 // Reset MaxWidth so that we can find the smallest type used by recurrences 5925 // in the loop. 5926 MaxWidth = -1U; 5927 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5928 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5929 // When finding the min width used by the recurrence we need to account 5930 // for casts on the input operands of the recurrence. 5931 MaxWidth = std::min<unsigned>( 5932 MaxWidth, std::min<unsigned>( 5933 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5934 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5935 } 5936 } else { 5937 for (Type *T : ElementTypesInLoop) { 5938 MinWidth = std::min<unsigned>( 5939 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5940 MaxWidth = std::max<unsigned>( 5941 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5942 } 5943 } 5944 return {MinWidth, MaxWidth}; 5945 } 5946 5947 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5948 ElementTypesInLoop.clear(); 5949 // For each block. 5950 for (BasicBlock *BB : TheLoop->blocks()) { 5951 // For each instruction in the loop. 5952 for (Instruction &I : BB->instructionsWithoutDebug()) { 5953 Type *T = I.getType(); 5954 5955 // Skip ignored values. 5956 if (ValuesToIgnore.count(&I)) 5957 continue; 5958 5959 // Only examine Loads, Stores and PHINodes. 5960 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5961 continue; 5962 5963 // Examine PHI nodes that are reduction variables. Update the type to 5964 // account for the recurrence type. 5965 if (auto *PN = dyn_cast<PHINode>(&I)) { 5966 if (!Legal->isReductionVariable(PN)) 5967 continue; 5968 const RecurrenceDescriptor &RdxDesc = 5969 Legal->getReductionVars().find(PN)->second; 5970 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5971 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5972 RdxDesc.getRecurrenceType(), 5973 TargetTransformInfo::ReductionFlags())) 5974 continue; 5975 T = RdxDesc.getRecurrenceType(); 5976 } 5977 5978 // Examine the stored values. 5979 if (auto *ST = dyn_cast<StoreInst>(&I)) 5980 T = ST->getValueOperand()->getType(); 5981 5982 assert(T->isSized() && 5983 "Expected the load/store/recurrence type to be sized"); 5984 5985 ElementTypesInLoop.insert(T); 5986 } 5987 } 5988 } 5989 5990 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5991 unsigned LoopCost) { 5992 // -- The interleave heuristics -- 5993 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5994 // There are many micro-architectural considerations that we can't predict 5995 // at this level. For example, frontend pressure (on decode or fetch) due to 5996 // code size, or the number and capabilities of the execution ports. 5997 // 5998 // We use the following heuristics to select the interleave count: 5999 // 1. If the code has reductions, then we interleave to break the cross 6000 // iteration dependency. 6001 // 2. If the loop is really small, then we interleave to reduce the loop 6002 // overhead. 6003 // 3. We don't interleave if we think that we will spill registers to memory 6004 // due to the increased register pressure. 6005 6006 if (!isScalarEpilogueAllowed()) 6007 return 1; 6008 6009 // We used the distance for the interleave count. 6010 if (Legal->getMaxSafeDepDistBytes() != -1U) 6011 return 1; 6012 6013 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6014 const bool HasReductions = !Legal->getReductionVars().empty(); 6015 // Do not interleave loops with a relatively small known or estimated trip 6016 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6017 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6018 // because with the above conditions interleaving can expose ILP and break 6019 // cross iteration dependences for reductions. 6020 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6021 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6022 return 1; 6023 6024 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6025 // We divide by these constants so assume that we have at least one 6026 // instruction that uses at least one register. 6027 for (auto& pair : R.MaxLocalUsers) { 6028 pair.second = std::max(pair.second, 1U); 6029 } 6030 6031 // We calculate the interleave count using the following formula. 6032 // Subtract the number of loop invariants from the number of available 6033 // registers. These registers are used by all of the interleaved instances. 6034 // Next, divide the remaining registers by the number of registers that is 6035 // required by the loop, in order to estimate how many parallel instances 6036 // fit without causing spills. All of this is rounded down if necessary to be 6037 // a power of two. We want power of two interleave count to simplify any 6038 // addressing operations or alignment considerations. 6039 // We also want power of two interleave counts to ensure that the induction 6040 // variable of the vector loop wraps to zero, when tail is folded by masking; 6041 // this currently happens when OptForSize, in which case IC is set to 1 above. 6042 unsigned IC = UINT_MAX; 6043 6044 for (auto& pair : R.MaxLocalUsers) { 6045 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6046 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6047 << " registers of " 6048 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6049 if (VF.isScalar()) { 6050 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6051 TargetNumRegisters = ForceTargetNumScalarRegs; 6052 } else { 6053 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6054 TargetNumRegisters = ForceTargetNumVectorRegs; 6055 } 6056 unsigned MaxLocalUsers = pair.second; 6057 unsigned LoopInvariantRegs = 0; 6058 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6059 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6060 6061 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6062 // Don't count the induction variable as interleaved. 6063 if (EnableIndVarRegisterHeur) { 6064 TmpIC = 6065 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6066 std::max(1U, (MaxLocalUsers - 1))); 6067 } 6068 6069 IC = std::min(IC, TmpIC); 6070 } 6071 6072 // Clamp the interleave ranges to reasonable counts. 6073 unsigned MaxInterleaveCount = 6074 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6075 6076 // Check if the user has overridden the max. 6077 if (VF.isScalar()) { 6078 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6079 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6080 } else { 6081 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6082 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6083 } 6084 6085 // If trip count is known or estimated compile time constant, limit the 6086 // interleave count to be less than the trip count divided by VF, provided it 6087 // is at least 1. 6088 // 6089 // For scalable vectors we can't know if interleaving is beneficial. It may 6090 // not be beneficial for small loops if none of the lanes in the second vector 6091 // iterations is enabled. However, for larger loops, there is likely to be a 6092 // similar benefit as for fixed-width vectors. For now, we choose to leave 6093 // the InterleaveCount as if vscale is '1', although if some information about 6094 // the vector is known (e.g. min vector size), we can make a better decision. 6095 if (BestKnownTC) { 6096 MaxInterleaveCount = 6097 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6098 // Make sure MaxInterleaveCount is greater than 0. 6099 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6100 } 6101 6102 assert(MaxInterleaveCount > 0 && 6103 "Maximum interleave count must be greater than 0"); 6104 6105 // Clamp the calculated IC to be between the 1 and the max interleave count 6106 // that the target and trip count allows. 6107 if (IC > MaxInterleaveCount) 6108 IC = MaxInterleaveCount; 6109 else 6110 // Make sure IC is greater than 0. 6111 IC = std::max(1u, IC); 6112 6113 assert(IC > 0 && "Interleave count must be greater than 0."); 6114 6115 // If we did not calculate the cost for VF (because the user selected the VF) 6116 // then we calculate the cost of VF here. 6117 if (LoopCost == 0) { 6118 InstructionCost C = expectedCost(VF).first; 6119 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6120 LoopCost = *C.getValue(); 6121 } 6122 6123 assert(LoopCost && "Non-zero loop cost expected"); 6124 6125 // Interleave if we vectorized this loop and there is a reduction that could 6126 // benefit from interleaving. 6127 if (VF.isVector() && HasReductions) { 6128 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6129 return IC; 6130 } 6131 6132 // For any scalar loop that either requires runtime checks or predication we 6133 // are better off leaving this to the unroller. Note that if we've already 6134 // vectorized the loop we will have done the runtime check and so interleaving 6135 // won't require further checks. 6136 bool ScalarInterleavingRequiresPredication = 6137 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 6138 return Legal->blockNeedsPredication(BB); 6139 })); 6140 bool ScalarInterleavingRequiresRuntimePointerCheck = 6141 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6142 6143 // We want to interleave small loops in order to reduce the loop overhead and 6144 // potentially expose ILP opportunities. 6145 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6146 << "LV: IC is " << IC << '\n' 6147 << "LV: VF is " << VF << '\n'); 6148 const bool AggressivelyInterleaveReductions = 6149 TTI.enableAggressiveInterleaving(HasReductions); 6150 if (!ScalarInterleavingRequiresRuntimePointerCheck && 6151 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 6152 // We assume that the cost overhead is 1 and we use the cost model 6153 // to estimate the cost of the loop and interleave until the cost of the 6154 // loop overhead is about 5% of the cost of the loop. 6155 unsigned SmallIC = 6156 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6157 6158 // Interleave until store/load ports (estimated by max interleave count) are 6159 // saturated. 6160 unsigned NumStores = Legal->getNumStores(); 6161 unsigned NumLoads = Legal->getNumLoads(); 6162 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6163 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6164 6165 // There is little point in interleaving for reductions containing selects 6166 // and compares when VF=1 since it may just create more overhead than it's 6167 // worth for loops with small trip counts. This is because we still have to 6168 // do the final reduction after the loop. 6169 bool HasSelectCmpReductions = 6170 HasReductions && 6171 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6172 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6173 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6174 RdxDesc.getRecurrenceKind()); 6175 }); 6176 if (HasSelectCmpReductions) { 6177 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6178 return 1; 6179 } 6180 6181 // If we have a scalar reduction (vector reductions are already dealt with 6182 // by this point), we can increase the critical path length if the loop 6183 // we're interleaving is inside another loop. For tree-wise reductions 6184 // set the limit to 2, and for ordered reductions it's best to disable 6185 // interleaving entirely. 6186 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6187 bool HasOrderedReductions = 6188 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6189 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6190 return RdxDesc.isOrdered(); 6191 }); 6192 if (HasOrderedReductions) { 6193 LLVM_DEBUG( 6194 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6195 return 1; 6196 } 6197 6198 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6199 SmallIC = std::min(SmallIC, F); 6200 StoresIC = std::min(StoresIC, F); 6201 LoadsIC = std::min(LoadsIC, F); 6202 } 6203 6204 if (EnableLoadStoreRuntimeInterleave && 6205 std::max(StoresIC, LoadsIC) > SmallIC) { 6206 LLVM_DEBUG( 6207 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6208 return std::max(StoresIC, LoadsIC); 6209 } 6210 6211 // If there are scalar reductions and TTI has enabled aggressive 6212 // interleaving for reductions, we will interleave to expose ILP. 6213 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6214 AggressivelyInterleaveReductions) { 6215 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6216 // Interleave no less than SmallIC but not as aggressive as the normal IC 6217 // to satisfy the rare situation when resources are too limited. 6218 return std::max(IC / 2, SmallIC); 6219 } else { 6220 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6221 return SmallIC; 6222 } 6223 } 6224 6225 // Interleave if this is a large loop (small loops are already dealt with by 6226 // this point) that could benefit from interleaving. 6227 if (AggressivelyInterleaveReductions) { 6228 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6229 return IC; 6230 } 6231 6232 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6233 return 1; 6234 } 6235 6236 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6237 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6238 // This function calculates the register usage by measuring the highest number 6239 // of values that are alive at a single location. Obviously, this is a very 6240 // rough estimation. We scan the loop in a topological order in order and 6241 // assign a number to each instruction. We use RPO to ensure that defs are 6242 // met before their users. We assume that each instruction that has in-loop 6243 // users starts an interval. We record every time that an in-loop value is 6244 // used, so we have a list of the first and last occurrences of each 6245 // instruction. Next, we transpose this data structure into a multi map that 6246 // holds the list of intervals that *end* at a specific location. This multi 6247 // map allows us to perform a linear search. We scan the instructions linearly 6248 // and record each time that a new interval starts, by placing it in a set. 6249 // If we find this value in the multi-map then we remove it from the set. 6250 // The max register usage is the maximum size of the set. 6251 // We also search for instructions that are defined outside the loop, but are 6252 // used inside the loop. We need this number separately from the max-interval 6253 // usage number because when we unroll, loop-invariant values do not take 6254 // more register. 6255 LoopBlocksDFS DFS(TheLoop); 6256 DFS.perform(LI); 6257 6258 RegisterUsage RU; 6259 6260 // Each 'key' in the map opens a new interval. The values 6261 // of the map are the index of the 'last seen' usage of the 6262 // instruction that is the key. 6263 using IntervalMap = DenseMap<Instruction *, unsigned>; 6264 6265 // Maps instruction to its index. 6266 SmallVector<Instruction *, 64> IdxToInstr; 6267 // Marks the end of each interval. 6268 IntervalMap EndPoint; 6269 // Saves the list of instruction indices that are used in the loop. 6270 SmallPtrSet<Instruction *, 8> Ends; 6271 // Saves the list of values that are used in the loop but are 6272 // defined outside the loop, such as arguments and constants. 6273 SmallPtrSet<Value *, 8> LoopInvariants; 6274 6275 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6276 for (Instruction &I : BB->instructionsWithoutDebug()) { 6277 IdxToInstr.push_back(&I); 6278 6279 // Save the end location of each USE. 6280 for (Value *U : I.operands()) { 6281 auto *Instr = dyn_cast<Instruction>(U); 6282 6283 // Ignore non-instruction values such as arguments, constants, etc. 6284 if (!Instr) 6285 continue; 6286 6287 // If this instruction is outside the loop then record it and continue. 6288 if (!TheLoop->contains(Instr)) { 6289 LoopInvariants.insert(Instr); 6290 continue; 6291 } 6292 6293 // Overwrite previous end points. 6294 EndPoint[Instr] = IdxToInstr.size(); 6295 Ends.insert(Instr); 6296 } 6297 } 6298 } 6299 6300 // Saves the list of intervals that end with the index in 'key'. 6301 using InstrList = SmallVector<Instruction *, 2>; 6302 DenseMap<unsigned, InstrList> TransposeEnds; 6303 6304 // Transpose the EndPoints to a list of values that end at each index. 6305 for (auto &Interval : EndPoint) 6306 TransposeEnds[Interval.second].push_back(Interval.first); 6307 6308 SmallPtrSet<Instruction *, 8> OpenIntervals; 6309 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6310 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6311 6312 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6313 6314 // A lambda that gets the register usage for the given type and VF. 6315 const auto &TTICapture = TTI; 6316 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6317 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6318 return 0; 6319 InstructionCost::CostType RegUsage = 6320 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6321 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6322 "Nonsensical values for register usage."); 6323 return RegUsage; 6324 }; 6325 6326 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6327 Instruction *I = IdxToInstr[i]; 6328 6329 // Remove all of the instructions that end at this location. 6330 InstrList &List = TransposeEnds[i]; 6331 for (Instruction *ToRemove : List) 6332 OpenIntervals.erase(ToRemove); 6333 6334 // Ignore instructions that are never used within the loop. 6335 if (!Ends.count(I)) 6336 continue; 6337 6338 // Skip ignored values. 6339 if (ValuesToIgnore.count(I)) 6340 continue; 6341 6342 // For each VF find the maximum usage of registers. 6343 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6344 // Count the number of live intervals. 6345 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6346 6347 if (VFs[j].isScalar()) { 6348 for (auto Inst : OpenIntervals) { 6349 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6350 if (RegUsage.find(ClassID) == RegUsage.end()) 6351 RegUsage[ClassID] = 1; 6352 else 6353 RegUsage[ClassID] += 1; 6354 } 6355 } else { 6356 collectUniformsAndScalars(VFs[j]); 6357 for (auto Inst : OpenIntervals) { 6358 // Skip ignored values for VF > 1. 6359 if (VecValuesToIgnore.count(Inst)) 6360 continue; 6361 if (isScalarAfterVectorization(Inst, VFs[j])) { 6362 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6363 if (RegUsage.find(ClassID) == RegUsage.end()) 6364 RegUsage[ClassID] = 1; 6365 else 6366 RegUsage[ClassID] += 1; 6367 } else { 6368 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6369 if (RegUsage.find(ClassID) == RegUsage.end()) 6370 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6371 else 6372 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6373 } 6374 } 6375 } 6376 6377 for (auto& pair : RegUsage) { 6378 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6379 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6380 else 6381 MaxUsages[j][pair.first] = pair.second; 6382 } 6383 } 6384 6385 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6386 << OpenIntervals.size() << '\n'); 6387 6388 // Add the current instruction to the list of open intervals. 6389 OpenIntervals.insert(I); 6390 } 6391 6392 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6393 SmallMapVector<unsigned, unsigned, 4> Invariant; 6394 6395 for (auto Inst : LoopInvariants) { 6396 unsigned Usage = 6397 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6398 unsigned ClassID = 6399 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6400 if (Invariant.find(ClassID) == Invariant.end()) 6401 Invariant[ClassID] = Usage; 6402 else 6403 Invariant[ClassID] += Usage; 6404 } 6405 6406 LLVM_DEBUG({ 6407 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6408 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6409 << " item\n"; 6410 for (const auto &pair : MaxUsages[i]) { 6411 dbgs() << "LV(REG): RegisterClass: " 6412 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6413 << " registers\n"; 6414 } 6415 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6416 << " item\n"; 6417 for (const auto &pair : Invariant) { 6418 dbgs() << "LV(REG): RegisterClass: " 6419 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6420 << " registers\n"; 6421 } 6422 }); 6423 6424 RU.LoopInvariantRegs = Invariant; 6425 RU.MaxLocalUsers = MaxUsages[i]; 6426 RUs[i] = RU; 6427 } 6428 6429 return RUs; 6430 } 6431 6432 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6433 ElementCount VF) { 6434 // TODO: Cost model for emulated masked load/store is completely 6435 // broken. This hack guides the cost model to use an artificially 6436 // high enough value to practically disable vectorization with such 6437 // operations, except where previously deployed legality hack allowed 6438 // using very low cost values. This is to avoid regressions coming simply 6439 // from moving "masked load/store" check from legality to cost model. 6440 // Masked Load/Gather emulation was previously never allowed. 6441 // Limited number of Masked Store/Scatter emulation was allowed. 6442 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6443 return isa<LoadInst>(I) || 6444 (isa<StoreInst>(I) && 6445 NumPredStores > NumberOfStoresToPredicate); 6446 } 6447 6448 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6449 // If we aren't vectorizing the loop, or if we've already collected the 6450 // instructions to scalarize, there's nothing to do. Collection may already 6451 // have occurred if we have a user-selected VF and are now computing the 6452 // expected cost for interleaving. 6453 if (VF.isScalar() || VF.isZero() || 6454 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6455 return; 6456 6457 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6458 // not profitable to scalarize any instructions, the presence of VF in the 6459 // map will indicate that we've analyzed it already. 6460 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6461 6462 // Find all the instructions that are scalar with predication in the loop and 6463 // determine if it would be better to not if-convert the blocks they are in. 6464 // If so, we also record the instructions to scalarize. 6465 for (BasicBlock *BB : TheLoop->blocks()) { 6466 if (!blockNeedsPredicationForAnyReason(BB)) 6467 continue; 6468 for (Instruction &I : *BB) 6469 if (isScalarWithPredication(&I, VF)) { 6470 ScalarCostsTy ScalarCosts; 6471 // Do not apply discount if scalable, because that would lead to 6472 // invalid scalarization costs. 6473 // Do not apply discount logic if hacked cost is needed 6474 // for emulated masked memrefs. 6475 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6476 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6477 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6478 // Remember that BB will remain after vectorization. 6479 PredicatedBBsAfterVectorization.insert(BB); 6480 } 6481 } 6482 } 6483 6484 int LoopVectorizationCostModel::computePredInstDiscount( 6485 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6486 assert(!isUniformAfterVectorization(PredInst, VF) && 6487 "Instruction marked uniform-after-vectorization will be predicated"); 6488 6489 // Initialize the discount to zero, meaning that the scalar version and the 6490 // vector version cost the same. 6491 InstructionCost Discount = 0; 6492 6493 // Holds instructions to analyze. The instructions we visit are mapped in 6494 // ScalarCosts. Those instructions are the ones that would be scalarized if 6495 // we find that the scalar version costs less. 6496 SmallVector<Instruction *, 8> Worklist; 6497 6498 // Returns true if the given instruction can be scalarized. 6499 auto canBeScalarized = [&](Instruction *I) -> bool { 6500 // We only attempt to scalarize instructions forming a single-use chain 6501 // from the original predicated block that would otherwise be vectorized. 6502 // Although not strictly necessary, we give up on instructions we know will 6503 // already be scalar to avoid traversing chains that are unlikely to be 6504 // beneficial. 6505 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6506 isScalarAfterVectorization(I, VF)) 6507 return false; 6508 6509 // If the instruction is scalar with predication, it will be analyzed 6510 // separately. We ignore it within the context of PredInst. 6511 if (isScalarWithPredication(I, VF)) 6512 return false; 6513 6514 // If any of the instruction's operands are uniform after vectorization, 6515 // the instruction cannot be scalarized. This prevents, for example, a 6516 // masked load from being scalarized. 6517 // 6518 // We assume we will only emit a value for lane zero of an instruction 6519 // marked uniform after vectorization, rather than VF identical values. 6520 // Thus, if we scalarize an instruction that uses a uniform, we would 6521 // create uses of values corresponding to the lanes we aren't emitting code 6522 // for. This behavior can be changed by allowing getScalarValue to clone 6523 // the lane zero values for uniforms rather than asserting. 6524 for (Use &U : I->operands()) 6525 if (auto *J = dyn_cast<Instruction>(U.get())) 6526 if (isUniformAfterVectorization(J, VF)) 6527 return false; 6528 6529 // Otherwise, we can scalarize the instruction. 6530 return true; 6531 }; 6532 6533 // Compute the expected cost discount from scalarizing the entire expression 6534 // feeding the predicated instruction. We currently only consider expressions 6535 // that are single-use instruction chains. 6536 Worklist.push_back(PredInst); 6537 while (!Worklist.empty()) { 6538 Instruction *I = Worklist.pop_back_val(); 6539 6540 // If we've already analyzed the instruction, there's nothing to do. 6541 if (ScalarCosts.find(I) != ScalarCosts.end()) 6542 continue; 6543 6544 // Compute the cost of the vector instruction. Note that this cost already 6545 // includes the scalarization overhead of the predicated instruction. 6546 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6547 6548 // Compute the cost of the scalarized instruction. This cost is the cost of 6549 // the instruction as if it wasn't if-converted and instead remained in the 6550 // predicated block. We will scale this cost by block probability after 6551 // computing the scalarization overhead. 6552 InstructionCost ScalarCost = 6553 VF.getFixedValue() * 6554 getInstructionCost(I, ElementCount::getFixed(1)).first; 6555 6556 // Compute the scalarization overhead of needed insertelement instructions 6557 // and phi nodes. 6558 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6559 ScalarCost += TTI.getScalarizationOverhead( 6560 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6561 APInt::getAllOnes(VF.getFixedValue()), true, false); 6562 ScalarCost += 6563 VF.getFixedValue() * 6564 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6565 } 6566 6567 // Compute the scalarization overhead of needed extractelement 6568 // instructions. For each of the instruction's operands, if the operand can 6569 // be scalarized, add it to the worklist; otherwise, account for the 6570 // overhead. 6571 for (Use &U : I->operands()) 6572 if (auto *J = dyn_cast<Instruction>(U.get())) { 6573 assert(VectorType::isValidElementType(J->getType()) && 6574 "Instruction has non-scalar type"); 6575 if (canBeScalarized(J)) 6576 Worklist.push_back(J); 6577 else if (needsExtract(J, VF)) { 6578 ScalarCost += TTI.getScalarizationOverhead( 6579 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6580 APInt::getAllOnes(VF.getFixedValue()), false, true); 6581 } 6582 } 6583 6584 // Scale the total scalar cost by block probability. 6585 ScalarCost /= getReciprocalPredBlockProb(); 6586 6587 // Compute the discount. A non-negative discount means the vector version 6588 // of the instruction costs more, and scalarizing would be beneficial. 6589 Discount += VectorCost - ScalarCost; 6590 ScalarCosts[I] = ScalarCost; 6591 } 6592 6593 return *Discount.getValue(); 6594 } 6595 6596 LoopVectorizationCostModel::VectorizationCostTy 6597 LoopVectorizationCostModel::expectedCost( 6598 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6599 VectorizationCostTy Cost; 6600 6601 // For each block. 6602 for (BasicBlock *BB : TheLoop->blocks()) { 6603 VectorizationCostTy BlockCost; 6604 6605 // For each instruction in the old loop. 6606 for (Instruction &I : BB->instructionsWithoutDebug()) { 6607 // Skip ignored values. 6608 if (ValuesToIgnore.count(&I) || 6609 (VF.isVector() && VecValuesToIgnore.count(&I))) 6610 continue; 6611 6612 VectorizationCostTy C = getInstructionCost(&I, VF); 6613 6614 // Check if we should override the cost. 6615 if (C.first.isValid() && 6616 ForceTargetInstructionCost.getNumOccurrences() > 0) 6617 C.first = InstructionCost(ForceTargetInstructionCost); 6618 6619 // Keep a list of instructions with invalid costs. 6620 if (Invalid && !C.first.isValid()) 6621 Invalid->emplace_back(&I, VF); 6622 6623 BlockCost.first += C.first; 6624 BlockCost.second |= C.second; 6625 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6626 << " for VF " << VF << " For instruction: " << I 6627 << '\n'); 6628 } 6629 6630 // If we are vectorizing a predicated block, it will have been 6631 // if-converted. This means that the block's instructions (aside from 6632 // stores and instructions that may divide by zero) will now be 6633 // unconditionally executed. For the scalar case, we may not always execute 6634 // the predicated block, if it is an if-else block. Thus, scale the block's 6635 // cost by the probability of executing it. blockNeedsPredication from 6636 // Legal is used so as to not include all blocks in tail folded loops. 6637 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6638 BlockCost.first /= getReciprocalPredBlockProb(); 6639 6640 Cost.first += BlockCost.first; 6641 Cost.second |= BlockCost.second; 6642 } 6643 6644 return Cost; 6645 } 6646 6647 /// Gets Address Access SCEV after verifying that the access pattern 6648 /// is loop invariant except the induction variable dependence. 6649 /// 6650 /// This SCEV can be sent to the Target in order to estimate the address 6651 /// calculation cost. 6652 static const SCEV *getAddressAccessSCEV( 6653 Value *Ptr, 6654 LoopVectorizationLegality *Legal, 6655 PredicatedScalarEvolution &PSE, 6656 const Loop *TheLoop) { 6657 6658 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6659 if (!Gep) 6660 return nullptr; 6661 6662 // We are looking for a gep with all loop invariant indices except for one 6663 // which should be an induction variable. 6664 auto SE = PSE.getSE(); 6665 unsigned NumOperands = Gep->getNumOperands(); 6666 for (unsigned i = 1; i < NumOperands; ++i) { 6667 Value *Opd = Gep->getOperand(i); 6668 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6669 !Legal->isInductionVariable(Opd)) 6670 return nullptr; 6671 } 6672 6673 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6674 return PSE.getSCEV(Ptr); 6675 } 6676 6677 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6678 return Legal->hasStride(I->getOperand(0)) || 6679 Legal->hasStride(I->getOperand(1)); 6680 } 6681 6682 InstructionCost 6683 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6684 ElementCount VF) { 6685 assert(VF.isVector() && 6686 "Scalarization cost of instruction implies vectorization."); 6687 if (VF.isScalable()) 6688 return InstructionCost::getInvalid(); 6689 6690 Type *ValTy = getLoadStoreType(I); 6691 auto SE = PSE.getSE(); 6692 6693 unsigned AS = getLoadStoreAddressSpace(I); 6694 Value *Ptr = getLoadStorePointerOperand(I); 6695 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6696 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6697 // that it is being called from this specific place. 6698 6699 // Figure out whether the access is strided and get the stride value 6700 // if it's known in compile time 6701 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6702 6703 // Get the cost of the scalar memory instruction and address computation. 6704 InstructionCost Cost = 6705 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6706 6707 // Don't pass *I here, since it is scalar but will actually be part of a 6708 // vectorized loop where the user of it is a vectorized instruction. 6709 const Align Alignment = getLoadStoreAlignment(I); 6710 Cost += VF.getKnownMinValue() * 6711 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6712 AS, TTI::TCK_RecipThroughput); 6713 6714 // Get the overhead of the extractelement and insertelement instructions 6715 // we might create due to scalarization. 6716 Cost += getScalarizationOverhead(I, VF); 6717 6718 // If we have a predicated load/store, it will need extra i1 extracts and 6719 // conditional branches, but may not be executed for each vector lane. Scale 6720 // the cost by the probability of executing the predicated block. 6721 if (isPredicatedInst(I, VF)) { 6722 Cost /= getReciprocalPredBlockProb(); 6723 6724 // Add the cost of an i1 extract and a branch 6725 auto *Vec_i1Ty = 6726 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6727 Cost += TTI.getScalarizationOverhead( 6728 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6729 /*Insert=*/false, /*Extract=*/true); 6730 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6731 6732 if (useEmulatedMaskMemRefHack(I, VF)) 6733 // Artificially setting to a high enough value to practically disable 6734 // vectorization with such operations. 6735 Cost = 3000000; 6736 } 6737 6738 return Cost; 6739 } 6740 6741 InstructionCost 6742 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6743 ElementCount VF) { 6744 Type *ValTy = getLoadStoreType(I); 6745 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6746 Value *Ptr = getLoadStorePointerOperand(I); 6747 unsigned AS = getLoadStoreAddressSpace(I); 6748 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6749 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6750 6751 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6752 "Stride should be 1 or -1 for consecutive memory access"); 6753 const Align Alignment = getLoadStoreAlignment(I); 6754 InstructionCost Cost = 0; 6755 if (Legal->isMaskRequired(I)) 6756 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6757 CostKind); 6758 else 6759 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6760 CostKind, I); 6761 6762 bool Reverse = ConsecutiveStride < 0; 6763 if (Reverse) 6764 Cost += 6765 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6766 return Cost; 6767 } 6768 6769 InstructionCost 6770 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6771 ElementCount VF) { 6772 assert(Legal->isUniformMemOp(*I)); 6773 6774 Type *ValTy = getLoadStoreType(I); 6775 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6776 const Align Alignment = getLoadStoreAlignment(I); 6777 unsigned AS = getLoadStoreAddressSpace(I); 6778 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6779 if (isa<LoadInst>(I)) { 6780 return TTI.getAddressComputationCost(ValTy) + 6781 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6782 CostKind) + 6783 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6784 } 6785 StoreInst *SI = cast<StoreInst>(I); 6786 6787 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6788 return TTI.getAddressComputationCost(ValTy) + 6789 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6790 CostKind) + 6791 (isLoopInvariantStoreValue 6792 ? 0 6793 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6794 VF.getKnownMinValue() - 1)); 6795 } 6796 6797 InstructionCost 6798 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6799 ElementCount VF) { 6800 Type *ValTy = getLoadStoreType(I); 6801 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6802 const Align Alignment = getLoadStoreAlignment(I); 6803 const Value *Ptr = getLoadStorePointerOperand(I); 6804 6805 return TTI.getAddressComputationCost(VectorTy) + 6806 TTI.getGatherScatterOpCost( 6807 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6808 TargetTransformInfo::TCK_RecipThroughput, I); 6809 } 6810 6811 InstructionCost 6812 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6813 ElementCount VF) { 6814 // TODO: Once we have support for interleaving with scalable vectors 6815 // we can calculate the cost properly here. 6816 if (VF.isScalable()) 6817 return InstructionCost::getInvalid(); 6818 6819 Type *ValTy = getLoadStoreType(I); 6820 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6821 unsigned AS = getLoadStoreAddressSpace(I); 6822 6823 auto Group = getInterleavedAccessGroup(I); 6824 assert(Group && "Fail to get an interleaved access group."); 6825 6826 unsigned InterleaveFactor = Group->getFactor(); 6827 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6828 6829 // Holds the indices of existing members in the interleaved group. 6830 SmallVector<unsigned, 4> Indices; 6831 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6832 if (Group->getMember(IF)) 6833 Indices.push_back(IF); 6834 6835 // Calculate the cost of the whole interleaved group. 6836 bool UseMaskForGaps = 6837 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6838 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6839 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6840 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6841 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6842 6843 if (Group->isReverse()) { 6844 // TODO: Add support for reversed masked interleaved access. 6845 assert(!Legal->isMaskRequired(I) && 6846 "Reverse masked interleaved access not supported."); 6847 Cost += 6848 Group->getNumMembers() * 6849 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6850 } 6851 return Cost; 6852 } 6853 6854 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6855 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6856 using namespace llvm::PatternMatch; 6857 // Early exit for no inloop reductions 6858 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6859 return None; 6860 auto *VectorTy = cast<VectorType>(Ty); 6861 6862 // We are looking for a pattern of, and finding the minimal acceptable cost: 6863 // reduce(mul(ext(A), ext(B))) or 6864 // reduce(mul(A, B)) or 6865 // reduce(ext(A)) or 6866 // reduce(A). 6867 // The basic idea is that we walk down the tree to do that, finding the root 6868 // reduction instruction in InLoopReductionImmediateChains. From there we find 6869 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6870 // of the components. If the reduction cost is lower then we return it for the 6871 // reduction instruction and 0 for the other instructions in the pattern. If 6872 // it is not we return an invalid cost specifying the orignal cost method 6873 // should be used. 6874 Instruction *RetI = I; 6875 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6876 if (!RetI->hasOneUser()) 6877 return None; 6878 RetI = RetI->user_back(); 6879 } 6880 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6881 RetI->user_back()->getOpcode() == Instruction::Add) { 6882 if (!RetI->hasOneUser()) 6883 return None; 6884 RetI = RetI->user_back(); 6885 } 6886 6887 // Test if the found instruction is a reduction, and if not return an invalid 6888 // cost specifying the parent to use the original cost modelling. 6889 if (!InLoopReductionImmediateChains.count(RetI)) 6890 return None; 6891 6892 // Find the reduction this chain is a part of and calculate the basic cost of 6893 // the reduction on its own. 6894 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6895 Instruction *ReductionPhi = LastChain; 6896 while (!isa<PHINode>(ReductionPhi)) 6897 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6898 6899 const RecurrenceDescriptor &RdxDesc = 6900 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6901 6902 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6903 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6904 6905 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6906 // normal fmul instruction to the cost of the fadd reduction. 6907 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6908 BaseCost += 6909 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6910 6911 // If we're using ordered reductions then we can just return the base cost 6912 // here, since getArithmeticReductionCost calculates the full ordered 6913 // reduction cost when FP reassociation is not allowed. 6914 if (useOrderedReductions(RdxDesc)) 6915 return BaseCost; 6916 6917 // Get the operand that was not the reduction chain and match it to one of the 6918 // patterns, returning the better cost if it is found. 6919 Instruction *RedOp = RetI->getOperand(1) == LastChain 6920 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6921 : dyn_cast<Instruction>(RetI->getOperand(1)); 6922 6923 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6924 6925 Instruction *Op0, *Op1; 6926 if (RedOp && 6927 match(RedOp, 6928 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6929 match(Op0, m_ZExtOrSExt(m_Value())) && 6930 Op0->getOpcode() == Op1->getOpcode() && 6931 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6932 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6933 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6934 6935 // Matched reduce(ext(mul(ext(A), ext(B))) 6936 // Note that the extend opcodes need to all match, or if A==B they will have 6937 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6938 // which is equally fine. 6939 bool IsUnsigned = isa<ZExtInst>(Op0); 6940 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6941 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6942 6943 InstructionCost ExtCost = 6944 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6945 TTI::CastContextHint::None, CostKind, Op0); 6946 InstructionCost MulCost = 6947 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6948 InstructionCost Ext2Cost = 6949 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6950 TTI::CastContextHint::None, CostKind, RedOp); 6951 6952 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6953 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6954 CostKind); 6955 6956 if (RedCost.isValid() && 6957 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6958 return I == RetI ? RedCost : 0; 6959 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6960 !TheLoop->isLoopInvariant(RedOp)) { 6961 // Matched reduce(ext(A)) 6962 bool IsUnsigned = isa<ZExtInst>(RedOp); 6963 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6964 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6965 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6966 CostKind); 6967 6968 InstructionCost ExtCost = 6969 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6970 TTI::CastContextHint::None, CostKind, RedOp); 6971 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6972 return I == RetI ? RedCost : 0; 6973 } else if (RedOp && 6974 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6975 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6976 Op0->getOpcode() == Op1->getOpcode() && 6977 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6978 bool IsUnsigned = isa<ZExtInst>(Op0); 6979 Type *Op0Ty = Op0->getOperand(0)->getType(); 6980 Type *Op1Ty = Op1->getOperand(0)->getType(); 6981 Type *LargestOpTy = 6982 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6983 : Op0Ty; 6984 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6985 6986 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6987 // different sizes. We take the largest type as the ext to reduce, and add 6988 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6989 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6990 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6991 TTI::CastContextHint::None, CostKind, Op0); 6992 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6993 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6994 TTI::CastContextHint::None, CostKind, Op1); 6995 InstructionCost MulCost = 6996 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6997 6998 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6999 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7000 CostKind); 7001 InstructionCost ExtraExtCost = 0; 7002 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7003 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7004 ExtraExtCost = TTI.getCastInstrCost( 7005 ExtraExtOp->getOpcode(), ExtType, 7006 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7007 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7008 } 7009 7010 if (RedCost.isValid() && 7011 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7012 return I == RetI ? RedCost : 0; 7013 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7014 // Matched reduce(mul()) 7015 InstructionCost MulCost = 7016 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7017 7018 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7019 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7020 CostKind); 7021 7022 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7023 return I == RetI ? RedCost : 0; 7024 } 7025 } 7026 7027 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7028 } 7029 7030 InstructionCost 7031 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7032 ElementCount VF) { 7033 // Calculate scalar cost only. Vectorization cost should be ready at this 7034 // moment. 7035 if (VF.isScalar()) { 7036 Type *ValTy = getLoadStoreType(I); 7037 const Align Alignment = getLoadStoreAlignment(I); 7038 unsigned AS = getLoadStoreAddressSpace(I); 7039 7040 return TTI.getAddressComputationCost(ValTy) + 7041 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7042 TTI::TCK_RecipThroughput, I); 7043 } 7044 return getWideningCost(I, VF); 7045 } 7046 7047 LoopVectorizationCostModel::VectorizationCostTy 7048 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7049 ElementCount VF) { 7050 // If we know that this instruction will remain uniform, check the cost of 7051 // the scalar version. 7052 if (isUniformAfterVectorization(I, VF)) 7053 VF = ElementCount::getFixed(1); 7054 7055 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7056 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7057 7058 // Forced scalars do not have any scalarization overhead. 7059 auto ForcedScalar = ForcedScalars.find(VF); 7060 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7061 auto InstSet = ForcedScalar->second; 7062 if (InstSet.count(I)) 7063 return VectorizationCostTy( 7064 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7065 VF.getKnownMinValue()), 7066 false); 7067 } 7068 7069 Type *VectorTy; 7070 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7071 7072 bool TypeNotScalarized = false; 7073 if (VF.isVector() && VectorTy->isVectorTy()) { 7074 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7075 if (NumParts) 7076 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7077 else 7078 C = InstructionCost::getInvalid(); 7079 } 7080 return VectorizationCostTy(C, TypeNotScalarized); 7081 } 7082 7083 InstructionCost 7084 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7085 ElementCount VF) const { 7086 7087 // There is no mechanism yet to create a scalable scalarization loop, 7088 // so this is currently Invalid. 7089 if (VF.isScalable()) 7090 return InstructionCost::getInvalid(); 7091 7092 if (VF.isScalar()) 7093 return 0; 7094 7095 InstructionCost Cost = 0; 7096 Type *RetTy = ToVectorTy(I->getType(), VF); 7097 if (!RetTy->isVoidTy() && 7098 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7099 Cost += TTI.getScalarizationOverhead( 7100 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7101 false); 7102 7103 // Some targets keep addresses scalar. 7104 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7105 return Cost; 7106 7107 // Some targets support efficient element stores. 7108 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7109 return Cost; 7110 7111 // Collect operands to consider. 7112 CallInst *CI = dyn_cast<CallInst>(I); 7113 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7114 7115 // Skip operands that do not require extraction/scalarization and do not incur 7116 // any overhead. 7117 SmallVector<Type *> Tys; 7118 for (auto *V : filterExtractingOperands(Ops, VF)) 7119 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7120 return Cost + TTI.getOperandsScalarizationOverhead( 7121 filterExtractingOperands(Ops, VF), Tys); 7122 } 7123 7124 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7125 if (VF.isScalar()) 7126 return; 7127 NumPredStores = 0; 7128 for (BasicBlock *BB : TheLoop->blocks()) { 7129 // For each instruction in the old loop. 7130 for (Instruction &I : *BB) { 7131 Value *Ptr = getLoadStorePointerOperand(&I); 7132 if (!Ptr) 7133 continue; 7134 7135 // TODO: We should generate better code and update the cost model for 7136 // predicated uniform stores. Today they are treated as any other 7137 // predicated store (see added test cases in 7138 // invariant-store-vectorization.ll). 7139 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 7140 NumPredStores++; 7141 7142 if (Legal->isUniformMemOp(I)) { 7143 // TODO: Avoid replicating loads and stores instead of 7144 // relying on instcombine to remove them. 7145 // Load: Scalar load + broadcast 7146 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7147 InstructionCost Cost; 7148 if (isa<StoreInst>(&I) && VF.isScalable() && 7149 isLegalGatherOrScatter(&I, VF)) { 7150 Cost = getGatherScatterCost(&I, VF); 7151 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7152 } else { 7153 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7154 "Cannot yet scalarize uniform stores"); 7155 Cost = getUniformMemOpCost(&I, VF); 7156 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7157 } 7158 continue; 7159 } 7160 7161 // We assume that widening is the best solution when possible. 7162 if (memoryInstructionCanBeWidened(&I, VF)) { 7163 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7164 int ConsecutiveStride = Legal->isConsecutivePtr( 7165 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7166 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7167 "Expected consecutive stride."); 7168 InstWidening Decision = 7169 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7170 setWideningDecision(&I, VF, Decision, Cost); 7171 continue; 7172 } 7173 7174 // Choose between Interleaving, Gather/Scatter or Scalarization. 7175 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7176 unsigned NumAccesses = 1; 7177 if (isAccessInterleaved(&I)) { 7178 auto Group = getInterleavedAccessGroup(&I); 7179 assert(Group && "Fail to get an interleaved access group."); 7180 7181 // Make one decision for the whole group. 7182 if (getWideningDecision(&I, VF) != CM_Unknown) 7183 continue; 7184 7185 NumAccesses = Group->getNumMembers(); 7186 if (interleavedAccessCanBeWidened(&I, VF)) 7187 InterleaveCost = getInterleaveGroupCost(&I, VF); 7188 } 7189 7190 InstructionCost GatherScatterCost = 7191 isLegalGatherOrScatter(&I, VF) 7192 ? getGatherScatterCost(&I, VF) * NumAccesses 7193 : InstructionCost::getInvalid(); 7194 7195 InstructionCost ScalarizationCost = 7196 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7197 7198 // Choose better solution for the current VF, 7199 // write down this decision and use it during vectorization. 7200 InstructionCost Cost; 7201 InstWidening Decision; 7202 if (InterleaveCost <= GatherScatterCost && 7203 InterleaveCost < ScalarizationCost) { 7204 Decision = CM_Interleave; 7205 Cost = InterleaveCost; 7206 } else if (GatherScatterCost < ScalarizationCost) { 7207 Decision = CM_GatherScatter; 7208 Cost = GatherScatterCost; 7209 } else { 7210 Decision = CM_Scalarize; 7211 Cost = ScalarizationCost; 7212 } 7213 // If the instructions belongs to an interleave group, the whole group 7214 // receives the same decision. The whole group receives the cost, but 7215 // the cost will actually be assigned to one instruction. 7216 if (auto Group = getInterleavedAccessGroup(&I)) 7217 setWideningDecision(Group, VF, Decision, Cost); 7218 else 7219 setWideningDecision(&I, VF, Decision, Cost); 7220 } 7221 } 7222 7223 // Make sure that any load of address and any other address computation 7224 // remains scalar unless there is gather/scatter support. This avoids 7225 // inevitable extracts into address registers, and also has the benefit of 7226 // activating LSR more, since that pass can't optimize vectorized 7227 // addresses. 7228 if (TTI.prefersVectorizedAddressing()) 7229 return; 7230 7231 // Start with all scalar pointer uses. 7232 SmallPtrSet<Instruction *, 8> AddrDefs; 7233 for (BasicBlock *BB : TheLoop->blocks()) 7234 for (Instruction &I : *BB) { 7235 Instruction *PtrDef = 7236 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7237 if (PtrDef && TheLoop->contains(PtrDef) && 7238 getWideningDecision(&I, VF) != CM_GatherScatter) 7239 AddrDefs.insert(PtrDef); 7240 } 7241 7242 // Add all instructions used to generate the addresses. 7243 SmallVector<Instruction *, 4> Worklist; 7244 append_range(Worklist, AddrDefs); 7245 while (!Worklist.empty()) { 7246 Instruction *I = Worklist.pop_back_val(); 7247 for (auto &Op : I->operands()) 7248 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7249 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7250 AddrDefs.insert(InstOp).second) 7251 Worklist.push_back(InstOp); 7252 } 7253 7254 for (auto *I : AddrDefs) { 7255 if (isa<LoadInst>(I)) { 7256 // Setting the desired widening decision should ideally be handled in 7257 // by cost functions, but since this involves the task of finding out 7258 // if the loaded register is involved in an address computation, it is 7259 // instead changed here when we know this is the case. 7260 InstWidening Decision = getWideningDecision(I, VF); 7261 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7262 // Scalarize a widened load of address. 7263 setWideningDecision( 7264 I, VF, CM_Scalarize, 7265 (VF.getKnownMinValue() * 7266 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7267 else if (auto Group = getInterleavedAccessGroup(I)) { 7268 // Scalarize an interleave group of address loads. 7269 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7270 if (Instruction *Member = Group->getMember(I)) 7271 setWideningDecision( 7272 Member, VF, CM_Scalarize, 7273 (VF.getKnownMinValue() * 7274 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7275 } 7276 } 7277 } else 7278 // Make sure I gets scalarized and a cost estimate without 7279 // scalarization overhead. 7280 ForcedScalars[VF].insert(I); 7281 } 7282 } 7283 7284 InstructionCost 7285 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7286 Type *&VectorTy) { 7287 Type *RetTy = I->getType(); 7288 if (canTruncateToMinimalBitwidth(I, VF)) 7289 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7290 auto SE = PSE.getSE(); 7291 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7292 7293 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7294 ElementCount VF) -> bool { 7295 if (VF.isScalar()) 7296 return true; 7297 7298 auto Scalarized = InstsToScalarize.find(VF); 7299 assert(Scalarized != InstsToScalarize.end() && 7300 "VF not yet analyzed for scalarization profitability"); 7301 return !Scalarized->second.count(I) && 7302 llvm::all_of(I->users(), [&](User *U) { 7303 auto *UI = cast<Instruction>(U); 7304 return !Scalarized->second.count(UI); 7305 }); 7306 }; 7307 (void) hasSingleCopyAfterVectorization; 7308 7309 if (isScalarAfterVectorization(I, VF)) { 7310 // With the exception of GEPs and PHIs, after scalarization there should 7311 // only be one copy of the instruction generated in the loop. This is 7312 // because the VF is either 1, or any instructions that need scalarizing 7313 // have already been dealt with by the the time we get here. As a result, 7314 // it means we don't have to multiply the instruction cost by VF. 7315 assert(I->getOpcode() == Instruction::GetElementPtr || 7316 I->getOpcode() == Instruction::PHI || 7317 (I->getOpcode() == Instruction::BitCast && 7318 I->getType()->isPointerTy()) || 7319 hasSingleCopyAfterVectorization(I, VF)); 7320 VectorTy = RetTy; 7321 } else 7322 VectorTy = ToVectorTy(RetTy, VF); 7323 7324 // TODO: We need to estimate the cost of intrinsic calls. 7325 switch (I->getOpcode()) { 7326 case Instruction::GetElementPtr: 7327 // We mark this instruction as zero-cost because the cost of GEPs in 7328 // vectorized code depends on whether the corresponding memory instruction 7329 // is scalarized or not. Therefore, we handle GEPs with the memory 7330 // instruction cost. 7331 return 0; 7332 case Instruction::Br: { 7333 // In cases of scalarized and predicated instructions, there will be VF 7334 // predicated blocks in the vectorized loop. Each branch around these 7335 // blocks requires also an extract of its vector compare i1 element. 7336 bool ScalarPredicatedBB = false; 7337 BranchInst *BI = cast<BranchInst>(I); 7338 if (VF.isVector() && BI->isConditional() && 7339 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7340 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7341 ScalarPredicatedBB = true; 7342 7343 if (ScalarPredicatedBB) { 7344 // Not possible to scalarize scalable vector with predicated instructions. 7345 if (VF.isScalable()) 7346 return InstructionCost::getInvalid(); 7347 // Return cost for branches around scalarized and predicated blocks. 7348 auto *Vec_i1Ty = 7349 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7350 return ( 7351 TTI.getScalarizationOverhead( 7352 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7353 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7354 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7355 // The back-edge branch will remain, as will all scalar branches. 7356 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7357 else 7358 // This branch will be eliminated by if-conversion. 7359 return 0; 7360 // Note: We currently assume zero cost for an unconditional branch inside 7361 // a predicated block since it will become a fall-through, although we 7362 // may decide in the future to call TTI for all branches. 7363 } 7364 case Instruction::PHI: { 7365 auto *Phi = cast<PHINode>(I); 7366 7367 // First-order recurrences are replaced by vector shuffles inside the loop. 7368 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7369 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7370 return TTI.getShuffleCost( 7371 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7372 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7373 7374 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7375 // converted into select instructions. We require N - 1 selects per phi 7376 // node, where N is the number of incoming values. 7377 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7378 return (Phi->getNumIncomingValues() - 1) * 7379 TTI.getCmpSelInstrCost( 7380 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7381 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7382 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7383 7384 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7385 } 7386 case Instruction::UDiv: 7387 case Instruction::SDiv: 7388 case Instruction::URem: 7389 case Instruction::SRem: 7390 // If we have a predicated instruction, it may not be executed for each 7391 // vector lane. Get the scalarization cost and scale this amount by the 7392 // probability of executing the predicated block. If the instruction is not 7393 // predicated, we fall through to the next case. 7394 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7395 InstructionCost Cost = 0; 7396 7397 // These instructions have a non-void type, so account for the phi nodes 7398 // that we will create. This cost is likely to be zero. The phi node 7399 // cost, if any, should be scaled by the block probability because it 7400 // models a copy at the end of each predicated block. 7401 Cost += VF.getKnownMinValue() * 7402 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7403 7404 // The cost of the non-predicated instruction. 7405 Cost += VF.getKnownMinValue() * 7406 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7407 7408 // The cost of insertelement and extractelement instructions needed for 7409 // scalarization. 7410 Cost += getScalarizationOverhead(I, VF); 7411 7412 // Scale the cost by the probability of executing the predicated blocks. 7413 // This assumes the predicated block for each vector lane is equally 7414 // likely. 7415 return Cost / getReciprocalPredBlockProb(); 7416 } 7417 LLVM_FALLTHROUGH; 7418 case Instruction::Add: 7419 case Instruction::FAdd: 7420 case Instruction::Sub: 7421 case Instruction::FSub: 7422 case Instruction::Mul: 7423 case Instruction::FMul: 7424 case Instruction::FDiv: 7425 case Instruction::FRem: 7426 case Instruction::Shl: 7427 case Instruction::LShr: 7428 case Instruction::AShr: 7429 case Instruction::And: 7430 case Instruction::Or: 7431 case Instruction::Xor: { 7432 // Since we will replace the stride by 1 the multiplication should go away. 7433 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7434 return 0; 7435 7436 // Detect reduction patterns 7437 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7438 return *RedCost; 7439 7440 // Certain instructions can be cheaper to vectorize if they have a constant 7441 // second vector operand. One example of this are shifts on x86. 7442 Value *Op2 = I->getOperand(1); 7443 TargetTransformInfo::OperandValueProperties Op2VP; 7444 TargetTransformInfo::OperandValueKind Op2VK = 7445 TTI.getOperandInfo(Op2, Op2VP); 7446 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7447 Op2VK = TargetTransformInfo::OK_UniformValue; 7448 7449 SmallVector<const Value *, 4> Operands(I->operand_values()); 7450 return TTI.getArithmeticInstrCost( 7451 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7452 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7453 } 7454 case Instruction::FNeg: { 7455 return TTI.getArithmeticInstrCost( 7456 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7457 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7458 TargetTransformInfo::OP_None, I->getOperand(0), I); 7459 } 7460 case Instruction::Select: { 7461 SelectInst *SI = cast<SelectInst>(I); 7462 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7463 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7464 7465 const Value *Op0, *Op1; 7466 using namespace llvm::PatternMatch; 7467 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7468 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7469 // select x, y, false --> x & y 7470 // select x, true, y --> x | y 7471 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7472 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7473 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7474 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7475 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7476 Op1->getType()->getScalarSizeInBits() == 1); 7477 7478 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7479 return TTI.getArithmeticInstrCost( 7480 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7481 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7482 } 7483 7484 Type *CondTy = SI->getCondition()->getType(); 7485 if (!ScalarCond) 7486 CondTy = VectorType::get(CondTy, VF); 7487 7488 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7489 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7490 Pred = Cmp->getPredicate(); 7491 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7492 CostKind, I); 7493 } 7494 case Instruction::ICmp: 7495 case Instruction::FCmp: { 7496 Type *ValTy = I->getOperand(0)->getType(); 7497 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7498 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7499 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7500 VectorTy = ToVectorTy(ValTy, VF); 7501 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7502 cast<CmpInst>(I)->getPredicate(), CostKind, 7503 I); 7504 } 7505 case Instruction::Store: 7506 case Instruction::Load: { 7507 ElementCount Width = VF; 7508 if (Width.isVector()) { 7509 InstWidening Decision = getWideningDecision(I, Width); 7510 assert(Decision != CM_Unknown && 7511 "CM decision should be taken at this point"); 7512 if (Decision == CM_Scalarize) 7513 Width = ElementCount::getFixed(1); 7514 } 7515 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7516 return getMemoryInstructionCost(I, VF); 7517 } 7518 case Instruction::BitCast: 7519 if (I->getType()->isPointerTy()) 7520 return 0; 7521 LLVM_FALLTHROUGH; 7522 case Instruction::ZExt: 7523 case Instruction::SExt: 7524 case Instruction::FPToUI: 7525 case Instruction::FPToSI: 7526 case Instruction::FPExt: 7527 case Instruction::PtrToInt: 7528 case Instruction::IntToPtr: 7529 case Instruction::SIToFP: 7530 case Instruction::UIToFP: 7531 case Instruction::Trunc: 7532 case Instruction::FPTrunc: { 7533 // Computes the CastContextHint from a Load/Store instruction. 7534 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7535 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7536 "Expected a load or a store!"); 7537 7538 if (VF.isScalar() || !TheLoop->contains(I)) 7539 return TTI::CastContextHint::Normal; 7540 7541 switch (getWideningDecision(I, VF)) { 7542 case LoopVectorizationCostModel::CM_GatherScatter: 7543 return TTI::CastContextHint::GatherScatter; 7544 case LoopVectorizationCostModel::CM_Interleave: 7545 return TTI::CastContextHint::Interleave; 7546 case LoopVectorizationCostModel::CM_Scalarize: 7547 case LoopVectorizationCostModel::CM_Widen: 7548 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7549 : TTI::CastContextHint::Normal; 7550 case LoopVectorizationCostModel::CM_Widen_Reverse: 7551 return TTI::CastContextHint::Reversed; 7552 case LoopVectorizationCostModel::CM_Unknown: 7553 llvm_unreachable("Instr did not go through cost modelling?"); 7554 } 7555 7556 llvm_unreachable("Unhandled case!"); 7557 }; 7558 7559 unsigned Opcode = I->getOpcode(); 7560 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7561 // For Trunc, the context is the only user, which must be a StoreInst. 7562 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7563 if (I->hasOneUse()) 7564 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7565 CCH = ComputeCCH(Store); 7566 } 7567 // For Z/Sext, the context is the operand, which must be a LoadInst. 7568 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7569 Opcode == Instruction::FPExt) { 7570 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7571 CCH = ComputeCCH(Load); 7572 } 7573 7574 // We optimize the truncation of induction variables having constant 7575 // integer steps. The cost of these truncations is the same as the scalar 7576 // operation. 7577 if (isOptimizableIVTruncate(I, VF)) { 7578 auto *Trunc = cast<TruncInst>(I); 7579 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7580 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7581 } 7582 7583 // Detect reduction patterns 7584 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7585 return *RedCost; 7586 7587 Type *SrcScalarTy = I->getOperand(0)->getType(); 7588 Type *SrcVecTy = 7589 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7590 if (canTruncateToMinimalBitwidth(I, VF)) { 7591 // This cast is going to be shrunk. This may remove the cast or it might 7592 // turn it into slightly different cast. For example, if MinBW == 16, 7593 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7594 // 7595 // Calculate the modified src and dest types. 7596 Type *MinVecTy = VectorTy; 7597 if (Opcode == Instruction::Trunc) { 7598 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7599 VectorTy = 7600 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7601 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7602 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7603 VectorTy = 7604 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7605 } 7606 } 7607 7608 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7609 } 7610 case Instruction::Call: { 7611 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7612 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7613 return *RedCost; 7614 bool NeedToScalarize; 7615 CallInst *CI = cast<CallInst>(I); 7616 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7617 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7618 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7619 return std::min(CallCost, IntrinsicCost); 7620 } 7621 return CallCost; 7622 } 7623 case Instruction::ExtractValue: 7624 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7625 case Instruction::Alloca: 7626 // We cannot easily widen alloca to a scalable alloca, as 7627 // the result would need to be a vector of pointers. 7628 if (VF.isScalable()) 7629 return InstructionCost::getInvalid(); 7630 LLVM_FALLTHROUGH; 7631 default: 7632 // This opcode is unknown. Assume that it is the same as 'mul'. 7633 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7634 } // end of switch. 7635 } 7636 7637 char LoopVectorize::ID = 0; 7638 7639 static const char lv_name[] = "Loop Vectorization"; 7640 7641 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7642 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7643 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7644 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7645 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7646 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7647 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7648 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7649 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7650 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7651 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7652 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7653 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7654 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7655 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7656 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7657 7658 namespace llvm { 7659 7660 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7661 7662 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7663 bool VectorizeOnlyWhenForced) { 7664 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7665 } 7666 7667 } // end namespace llvm 7668 7669 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7670 // Check if the pointer operand of a load or store instruction is 7671 // consecutive. 7672 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7673 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7674 return false; 7675 } 7676 7677 void LoopVectorizationCostModel::collectValuesToIgnore() { 7678 // Ignore ephemeral values. 7679 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7680 7681 // Ignore type-promoting instructions we identified during reduction 7682 // detection. 7683 for (auto &Reduction : Legal->getReductionVars()) { 7684 const RecurrenceDescriptor &RedDes = Reduction.second; 7685 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7686 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7687 } 7688 // Ignore type-casting instructions we identified during induction 7689 // detection. 7690 for (auto &Induction : Legal->getInductionVars()) { 7691 const InductionDescriptor &IndDes = Induction.second; 7692 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7693 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7694 } 7695 } 7696 7697 void LoopVectorizationCostModel::collectInLoopReductions() { 7698 for (auto &Reduction : Legal->getReductionVars()) { 7699 PHINode *Phi = Reduction.first; 7700 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7701 7702 // We don't collect reductions that are type promoted (yet). 7703 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7704 continue; 7705 7706 // If the target would prefer this reduction to happen "in-loop", then we 7707 // want to record it as such. 7708 unsigned Opcode = RdxDesc.getOpcode(); 7709 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7710 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7711 TargetTransformInfo::ReductionFlags())) 7712 continue; 7713 7714 // Check that we can correctly put the reductions into the loop, by 7715 // finding the chain of operations that leads from the phi to the loop 7716 // exit value. 7717 SmallVector<Instruction *, 4> ReductionOperations = 7718 RdxDesc.getReductionOpChain(Phi, TheLoop); 7719 bool InLoop = !ReductionOperations.empty(); 7720 if (InLoop) { 7721 InLoopReductionChains[Phi] = ReductionOperations; 7722 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7723 Instruction *LastChain = Phi; 7724 for (auto *I : ReductionOperations) { 7725 InLoopReductionImmediateChains[I] = LastChain; 7726 LastChain = I; 7727 } 7728 } 7729 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7730 << " reduction for phi: " << *Phi << "\n"); 7731 } 7732 } 7733 7734 // TODO: we could return a pair of values that specify the max VF and 7735 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7736 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7737 // doesn't have a cost model that can choose which plan to execute if 7738 // more than one is generated. 7739 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7740 LoopVectorizationCostModel &CM) { 7741 unsigned WidestType; 7742 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7743 return WidestVectorRegBits / WidestType; 7744 } 7745 7746 VectorizationFactor 7747 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7748 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7749 ElementCount VF = UserVF; 7750 // Outer loop handling: They may require CFG and instruction level 7751 // transformations before even evaluating whether vectorization is profitable. 7752 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7753 // the vectorization pipeline. 7754 if (!OrigLoop->isInnermost()) { 7755 // If the user doesn't provide a vectorization factor, determine a 7756 // reasonable one. 7757 if (UserVF.isZero()) { 7758 VF = ElementCount::getFixed(determineVPlanVF( 7759 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7760 .getFixedSize(), 7761 CM)); 7762 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7763 7764 // Make sure we have a VF > 1 for stress testing. 7765 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7766 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7767 << "overriding computed VF.\n"); 7768 VF = ElementCount::getFixed(4); 7769 } 7770 } 7771 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7772 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7773 "VF needs to be a power of two"); 7774 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7775 << "VF " << VF << " to build VPlans.\n"); 7776 buildVPlans(VF, VF); 7777 7778 // For VPlan build stress testing, we bail out after VPlan construction. 7779 if (VPlanBuildStressTest) 7780 return VectorizationFactor::Disabled(); 7781 7782 return {VF, 0 /*Cost*/}; 7783 } 7784 7785 LLVM_DEBUG( 7786 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7787 "VPlan-native path.\n"); 7788 return VectorizationFactor::Disabled(); 7789 } 7790 7791 Optional<VectorizationFactor> 7792 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7793 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7794 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7795 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7796 return None; 7797 7798 // Invalidate interleave groups if all blocks of loop will be predicated. 7799 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7800 !useMaskedInterleavedAccesses(*TTI)) { 7801 LLVM_DEBUG( 7802 dbgs() 7803 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7804 "which requires masked-interleaved support.\n"); 7805 if (CM.InterleaveInfo.invalidateGroups()) 7806 // Invalidating interleave groups also requires invalidating all decisions 7807 // based on them, which includes widening decisions and uniform and scalar 7808 // values. 7809 CM.invalidateCostModelingDecisions(); 7810 } 7811 7812 ElementCount MaxUserVF = 7813 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7814 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7815 if (!UserVF.isZero() && UserVFIsLegal) { 7816 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7817 "VF needs to be a power of two"); 7818 // Collect the instructions (and their associated costs) that will be more 7819 // profitable to scalarize. 7820 if (CM.selectUserVectorizationFactor(UserVF)) { 7821 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7822 CM.collectInLoopReductions(); 7823 buildVPlansWithVPRecipes(UserVF, UserVF); 7824 LLVM_DEBUG(printPlans(dbgs())); 7825 return {{UserVF, 0}}; 7826 } else 7827 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7828 "InvalidCost", ORE, OrigLoop); 7829 } 7830 7831 // Populate the set of Vectorization Factor Candidates. 7832 ElementCountSet VFCandidates; 7833 for (auto VF = ElementCount::getFixed(1); 7834 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7835 VFCandidates.insert(VF); 7836 for (auto VF = ElementCount::getScalable(1); 7837 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7838 VFCandidates.insert(VF); 7839 7840 for (const auto &VF : VFCandidates) { 7841 // Collect Uniform and Scalar instructions after vectorization with VF. 7842 CM.collectUniformsAndScalars(VF); 7843 7844 // Collect the instructions (and their associated costs) that will be more 7845 // profitable to scalarize. 7846 if (VF.isVector()) 7847 CM.collectInstsToScalarize(VF); 7848 } 7849 7850 CM.collectInLoopReductions(); 7851 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7852 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7853 7854 LLVM_DEBUG(printPlans(dbgs())); 7855 if (!MaxFactors.hasVector()) 7856 return VectorizationFactor::Disabled(); 7857 7858 // Select the optimal vectorization factor. 7859 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7860 7861 // Check if it is profitable to vectorize with runtime checks. 7862 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7863 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7864 bool PragmaThresholdReached = 7865 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7866 bool ThresholdReached = 7867 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7868 if ((ThresholdReached && !Hints.allowReordering()) || 7869 PragmaThresholdReached) { 7870 ORE->emit([&]() { 7871 return OptimizationRemarkAnalysisAliasing( 7872 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7873 OrigLoop->getHeader()) 7874 << "loop not vectorized: cannot prove it is safe to reorder " 7875 "memory operations"; 7876 }); 7877 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7878 Hints.emitRemarkWithHints(); 7879 return VectorizationFactor::Disabled(); 7880 } 7881 } 7882 return SelectedVF; 7883 } 7884 7885 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7886 assert(count_if(VPlans, 7887 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7888 1 && 7889 "Best VF has not a single VPlan."); 7890 7891 for (const VPlanPtr &Plan : VPlans) { 7892 if (Plan->hasVF(VF)) 7893 return *Plan.get(); 7894 } 7895 llvm_unreachable("No plan found!"); 7896 } 7897 7898 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7899 SmallVector<Metadata *, 4> MDs; 7900 // Reserve first location for self reference to the LoopID metadata node. 7901 MDs.push_back(nullptr); 7902 bool IsUnrollMetadata = false; 7903 MDNode *LoopID = L->getLoopID(); 7904 if (LoopID) { 7905 // First find existing loop unrolling disable metadata. 7906 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7907 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7908 if (MD) { 7909 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7910 IsUnrollMetadata = 7911 S && S->getString().startswith("llvm.loop.unroll.disable"); 7912 } 7913 MDs.push_back(LoopID->getOperand(i)); 7914 } 7915 } 7916 7917 if (!IsUnrollMetadata) { 7918 // Add runtime unroll disable metadata. 7919 LLVMContext &Context = L->getHeader()->getContext(); 7920 SmallVector<Metadata *, 1> DisableOperands; 7921 DisableOperands.push_back( 7922 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7923 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7924 MDs.push_back(DisableNode); 7925 MDNode *NewLoopID = MDNode::get(Context, MDs); 7926 // Set operand 0 to refer to the loop id itself. 7927 NewLoopID->replaceOperandWith(0, NewLoopID); 7928 L->setLoopID(NewLoopID); 7929 } 7930 } 7931 7932 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7933 VPlan &BestVPlan, 7934 InnerLoopVectorizer &ILV, 7935 DominatorTree *DT) { 7936 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7937 << '\n'); 7938 7939 // Perform the actual loop transformation. 7940 7941 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7942 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7943 Value *CanonicalIVStartValue; 7944 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7945 ILV.createVectorizedLoopSkeleton(); 7946 ILV.collectPoisonGeneratingRecipes(State); 7947 7948 ILV.printDebugTracesAtStart(); 7949 7950 //===------------------------------------------------===// 7951 // 7952 // Notice: any optimization or new instruction that go 7953 // into the code below should also be implemented in 7954 // the cost-model. 7955 // 7956 //===------------------------------------------------===// 7957 7958 // 2. Copy and widen instructions from the old loop into the new loop. 7959 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7960 ILV.getOrCreateVectorTripCount(nullptr), 7961 CanonicalIVStartValue, State); 7962 BestVPlan.execute(&State); 7963 7964 // Keep all loop hints from the original loop on the vector loop (we'll 7965 // replace the vectorizer-specific hints below). 7966 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7967 7968 Optional<MDNode *> VectorizedLoopID = 7969 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7970 LLVMLoopVectorizeFollowupVectorized}); 7971 7972 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 7973 if (VectorizedLoopID.hasValue()) 7974 L->setLoopID(VectorizedLoopID.getValue()); 7975 else { 7976 // Keep all loop hints from the original loop on the vector loop (we'll 7977 // replace the vectorizer-specific hints below). 7978 if (MDNode *LID = OrigLoop->getLoopID()) 7979 L->setLoopID(LID); 7980 7981 LoopVectorizeHints Hints(L, true, *ORE); 7982 Hints.setAlreadyVectorized(); 7983 } 7984 // Disable runtime unrolling when vectorizing the epilogue loop. 7985 if (CanonicalIVStartValue) 7986 AddRuntimeUnrollDisableMetaData(L); 7987 7988 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7989 // predication, updating analyses. 7990 ILV.fixVectorizedLoop(State); 7991 7992 ILV.printDebugTracesAtEnd(); 7993 } 7994 7995 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7996 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7997 for (const auto &Plan : VPlans) 7998 if (PrintVPlansInDotFormat) 7999 Plan->printDOT(O); 8000 else 8001 Plan->print(O); 8002 } 8003 #endif 8004 8005 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8006 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8007 8008 // We create new control-flow for the vectorized loop, so the original exit 8009 // conditions will be dead after vectorization if it's only used by the 8010 // terminator 8011 SmallVector<BasicBlock*> ExitingBlocks; 8012 OrigLoop->getExitingBlocks(ExitingBlocks); 8013 for (auto *BB : ExitingBlocks) { 8014 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8015 if (!Cmp || !Cmp->hasOneUse()) 8016 continue; 8017 8018 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8019 if (!DeadInstructions.insert(Cmp).second) 8020 continue; 8021 8022 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8023 // TODO: can recurse through operands in general 8024 for (Value *Op : Cmp->operands()) { 8025 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8026 DeadInstructions.insert(cast<Instruction>(Op)); 8027 } 8028 } 8029 8030 // We create new "steps" for induction variable updates to which the original 8031 // induction variables map. An original update instruction will be dead if 8032 // all its users except the induction variable are dead. 8033 auto *Latch = OrigLoop->getLoopLatch(); 8034 for (auto &Induction : Legal->getInductionVars()) { 8035 PHINode *Ind = Induction.first; 8036 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8037 8038 // If the tail is to be folded by masking, the primary induction variable, 8039 // if exists, isn't dead: it will be used for masking. Don't kill it. 8040 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8041 continue; 8042 8043 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8044 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8045 })) 8046 DeadInstructions.insert(IndUpdate); 8047 } 8048 } 8049 8050 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8051 8052 //===--------------------------------------------------------------------===// 8053 // EpilogueVectorizerMainLoop 8054 //===--------------------------------------------------------------------===// 8055 8056 /// This function is partially responsible for generating the control flow 8057 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8058 std::pair<BasicBlock *, Value *> 8059 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8060 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8061 Loop *Lp = createVectorLoopSkeleton(""); 8062 8063 // Generate the code to check the minimum iteration count of the vector 8064 // epilogue (see below). 8065 EPI.EpilogueIterationCountCheck = 8066 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8067 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8068 8069 // Generate the code to check any assumptions that we've made for SCEV 8070 // expressions. 8071 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8072 8073 // Generate the code that checks at runtime if arrays overlap. We put the 8074 // checks into a separate block to make the more common case of few elements 8075 // faster. 8076 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8077 8078 // Generate the iteration count check for the main loop, *after* the check 8079 // for the epilogue loop, so that the path-length is shorter for the case 8080 // that goes directly through the vector epilogue. The longer-path length for 8081 // the main loop is compensated for, by the gain from vectorizing the larger 8082 // trip count. Note: the branch will get updated later on when we vectorize 8083 // the epilogue. 8084 EPI.MainLoopIterationCountCheck = 8085 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8086 8087 // Generate the induction variable. 8088 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8089 EPI.VectorTripCount = CountRoundDown; 8090 createHeaderBranch(Lp); 8091 8092 // Skip induction resume value creation here because they will be created in 8093 // the second pass. If we created them here, they wouldn't be used anyway, 8094 // because the vplan in the second pass still contains the inductions from the 8095 // original loop. 8096 8097 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 8098 } 8099 8100 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8101 LLVM_DEBUG({ 8102 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8103 << "Main Loop VF:" << EPI.MainLoopVF 8104 << ", Main Loop UF:" << EPI.MainLoopUF 8105 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8106 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8107 }); 8108 } 8109 8110 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8111 DEBUG_WITH_TYPE(VerboseDebug, { 8112 dbgs() << "intermediate fn:\n" 8113 << *OrigLoop->getHeader()->getParent() << "\n"; 8114 }); 8115 } 8116 8117 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8118 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8119 assert(L && "Expected valid Loop."); 8120 assert(Bypass && "Expected valid bypass basic block."); 8121 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8122 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8123 Value *Count = getOrCreateTripCount(L); 8124 // Reuse existing vector loop preheader for TC checks. 8125 // Note that new preheader block is generated for vector loop. 8126 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8127 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8128 8129 // Generate code to check if the loop's trip count is less than VF * UF of the 8130 // main vector loop. 8131 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8132 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8133 8134 Value *CheckMinIters = Builder.CreateICmp( 8135 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8136 "min.iters.check"); 8137 8138 if (!ForEpilogue) 8139 TCCheckBlock->setName("vector.main.loop.iter.check"); 8140 8141 // Create new preheader for vector loop. 8142 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8143 DT, LI, nullptr, "vector.ph"); 8144 8145 if (ForEpilogue) { 8146 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8147 DT->getNode(Bypass)->getIDom()) && 8148 "TC check is expected to dominate Bypass"); 8149 8150 // Update dominator for Bypass & LoopExit. 8151 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8152 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8153 // For loops with multiple exits, there's no edge from the middle block 8154 // to exit blocks (as the epilogue must run) and thus no need to update 8155 // the immediate dominator of the exit blocks. 8156 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8157 8158 LoopBypassBlocks.push_back(TCCheckBlock); 8159 8160 // Save the trip count so we don't have to regenerate it in the 8161 // vec.epilog.iter.check. This is safe to do because the trip count 8162 // generated here dominates the vector epilog iter check. 8163 EPI.TripCount = Count; 8164 } 8165 8166 ReplaceInstWithInst( 8167 TCCheckBlock->getTerminator(), 8168 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8169 8170 return TCCheckBlock; 8171 } 8172 8173 //===--------------------------------------------------------------------===// 8174 // EpilogueVectorizerEpilogueLoop 8175 //===--------------------------------------------------------------------===// 8176 8177 /// This function is partially responsible for generating the control flow 8178 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8179 std::pair<BasicBlock *, Value *> 8180 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8181 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8182 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8183 8184 // Now, compare the remaining count and if there aren't enough iterations to 8185 // execute the vectorized epilogue skip to the scalar part. 8186 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8187 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8188 LoopVectorPreHeader = 8189 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8190 LI, nullptr, "vec.epilog.ph"); 8191 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8192 VecEpilogueIterationCountCheck); 8193 8194 // Adjust the control flow taking the state info from the main loop 8195 // vectorization into account. 8196 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8197 "expected this to be saved from the previous pass."); 8198 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8199 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8200 8201 DT->changeImmediateDominator(LoopVectorPreHeader, 8202 EPI.MainLoopIterationCountCheck); 8203 8204 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8205 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8206 8207 if (EPI.SCEVSafetyCheck) 8208 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8209 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8210 if (EPI.MemSafetyCheck) 8211 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8212 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8213 8214 DT->changeImmediateDominator( 8215 VecEpilogueIterationCountCheck, 8216 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8217 8218 DT->changeImmediateDominator(LoopScalarPreHeader, 8219 EPI.EpilogueIterationCountCheck); 8220 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8221 // If there is an epilogue which must run, there's no edge from the 8222 // middle block to exit blocks and thus no need to update the immediate 8223 // dominator of the exit blocks. 8224 DT->changeImmediateDominator(LoopExitBlock, 8225 EPI.EpilogueIterationCountCheck); 8226 8227 // Keep track of bypass blocks, as they feed start values to the induction 8228 // phis in the scalar loop preheader. 8229 if (EPI.SCEVSafetyCheck) 8230 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8231 if (EPI.MemSafetyCheck) 8232 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8233 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8234 8235 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 8236 // merge control-flow from the latch block and the middle block. Update the 8237 // incoming values here and move the Phi into the preheader. 8238 SmallVector<PHINode *, 4> PhisInBlock; 8239 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8240 PhisInBlock.push_back(&Phi); 8241 8242 for (PHINode *Phi : PhisInBlock) { 8243 Phi->replaceIncomingBlockWith( 8244 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8245 VecEpilogueIterationCountCheck); 8246 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8247 if (EPI.SCEVSafetyCheck) 8248 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8249 if (EPI.MemSafetyCheck) 8250 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8251 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8252 } 8253 8254 // Generate a resume induction for the vector epilogue and put it in the 8255 // vector epilogue preheader 8256 Type *IdxTy = Legal->getWidestInductionType(); 8257 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8258 LoopVectorPreHeader->getFirstNonPHI()); 8259 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8260 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8261 EPI.MainLoopIterationCountCheck); 8262 8263 // Generate the induction variable. 8264 createHeaderBranch(Lp); 8265 8266 // Generate induction resume values. These variables save the new starting 8267 // indexes for the scalar loop. They are used to test if there are any tail 8268 // iterations left once the vector loop has completed. 8269 // Note that when the vectorized epilogue is skipped due to iteration count 8270 // check, then the resume value for the induction variable comes from 8271 // the trip count of the main vector loop, hence passing the AdditionalBypass 8272 // argument. 8273 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8274 EPI.VectorTripCount} /* AdditionalBypass */); 8275 8276 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8277 } 8278 8279 BasicBlock * 8280 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8281 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8282 8283 assert(EPI.TripCount && 8284 "Expected trip count to have been safed in the first pass."); 8285 assert( 8286 (!isa<Instruction>(EPI.TripCount) || 8287 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8288 "saved trip count does not dominate insertion point."); 8289 Value *TC = EPI.TripCount; 8290 IRBuilder<> Builder(Insert->getTerminator()); 8291 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8292 8293 // Generate code to check if the loop's trip count is less than VF * UF of the 8294 // vector epilogue loop. 8295 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8296 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8297 8298 Value *CheckMinIters = 8299 Builder.CreateICmp(P, Count, 8300 createStepForVF(Builder, Count->getType(), 8301 EPI.EpilogueVF, EPI.EpilogueUF), 8302 "min.epilog.iters.check"); 8303 8304 ReplaceInstWithInst( 8305 Insert->getTerminator(), 8306 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8307 8308 LoopBypassBlocks.push_back(Insert); 8309 return Insert; 8310 } 8311 8312 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8313 LLVM_DEBUG({ 8314 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8315 << "Epilogue Loop VF:" << EPI.EpilogueVF 8316 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8317 }); 8318 } 8319 8320 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8321 DEBUG_WITH_TYPE(VerboseDebug, { 8322 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8323 }); 8324 } 8325 8326 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8327 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8328 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8329 bool PredicateAtRangeStart = Predicate(Range.Start); 8330 8331 for (ElementCount TmpVF = Range.Start * 2; 8332 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8333 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8334 Range.End = TmpVF; 8335 break; 8336 } 8337 8338 return PredicateAtRangeStart; 8339 } 8340 8341 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8342 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8343 /// of VF's starting at a given VF and extending it as much as possible. Each 8344 /// vectorization decision can potentially shorten this sub-range during 8345 /// buildVPlan(). 8346 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8347 ElementCount MaxVF) { 8348 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8349 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8350 VFRange SubRange = {VF, MaxVFPlusOne}; 8351 VPlans.push_back(buildVPlan(SubRange)); 8352 VF = SubRange.End; 8353 } 8354 } 8355 8356 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8357 VPlanPtr &Plan) { 8358 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8359 8360 // Look for cached value. 8361 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8362 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8363 if (ECEntryIt != EdgeMaskCache.end()) 8364 return ECEntryIt->second; 8365 8366 VPValue *SrcMask = createBlockInMask(Src, Plan); 8367 8368 // The terminator has to be a branch inst! 8369 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8370 assert(BI && "Unexpected terminator found"); 8371 8372 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8373 return EdgeMaskCache[Edge] = SrcMask; 8374 8375 // If source is an exiting block, we know the exit edge is dynamically dead 8376 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8377 // adding uses of an otherwise potentially dead instruction. 8378 if (OrigLoop->isLoopExiting(Src)) 8379 return EdgeMaskCache[Edge] = SrcMask; 8380 8381 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8382 assert(EdgeMask && "No Edge Mask found for condition"); 8383 8384 if (BI->getSuccessor(0) != Dst) 8385 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8386 8387 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8388 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8389 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8390 // The select version does not introduce new UB if SrcMask is false and 8391 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8392 VPValue *False = Plan->getOrAddVPValue( 8393 ConstantInt::getFalse(BI->getCondition()->getType())); 8394 EdgeMask = 8395 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8396 } 8397 8398 return EdgeMaskCache[Edge] = EdgeMask; 8399 } 8400 8401 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8402 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8403 8404 // Look for cached value. 8405 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8406 if (BCEntryIt != BlockMaskCache.end()) 8407 return BCEntryIt->second; 8408 8409 // All-one mask is modelled as no-mask following the convention for masked 8410 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8411 VPValue *BlockMask = nullptr; 8412 8413 if (OrigLoop->getHeader() == BB) { 8414 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8415 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8416 8417 // Introduce the early-exit compare IV <= BTC to form header block mask. 8418 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8419 // constructing the desired canonical IV in the header block as its first 8420 // non-phi instructions. 8421 assert(CM.foldTailByMasking() && "must fold the tail"); 8422 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8423 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8424 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8425 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8426 8427 VPBuilder::InsertPointGuard Guard(Builder); 8428 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8429 if (CM.TTI.emitGetActiveLaneMask()) { 8430 VPValue *TC = Plan->getOrCreateTripCount(); 8431 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8432 } else { 8433 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8434 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8435 } 8436 return BlockMaskCache[BB] = BlockMask; 8437 } 8438 8439 // This is the block mask. We OR all incoming edges. 8440 for (auto *Predecessor : predecessors(BB)) { 8441 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8442 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8443 return BlockMaskCache[BB] = EdgeMask; 8444 8445 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8446 BlockMask = EdgeMask; 8447 continue; 8448 } 8449 8450 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8451 } 8452 8453 return BlockMaskCache[BB] = BlockMask; 8454 } 8455 8456 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8457 ArrayRef<VPValue *> Operands, 8458 VFRange &Range, 8459 VPlanPtr &Plan) { 8460 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8461 "Must be called with either a load or store"); 8462 8463 auto willWiden = [&](ElementCount VF) -> bool { 8464 if (VF.isScalar()) 8465 return false; 8466 LoopVectorizationCostModel::InstWidening Decision = 8467 CM.getWideningDecision(I, VF); 8468 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8469 "CM decision should be taken at this point."); 8470 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8471 return true; 8472 if (CM.isScalarAfterVectorization(I, VF) || 8473 CM.isProfitableToScalarize(I, VF)) 8474 return false; 8475 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8476 }; 8477 8478 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8479 return nullptr; 8480 8481 VPValue *Mask = nullptr; 8482 if (Legal->isMaskRequired(I)) 8483 Mask = createBlockInMask(I->getParent(), Plan); 8484 8485 // Determine if the pointer operand of the access is either consecutive or 8486 // reverse consecutive. 8487 LoopVectorizationCostModel::InstWidening Decision = 8488 CM.getWideningDecision(I, Range.Start); 8489 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8490 bool Consecutive = 8491 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8492 8493 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8494 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8495 Consecutive, Reverse); 8496 8497 StoreInst *Store = cast<StoreInst>(I); 8498 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8499 Mask, Consecutive, Reverse); 8500 } 8501 8502 static VPWidenIntOrFpInductionRecipe * 8503 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, 8504 VPValue *Start, const InductionDescriptor &IndDesc, 8505 LoopVectorizationCostModel &CM, Loop &OrigLoop, 8506 VFRange &Range) { 8507 // Returns true if an instruction \p I should be scalarized instead of 8508 // vectorized for the chosen vectorization factor. 8509 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8510 return CM.isScalarAfterVectorization(I, VF) || 8511 CM.isProfitableToScalarize(I, VF); 8512 }; 8513 8514 bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( 8515 [&](ElementCount VF) { 8516 // Returns true if we should generate a scalar version of \p IV. 8517 if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) 8518 return true; 8519 auto isScalarInst = [&](User *U) -> bool { 8520 auto *I = cast<Instruction>(U); 8521 return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); 8522 }; 8523 return any_of(PhiOrTrunc->users(), isScalarInst); 8524 }, 8525 Range); 8526 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8527 [&](ElementCount VF) { 8528 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8529 }, 8530 Range); 8531 assert(IndDesc.getStartValue() == 8532 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8533 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8534 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, 8535 NeedsScalarIV, !NeedsScalarIVOnly); 8536 } 8537 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8538 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, 8539 !NeedsScalarIVOnly); 8540 } 8541 8542 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8543 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { 8544 8545 // Check if this is an integer or fp induction. If so, build the recipe that 8546 // produces its scalar and vector values. 8547 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8548 return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, 8549 Range); 8550 8551 return nullptr; 8552 } 8553 8554 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8555 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8556 VPlan &Plan) const { 8557 // Optimize the special case where the source is a constant integer 8558 // induction variable. Notice that we can only optimize the 'trunc' case 8559 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8560 // (c) other casts depend on pointer size. 8561 8562 // Determine whether \p K is a truncation based on an induction variable that 8563 // can be optimized. 8564 auto isOptimizableIVTruncate = 8565 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8566 return [=](ElementCount VF) -> bool { 8567 return CM.isOptimizableIVTruncate(K, VF); 8568 }; 8569 }; 8570 8571 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8572 isOptimizableIVTruncate(I), Range)) { 8573 8574 auto *Phi = cast<PHINode>(I->getOperand(0)); 8575 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8576 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8577 return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); 8578 } 8579 return nullptr; 8580 } 8581 8582 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8583 ArrayRef<VPValue *> Operands, 8584 VPlanPtr &Plan) { 8585 // If all incoming values are equal, the incoming VPValue can be used directly 8586 // instead of creating a new VPBlendRecipe. 8587 VPValue *FirstIncoming = Operands[0]; 8588 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8589 return FirstIncoming == Inc; 8590 })) { 8591 return Operands[0]; 8592 } 8593 8594 // We know that all PHIs in non-header blocks are converted into selects, so 8595 // we don't have to worry about the insertion order and we can just use the 8596 // builder. At this point we generate the predication tree. There may be 8597 // duplications since this is a simple recursive scan, but future 8598 // optimizations will clean it up. 8599 SmallVector<VPValue *, 2> OperandsWithMask; 8600 unsigned NumIncoming = Phi->getNumIncomingValues(); 8601 8602 for (unsigned In = 0; In < NumIncoming; In++) { 8603 VPValue *EdgeMask = 8604 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8605 assert((EdgeMask || NumIncoming == 1) && 8606 "Multiple predecessors with one having a full mask"); 8607 OperandsWithMask.push_back(Operands[In]); 8608 if (EdgeMask) 8609 OperandsWithMask.push_back(EdgeMask); 8610 } 8611 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8612 } 8613 8614 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8615 ArrayRef<VPValue *> Operands, 8616 VFRange &Range) const { 8617 8618 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8619 [this, CI](ElementCount VF) { 8620 return CM.isScalarWithPredication(CI, VF); 8621 }, 8622 Range); 8623 8624 if (IsPredicated) 8625 return nullptr; 8626 8627 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8628 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8629 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8630 ID == Intrinsic::pseudoprobe || 8631 ID == Intrinsic::experimental_noalias_scope_decl)) 8632 return nullptr; 8633 8634 auto willWiden = [&](ElementCount VF) -> bool { 8635 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8636 // The following case may be scalarized depending on the VF. 8637 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8638 // version of the instruction. 8639 // Is it beneficial to perform intrinsic call compared to lib call? 8640 bool NeedToScalarize = false; 8641 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8642 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8643 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8644 return UseVectorIntrinsic || !NeedToScalarize; 8645 }; 8646 8647 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8648 return nullptr; 8649 8650 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8651 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8652 } 8653 8654 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8655 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8656 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8657 // Instruction should be widened, unless it is scalar after vectorization, 8658 // scalarization is profitable or it is predicated. 8659 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8660 return CM.isScalarAfterVectorization(I, VF) || 8661 CM.isProfitableToScalarize(I, VF) || 8662 CM.isScalarWithPredication(I, VF); 8663 }; 8664 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8665 Range); 8666 } 8667 8668 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8669 ArrayRef<VPValue *> Operands) const { 8670 auto IsVectorizableOpcode = [](unsigned Opcode) { 8671 switch (Opcode) { 8672 case Instruction::Add: 8673 case Instruction::And: 8674 case Instruction::AShr: 8675 case Instruction::BitCast: 8676 case Instruction::FAdd: 8677 case Instruction::FCmp: 8678 case Instruction::FDiv: 8679 case Instruction::FMul: 8680 case Instruction::FNeg: 8681 case Instruction::FPExt: 8682 case Instruction::FPToSI: 8683 case Instruction::FPToUI: 8684 case Instruction::FPTrunc: 8685 case Instruction::FRem: 8686 case Instruction::FSub: 8687 case Instruction::ICmp: 8688 case Instruction::IntToPtr: 8689 case Instruction::LShr: 8690 case Instruction::Mul: 8691 case Instruction::Or: 8692 case Instruction::PtrToInt: 8693 case Instruction::SDiv: 8694 case Instruction::Select: 8695 case Instruction::SExt: 8696 case Instruction::Shl: 8697 case Instruction::SIToFP: 8698 case Instruction::SRem: 8699 case Instruction::Sub: 8700 case Instruction::Trunc: 8701 case Instruction::UDiv: 8702 case Instruction::UIToFP: 8703 case Instruction::URem: 8704 case Instruction::Xor: 8705 case Instruction::ZExt: 8706 return true; 8707 } 8708 return false; 8709 }; 8710 8711 if (!IsVectorizableOpcode(I->getOpcode())) 8712 return nullptr; 8713 8714 // Success: widen this instruction. 8715 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8716 } 8717 8718 void VPRecipeBuilder::fixHeaderPhis() { 8719 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8720 for (VPHeaderPHIRecipe *R : PhisToFix) { 8721 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8722 VPRecipeBase *IncR = 8723 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8724 R->addOperand(IncR->getVPSingleValue()); 8725 } 8726 } 8727 8728 VPBasicBlock *VPRecipeBuilder::handleReplication( 8729 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8730 VPlanPtr &Plan) { 8731 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8732 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8733 Range); 8734 8735 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8736 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8737 Range); 8738 8739 // Even if the instruction is not marked as uniform, there are certain 8740 // intrinsic calls that can be effectively treated as such, so we check for 8741 // them here. Conservatively, we only do this for scalable vectors, since 8742 // for fixed-width VFs we can always fall back on full scalarization. 8743 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8744 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8745 case Intrinsic::assume: 8746 case Intrinsic::lifetime_start: 8747 case Intrinsic::lifetime_end: 8748 // For scalable vectors if one of the operands is variant then we still 8749 // want to mark as uniform, which will generate one instruction for just 8750 // the first lane of the vector. We can't scalarize the call in the same 8751 // way as for fixed-width vectors because we don't know how many lanes 8752 // there are. 8753 // 8754 // The reasons for doing it this way for scalable vectors are: 8755 // 1. For the assume intrinsic generating the instruction for the first 8756 // lane is still be better than not generating any at all. For 8757 // example, the input may be a splat across all lanes. 8758 // 2. For the lifetime start/end intrinsics the pointer operand only 8759 // does anything useful when the input comes from a stack object, 8760 // which suggests it should always be uniform. For non-stack objects 8761 // the effect is to poison the object, which still allows us to 8762 // remove the call. 8763 IsUniform = true; 8764 break; 8765 default: 8766 break; 8767 } 8768 } 8769 8770 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8771 IsUniform, IsPredicated); 8772 setRecipe(I, Recipe); 8773 Plan->addVPValue(I, Recipe); 8774 8775 // Find if I uses a predicated instruction. If so, it will use its scalar 8776 // value. Avoid hoisting the insert-element which packs the scalar value into 8777 // a vector value, as that happens iff all users use the vector value. 8778 for (VPValue *Op : Recipe->operands()) { 8779 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8780 if (!PredR) 8781 continue; 8782 auto *RepR = 8783 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8784 assert(RepR->isPredicated() && 8785 "expected Replicate recipe to be predicated"); 8786 RepR->setAlsoPack(false); 8787 } 8788 8789 // Finalize the recipe for Instr, first if it is not predicated. 8790 if (!IsPredicated) { 8791 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8792 VPBB->appendRecipe(Recipe); 8793 return VPBB; 8794 } 8795 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8796 8797 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8798 assert(SingleSucc && "VPBB must have a single successor when handling " 8799 "predicated replication."); 8800 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8801 // Record predicated instructions for above packing optimizations. 8802 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8803 VPBlockUtils::insertBlockAfter(Region, VPBB); 8804 auto *RegSucc = new VPBasicBlock(); 8805 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8806 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8807 return RegSucc; 8808 } 8809 8810 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8811 VPRecipeBase *PredRecipe, 8812 VPlanPtr &Plan) { 8813 // Instructions marked for predication are replicated and placed under an 8814 // if-then construct to prevent side-effects. 8815 8816 // Generate recipes to compute the block mask for this region. 8817 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8818 8819 // Build the triangular if-then region. 8820 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8821 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8822 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8823 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8824 auto *PHIRecipe = Instr->getType()->isVoidTy() 8825 ? nullptr 8826 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8827 if (PHIRecipe) { 8828 Plan->removeVPValueFor(Instr); 8829 Plan->addVPValue(Instr, PHIRecipe); 8830 } 8831 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8832 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8833 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8834 8835 // Note: first set Entry as region entry and then connect successors starting 8836 // from it in order, to propagate the "parent" of each VPBasicBlock. 8837 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8838 VPBlockUtils::connectBlocks(Pred, Exit); 8839 8840 return Region; 8841 } 8842 8843 VPRecipeOrVPValueTy 8844 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8845 ArrayRef<VPValue *> Operands, 8846 VFRange &Range, VPlanPtr &Plan) { 8847 // First, check for specific widening recipes that deal with calls, memory 8848 // operations, inductions and Phi nodes. 8849 if (auto *CI = dyn_cast<CallInst>(Instr)) 8850 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8851 8852 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8853 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8854 8855 VPRecipeBase *Recipe; 8856 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8857 if (Phi->getParent() != OrigLoop->getHeader()) 8858 return tryToBlend(Phi, Operands, Plan); 8859 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8860 return toVPRecipeResult(Recipe); 8861 8862 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8863 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8864 VPValue *StartV = Operands[0]; 8865 if (Legal->isReductionVariable(Phi)) { 8866 const RecurrenceDescriptor &RdxDesc = 8867 Legal->getReductionVars().find(Phi)->second; 8868 assert(RdxDesc.getRecurrenceStartValue() == 8869 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8870 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8871 CM.isInLoopReduction(Phi), 8872 CM.useOrderedReductions(RdxDesc)); 8873 } else { 8874 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8875 } 8876 8877 // Record the incoming value from the backedge, so we can add the incoming 8878 // value from the backedge after all recipes have been created. 8879 recordRecipeOf(cast<Instruction>( 8880 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8881 PhisToFix.push_back(PhiRecipe); 8882 } else { 8883 // TODO: record backedge value for remaining pointer induction phis. 8884 assert(Phi->getType()->isPointerTy() && 8885 "only pointer phis should be handled here"); 8886 assert(Legal->getInductionVars().count(Phi) && 8887 "Not an induction variable"); 8888 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8889 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8890 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8891 } 8892 8893 return toVPRecipeResult(PhiRecipe); 8894 } 8895 8896 if (isa<TruncInst>(Instr) && 8897 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8898 Range, *Plan))) 8899 return toVPRecipeResult(Recipe); 8900 8901 if (!shouldWiden(Instr, Range)) 8902 return nullptr; 8903 8904 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8905 return toVPRecipeResult(new VPWidenGEPRecipe( 8906 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8907 8908 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8909 bool InvariantCond = 8910 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8911 return toVPRecipeResult(new VPWidenSelectRecipe( 8912 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8913 } 8914 8915 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8916 } 8917 8918 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8919 ElementCount MaxVF) { 8920 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8921 8922 // Collect instructions from the original loop that will become trivially dead 8923 // in the vectorized loop. We don't need to vectorize these instructions. For 8924 // example, original induction update instructions can become dead because we 8925 // separately emit induction "steps" when generating code for the new loop. 8926 // Similarly, we create a new latch condition when setting up the structure 8927 // of the new loop, so the old one can become dead. 8928 SmallPtrSet<Instruction *, 4> DeadInstructions; 8929 collectTriviallyDeadInstructions(DeadInstructions); 8930 8931 // Add assume instructions we need to drop to DeadInstructions, to prevent 8932 // them from being added to the VPlan. 8933 // TODO: We only need to drop assumes in blocks that get flattend. If the 8934 // control flow is preserved, we should keep them. 8935 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8936 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8937 8938 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8939 // Dead instructions do not need sinking. Remove them from SinkAfter. 8940 for (Instruction *I : DeadInstructions) 8941 SinkAfter.erase(I); 8942 8943 // Cannot sink instructions after dead instructions (there won't be any 8944 // recipes for them). Instead, find the first non-dead previous instruction. 8945 for (auto &P : Legal->getSinkAfter()) { 8946 Instruction *SinkTarget = P.second; 8947 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8948 (void)FirstInst; 8949 while (DeadInstructions.contains(SinkTarget)) { 8950 assert( 8951 SinkTarget != FirstInst && 8952 "Must find a live instruction (at least the one feeding the " 8953 "first-order recurrence PHI) before reaching beginning of the block"); 8954 SinkTarget = SinkTarget->getPrevNode(); 8955 assert(SinkTarget != P.first && 8956 "sink source equals target, no sinking required"); 8957 } 8958 P.second = SinkTarget; 8959 } 8960 8961 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8962 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8963 VFRange SubRange = {VF, MaxVFPlusOne}; 8964 VPlans.push_back( 8965 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8966 VF = SubRange.End; 8967 } 8968 } 8969 8970 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8971 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8972 // BranchOnCount VPInstruction to the latch. 8973 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8974 bool HasNUW, bool IsVPlanNative) { 8975 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8976 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8977 8978 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8979 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8980 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8981 if (IsVPlanNative) 8982 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 8983 Header->insert(CanonicalIVPHI, Header->begin()); 8984 8985 auto *CanonicalIVIncrement = 8986 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8987 : VPInstruction::CanonicalIVIncrement, 8988 {CanonicalIVPHI}, DL); 8989 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8990 8991 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 8992 if (IsVPlanNative) { 8993 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 8994 EB->setCondBit(nullptr); 8995 } 8996 EB->appendRecipe(CanonicalIVIncrement); 8997 8998 auto *BranchOnCount = 8999 new VPInstruction(VPInstruction::BranchOnCount, 9000 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 9001 EB->appendRecipe(BranchOnCount); 9002 } 9003 9004 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9005 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9006 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9007 9008 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9009 9010 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9011 9012 // --------------------------------------------------------------------------- 9013 // Pre-construction: record ingredients whose recipes we'll need to further 9014 // process after constructing the initial VPlan. 9015 // --------------------------------------------------------------------------- 9016 9017 // Mark instructions we'll need to sink later and their targets as 9018 // ingredients whose recipe we'll need to record. 9019 for (auto &Entry : SinkAfter) { 9020 RecipeBuilder.recordRecipeOf(Entry.first); 9021 RecipeBuilder.recordRecipeOf(Entry.second); 9022 } 9023 for (auto &Reduction : CM.getInLoopReductionChains()) { 9024 PHINode *Phi = Reduction.first; 9025 RecurKind Kind = 9026 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 9027 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9028 9029 RecipeBuilder.recordRecipeOf(Phi); 9030 for (auto &R : ReductionOperations) { 9031 RecipeBuilder.recordRecipeOf(R); 9032 // For min/max reducitons, where we have a pair of icmp/select, we also 9033 // need to record the ICmp recipe, so it can be removed later. 9034 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9035 "Only min/max recurrences allowed for inloop reductions"); 9036 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9037 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9038 } 9039 } 9040 9041 // For each interleave group which is relevant for this (possibly trimmed) 9042 // Range, add it to the set of groups to be later applied to the VPlan and add 9043 // placeholders for its members' Recipes which we'll be replacing with a 9044 // single VPInterleaveRecipe. 9045 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9046 auto applyIG = [IG, this](ElementCount VF) -> bool { 9047 return (VF.isVector() && // Query is illegal for VF == 1 9048 CM.getWideningDecision(IG->getInsertPos(), VF) == 9049 LoopVectorizationCostModel::CM_Interleave); 9050 }; 9051 if (!getDecisionAndClampRange(applyIG, Range)) 9052 continue; 9053 InterleaveGroups.insert(IG); 9054 for (unsigned i = 0; i < IG->getFactor(); i++) 9055 if (Instruction *Member = IG->getMember(i)) 9056 RecipeBuilder.recordRecipeOf(Member); 9057 }; 9058 9059 // --------------------------------------------------------------------------- 9060 // Build initial VPlan: Scan the body of the loop in a topological order to 9061 // visit each basic block after having visited its predecessor basic blocks. 9062 // --------------------------------------------------------------------------- 9063 9064 // Create initial VPlan skeleton, with separate header and latch blocks. 9065 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9066 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9067 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9068 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9069 auto Plan = std::make_unique<VPlan>(TopRegion); 9070 9071 Instruction *DLInst = 9072 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9073 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 9074 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 9075 !CM.foldTailByMasking(), false); 9076 9077 // Scan the body of the loop in a topological order to visit each basic block 9078 // after having visited its predecessor basic blocks. 9079 LoopBlocksDFS DFS(OrigLoop); 9080 DFS.perform(LI); 9081 9082 VPBasicBlock *VPBB = HeaderVPBB; 9083 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9084 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9085 // Relevant instructions from basic block BB will be grouped into VPRecipe 9086 // ingredients and fill a new VPBasicBlock. 9087 unsigned VPBBsForBB = 0; 9088 VPBB->setName(BB->getName()); 9089 Builder.setInsertPoint(VPBB); 9090 9091 // Introduce each ingredient into VPlan. 9092 // TODO: Model and preserve debug instrinsics in VPlan. 9093 for (Instruction &I : BB->instructionsWithoutDebug()) { 9094 Instruction *Instr = &I; 9095 9096 // First filter out irrelevant instructions, to ensure no recipes are 9097 // built for them. 9098 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9099 continue; 9100 9101 SmallVector<VPValue *, 4> Operands; 9102 auto *Phi = dyn_cast<PHINode>(Instr); 9103 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9104 Operands.push_back(Plan->getOrAddVPValue( 9105 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9106 } else { 9107 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9108 Operands = {OpRange.begin(), OpRange.end()}; 9109 } 9110 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9111 Instr, Operands, Range, Plan)) { 9112 // If Instr can be simplified to an existing VPValue, use it. 9113 if (RecipeOrValue.is<VPValue *>()) { 9114 auto *VPV = RecipeOrValue.get<VPValue *>(); 9115 Plan->addVPValue(Instr, VPV); 9116 // If the re-used value is a recipe, register the recipe for the 9117 // instruction, in case the recipe for Instr needs to be recorded. 9118 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9119 RecipeBuilder.setRecipe(Instr, R); 9120 continue; 9121 } 9122 // Otherwise, add the new recipe. 9123 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9124 for (auto *Def : Recipe->definedValues()) { 9125 auto *UV = Def->getUnderlyingValue(); 9126 Plan->addVPValue(UV, Def); 9127 } 9128 9129 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9130 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9131 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9132 // of the header block. That can happen for truncates of induction 9133 // variables. Those recipes are moved to the phi section of the header 9134 // block after applying SinkAfter, which relies on the original 9135 // position of the trunc. 9136 assert(isa<TruncInst>(Instr)); 9137 InductionsToMove.push_back( 9138 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9139 } 9140 RecipeBuilder.setRecipe(Instr, Recipe); 9141 VPBB->appendRecipe(Recipe); 9142 continue; 9143 } 9144 9145 // Otherwise, if all widening options failed, Instruction is to be 9146 // replicated. This may create a successor for VPBB. 9147 VPBasicBlock *NextVPBB = 9148 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9149 if (NextVPBB != VPBB) { 9150 VPBB = NextVPBB; 9151 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9152 : ""); 9153 } 9154 } 9155 9156 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9157 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9158 } 9159 9160 // Fold the last, empty block into its predecessor. 9161 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9162 assert(VPBB && "expected to fold last (empty) block"); 9163 // After here, VPBB should not be used. 9164 VPBB = nullptr; 9165 9166 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9167 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9168 "entry block must be set to a VPRegionBlock having a non-empty entry " 9169 "VPBasicBlock"); 9170 RecipeBuilder.fixHeaderPhis(); 9171 9172 // --------------------------------------------------------------------------- 9173 // Transform initial VPlan: Apply previously taken decisions, in order, to 9174 // bring the VPlan to its final state. 9175 // --------------------------------------------------------------------------- 9176 9177 // Apply Sink-After legal constraints. 9178 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9179 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9180 if (Region && Region->isReplicator()) { 9181 assert(Region->getNumSuccessors() == 1 && 9182 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9183 assert(R->getParent()->size() == 1 && 9184 "A recipe in an original replicator region must be the only " 9185 "recipe in its block"); 9186 return Region; 9187 } 9188 return nullptr; 9189 }; 9190 for (auto &Entry : SinkAfter) { 9191 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9192 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9193 9194 auto *TargetRegion = GetReplicateRegion(Target); 9195 auto *SinkRegion = GetReplicateRegion(Sink); 9196 if (!SinkRegion) { 9197 // If the sink source is not a replicate region, sink the recipe directly. 9198 if (TargetRegion) { 9199 // The target is in a replication region, make sure to move Sink to 9200 // the block after it, not into the replication region itself. 9201 VPBasicBlock *NextBlock = 9202 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9203 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9204 } else 9205 Sink->moveAfter(Target); 9206 continue; 9207 } 9208 9209 // The sink source is in a replicate region. Unhook the region from the CFG. 9210 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9211 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9212 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9213 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9214 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9215 9216 if (TargetRegion) { 9217 // The target recipe is also in a replicate region, move the sink region 9218 // after the target region. 9219 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9220 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9221 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9222 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9223 } else { 9224 // The sink source is in a replicate region, we need to move the whole 9225 // replicate region, which should only contain a single recipe in the 9226 // main block. 9227 auto *SplitBlock = 9228 Target->getParent()->splitAt(std::next(Target->getIterator())); 9229 9230 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9231 9232 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9233 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9234 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9235 } 9236 } 9237 9238 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9239 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9240 9241 // Now that sink-after is done, move induction recipes for optimized truncates 9242 // to the phi section of the header block. 9243 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9244 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9245 9246 // Adjust the recipes for any inloop reductions. 9247 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9248 RecipeBuilder, Range.Start); 9249 9250 // Introduce a recipe to combine the incoming and previous values of a 9251 // first-order recurrence. 9252 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9253 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9254 if (!RecurPhi) 9255 continue; 9256 9257 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9258 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9259 auto *Region = GetReplicateRegion(PrevRecipe); 9260 if (Region) 9261 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9262 if (Region || PrevRecipe->isPhi()) 9263 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9264 else 9265 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9266 9267 auto *RecurSplice = cast<VPInstruction>( 9268 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9269 {RecurPhi, RecurPhi->getBackedgeValue()})); 9270 9271 RecurPhi->replaceAllUsesWith(RecurSplice); 9272 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9273 // all users. 9274 RecurSplice->setOperand(0, RecurPhi); 9275 } 9276 9277 // Interleave memory: for each Interleave Group we marked earlier as relevant 9278 // for this VPlan, replace the Recipes widening its memory instructions with a 9279 // single VPInterleaveRecipe at its insertion point. 9280 for (auto IG : InterleaveGroups) { 9281 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9282 RecipeBuilder.getRecipe(IG->getInsertPos())); 9283 SmallVector<VPValue *, 4> StoredValues; 9284 for (unsigned i = 0; i < IG->getFactor(); ++i) 9285 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9286 auto *StoreR = 9287 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9288 StoredValues.push_back(StoreR->getStoredValue()); 9289 } 9290 9291 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9292 Recipe->getMask()); 9293 VPIG->insertBefore(Recipe); 9294 unsigned J = 0; 9295 for (unsigned i = 0; i < IG->getFactor(); ++i) 9296 if (Instruction *Member = IG->getMember(i)) { 9297 if (!Member->getType()->isVoidTy()) { 9298 VPValue *OriginalV = Plan->getVPValue(Member); 9299 Plan->removeVPValueFor(Member); 9300 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9301 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9302 J++; 9303 } 9304 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9305 } 9306 } 9307 9308 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9309 // in ways that accessing values using original IR values is incorrect. 9310 Plan->disableValue2VPValue(); 9311 9312 VPlanTransforms::sinkScalarOperands(*Plan); 9313 VPlanTransforms::mergeReplicateRegions(*Plan); 9314 9315 std::string PlanName; 9316 raw_string_ostream RSO(PlanName); 9317 ElementCount VF = Range.Start; 9318 Plan->addVF(VF); 9319 RSO << "Initial VPlan for VF={" << VF; 9320 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9321 Plan->addVF(VF); 9322 RSO << "," << VF; 9323 } 9324 RSO << "},UF>=1"; 9325 RSO.flush(); 9326 Plan->setName(PlanName); 9327 9328 // Fold Exit block into its predecessor if possible. 9329 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9330 // VPBasicBlock as exit. 9331 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9332 9333 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9334 return Plan; 9335 } 9336 9337 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9338 // Outer loop handling: They may require CFG and instruction level 9339 // transformations before even evaluating whether vectorization is profitable. 9340 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9341 // the vectorization pipeline. 9342 assert(!OrigLoop->isInnermost()); 9343 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9344 9345 // Create new empty VPlan 9346 auto Plan = std::make_unique<VPlan>(); 9347 9348 // Build hierarchical CFG 9349 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9350 HCFGBuilder.buildHierarchicalCFG(); 9351 9352 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9353 VF *= 2) 9354 Plan->addVF(VF); 9355 9356 if (EnableVPlanPredication) { 9357 VPlanPredicator VPP(*Plan); 9358 VPP.predicate(); 9359 9360 // Avoid running transformation to recipes until masked code generation in 9361 // VPlan-native path is in place. 9362 return Plan; 9363 } 9364 9365 SmallPtrSet<Instruction *, 1> DeadInstructions; 9366 VPlanTransforms::VPInstructionsToVPRecipes( 9367 OrigLoop, Plan, 9368 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9369 DeadInstructions, *PSE.getSE()); 9370 9371 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9372 true, true); 9373 return Plan; 9374 } 9375 9376 // Adjust the recipes for reductions. For in-loop reductions the chain of 9377 // instructions leading from the loop exit instr to the phi need to be converted 9378 // to reductions, with one operand being vector and the other being the scalar 9379 // reduction chain. For other reductions, a select is introduced between the phi 9380 // and live-out recipes when folding the tail. 9381 void LoopVectorizationPlanner::adjustRecipesForReductions( 9382 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9383 ElementCount MinVF) { 9384 for (auto &Reduction : CM.getInLoopReductionChains()) { 9385 PHINode *Phi = Reduction.first; 9386 const RecurrenceDescriptor &RdxDesc = 9387 Legal->getReductionVars().find(Phi)->second; 9388 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9389 9390 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9391 continue; 9392 9393 // ReductionOperations are orders top-down from the phi's use to the 9394 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9395 // which of the two operands will remain scalar and which will be reduced. 9396 // For minmax the chain will be the select instructions. 9397 Instruction *Chain = Phi; 9398 for (Instruction *R : ReductionOperations) { 9399 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9400 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9401 9402 VPValue *ChainOp = Plan->getVPValue(Chain); 9403 unsigned FirstOpId; 9404 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9405 "Only min/max recurrences allowed for inloop reductions"); 9406 // Recognize a call to the llvm.fmuladd intrinsic. 9407 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9408 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9409 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9410 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9411 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9412 "Expected to replace a VPWidenSelectSC"); 9413 FirstOpId = 1; 9414 } else { 9415 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9416 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9417 "Expected to replace a VPWidenSC"); 9418 FirstOpId = 0; 9419 } 9420 unsigned VecOpId = 9421 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9422 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9423 9424 auto *CondOp = CM.foldTailByMasking() 9425 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9426 : nullptr; 9427 9428 if (IsFMulAdd) { 9429 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9430 // need to create an fmul recipe to use as the vector operand for the 9431 // fadd reduction. 9432 VPInstruction *FMulRecipe = new VPInstruction( 9433 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9434 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9435 WidenRecipe->getParent()->insert(FMulRecipe, 9436 WidenRecipe->getIterator()); 9437 VecOp = FMulRecipe; 9438 } 9439 VPReductionRecipe *RedRecipe = 9440 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9441 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9442 Plan->removeVPValueFor(R); 9443 Plan->addVPValue(R, RedRecipe); 9444 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9445 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9446 WidenRecipe->eraseFromParent(); 9447 9448 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9449 VPRecipeBase *CompareRecipe = 9450 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9451 assert(isa<VPWidenRecipe>(CompareRecipe) && 9452 "Expected to replace a VPWidenSC"); 9453 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9454 "Expected no remaining users"); 9455 CompareRecipe->eraseFromParent(); 9456 } 9457 Chain = R; 9458 } 9459 } 9460 9461 // If tail is folded by masking, introduce selects between the phi 9462 // and the live-out instruction of each reduction, at the beginning of the 9463 // dedicated latch block. 9464 if (CM.foldTailByMasking()) { 9465 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9466 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9467 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9468 if (!PhiR || PhiR->isInLoop()) 9469 continue; 9470 VPValue *Cond = 9471 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9472 VPValue *Red = PhiR->getBackedgeValue(); 9473 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9474 "reduction recipe must be defined before latch"); 9475 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9476 } 9477 } 9478 } 9479 9480 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9481 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9482 VPSlotTracker &SlotTracker) const { 9483 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9484 IG->getInsertPos()->printAsOperand(O, false); 9485 O << ", "; 9486 getAddr()->printAsOperand(O, SlotTracker); 9487 VPValue *Mask = getMask(); 9488 if (Mask) { 9489 O << ", "; 9490 Mask->printAsOperand(O, SlotTracker); 9491 } 9492 9493 unsigned OpIdx = 0; 9494 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9495 if (!IG->getMember(i)) 9496 continue; 9497 if (getNumStoreOperands() > 0) { 9498 O << "\n" << Indent << " store "; 9499 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9500 O << " to index " << i; 9501 } else { 9502 O << "\n" << Indent << " "; 9503 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9504 O << " = load from index " << i; 9505 } 9506 ++OpIdx; 9507 } 9508 } 9509 #endif 9510 9511 void VPWidenCallRecipe::execute(VPTransformState &State) { 9512 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9513 *this, State); 9514 } 9515 9516 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9517 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9518 State.ILV->setDebugLocFromInst(&I); 9519 9520 // The condition can be loop invariant but still defined inside the 9521 // loop. This means that we can't just use the original 'cond' value. 9522 // We have to take the 'vectorized' value and pick the first lane. 9523 // Instcombine will make this a no-op. 9524 auto *InvarCond = 9525 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9526 9527 for (unsigned Part = 0; Part < State.UF; ++Part) { 9528 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9529 Value *Op0 = State.get(getOperand(1), Part); 9530 Value *Op1 = State.get(getOperand(2), Part); 9531 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9532 State.set(this, Sel, Part); 9533 State.ILV->addMetadata(Sel, &I); 9534 } 9535 } 9536 9537 void VPWidenRecipe::execute(VPTransformState &State) { 9538 auto &I = *cast<Instruction>(getUnderlyingValue()); 9539 auto &Builder = State.Builder; 9540 switch (I.getOpcode()) { 9541 case Instruction::Call: 9542 case Instruction::Br: 9543 case Instruction::PHI: 9544 case Instruction::GetElementPtr: 9545 case Instruction::Select: 9546 llvm_unreachable("This instruction is handled by a different recipe."); 9547 case Instruction::UDiv: 9548 case Instruction::SDiv: 9549 case Instruction::SRem: 9550 case Instruction::URem: 9551 case Instruction::Add: 9552 case Instruction::FAdd: 9553 case Instruction::Sub: 9554 case Instruction::FSub: 9555 case Instruction::FNeg: 9556 case Instruction::Mul: 9557 case Instruction::FMul: 9558 case Instruction::FDiv: 9559 case Instruction::FRem: 9560 case Instruction::Shl: 9561 case Instruction::LShr: 9562 case Instruction::AShr: 9563 case Instruction::And: 9564 case Instruction::Or: 9565 case Instruction::Xor: { 9566 // Just widen unops and binops. 9567 State.ILV->setDebugLocFromInst(&I); 9568 9569 for (unsigned Part = 0; Part < State.UF; ++Part) { 9570 SmallVector<Value *, 2> Ops; 9571 for (VPValue *VPOp : operands()) 9572 Ops.push_back(State.get(VPOp, Part)); 9573 9574 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9575 9576 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9577 VecOp->copyIRFlags(&I); 9578 9579 // If the instruction is vectorized and was in a basic block that needed 9580 // predication, we can't propagate poison-generating flags (nuw/nsw, 9581 // exact, etc.). The control flow has been linearized and the 9582 // instruction is no longer guarded by the predicate, which could make 9583 // the flag properties to no longer hold. 9584 if (State.MayGeneratePoisonRecipes.contains(this)) 9585 VecOp->dropPoisonGeneratingFlags(); 9586 } 9587 9588 // Use this vector value for all users of the original instruction. 9589 State.set(this, V, Part); 9590 State.ILV->addMetadata(V, &I); 9591 } 9592 9593 break; 9594 } 9595 case Instruction::ICmp: 9596 case Instruction::FCmp: { 9597 // Widen compares. Generate vector compares. 9598 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9599 auto *Cmp = cast<CmpInst>(&I); 9600 State.ILV->setDebugLocFromInst(Cmp); 9601 for (unsigned Part = 0; Part < State.UF; ++Part) { 9602 Value *A = State.get(getOperand(0), Part); 9603 Value *B = State.get(getOperand(1), Part); 9604 Value *C = nullptr; 9605 if (FCmp) { 9606 // Propagate fast math flags. 9607 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9608 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9609 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9610 } else { 9611 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9612 } 9613 State.set(this, C, Part); 9614 State.ILV->addMetadata(C, &I); 9615 } 9616 9617 break; 9618 } 9619 9620 case Instruction::ZExt: 9621 case Instruction::SExt: 9622 case Instruction::FPToUI: 9623 case Instruction::FPToSI: 9624 case Instruction::FPExt: 9625 case Instruction::PtrToInt: 9626 case Instruction::IntToPtr: 9627 case Instruction::SIToFP: 9628 case Instruction::UIToFP: 9629 case Instruction::Trunc: 9630 case Instruction::FPTrunc: 9631 case Instruction::BitCast: { 9632 auto *CI = cast<CastInst>(&I); 9633 State.ILV->setDebugLocFromInst(CI); 9634 9635 /// Vectorize casts. 9636 Type *DestTy = (State.VF.isScalar()) 9637 ? CI->getType() 9638 : VectorType::get(CI->getType(), State.VF); 9639 9640 for (unsigned Part = 0; Part < State.UF; ++Part) { 9641 Value *A = State.get(getOperand(0), Part); 9642 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9643 State.set(this, Cast, Part); 9644 State.ILV->addMetadata(Cast, &I); 9645 } 9646 break; 9647 } 9648 default: 9649 // This instruction is not vectorized by simple widening. 9650 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9651 llvm_unreachable("Unhandled instruction!"); 9652 } // end of switch. 9653 } 9654 9655 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9656 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9657 // Construct a vector GEP by widening the operands of the scalar GEP as 9658 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9659 // results in a vector of pointers when at least one operand of the GEP 9660 // is vector-typed. Thus, to keep the representation compact, we only use 9661 // vector-typed operands for loop-varying values. 9662 9663 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9664 // If we are vectorizing, but the GEP has only loop-invariant operands, 9665 // the GEP we build (by only using vector-typed operands for 9666 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9667 // produce a vector of pointers, we need to either arbitrarily pick an 9668 // operand to broadcast, or broadcast a clone of the original GEP. 9669 // Here, we broadcast a clone of the original. 9670 // 9671 // TODO: If at some point we decide to scalarize instructions having 9672 // loop-invariant operands, this special case will no longer be 9673 // required. We would add the scalarization decision to 9674 // collectLoopScalars() and teach getVectorValue() to broadcast 9675 // the lane-zero scalar value. 9676 auto *Clone = State.Builder.Insert(GEP->clone()); 9677 for (unsigned Part = 0; Part < State.UF; ++Part) { 9678 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9679 State.set(this, EntryPart, Part); 9680 State.ILV->addMetadata(EntryPart, GEP); 9681 } 9682 } else { 9683 // If the GEP has at least one loop-varying operand, we are sure to 9684 // produce a vector of pointers. But if we are only unrolling, we want 9685 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9686 // produce with the code below will be scalar (if VF == 1) or vector 9687 // (otherwise). Note that for the unroll-only case, we still maintain 9688 // values in the vector mapping with initVector, as we do for other 9689 // instructions. 9690 for (unsigned Part = 0; Part < State.UF; ++Part) { 9691 // The pointer operand of the new GEP. If it's loop-invariant, we 9692 // won't broadcast it. 9693 auto *Ptr = IsPtrLoopInvariant 9694 ? State.get(getOperand(0), VPIteration(0, 0)) 9695 : State.get(getOperand(0), Part); 9696 9697 // Collect all the indices for the new GEP. If any index is 9698 // loop-invariant, we won't broadcast it. 9699 SmallVector<Value *, 4> Indices; 9700 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9701 VPValue *Operand = getOperand(I); 9702 if (IsIndexLoopInvariant[I - 1]) 9703 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9704 else 9705 Indices.push_back(State.get(Operand, Part)); 9706 } 9707 9708 // If the GEP instruction is vectorized and was in a basic block that 9709 // needed predication, we can't propagate the poison-generating 'inbounds' 9710 // flag. The control flow has been linearized and the GEP is no longer 9711 // guarded by the predicate, which could make the 'inbounds' properties to 9712 // no longer hold. 9713 bool IsInBounds = 9714 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9715 9716 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9717 // but it should be a vector, otherwise. 9718 auto *NewGEP = IsInBounds 9719 ? State.Builder.CreateInBoundsGEP( 9720 GEP->getSourceElementType(), Ptr, Indices) 9721 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9722 Ptr, Indices); 9723 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9724 "NewGEP is not a pointer vector"); 9725 State.set(this, NewGEP, Part); 9726 State.ILV->addMetadata(NewGEP, GEP); 9727 } 9728 } 9729 } 9730 9731 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9732 assert(!State.Instance && "Int or FP induction being replicated."); 9733 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9734 State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); 9735 } 9736 9737 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9738 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9739 State); 9740 } 9741 9742 void VPBlendRecipe::execute(VPTransformState &State) { 9743 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9744 // We know that all PHIs in non-header blocks are converted into 9745 // selects, so we don't have to worry about the insertion order and we 9746 // can just use the builder. 9747 // At this point we generate the predication tree. There may be 9748 // duplications since this is a simple recursive scan, but future 9749 // optimizations will clean it up. 9750 9751 unsigned NumIncoming = getNumIncomingValues(); 9752 9753 // Generate a sequence of selects of the form: 9754 // SELECT(Mask3, In3, 9755 // SELECT(Mask2, In2, 9756 // SELECT(Mask1, In1, 9757 // In0))) 9758 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9759 // are essentially undef are taken from In0. 9760 InnerLoopVectorizer::VectorParts Entry(State.UF); 9761 for (unsigned In = 0; In < NumIncoming; ++In) { 9762 for (unsigned Part = 0; Part < State.UF; ++Part) { 9763 // We might have single edge PHIs (blocks) - use an identity 9764 // 'select' for the first PHI operand. 9765 Value *In0 = State.get(getIncomingValue(In), Part); 9766 if (In == 0) 9767 Entry[Part] = In0; // Initialize with the first incoming value. 9768 else { 9769 // Select between the current value and the previous incoming edge 9770 // based on the incoming mask. 9771 Value *Cond = State.get(getMask(In), Part); 9772 Entry[Part] = 9773 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9774 } 9775 } 9776 } 9777 for (unsigned Part = 0; Part < State.UF; ++Part) 9778 State.set(this, Entry[Part], Part); 9779 } 9780 9781 void VPInterleaveRecipe::execute(VPTransformState &State) { 9782 assert(!State.Instance && "Interleave group being replicated."); 9783 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9784 getStoredValues(), getMask()); 9785 } 9786 9787 void VPReductionRecipe::execute(VPTransformState &State) { 9788 assert(!State.Instance && "Reduction being replicated."); 9789 Value *PrevInChain = State.get(getChainOp(), 0); 9790 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9791 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9792 // Propagate the fast-math flags carried by the underlying instruction. 9793 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9794 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9795 for (unsigned Part = 0; Part < State.UF; ++Part) { 9796 Value *NewVecOp = State.get(getVecOp(), Part); 9797 if (VPValue *Cond = getCondOp()) { 9798 Value *NewCond = State.get(Cond, Part); 9799 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9800 Value *Iden = RdxDesc->getRecurrenceIdentity( 9801 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9802 Value *IdenVec = 9803 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9804 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9805 NewVecOp = Select; 9806 } 9807 Value *NewRed; 9808 Value *NextInChain; 9809 if (IsOrdered) { 9810 if (State.VF.isVector()) 9811 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9812 PrevInChain); 9813 else 9814 NewRed = State.Builder.CreateBinOp( 9815 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9816 NewVecOp); 9817 PrevInChain = NewRed; 9818 } else { 9819 PrevInChain = State.get(getChainOp(), Part); 9820 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9821 } 9822 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9823 NextInChain = 9824 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9825 NewRed, PrevInChain); 9826 } else if (IsOrdered) 9827 NextInChain = NewRed; 9828 else 9829 NextInChain = State.Builder.CreateBinOp( 9830 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9831 PrevInChain); 9832 State.set(this, NextInChain, Part); 9833 } 9834 } 9835 9836 void VPReplicateRecipe::execute(VPTransformState &State) { 9837 if (State.Instance) { // Generate a single instance. 9838 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9839 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9840 IsPredicated, State); 9841 // Insert scalar instance packing it into a vector. 9842 if (AlsoPack && State.VF.isVector()) { 9843 // If we're constructing lane 0, initialize to start from poison. 9844 if (State.Instance->Lane.isFirstLane()) { 9845 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9846 Value *Poison = PoisonValue::get( 9847 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9848 State.set(this, Poison, State.Instance->Part); 9849 } 9850 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9851 } 9852 return; 9853 } 9854 9855 // Generate scalar instances for all VF lanes of all UF parts, unless the 9856 // instruction is uniform inwhich case generate only the first lane for each 9857 // of the UF parts. 9858 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9859 assert((!State.VF.isScalable() || IsUniform) && 9860 "Can't scalarize a scalable vector"); 9861 for (unsigned Part = 0; Part < State.UF; ++Part) 9862 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9863 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9864 VPIteration(Part, Lane), IsPredicated, 9865 State); 9866 } 9867 9868 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9869 assert(State.Instance && "Branch on Mask works only on single instance."); 9870 9871 unsigned Part = State.Instance->Part; 9872 unsigned Lane = State.Instance->Lane.getKnownLane(); 9873 9874 Value *ConditionBit = nullptr; 9875 VPValue *BlockInMask = getMask(); 9876 if (BlockInMask) { 9877 ConditionBit = State.get(BlockInMask, Part); 9878 if (ConditionBit->getType()->isVectorTy()) 9879 ConditionBit = State.Builder.CreateExtractElement( 9880 ConditionBit, State.Builder.getInt32(Lane)); 9881 } else // Block in mask is all-one. 9882 ConditionBit = State.Builder.getTrue(); 9883 9884 // Replace the temporary unreachable terminator with a new conditional branch, 9885 // whose two destinations will be set later when they are created. 9886 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9887 assert(isa<UnreachableInst>(CurrentTerminator) && 9888 "Expected to replace unreachable terminator with conditional branch."); 9889 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9890 CondBr->setSuccessor(0, nullptr); 9891 ReplaceInstWithInst(CurrentTerminator, CondBr); 9892 } 9893 9894 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9895 assert(State.Instance && "Predicated instruction PHI works per instance."); 9896 Instruction *ScalarPredInst = 9897 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9898 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9899 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9900 assert(PredicatingBB && "Predicated block has no single predecessor."); 9901 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9902 "operand must be VPReplicateRecipe"); 9903 9904 // By current pack/unpack logic we need to generate only a single phi node: if 9905 // a vector value for the predicated instruction exists at this point it means 9906 // the instruction has vector users only, and a phi for the vector value is 9907 // needed. In this case the recipe of the predicated instruction is marked to 9908 // also do that packing, thereby "hoisting" the insert-element sequence. 9909 // Otherwise, a phi node for the scalar value is needed. 9910 unsigned Part = State.Instance->Part; 9911 if (State.hasVectorValue(getOperand(0), Part)) { 9912 Value *VectorValue = State.get(getOperand(0), Part); 9913 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9914 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9915 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9916 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9917 if (State.hasVectorValue(this, Part)) 9918 State.reset(this, VPhi, Part); 9919 else 9920 State.set(this, VPhi, Part); 9921 // NOTE: Currently we need to update the value of the operand, so the next 9922 // predicated iteration inserts its generated value in the correct vector. 9923 State.reset(getOperand(0), VPhi, Part); 9924 } else { 9925 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9926 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9927 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9928 PredicatingBB); 9929 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9930 if (State.hasScalarValue(this, *State.Instance)) 9931 State.reset(this, Phi, *State.Instance); 9932 else 9933 State.set(this, Phi, *State.Instance); 9934 // NOTE: Currently we need to update the value of the operand, so the next 9935 // predicated iteration inserts its generated value in the correct vector. 9936 State.reset(getOperand(0), Phi, *State.Instance); 9937 } 9938 } 9939 9940 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9941 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9942 9943 // Attempt to issue a wide load. 9944 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9945 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9946 9947 assert((LI || SI) && "Invalid Load/Store instruction"); 9948 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9949 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9950 9951 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9952 9953 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9954 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9955 bool CreateGatherScatter = !Consecutive; 9956 9957 auto &Builder = State.Builder; 9958 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9959 bool isMaskRequired = getMask(); 9960 if (isMaskRequired) 9961 for (unsigned Part = 0; Part < State.UF; ++Part) 9962 BlockInMaskParts[Part] = State.get(getMask(), Part); 9963 9964 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9965 // Calculate the pointer for the specific unroll-part. 9966 GetElementPtrInst *PartPtr = nullptr; 9967 9968 bool InBounds = false; 9969 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9970 InBounds = gep->isInBounds(); 9971 if (Reverse) { 9972 // If the address is consecutive but reversed, then the 9973 // wide store needs to start at the last vector element. 9974 // RunTimeVF = VScale * VF.getKnownMinValue() 9975 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9976 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9977 // NumElt = -Part * RunTimeVF 9978 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9979 // LastLane = 1 - RunTimeVF 9980 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9981 PartPtr = 9982 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9983 PartPtr->setIsInBounds(InBounds); 9984 PartPtr = cast<GetElementPtrInst>( 9985 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9986 PartPtr->setIsInBounds(InBounds); 9987 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9988 BlockInMaskParts[Part] = 9989 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9990 } else { 9991 Value *Increment = 9992 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9993 PartPtr = cast<GetElementPtrInst>( 9994 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9995 PartPtr->setIsInBounds(InBounds); 9996 } 9997 9998 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9999 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 10000 }; 10001 10002 // Handle Stores: 10003 if (SI) { 10004 State.ILV->setDebugLocFromInst(SI); 10005 10006 for (unsigned Part = 0; Part < State.UF; ++Part) { 10007 Instruction *NewSI = nullptr; 10008 Value *StoredVal = State.get(StoredValue, Part); 10009 if (CreateGatherScatter) { 10010 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10011 Value *VectorGep = State.get(getAddr(), Part); 10012 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 10013 MaskPart); 10014 } else { 10015 if (Reverse) { 10016 // If we store to reverse consecutive memory locations, then we need 10017 // to reverse the order of elements in the stored value. 10018 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 10019 // We don't want to update the value in the map as it might be used in 10020 // another expression. So don't call resetVectorValue(StoredVal). 10021 } 10022 auto *VecPtr = 10023 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10024 if (isMaskRequired) 10025 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 10026 BlockInMaskParts[Part]); 10027 else 10028 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 10029 } 10030 State.ILV->addMetadata(NewSI, SI); 10031 } 10032 return; 10033 } 10034 10035 // Handle loads. 10036 assert(LI && "Must have a load instruction"); 10037 State.ILV->setDebugLocFromInst(LI); 10038 for (unsigned Part = 0; Part < State.UF; ++Part) { 10039 Value *NewLI; 10040 if (CreateGatherScatter) { 10041 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10042 Value *VectorGep = State.get(getAddr(), Part); 10043 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10044 nullptr, "wide.masked.gather"); 10045 State.ILV->addMetadata(NewLI, LI); 10046 } else { 10047 auto *VecPtr = 10048 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10049 if (isMaskRequired) 10050 NewLI = Builder.CreateMaskedLoad( 10051 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10052 PoisonValue::get(DataTy), "wide.masked.load"); 10053 else 10054 NewLI = 10055 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10056 10057 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10058 State.ILV->addMetadata(NewLI, LI); 10059 if (Reverse) 10060 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10061 } 10062 10063 State.set(this, NewLI, Part); 10064 } 10065 } 10066 10067 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10068 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10069 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10070 // for predication. 10071 static ScalarEpilogueLowering getScalarEpilogueLowering( 10072 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10073 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10074 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10075 LoopVectorizationLegality &LVL) { 10076 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10077 // don't look at hints or options, and don't request a scalar epilogue. 10078 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10079 // LoopAccessInfo (due to code dependency and not being able to reliably get 10080 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10081 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10082 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10083 // back to the old way and vectorize with versioning when forced. See D81345.) 10084 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10085 PGSOQueryType::IRPass) && 10086 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10087 return CM_ScalarEpilogueNotAllowedOptSize; 10088 10089 // 2) If set, obey the directives 10090 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10091 switch (PreferPredicateOverEpilogue) { 10092 case PreferPredicateTy::ScalarEpilogue: 10093 return CM_ScalarEpilogueAllowed; 10094 case PreferPredicateTy::PredicateElseScalarEpilogue: 10095 return CM_ScalarEpilogueNotNeededUsePredicate; 10096 case PreferPredicateTy::PredicateOrDontVectorize: 10097 return CM_ScalarEpilogueNotAllowedUsePredicate; 10098 }; 10099 } 10100 10101 // 3) If set, obey the hints 10102 switch (Hints.getPredicate()) { 10103 case LoopVectorizeHints::FK_Enabled: 10104 return CM_ScalarEpilogueNotNeededUsePredicate; 10105 case LoopVectorizeHints::FK_Disabled: 10106 return CM_ScalarEpilogueAllowed; 10107 }; 10108 10109 // 4) if the TTI hook indicates this is profitable, request predication. 10110 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10111 LVL.getLAI())) 10112 return CM_ScalarEpilogueNotNeededUsePredicate; 10113 10114 return CM_ScalarEpilogueAllowed; 10115 } 10116 10117 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10118 // If Values have been set for this Def return the one relevant for \p Part. 10119 if (hasVectorValue(Def, Part)) 10120 return Data.PerPartOutput[Def][Part]; 10121 10122 if (!hasScalarValue(Def, {Part, 0})) { 10123 Value *IRV = Def->getLiveInIRValue(); 10124 Value *B = ILV->getBroadcastInstrs(IRV); 10125 set(Def, B, Part); 10126 return B; 10127 } 10128 10129 Value *ScalarValue = get(Def, {Part, 0}); 10130 // If we aren't vectorizing, we can just copy the scalar map values over 10131 // to the vector map. 10132 if (VF.isScalar()) { 10133 set(Def, ScalarValue, Part); 10134 return ScalarValue; 10135 } 10136 10137 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10138 bool IsUniform = RepR && RepR->isUniform(); 10139 10140 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10141 // Check if there is a scalar value for the selected lane. 10142 if (!hasScalarValue(Def, {Part, LastLane})) { 10143 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10144 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10145 "unexpected recipe found to be invariant"); 10146 IsUniform = true; 10147 LastLane = 0; 10148 } 10149 10150 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10151 // Set the insert point after the last scalarized instruction or after the 10152 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10153 // will directly follow the scalar definitions. 10154 auto OldIP = Builder.saveIP(); 10155 auto NewIP = 10156 isa<PHINode>(LastInst) 10157 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10158 : std::next(BasicBlock::iterator(LastInst)); 10159 Builder.SetInsertPoint(&*NewIP); 10160 10161 // However, if we are vectorizing, we need to construct the vector values. 10162 // If the value is known to be uniform after vectorization, we can just 10163 // broadcast the scalar value corresponding to lane zero for each unroll 10164 // iteration. Otherwise, we construct the vector values using 10165 // insertelement instructions. Since the resulting vectors are stored in 10166 // State, we will only generate the insertelements once. 10167 Value *VectorValue = nullptr; 10168 if (IsUniform) { 10169 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10170 set(Def, VectorValue, Part); 10171 } else { 10172 // Initialize packing with insertelements to start from undef. 10173 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10174 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10175 set(Def, Undef, Part); 10176 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10177 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10178 VectorValue = get(Def, Part); 10179 } 10180 Builder.restoreIP(OldIP); 10181 return VectorValue; 10182 } 10183 10184 // Process the loop in the VPlan-native vectorization path. This path builds 10185 // VPlan upfront in the vectorization pipeline, which allows to apply 10186 // VPlan-to-VPlan transformations from the very beginning without modifying the 10187 // input LLVM IR. 10188 static bool processLoopInVPlanNativePath( 10189 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10190 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10191 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10192 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10193 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10194 LoopVectorizationRequirements &Requirements) { 10195 10196 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10197 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10198 return false; 10199 } 10200 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10201 Function *F = L->getHeader()->getParent(); 10202 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10203 10204 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10205 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10206 10207 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10208 &Hints, IAI); 10209 // Use the planner for outer loop vectorization. 10210 // TODO: CM is not used at this point inside the planner. Turn CM into an 10211 // optional argument if we don't need it in the future. 10212 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10213 Requirements, ORE); 10214 10215 // Get user vectorization factor. 10216 ElementCount UserVF = Hints.getWidth(); 10217 10218 CM.collectElementTypesForWidening(); 10219 10220 // Plan how to best vectorize, return the best VF and its cost. 10221 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10222 10223 // If we are stress testing VPlan builds, do not attempt to generate vector 10224 // code. Masked vector code generation support will follow soon. 10225 // Also, do not attempt to vectorize if no vector code will be produced. 10226 if (VPlanBuildStressTest || EnableVPlanPredication || 10227 VectorizationFactor::Disabled() == VF) 10228 return false; 10229 10230 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10231 10232 { 10233 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10234 F->getParent()->getDataLayout()); 10235 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10236 &CM, BFI, PSI, Checks); 10237 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10238 << L->getHeader()->getParent()->getName() << "\"\n"); 10239 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10240 } 10241 10242 // Mark the loop as already vectorized to avoid vectorizing again. 10243 Hints.setAlreadyVectorized(); 10244 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10245 return true; 10246 } 10247 10248 // Emit a remark if there are stores to floats that required a floating point 10249 // extension. If the vectorized loop was generated with floating point there 10250 // will be a performance penalty from the conversion overhead and the change in 10251 // the vector width. 10252 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10253 SmallVector<Instruction *, 4> Worklist; 10254 for (BasicBlock *BB : L->getBlocks()) { 10255 for (Instruction &Inst : *BB) { 10256 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10257 if (S->getValueOperand()->getType()->isFloatTy()) 10258 Worklist.push_back(S); 10259 } 10260 } 10261 } 10262 10263 // Traverse the floating point stores upwards searching, for floating point 10264 // conversions. 10265 SmallPtrSet<const Instruction *, 4> Visited; 10266 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10267 while (!Worklist.empty()) { 10268 auto *I = Worklist.pop_back_val(); 10269 if (!L->contains(I)) 10270 continue; 10271 if (!Visited.insert(I).second) 10272 continue; 10273 10274 // Emit a remark if the floating point store required a floating 10275 // point conversion. 10276 // TODO: More work could be done to identify the root cause such as a 10277 // constant or a function return type and point the user to it. 10278 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10279 ORE->emit([&]() { 10280 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10281 I->getDebugLoc(), L->getHeader()) 10282 << "floating point conversion changes vector width. " 10283 << "Mixed floating point precision requires an up/down " 10284 << "cast that will negatively impact performance."; 10285 }); 10286 10287 for (Use &Op : I->operands()) 10288 if (auto *OpI = dyn_cast<Instruction>(Op)) 10289 Worklist.push_back(OpI); 10290 } 10291 } 10292 10293 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10294 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10295 !EnableLoopInterleaving), 10296 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10297 !EnableLoopVectorization) {} 10298 10299 bool LoopVectorizePass::processLoop(Loop *L) { 10300 assert((EnableVPlanNativePath || L->isInnermost()) && 10301 "VPlan-native path is not enabled. Only process inner loops."); 10302 10303 #ifndef NDEBUG 10304 const std::string DebugLocStr = getDebugLocString(L); 10305 #endif /* NDEBUG */ 10306 10307 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10308 << L->getHeader()->getParent()->getName() << "\" from " 10309 << DebugLocStr << "\n"); 10310 10311 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10312 10313 LLVM_DEBUG( 10314 dbgs() << "LV: Loop hints:" 10315 << " force=" 10316 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10317 ? "disabled" 10318 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10319 ? "enabled" 10320 : "?")) 10321 << " width=" << Hints.getWidth() 10322 << " interleave=" << Hints.getInterleave() << "\n"); 10323 10324 // Function containing loop 10325 Function *F = L->getHeader()->getParent(); 10326 10327 // Looking at the diagnostic output is the only way to determine if a loop 10328 // was vectorized (other than looking at the IR or machine code), so it 10329 // is important to generate an optimization remark for each loop. Most of 10330 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10331 // generated as OptimizationRemark and OptimizationRemarkMissed are 10332 // less verbose reporting vectorized loops and unvectorized loops that may 10333 // benefit from vectorization, respectively. 10334 10335 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10336 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10337 return false; 10338 } 10339 10340 PredicatedScalarEvolution PSE(*SE, *L); 10341 10342 // Check if it is legal to vectorize the loop. 10343 LoopVectorizationRequirements Requirements; 10344 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10345 &Requirements, &Hints, DB, AC, BFI, PSI); 10346 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10347 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10348 Hints.emitRemarkWithHints(); 10349 return false; 10350 } 10351 10352 // Check the function attributes and profiles to find out if this function 10353 // should be optimized for size. 10354 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10355 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10356 10357 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10358 // here. They may require CFG and instruction level transformations before 10359 // even evaluating whether vectorization is profitable. Since we cannot modify 10360 // the incoming IR, we need to build VPlan upfront in the vectorization 10361 // pipeline. 10362 if (!L->isInnermost()) 10363 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10364 ORE, BFI, PSI, Hints, Requirements); 10365 10366 assert(L->isInnermost() && "Inner loop expected."); 10367 10368 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10369 // count by optimizing for size, to minimize overheads. 10370 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10371 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10372 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10373 << "This loop is worth vectorizing only if no scalar " 10374 << "iteration overheads are incurred."); 10375 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10376 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10377 else { 10378 LLVM_DEBUG(dbgs() << "\n"); 10379 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10380 } 10381 } 10382 10383 // Check the function attributes to see if implicit floats are allowed. 10384 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10385 // an integer loop and the vector instructions selected are purely integer 10386 // vector instructions? 10387 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10388 reportVectorizationFailure( 10389 "Can't vectorize when the NoImplicitFloat attribute is used", 10390 "loop not vectorized due to NoImplicitFloat attribute", 10391 "NoImplicitFloat", ORE, L); 10392 Hints.emitRemarkWithHints(); 10393 return false; 10394 } 10395 10396 // Check if the target supports potentially unsafe FP vectorization. 10397 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10398 // for the target we're vectorizing for, to make sure none of the 10399 // additional fp-math flags can help. 10400 if (Hints.isPotentiallyUnsafe() && 10401 TTI->isFPVectorizationPotentiallyUnsafe()) { 10402 reportVectorizationFailure( 10403 "Potentially unsafe FP op prevents vectorization", 10404 "loop not vectorized due to unsafe FP support.", 10405 "UnsafeFP", ORE, L); 10406 Hints.emitRemarkWithHints(); 10407 return false; 10408 } 10409 10410 bool AllowOrderedReductions; 10411 // If the flag is set, use that instead and override the TTI behaviour. 10412 if (ForceOrderedReductions.getNumOccurrences() > 0) 10413 AllowOrderedReductions = ForceOrderedReductions; 10414 else 10415 AllowOrderedReductions = TTI->enableOrderedReductions(); 10416 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10417 ORE->emit([&]() { 10418 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10419 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10420 ExactFPMathInst->getDebugLoc(), 10421 ExactFPMathInst->getParent()) 10422 << "loop not vectorized: cannot prove it is safe to reorder " 10423 "floating-point operations"; 10424 }); 10425 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10426 "reorder floating-point operations\n"); 10427 Hints.emitRemarkWithHints(); 10428 return false; 10429 } 10430 10431 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10432 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10433 10434 // If an override option has been passed in for interleaved accesses, use it. 10435 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10436 UseInterleaved = EnableInterleavedMemAccesses; 10437 10438 // Analyze interleaved memory accesses. 10439 if (UseInterleaved) { 10440 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10441 } 10442 10443 // Use the cost model. 10444 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10445 F, &Hints, IAI); 10446 CM.collectValuesToIgnore(); 10447 CM.collectElementTypesForWidening(); 10448 10449 // Use the planner for vectorization. 10450 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10451 Requirements, ORE); 10452 10453 // Get user vectorization factor and interleave count. 10454 ElementCount UserVF = Hints.getWidth(); 10455 unsigned UserIC = Hints.getInterleave(); 10456 10457 // Plan how to best vectorize, return the best VF and its cost. 10458 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10459 10460 VectorizationFactor VF = VectorizationFactor::Disabled(); 10461 unsigned IC = 1; 10462 10463 if (MaybeVF) { 10464 VF = *MaybeVF; 10465 // Select the interleave count. 10466 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10467 } 10468 10469 // Identify the diagnostic messages that should be produced. 10470 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10471 bool VectorizeLoop = true, InterleaveLoop = true; 10472 if (VF.Width.isScalar()) { 10473 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10474 VecDiagMsg = std::make_pair( 10475 "VectorizationNotBeneficial", 10476 "the cost-model indicates that vectorization is not beneficial"); 10477 VectorizeLoop = false; 10478 } 10479 10480 if (!MaybeVF && UserIC > 1) { 10481 // Tell the user interleaving was avoided up-front, despite being explicitly 10482 // requested. 10483 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10484 "interleaving should be avoided up front\n"); 10485 IntDiagMsg = std::make_pair( 10486 "InterleavingAvoided", 10487 "Ignoring UserIC, because interleaving was avoided up front"); 10488 InterleaveLoop = false; 10489 } else if (IC == 1 && UserIC <= 1) { 10490 // Tell the user interleaving is not beneficial. 10491 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10492 IntDiagMsg = std::make_pair( 10493 "InterleavingNotBeneficial", 10494 "the cost-model indicates that interleaving is not beneficial"); 10495 InterleaveLoop = false; 10496 if (UserIC == 1) { 10497 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10498 IntDiagMsg.second += 10499 " and is explicitly disabled or interleave count is set to 1"; 10500 } 10501 } else if (IC > 1 && UserIC == 1) { 10502 // Tell the user interleaving is beneficial, but it explicitly disabled. 10503 LLVM_DEBUG( 10504 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10505 IntDiagMsg = std::make_pair( 10506 "InterleavingBeneficialButDisabled", 10507 "the cost-model indicates that interleaving is beneficial " 10508 "but is explicitly disabled or interleave count is set to 1"); 10509 InterleaveLoop = false; 10510 } 10511 10512 // Override IC if user provided an interleave count. 10513 IC = UserIC > 0 ? UserIC : IC; 10514 10515 // Emit diagnostic messages, if any. 10516 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10517 if (!VectorizeLoop && !InterleaveLoop) { 10518 // Do not vectorize or interleaving the loop. 10519 ORE->emit([&]() { 10520 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10521 L->getStartLoc(), L->getHeader()) 10522 << VecDiagMsg.second; 10523 }); 10524 ORE->emit([&]() { 10525 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10526 L->getStartLoc(), L->getHeader()) 10527 << IntDiagMsg.second; 10528 }); 10529 return false; 10530 } else if (!VectorizeLoop && InterleaveLoop) { 10531 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10532 ORE->emit([&]() { 10533 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10534 L->getStartLoc(), L->getHeader()) 10535 << VecDiagMsg.second; 10536 }); 10537 } else if (VectorizeLoop && !InterleaveLoop) { 10538 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10539 << ") in " << DebugLocStr << '\n'); 10540 ORE->emit([&]() { 10541 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10542 L->getStartLoc(), L->getHeader()) 10543 << IntDiagMsg.second; 10544 }); 10545 } else if (VectorizeLoop && InterleaveLoop) { 10546 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10547 << ") in " << DebugLocStr << '\n'); 10548 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10549 } 10550 10551 bool DisableRuntimeUnroll = false; 10552 MDNode *OrigLoopID = L->getLoopID(); 10553 { 10554 // Optimistically generate runtime checks. Drop them if they turn out to not 10555 // be profitable. Limit the scope of Checks, so the cleanup happens 10556 // immediately after vector codegeneration is done. 10557 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10558 F->getParent()->getDataLayout()); 10559 if (!VF.Width.isScalar() || IC > 1) 10560 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate()); 10561 10562 using namespace ore; 10563 if (!VectorizeLoop) { 10564 assert(IC > 1 && "interleave count should not be 1 or 0"); 10565 // If we decided that it is not legal to vectorize the loop, then 10566 // interleave it. 10567 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10568 &CM, BFI, PSI, Checks); 10569 10570 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10571 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10572 10573 ORE->emit([&]() { 10574 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10575 L->getHeader()) 10576 << "interleaved loop (interleaved count: " 10577 << NV("InterleaveCount", IC) << ")"; 10578 }); 10579 } else { 10580 // If we decided that it is *legal* to vectorize the loop, then do it. 10581 10582 // Consider vectorizing the epilogue too if it's profitable. 10583 VectorizationFactor EpilogueVF = 10584 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10585 if (EpilogueVF.Width.isVector()) { 10586 10587 // The first pass vectorizes the main loop and creates a scalar epilogue 10588 // to be vectorized by executing the plan (potentially with a different 10589 // factor) again shortly afterwards. 10590 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10591 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10592 EPI, &LVL, &CM, BFI, PSI, Checks); 10593 10594 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10595 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10596 DT); 10597 ++LoopsVectorized; 10598 10599 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10600 formLCSSARecursively(*L, *DT, LI, SE); 10601 10602 // Second pass vectorizes the epilogue and adjusts the control flow 10603 // edges from the first pass. 10604 EPI.MainLoopVF = EPI.EpilogueVF; 10605 EPI.MainLoopUF = EPI.EpilogueUF; 10606 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10607 ORE, EPI, &LVL, &CM, BFI, PSI, 10608 Checks); 10609 10610 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10611 10612 // Ensure that the start values for any VPReductionPHIRecipes are 10613 // updated before vectorising the epilogue loop. 10614 VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); 10615 for (VPRecipeBase &R : Header->phis()) { 10616 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10617 if (auto *Resume = MainILV.getReductionResumeValue( 10618 ReductionPhi->getRecurrenceDescriptor())) { 10619 VPValue *StartVal = new VPValue(Resume); 10620 BestEpiPlan.addExternalDef(StartVal); 10621 ReductionPhi->setOperand(0, StartVal); 10622 } 10623 } 10624 } 10625 10626 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10627 DT); 10628 ++LoopsEpilogueVectorized; 10629 10630 if (!MainILV.areSafetyChecksAdded()) 10631 DisableRuntimeUnroll = true; 10632 } else { 10633 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10634 &LVL, &CM, BFI, PSI, Checks); 10635 10636 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10637 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10638 ++LoopsVectorized; 10639 10640 // Add metadata to disable runtime unrolling a scalar loop when there 10641 // are no runtime checks about strides and memory. A scalar loop that is 10642 // rarely used is not worth unrolling. 10643 if (!LB.areSafetyChecksAdded()) 10644 DisableRuntimeUnroll = true; 10645 } 10646 // Report the vectorization decision. 10647 ORE->emit([&]() { 10648 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10649 L->getHeader()) 10650 << "vectorized loop (vectorization width: " 10651 << NV("VectorizationFactor", VF.Width) 10652 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10653 }); 10654 } 10655 10656 if (ORE->allowExtraAnalysis(LV_NAME)) 10657 checkMixedPrecision(L, ORE); 10658 } 10659 10660 Optional<MDNode *> RemainderLoopID = 10661 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10662 LLVMLoopVectorizeFollowupEpilogue}); 10663 if (RemainderLoopID.hasValue()) { 10664 L->setLoopID(RemainderLoopID.getValue()); 10665 } else { 10666 if (DisableRuntimeUnroll) 10667 AddRuntimeUnrollDisableMetaData(L); 10668 10669 // Mark the loop as already vectorized to avoid vectorizing again. 10670 Hints.setAlreadyVectorized(); 10671 } 10672 10673 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10674 return true; 10675 } 10676 10677 LoopVectorizeResult LoopVectorizePass::runImpl( 10678 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10679 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10680 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10681 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10682 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10683 SE = &SE_; 10684 LI = &LI_; 10685 TTI = &TTI_; 10686 DT = &DT_; 10687 BFI = &BFI_; 10688 TLI = TLI_; 10689 AA = &AA_; 10690 AC = &AC_; 10691 GetLAA = &GetLAA_; 10692 DB = &DB_; 10693 ORE = &ORE_; 10694 PSI = PSI_; 10695 10696 // Don't attempt if 10697 // 1. the target claims to have no vector registers, and 10698 // 2. interleaving won't help ILP. 10699 // 10700 // The second condition is necessary because, even if the target has no 10701 // vector registers, loop vectorization may still enable scalar 10702 // interleaving. 10703 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10704 TTI->getMaxInterleaveFactor(1) < 2) 10705 return LoopVectorizeResult(false, false); 10706 10707 bool Changed = false, CFGChanged = false; 10708 10709 // The vectorizer requires loops to be in simplified form. 10710 // Since simplification may add new inner loops, it has to run before the 10711 // legality and profitability checks. This means running the loop vectorizer 10712 // will simplify all loops, regardless of whether anything end up being 10713 // vectorized. 10714 for (auto &L : *LI) 10715 Changed |= CFGChanged |= 10716 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10717 10718 // Build up a worklist of inner-loops to vectorize. This is necessary as 10719 // the act of vectorizing or partially unrolling a loop creates new loops 10720 // and can invalidate iterators across the loops. 10721 SmallVector<Loop *, 8> Worklist; 10722 10723 for (Loop *L : *LI) 10724 collectSupportedLoops(*L, LI, ORE, Worklist); 10725 10726 LoopsAnalyzed += Worklist.size(); 10727 10728 // Now walk the identified inner loops. 10729 while (!Worklist.empty()) { 10730 Loop *L = Worklist.pop_back_val(); 10731 10732 // For the inner loops we actually process, form LCSSA to simplify the 10733 // transform. 10734 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10735 10736 Changed |= CFGChanged |= processLoop(L); 10737 } 10738 10739 // Process each loop nest in the function. 10740 return LoopVectorizeResult(Changed, CFGChanged); 10741 } 10742 10743 PreservedAnalyses LoopVectorizePass::run(Function &F, 10744 FunctionAnalysisManager &AM) { 10745 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10746 auto &LI = AM.getResult<LoopAnalysis>(F); 10747 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10748 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10749 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10750 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10751 auto &AA = AM.getResult<AAManager>(F); 10752 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10753 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10754 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10755 10756 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10757 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10758 [&](Loop &L) -> const LoopAccessInfo & { 10759 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10760 TLI, TTI, nullptr, nullptr, nullptr}; 10761 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10762 }; 10763 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10764 ProfileSummaryInfo *PSI = 10765 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10766 LoopVectorizeResult Result = 10767 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10768 if (!Result.MadeAnyChange) 10769 return PreservedAnalyses::all(); 10770 PreservedAnalyses PA; 10771 10772 // We currently do not preserve loopinfo/dominator analyses with outer loop 10773 // vectorization. Until this is addressed, mark these analyses as preserved 10774 // only for non-VPlan-native path. 10775 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10776 if (!EnableVPlanNativePath) { 10777 PA.preserve<LoopAnalysis>(); 10778 PA.preserve<DominatorTreeAnalysis>(); 10779 } 10780 10781 if (Result.MadeCFGChange) { 10782 // Making CFG changes likely means a loop got vectorized. Indicate that 10783 // extra simplification passes should be run. 10784 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10785 // be run if runtime checks have been added. 10786 AM.getResult<ShouldRunExtraVectorPasses>(F); 10787 PA.preserve<ShouldRunExtraVectorPasses>(); 10788 } else { 10789 PA.preserveSet<CFGAnalyses>(); 10790 } 10791 return PA; 10792 } 10793 10794 void LoopVectorizePass::printPipeline( 10795 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10796 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10797 OS, MapClassName2PassName); 10798 10799 OS << "<"; 10800 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10801 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10802 OS << ">"; 10803 } 10804