1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop. 472 /// In the case of epilogue vectorization, this function is overriden to 473 /// handle the more complex control flow around the loops. 474 virtual BasicBlock *createVectorizedLoopSkeleton(); 475 476 /// Widen a single instruction within the innermost loop. 477 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 478 VPTransformState &State); 479 480 /// Widen a single call instruction within the innermost loop. 481 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 482 VPTransformState &State); 483 484 /// Widen a single select instruction within the innermost loop. 485 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 486 bool InvariantCond, VPTransformState &State); 487 488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 489 void fixVectorizedLoop(VPTransformState &State); 490 491 // Return true if any runtime check is added. 492 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 493 494 /// A type for vectorized values in the new loop. Each value from the 495 /// original loop, when vectorized, is represented by UF vector values in the 496 /// new unrolled loop, where UF is the unroll factor. 497 using VectorParts = SmallVector<Value *, 2>; 498 499 /// Vectorize a single GetElementPtrInst based on information gathered and 500 /// decisions taken during planning. 501 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 502 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 503 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 504 505 /// Vectorize a single first-order recurrence or pointer induction PHINode in 506 /// a block. This method handles the induction variable canonicalization. It 507 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 508 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 509 VPTransformState &State); 510 511 /// A helper function to scalarize a single Instruction in the innermost loop. 512 /// Generates a sequence of scalar instances for each lane between \p MinLane 513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 514 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 515 /// Instr's operands. 516 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 517 const VPIteration &Instance, bool IfPredicateInstr, 518 VPTransformState &State); 519 520 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 521 /// is provided, the integer induction variable will first be truncated to 522 /// the corresponding type. 523 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 524 VPValue *Def, VPValue *CastDef, 525 VPTransformState &State); 526 527 /// Construct the vector value of a scalarized value \p V one lane at a time. 528 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 529 VPTransformState &State); 530 531 /// Try to vectorize interleaved access group \p Group with the base address 532 /// given in \p Addr, optionally masking the vector operations if \p 533 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 534 /// values in the vectorized loop. 535 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 536 ArrayRef<VPValue *> VPDefs, 537 VPTransformState &State, VPValue *Addr, 538 ArrayRef<VPValue *> StoredValues, 539 VPValue *BlockInMask = nullptr); 540 541 /// Vectorize Load and Store instructions with the base address given in \p 542 /// Addr, optionally masking the vector operations if \p BlockInMask is 543 /// non-null. Use \p State to translate given VPValues to IR values in the 544 /// vectorized loop. 545 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 546 VPValue *Def, VPValue *Addr, 547 VPValue *StoredValue, VPValue *BlockInMask, 548 bool ConsecutiveStride, bool Reverse); 549 550 /// Set the debug location in the builder \p Ptr using the debug location in 551 /// \p V. If \p Ptr is None then it uses the class member's Builder. 552 void setDebugLocFromInst(const Value *V, 553 Optional<IRBuilder<> *> CustomBuilder = None); 554 555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 556 void fixNonInductionPHIs(VPTransformState &State); 557 558 /// Returns true if the reordering of FP operations is not allowed, but we are 559 /// able to vectorize with strict in-order reductions for the given RdxDesc. 560 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 561 562 /// Create a broadcast instruction. This method generates a broadcast 563 /// instruction (shuffle) for loop invariant values and for the induction 564 /// value. If this is the induction variable then we extend it to N, N+1, ... 565 /// this is needed because each iteration in the loop corresponds to a SIMD 566 /// element. 567 virtual Value *getBroadcastInstrs(Value *V); 568 569 protected: 570 friend class LoopVectorizationPlanner; 571 572 /// A small list of PHINodes. 573 using PhiVector = SmallVector<PHINode *, 4>; 574 575 /// A type for scalarized values in the new loop. Each value from the 576 /// original loop, when scalarized, is represented by UF x VF scalar values 577 /// in the new unrolled loop, where UF is the unroll factor and VF is the 578 /// vectorization factor. 579 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 580 581 /// Set up the values of the IVs correctly when exiting the vector loop. 582 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 583 Value *CountRoundDown, Value *EndValue, 584 BasicBlock *MiddleBlock); 585 586 /// Create a new induction variable inside L. 587 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 588 Value *Step, Instruction *DL); 589 590 /// Handle all cross-iteration phis in the header. 591 void fixCrossIterationPHIs(VPTransformState &State); 592 593 /// Create the exit value of first order recurrences in the middle block and 594 /// update their users. 595 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 596 597 /// Create code for the loop exit value of the reduction. 598 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 599 600 /// Clear NSW/NUW flags from reduction instructions if necessary. 601 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 602 VPTransformState &State); 603 604 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 605 /// means we need to add the appropriate incoming value from the middle 606 /// block as exiting edges from the scalar epilogue loop (if present) are 607 /// already in place, and we exit the vector loop exclusively to the middle 608 /// block. 609 void fixLCSSAPHIs(VPTransformState &State); 610 611 /// Iteratively sink the scalarized operands of a predicated instruction into 612 /// the block that was created for it. 613 void sinkScalarOperands(Instruction *PredInst); 614 615 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 616 /// represented as. 617 void truncateToMinimalBitwidths(VPTransformState &State); 618 619 /// This function adds 620 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 621 /// to each vector element of Val. The sequence starts at StartIndex. 622 /// \p Opcode is relevant for FP induction variable. 623 virtual Value * 624 getStepVector(Value *Val, Value *StartIdx, Value *Step, 625 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 626 627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 628 /// variable on which to base the steps, \p Step is the size of the step, and 629 /// \p EntryVal is the value from the original loop that maps to the steps. 630 /// Note that \p EntryVal doesn't have to be an induction variable - it 631 /// can also be a truncate instruction. 632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 633 const InductionDescriptor &ID, VPValue *Def, 634 VPValue *CastDef, VPTransformState &State); 635 636 /// Create a vector induction phi node based on an existing scalar one. \p 637 /// EntryVal is the value from the original loop that maps to the vector phi 638 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 639 /// truncate instruction, instead of widening the original IV, we widen a 640 /// version of the IV truncated to \p EntryVal's type. 641 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 642 Value *Step, Value *Start, 643 Instruction *EntryVal, VPValue *Def, 644 VPValue *CastDef, 645 VPTransformState &State); 646 647 /// Returns true if an instruction \p I should be scalarized instead of 648 /// vectorized for the chosen vectorization factor. 649 bool shouldScalarizeInstruction(Instruction *I) const; 650 651 /// Returns true if we should generate a scalar version of \p IV. 652 bool needsScalarInduction(Instruction *IV) const; 653 654 /// If there is a cast involved in the induction variable \p ID, which should 655 /// be ignored in the vectorized loop body, this function records the 656 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 657 /// cast. We had already proved that the casted Phi is equal to the uncasted 658 /// Phi in the vectorized loop (under a runtime guard), and therefore 659 /// there is no need to vectorize the cast - the same value can be used in the 660 /// vector loop for both the Phi and the cast. 661 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 662 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 663 /// 664 /// \p EntryVal is the value from the original loop that maps to the vector 665 /// phi node and is used to distinguish what is the IV currently being 666 /// processed - original one (if \p EntryVal is a phi corresponding to the 667 /// original IV) or the "newly-created" one based on the proof mentioned above 668 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 669 /// latter case \p EntryVal is a TruncInst and we must not record anything for 670 /// that IV, but it's error-prone to expect callers of this routine to care 671 /// about that, hence this explicit parameter. 672 void recordVectorLoopValueForInductionCast( 673 const InductionDescriptor &ID, const Instruction *EntryVal, 674 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 675 unsigned Part, unsigned Lane = UINT_MAX); 676 677 /// Generate a shuffle sequence that will reverse the vector Vec. 678 virtual Value *reverseVector(Value *Vec); 679 680 /// Returns (and creates if needed) the original loop trip count. 681 Value *getOrCreateTripCount(Loop *NewLoop); 682 683 /// Returns (and creates if needed) the trip count of the widened loop. 684 Value *getOrCreateVectorTripCount(Loop *NewLoop); 685 686 /// Returns a bitcasted value to the requested vector type. 687 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 688 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 689 const DataLayout &DL); 690 691 /// Emit a bypass check to see if the vector trip count is zero, including if 692 /// it overflows. 693 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 694 695 /// Emit a bypass check to see if all of the SCEV assumptions we've 696 /// had to make are correct. Returns the block containing the checks or 697 /// nullptr if no checks have been added. 698 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 699 700 /// Emit bypass checks to check any memory assumptions we may have made. 701 /// Returns the block containing the checks or nullptr if no checks have been 702 /// added. 703 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 704 705 /// Compute the transformed value of Index at offset StartValue using step 706 /// StepValue. 707 /// For integer induction, returns StartValue + Index * StepValue. 708 /// For pointer induction, returns StartValue[Index * StepValue]. 709 /// FIXME: The newly created binary instructions should contain nsw/nuw 710 /// flags, which can be found from the original scalar operations. 711 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 712 const DataLayout &DL, 713 const InductionDescriptor &ID) const; 714 715 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 716 /// vector loop preheader, middle block and scalar preheader. Also 717 /// allocate a loop object for the new vector loop and return it. 718 Loop *createVectorLoopSkeleton(StringRef Prefix); 719 720 /// Create new phi nodes for the induction variables to resume iteration count 721 /// in the scalar epilogue, from where the vectorized loop left off (given by 722 /// \p VectorTripCount). 723 /// In cases where the loop skeleton is more complicated (eg. epilogue 724 /// vectorization) and the resume values can come from an additional bypass 725 /// block, the \p AdditionalBypass pair provides information about the bypass 726 /// block and the end value on the edge from bypass to this loop. 727 void createInductionResumeValues( 728 Loop *L, Value *VectorTripCount, 729 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 730 731 /// Complete the loop skeleton by adding debug MDs, creating appropriate 732 /// conditional branches in the middle block, preparing the builder and 733 /// running the verifier. Take in the vector loop \p L as argument, and return 734 /// the preheader of the completed vector loop. 735 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 736 737 /// Add additional metadata to \p To that was not present on \p Orig. 738 /// 739 /// Currently this is used to add the noalias annotations based on the 740 /// inserted memchecks. Use this for instructions that are *cloned* into the 741 /// vector loop. 742 void addNewMetadata(Instruction *To, const Instruction *Orig); 743 744 /// Add metadata from one instruction to another. 745 /// 746 /// This includes both the original MDs from \p From and additional ones (\see 747 /// addNewMetadata). Use this for *newly created* instructions in the vector 748 /// loop. 749 void addMetadata(Instruction *To, Instruction *From); 750 751 /// Similar to the previous function but it adds the metadata to a 752 /// vector of instructions. 753 void addMetadata(ArrayRef<Value *> To, Instruction *From); 754 755 /// Allow subclasses to override and print debug traces before/after vplan 756 /// execution, when trace information is requested. 757 virtual void printDebugTracesAtStart(){}; 758 virtual void printDebugTracesAtEnd(){}; 759 760 /// The original loop. 761 Loop *OrigLoop; 762 763 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 764 /// dynamic knowledge to simplify SCEV expressions and converts them to a 765 /// more usable form. 766 PredicatedScalarEvolution &PSE; 767 768 /// Loop Info. 769 LoopInfo *LI; 770 771 /// Dominator Tree. 772 DominatorTree *DT; 773 774 /// Alias Analysis. 775 AAResults *AA; 776 777 /// Target Library Info. 778 const TargetLibraryInfo *TLI; 779 780 /// Target Transform Info. 781 const TargetTransformInfo *TTI; 782 783 /// Assumption Cache. 784 AssumptionCache *AC; 785 786 /// Interface to emit optimization remarks. 787 OptimizationRemarkEmitter *ORE; 788 789 /// LoopVersioning. It's only set up (non-null) if memchecks were 790 /// used. 791 /// 792 /// This is currently only used to add no-alias metadata based on the 793 /// memchecks. The actually versioning is performed manually. 794 std::unique_ptr<LoopVersioning> LVer; 795 796 /// The vectorization SIMD factor to use. Each vector will have this many 797 /// vector elements. 798 ElementCount VF; 799 800 /// The vectorization unroll factor to use. Each scalar is vectorized to this 801 /// many different vector instructions. 802 unsigned UF; 803 804 /// The builder that we use 805 IRBuilder<> Builder; 806 807 // --- Vectorization state --- 808 809 /// The vector-loop preheader. 810 BasicBlock *LoopVectorPreHeader; 811 812 /// The scalar-loop preheader. 813 BasicBlock *LoopScalarPreHeader; 814 815 /// Middle Block between the vector and the scalar. 816 BasicBlock *LoopMiddleBlock; 817 818 /// The unique ExitBlock of the scalar loop if one exists. Note that 819 /// there can be multiple exiting edges reaching this block. 820 BasicBlock *LoopExitBlock; 821 822 /// The vector loop body. 823 BasicBlock *LoopVectorBody; 824 825 /// The scalar loop body. 826 BasicBlock *LoopScalarBody; 827 828 /// A list of all bypass blocks. The first block is the entry of the loop. 829 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 830 831 /// The new Induction variable which was added to the new block. 832 PHINode *Induction = nullptr; 833 834 /// The induction variable of the old basic block. 835 PHINode *OldInduction = nullptr; 836 837 /// Store instructions that were predicated. 838 SmallVector<Instruction *, 4> PredicatedInstructions; 839 840 /// Trip count of the original loop. 841 Value *TripCount = nullptr; 842 843 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 844 Value *VectorTripCount = nullptr; 845 846 /// The legality analysis. 847 LoopVectorizationLegality *Legal; 848 849 /// The profitablity analysis. 850 LoopVectorizationCostModel *Cost; 851 852 // Record whether runtime checks are added. 853 bool AddedSafetyChecks = false; 854 855 // Holds the end values for each induction variable. We save the end values 856 // so we can later fix-up the external users of the induction variables. 857 DenseMap<PHINode *, Value *> IVEndValues; 858 859 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 860 // fixed up at the end of vector code generation. 861 SmallVector<PHINode *, 8> OrigPHIsToFix; 862 863 /// BFI and PSI are used to check for profile guided size optimizations. 864 BlockFrequencyInfo *BFI; 865 ProfileSummaryInfo *PSI; 866 867 // Whether this loop should be optimized for size based on profile guided size 868 // optimizatios. 869 bool OptForSizeBasedOnProfile; 870 871 /// Structure to hold information about generated runtime checks, responsible 872 /// for cleaning the checks, if vectorization turns out unprofitable. 873 GeneratedRTChecks &RTChecks; 874 }; 875 876 class InnerLoopUnroller : public InnerLoopVectorizer { 877 public: 878 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 879 LoopInfo *LI, DominatorTree *DT, 880 const TargetLibraryInfo *TLI, 881 const TargetTransformInfo *TTI, AssumptionCache *AC, 882 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 883 LoopVectorizationLegality *LVL, 884 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 885 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 887 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 888 BFI, PSI, Check) {} 889 890 private: 891 Value *getBroadcastInstrs(Value *V) override; 892 Value *getStepVector( 893 Value *Val, Value *StartIdx, Value *Step, 894 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 895 Value *reverseVector(Value *Vec) override; 896 }; 897 898 /// Encapsulate information regarding vectorization of a loop and its epilogue. 899 /// This information is meant to be updated and used across two stages of 900 /// epilogue vectorization. 901 struct EpilogueLoopVectorizationInfo { 902 ElementCount MainLoopVF = ElementCount::getFixed(0); 903 unsigned MainLoopUF = 0; 904 ElementCount EpilogueVF = ElementCount::getFixed(0); 905 unsigned EpilogueUF = 0; 906 BasicBlock *MainLoopIterationCountCheck = nullptr; 907 BasicBlock *EpilogueIterationCountCheck = nullptr; 908 BasicBlock *SCEVSafetyCheck = nullptr; 909 BasicBlock *MemSafetyCheck = nullptr; 910 Value *TripCount = nullptr; 911 Value *VectorTripCount = nullptr; 912 913 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 914 ElementCount EVF, unsigned EUF) 915 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 916 assert(EUF == 1 && 917 "A high UF for the epilogue loop is likely not beneficial."); 918 } 919 }; 920 921 /// An extension of the inner loop vectorizer that creates a skeleton for a 922 /// vectorized loop that has its epilogue (residual) also vectorized. 923 /// The idea is to run the vplan on a given loop twice, firstly to setup the 924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 925 /// from the first step and vectorize the epilogue. This is achieved by 926 /// deriving two concrete strategy classes from this base class and invoking 927 /// them in succession from the loop vectorizer planner. 928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 929 public: 930 InnerLoopAndEpilogueVectorizer( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 940 Checks), 941 EPI(EPI) {} 942 943 // Override this function to handle the more complex control flow around the 944 // three loops. 945 BasicBlock *createVectorizedLoopSkeleton() final override { 946 return createEpilogueVectorizedLoopSkeleton(); 947 } 948 949 /// The interface for creating a vectorized skeleton using one of two 950 /// different strategies, each corresponding to one execution of the vplan 951 /// as described above. 952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 953 954 /// Holds and updates state information required to vectorize the main loop 955 /// and its epilogue in two separate passes. This setup helps us avoid 956 /// regenerating and recomputing runtime safety checks. It also helps us to 957 /// shorten the iteration-count-check path length for the cases where the 958 /// iteration count of the loop is so small that the main vector loop is 959 /// completely skipped. 960 EpilogueLoopVectorizationInfo &EPI; 961 }; 962 963 /// A specialized derived class of inner loop vectorizer that performs 964 /// vectorization of *main* loops in the process of vectorizing loops and their 965 /// epilogues. 966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 967 public: 968 EpilogueVectorizerMainLoop( 969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 970 DominatorTree *DT, const TargetLibraryInfo *TLI, 971 const TargetTransformInfo *TTI, AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 975 GeneratedRTChecks &Check) 976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 977 EPI, LVL, CM, BFI, PSI, Check) {} 978 /// Implements the interface for creating a vectorized skeleton using the 979 /// *main loop* strategy (ie the first pass of vplan execution). 980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 981 982 protected: 983 /// Emits an iteration count bypass check once for the main loop (when \p 984 /// ForEpilogue is false) and once for the epilogue loop (when \p 985 /// ForEpilogue is true). 986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 987 bool ForEpilogue); 988 void printDebugTracesAtStart() override; 989 void printDebugTracesAtEnd() override; 990 }; 991 992 // A specialized derived class of inner loop vectorizer that performs 993 // vectorization of *epilogue* loops in the process of vectorizing loops and 994 // their epilogues. 995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 996 public: 997 EpilogueVectorizerEpilogueLoop( 998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 999 DominatorTree *DT, const TargetLibraryInfo *TLI, 1000 const TargetTransformInfo *TTI, AssumptionCache *AC, 1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1004 GeneratedRTChecks &Checks) 1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1006 EPI, LVL, CM, BFI, PSI, Checks) {} 1007 /// Implements the interface for creating a vectorized skeleton using the 1008 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1010 1011 protected: 1012 /// Emits an iteration count bypass check after the main vector loop has 1013 /// finished to see if there are any iterations left to execute by either 1014 /// the vector epilogue or the scalar epilogue. 1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1016 BasicBlock *Bypass, 1017 BasicBlock *Insert); 1018 void printDebugTracesAtStart() override; 1019 void printDebugTracesAtEnd() override; 1020 }; 1021 } // end namespace llvm 1022 1023 /// Look for a meaningful debug location on the instruction or it's 1024 /// operands. 1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1026 if (!I) 1027 return I; 1028 1029 DebugLoc Empty; 1030 if (I->getDebugLoc() != Empty) 1031 return I; 1032 1033 for (Use &Op : I->operands()) { 1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1035 if (OpInst->getDebugLoc() != Empty) 1036 return OpInst; 1037 } 1038 1039 return I; 1040 } 1041 1042 void InnerLoopVectorizer::setDebugLocFromInst( 1043 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1044 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1045 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1046 const DILocation *DIL = Inst->getDebugLoc(); 1047 1048 // When a FSDiscriminator is enabled, we don't need to add the multiply 1049 // factors to the discriminators. 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1052 // FIXME: For scalable vectors, assume vscale=1. 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B->SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } else 1062 B->SetCurrentDebugLocation(DIL); 1063 } else 1064 B->SetCurrentDebugLocation(DebugLoc()); 1065 } 1066 1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1068 /// is passed, the message relates to that particular instruction. 1069 #ifndef NDEBUG 1070 static void debugVectorizationMessage(const StringRef Prefix, 1071 const StringRef DebugMsg, 1072 Instruction *I) { 1073 dbgs() << "LV: " << Prefix << DebugMsg; 1074 if (I != nullptr) 1075 dbgs() << " " << *I; 1076 else 1077 dbgs() << '.'; 1078 dbgs() << '\n'; 1079 } 1080 #endif 1081 1082 /// Create an analysis remark that explains why vectorization failed 1083 /// 1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1085 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1086 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1087 /// the location of the remark. \return the remark object that can be 1088 /// streamed to. 1089 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1090 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1091 Value *CodeRegion = TheLoop->getHeader(); 1092 DebugLoc DL = TheLoop->getStartLoc(); 1093 1094 if (I) { 1095 CodeRegion = I->getParent(); 1096 // If there is no debug location attached to the instruction, revert back to 1097 // using the loop's. 1098 if (I->getDebugLoc()) 1099 DL = I->getDebugLoc(); 1100 } 1101 1102 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1103 } 1104 1105 /// Return a value for Step multiplied by VF. 1106 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1107 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1108 Constant *StepVal = ConstantInt::get( 1109 Step->getType(), 1110 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1111 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1112 } 1113 1114 namespace llvm { 1115 1116 /// Return the runtime value for VF. 1117 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1118 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1119 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1120 } 1121 1122 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1123 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1124 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1125 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1126 return B.CreateUIToFP(RuntimeVF, FTy); 1127 } 1128 1129 void reportVectorizationFailure(const StringRef DebugMsg, 1130 const StringRef OREMsg, const StringRef ORETag, 1131 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1132 Instruction *I) { 1133 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1134 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1135 ORE->emit( 1136 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1137 << "loop not vectorized: " << OREMsg); 1138 } 1139 1140 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1141 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1142 Instruction *I) { 1143 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1144 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1145 ORE->emit( 1146 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1147 << Msg); 1148 } 1149 1150 } // end namespace llvm 1151 1152 #ifndef NDEBUG 1153 /// \return string containing a file name and a line # for the given loop. 1154 static std::string getDebugLocString(const Loop *L) { 1155 std::string Result; 1156 if (L) { 1157 raw_string_ostream OS(Result); 1158 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1159 LoopDbgLoc.print(OS); 1160 else 1161 // Just print the module name. 1162 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1163 OS.flush(); 1164 } 1165 return Result; 1166 } 1167 #endif 1168 1169 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1170 const Instruction *Orig) { 1171 // If the loop was versioned with memchecks, add the corresponding no-alias 1172 // metadata. 1173 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1174 LVer->annotateInstWithNoAlias(To, Orig); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(Instruction *To, 1178 Instruction *From) { 1179 propagateMetadata(To, From); 1180 addNewMetadata(To, From); 1181 } 1182 1183 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1184 Instruction *From) { 1185 for (Value *V : To) { 1186 if (Instruction *I = dyn_cast<Instruction>(V)) 1187 addMetadata(I, From); 1188 } 1189 } 1190 1191 namespace llvm { 1192 1193 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1194 // lowered. 1195 enum ScalarEpilogueLowering { 1196 1197 // The default: allowing scalar epilogues. 1198 CM_ScalarEpilogueAllowed, 1199 1200 // Vectorization with OptForSize: don't allow epilogues. 1201 CM_ScalarEpilogueNotAllowedOptSize, 1202 1203 // A special case of vectorisation with OptForSize: loops with a very small 1204 // trip count are considered for vectorization under OptForSize, thereby 1205 // making sure the cost of their loop body is dominant, free of runtime 1206 // guards and scalar iteration overheads. 1207 CM_ScalarEpilogueNotAllowedLowTripLoop, 1208 1209 // Loop hint predicate indicating an epilogue is undesired. 1210 CM_ScalarEpilogueNotNeededUsePredicate, 1211 1212 // Directive indicating we must either tail fold or not vectorize 1213 CM_ScalarEpilogueNotAllowedUsePredicate 1214 }; 1215 1216 /// ElementCountComparator creates a total ordering for ElementCount 1217 /// for the purposes of using it in a set structure. 1218 struct ElementCountComparator { 1219 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1220 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1221 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1222 } 1223 }; 1224 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1225 1226 /// LoopVectorizationCostModel - estimates the expected speedups due to 1227 /// vectorization. 1228 /// In many cases vectorization is not profitable. This can happen because of 1229 /// a number of reasons. In this class we mainly attempt to predict the 1230 /// expected speedup/slowdowns due to the supported instruction set. We use the 1231 /// TargetTransformInfo to query the different backends for the cost of 1232 /// different operations. 1233 class LoopVectorizationCostModel { 1234 public: 1235 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1236 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1237 LoopVectorizationLegality *Legal, 1238 const TargetTransformInfo &TTI, 1239 const TargetLibraryInfo *TLI, DemandedBits *DB, 1240 AssumptionCache *AC, 1241 OptimizationRemarkEmitter *ORE, const Function *F, 1242 const LoopVectorizeHints *Hints, 1243 InterleavedAccessInfo &IAI) 1244 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1245 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1246 Hints(Hints), InterleaveInfo(IAI) {} 1247 1248 /// \return An upper bound for the vectorization factors (both fixed and 1249 /// scalable). If the factors are 0, vectorization and interleaving should be 1250 /// avoided up front. 1251 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1252 1253 /// \return True if runtime checks are required for vectorization, and false 1254 /// otherwise. 1255 bool runtimeChecksRequired(); 1256 1257 /// \return The most profitable vectorization factor and the cost of that VF. 1258 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1259 /// then this vectorization factor will be selected if vectorization is 1260 /// possible. 1261 VectorizationFactor 1262 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1263 1264 VectorizationFactor 1265 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1266 const LoopVectorizationPlanner &LVP); 1267 1268 /// Setup cost-based decisions for user vectorization factor. 1269 /// \return true if the UserVF is a feasible VF to be chosen. 1270 bool selectUserVectorizationFactor(ElementCount UserVF) { 1271 collectUniformsAndScalars(UserVF); 1272 collectInstsToScalarize(UserVF); 1273 return expectedCost(UserVF).first.isValid(); 1274 } 1275 1276 /// \return The size (in bits) of the smallest and widest types in the code 1277 /// that needs to be vectorized. We ignore values that remain scalar such as 1278 /// 64 bit loop indices. 1279 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1280 1281 /// \return The desired interleave count. 1282 /// If interleave count has been specified by metadata it will be returned. 1283 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1284 /// are the selected vectorization factor and the cost of the selected VF. 1285 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1286 1287 /// Memory access instruction may be vectorized in more than one way. 1288 /// Form of instruction after vectorization depends on cost. 1289 /// This function takes cost-based decisions for Load/Store instructions 1290 /// and collects them in a map. This decisions map is used for building 1291 /// the lists of loop-uniform and loop-scalar instructions. 1292 /// The calculated cost is saved with widening decision in order to 1293 /// avoid redundant calculations. 1294 void setCostBasedWideningDecision(ElementCount VF); 1295 1296 /// A struct that represents some properties of the register usage 1297 /// of a loop. 1298 struct RegisterUsage { 1299 /// Holds the number of loop invariant values that are used in the loop. 1300 /// The key is ClassID of target-provided register class. 1301 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1302 /// Holds the maximum number of concurrent live intervals in the loop. 1303 /// The key is ClassID of target-provided register class. 1304 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1305 }; 1306 1307 /// \return Returns information about the register usages of the loop for the 1308 /// given vectorization factors. 1309 SmallVector<RegisterUsage, 8> 1310 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1311 1312 /// Collect values we want to ignore in the cost model. 1313 void collectValuesToIgnore(); 1314 1315 /// Collect all element types in the loop for which widening is needed. 1316 void collectElementTypesForWidening(); 1317 1318 /// Split reductions into those that happen in the loop, and those that happen 1319 /// outside. In loop reductions are collected into InLoopReductionChains. 1320 void collectInLoopReductions(); 1321 1322 /// Returns true if we should use strict in-order reductions for the given 1323 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1324 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1325 /// of FP operations. 1326 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1327 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1328 } 1329 1330 /// \returns The smallest bitwidth each instruction can be represented with. 1331 /// The vector equivalents of these instructions should be truncated to this 1332 /// type. 1333 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1334 return MinBWs; 1335 } 1336 1337 /// \returns True if it is more profitable to scalarize instruction \p I for 1338 /// vectorization factor \p VF. 1339 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1340 assert(VF.isVector() && 1341 "Profitable to scalarize relevant only for VF > 1."); 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto Scalars = InstsToScalarize.find(VF); 1349 assert(Scalars != InstsToScalarize.end() && 1350 "VF not yet analyzed for scalarization profitability"); 1351 return Scalars->second.find(I) != Scalars->second.end(); 1352 } 1353 1354 /// Returns true if \p I is known to be uniform after vectorization. 1355 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1356 if (VF.isScalar()) 1357 return true; 1358 1359 // Cost model is not run in the VPlan-native path - return conservative 1360 // result until this changes. 1361 if (EnableVPlanNativePath) 1362 return false; 1363 1364 auto UniformsPerVF = Uniforms.find(VF); 1365 assert(UniformsPerVF != Uniforms.end() && 1366 "VF not yet analyzed for uniformity"); 1367 return UniformsPerVF->second.count(I); 1368 } 1369 1370 /// Returns true if \p I is known to be scalar after vectorization. 1371 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1372 if (VF.isScalar()) 1373 return true; 1374 1375 // Cost model is not run in the VPlan-native path - return conservative 1376 // result until this changes. 1377 if (EnableVPlanNativePath) 1378 return false; 1379 1380 auto ScalarsPerVF = Scalars.find(VF); 1381 assert(ScalarsPerVF != Scalars.end() && 1382 "Scalar values are not calculated for VF"); 1383 return ScalarsPerVF->second.count(I); 1384 } 1385 1386 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1387 /// for vectorization factor \p VF. 1388 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1389 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1390 !isProfitableToScalarize(I, VF) && 1391 !isScalarAfterVectorization(I, VF); 1392 } 1393 1394 /// Decision that was taken during cost calculation for memory instruction. 1395 enum InstWidening { 1396 CM_Unknown, 1397 CM_Widen, // For consecutive accesses with stride +1. 1398 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1399 CM_Interleave, 1400 CM_GatherScatter, 1401 CM_Scalarize 1402 }; 1403 1404 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1405 /// instruction \p I and vector width \p VF. 1406 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1407 InstructionCost Cost) { 1408 assert(VF.isVector() && "Expected VF >=2"); 1409 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1410 } 1411 1412 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1413 /// interleaving group \p Grp and vector width \p VF. 1414 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1415 ElementCount VF, InstWidening W, 1416 InstructionCost Cost) { 1417 assert(VF.isVector() && "Expected VF >=2"); 1418 /// Broadcast this decicion to all instructions inside the group. 1419 /// But the cost will be assigned to one instruction only. 1420 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1421 if (auto *I = Grp->getMember(i)) { 1422 if (Grp->getInsertPos() == I) 1423 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1424 else 1425 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1426 } 1427 } 1428 } 1429 1430 /// Return the cost model decision for the given instruction \p I and vector 1431 /// width \p VF. Return CM_Unknown if this instruction did not pass 1432 /// through the cost modeling. 1433 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1434 assert(VF.isVector() && "Expected VF to be a vector VF"); 1435 // Cost model is not run in the VPlan-native path - return conservative 1436 // result until this changes. 1437 if (EnableVPlanNativePath) 1438 return CM_GatherScatter; 1439 1440 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1441 auto Itr = WideningDecisions.find(InstOnVF); 1442 if (Itr == WideningDecisions.end()) 1443 return CM_Unknown; 1444 return Itr->second.first; 1445 } 1446 1447 /// Return the vectorization cost for the given instruction \p I and vector 1448 /// width \p VF. 1449 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1450 assert(VF.isVector() && "Expected VF >=2"); 1451 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1452 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1453 "The cost is not calculated"); 1454 return WideningDecisions[InstOnVF].second; 1455 } 1456 1457 /// Return True if instruction \p I is an optimizable truncate whose operand 1458 /// is an induction variable. Such a truncate will be removed by adding a new 1459 /// induction variable with the destination type. 1460 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1461 // If the instruction is not a truncate, return false. 1462 auto *Trunc = dyn_cast<TruncInst>(I); 1463 if (!Trunc) 1464 return false; 1465 1466 // Get the source and destination types of the truncate. 1467 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1468 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1469 1470 // If the truncate is free for the given types, return false. Replacing a 1471 // free truncate with an induction variable would add an induction variable 1472 // update instruction to each iteration of the loop. We exclude from this 1473 // check the primary induction variable since it will need an update 1474 // instruction regardless. 1475 Value *Op = Trunc->getOperand(0); 1476 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1477 return false; 1478 1479 // If the truncated value is not an induction variable, return false. 1480 return Legal->isInductionPhi(Op); 1481 } 1482 1483 /// Collects the instructions to scalarize for each predicated instruction in 1484 /// the loop. 1485 void collectInstsToScalarize(ElementCount VF); 1486 1487 /// Collect Uniform and Scalar values for the given \p VF. 1488 /// The sets depend on CM decision for Load/Store instructions 1489 /// that may be vectorized as interleave, gather-scatter or scalarized. 1490 void collectUniformsAndScalars(ElementCount VF) { 1491 // Do the analysis once. 1492 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1493 return; 1494 setCostBasedWideningDecision(VF); 1495 collectLoopUniforms(VF); 1496 collectLoopScalars(VF); 1497 } 1498 1499 /// Returns true if the target machine supports masked store operation 1500 /// for the given \p DataType and kind of access to \p Ptr. 1501 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1502 return Legal->isConsecutivePtr(DataType, Ptr) && 1503 TTI.isLegalMaskedStore(DataType, Alignment); 1504 } 1505 1506 /// Returns true if the target machine supports masked load operation 1507 /// for the given \p DataType and kind of access to \p Ptr. 1508 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1509 return Legal->isConsecutivePtr(DataType, Ptr) && 1510 TTI.isLegalMaskedLoad(DataType, Alignment); 1511 } 1512 1513 /// Returns true if the target machine can represent \p V as a masked gather 1514 /// or scatter operation. 1515 bool isLegalGatherOrScatter(Value *V) { 1516 bool LI = isa<LoadInst>(V); 1517 bool SI = isa<StoreInst>(V); 1518 if (!LI && !SI) 1519 return false; 1520 auto *Ty = getLoadStoreType(V); 1521 Align Align = getLoadStoreAlignment(V); 1522 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1523 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1524 } 1525 1526 /// Returns true if the target machine supports all of the reduction 1527 /// variables found for the given VF. 1528 bool canVectorizeReductions(ElementCount VF) const { 1529 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1530 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1531 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1532 })); 1533 } 1534 1535 /// Returns true if \p I is an instruction that will be scalarized with 1536 /// predication. Such instructions include conditional stores and 1537 /// instructions that may divide by zero. 1538 /// If a non-zero VF has been calculated, we check if I will be scalarized 1539 /// predication for that VF. 1540 bool isScalarWithPredication(Instruction *I) const; 1541 1542 // Returns true if \p I is an instruction that will be predicated either 1543 // through scalar predication or masked load/store or masked gather/scatter. 1544 // Superset of instructions that return true for isScalarWithPredication. 1545 bool isPredicatedInst(Instruction *I) { 1546 if (!blockNeedsPredication(I->getParent())) 1547 return false; 1548 // Loads and stores that need some form of masked operation are predicated 1549 // instructions. 1550 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1551 return Legal->isMaskRequired(I); 1552 return isScalarWithPredication(I); 1553 } 1554 1555 /// Returns true if \p I is a memory instruction with consecutive memory 1556 /// access that can be widened. 1557 bool 1558 memoryInstructionCanBeWidened(Instruction *I, 1559 ElementCount VF = ElementCount::getFixed(1)); 1560 1561 /// Returns true if \p I is a memory instruction in an interleaved-group 1562 /// of memory accesses that can be vectorized with wide vector loads/stores 1563 /// and shuffles. 1564 bool 1565 interleavedAccessCanBeWidened(Instruction *I, 1566 ElementCount VF = ElementCount::getFixed(1)); 1567 1568 /// Check if \p Instr belongs to any interleaved access group. 1569 bool isAccessInterleaved(Instruction *Instr) { 1570 return InterleaveInfo.isInterleaved(Instr); 1571 } 1572 1573 /// Get the interleaved access group that \p Instr belongs to. 1574 const InterleaveGroup<Instruction> * 1575 getInterleavedAccessGroup(Instruction *Instr) { 1576 return InterleaveInfo.getInterleaveGroup(Instr); 1577 } 1578 1579 /// Returns true if we're required to use a scalar epilogue for at least 1580 /// the final iteration of the original loop. 1581 bool requiresScalarEpilogue(ElementCount VF) const { 1582 if (!isScalarEpilogueAllowed()) 1583 return false; 1584 // If we might exit from anywhere but the latch, must run the exiting 1585 // iteration in scalar form. 1586 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1587 return true; 1588 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1589 } 1590 1591 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1592 /// loop hint annotation. 1593 bool isScalarEpilogueAllowed() const { 1594 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1595 } 1596 1597 /// Returns true if all loop blocks should be masked to fold tail loop. 1598 bool foldTailByMasking() const { return FoldTailByMasking; } 1599 1600 bool blockNeedsPredication(BasicBlock *BB) const { 1601 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1602 } 1603 1604 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1605 /// nodes to the chain of instructions representing the reductions. Uses a 1606 /// MapVector to ensure deterministic iteration order. 1607 using ReductionChainMap = 1608 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1609 1610 /// Return the chain of instructions representing an inloop reduction. 1611 const ReductionChainMap &getInLoopReductionChains() const { 1612 return InLoopReductionChains; 1613 } 1614 1615 /// Returns true if the Phi is part of an inloop reduction. 1616 bool isInLoopReduction(PHINode *Phi) const { 1617 return InLoopReductionChains.count(Phi); 1618 } 1619 1620 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1621 /// with factor VF. Return the cost of the instruction, including 1622 /// scalarization overhead if it's needed. 1623 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1624 1625 /// Estimate cost of a call instruction CI if it were vectorized with factor 1626 /// VF. Return the cost of the instruction, including scalarization overhead 1627 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1628 /// scalarized - 1629 /// i.e. either vector version isn't available, or is too expensive. 1630 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1631 bool &NeedToScalarize) const; 1632 1633 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1634 /// that of B. 1635 bool isMoreProfitable(const VectorizationFactor &A, 1636 const VectorizationFactor &B) const; 1637 1638 /// Invalidates decisions already taken by the cost model. 1639 void invalidateCostModelingDecisions() { 1640 WideningDecisions.clear(); 1641 Uniforms.clear(); 1642 Scalars.clear(); 1643 } 1644 1645 private: 1646 unsigned NumPredStores = 0; 1647 1648 /// \return An upper bound for the vectorization factors for both 1649 /// fixed and scalable vectorization, where the minimum-known number of 1650 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1651 /// disabled or unsupported, then the scalable part will be equal to 1652 /// ElementCount::getScalable(0). 1653 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1654 ElementCount UserVF); 1655 1656 /// \return the maximized element count based on the targets vector 1657 /// registers and the loop trip-count, but limited to a maximum safe VF. 1658 /// This is a helper function of computeFeasibleMaxVF. 1659 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1660 /// issue that occurred on one of the buildbots which cannot be reproduced 1661 /// without having access to the properietary compiler (see comments on 1662 /// D98509). The issue is currently under investigation and this workaround 1663 /// will be removed as soon as possible. 1664 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1665 unsigned SmallestType, 1666 unsigned WidestType, 1667 const ElementCount &MaxSafeVF); 1668 1669 /// \return the maximum legal scalable VF, based on the safe max number 1670 /// of elements. 1671 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1672 1673 /// The vectorization cost is a combination of the cost itself and a boolean 1674 /// indicating whether any of the contributing operations will actually 1675 /// operate on vector values after type legalization in the backend. If this 1676 /// latter value is false, then all operations will be scalarized (i.e. no 1677 /// vectorization has actually taken place). 1678 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1679 1680 /// Returns the expected execution cost. The unit of the cost does 1681 /// not matter because we use the 'cost' units to compare different 1682 /// vector widths. The cost that is returned is *not* normalized by 1683 /// the factor width. If \p Invalid is not nullptr, this function 1684 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1685 /// each instruction that has an Invalid cost for the given VF. 1686 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1687 VectorizationCostTy 1688 expectedCost(ElementCount VF, 1689 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1690 1691 /// Returns the execution time cost of an instruction for a given vector 1692 /// width. Vector width of one means scalar. 1693 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1694 1695 /// The cost-computation logic from getInstructionCost which provides 1696 /// the vector type as an output parameter. 1697 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1698 Type *&VectorTy); 1699 1700 /// Return the cost of instructions in an inloop reduction pattern, if I is 1701 /// part of that pattern. 1702 Optional<InstructionCost> 1703 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1704 TTI::TargetCostKind CostKind); 1705 1706 /// Calculate vectorization cost of memory instruction \p I. 1707 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1708 1709 /// The cost computation for scalarized memory instruction. 1710 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1711 1712 /// The cost computation for interleaving group of memory instructions. 1713 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1714 1715 /// The cost computation for Gather/Scatter instruction. 1716 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1717 1718 /// The cost computation for widening instruction \p I with consecutive 1719 /// memory access. 1720 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1721 1722 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1723 /// Load: scalar load + broadcast. 1724 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1725 /// element) 1726 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1727 1728 /// Estimate the overhead of scalarizing an instruction. This is a 1729 /// convenience wrapper for the type-based getScalarizationOverhead API. 1730 InstructionCost getScalarizationOverhead(Instruction *I, 1731 ElementCount VF) const; 1732 1733 /// Returns whether the instruction is a load or store and will be a emitted 1734 /// as a vector operation. 1735 bool isConsecutiveLoadOrStore(Instruction *I); 1736 1737 /// Returns true if an artificially high cost for emulated masked memrefs 1738 /// should be used. 1739 bool useEmulatedMaskMemRefHack(Instruction *I); 1740 1741 /// Map of scalar integer values to the smallest bitwidth they can be legally 1742 /// represented as. The vector equivalents of these values should be truncated 1743 /// to this type. 1744 MapVector<Instruction *, uint64_t> MinBWs; 1745 1746 /// A type representing the costs for instructions if they were to be 1747 /// scalarized rather than vectorized. The entries are Instruction-Cost 1748 /// pairs. 1749 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1750 1751 /// A set containing all BasicBlocks that are known to present after 1752 /// vectorization as a predicated block. 1753 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1754 1755 /// Records whether it is allowed to have the original scalar loop execute at 1756 /// least once. This may be needed as a fallback loop in case runtime 1757 /// aliasing/dependence checks fail, or to handle the tail/remainder 1758 /// iterations when the trip count is unknown or doesn't divide by the VF, 1759 /// or as a peel-loop to handle gaps in interleave-groups. 1760 /// Under optsize and when the trip count is very small we don't allow any 1761 /// iterations to execute in the scalar loop. 1762 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1763 1764 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1765 bool FoldTailByMasking = false; 1766 1767 /// A map holding scalar costs for different vectorization factors. The 1768 /// presence of a cost for an instruction in the mapping indicates that the 1769 /// instruction will be scalarized when vectorizing with the associated 1770 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1771 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1772 1773 /// Holds the instructions known to be uniform after vectorization. 1774 /// The data is collected per VF. 1775 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1776 1777 /// Holds the instructions known to be scalar after vectorization. 1778 /// The data is collected per VF. 1779 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1780 1781 /// Holds the instructions (address computations) that are forced to be 1782 /// scalarized. 1783 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1784 1785 /// PHINodes of the reductions that should be expanded in-loop along with 1786 /// their associated chains of reduction operations, in program order from top 1787 /// (PHI) to bottom 1788 ReductionChainMap InLoopReductionChains; 1789 1790 /// A Map of inloop reduction operations and their immediate chain operand. 1791 /// FIXME: This can be removed once reductions can be costed correctly in 1792 /// vplan. This was added to allow quick lookup to the inloop operations, 1793 /// without having to loop through InLoopReductionChains. 1794 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1795 1796 /// Returns the expected difference in cost from scalarizing the expression 1797 /// feeding a predicated instruction \p PredInst. The instructions to 1798 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1799 /// non-negative return value implies the expression will be scalarized. 1800 /// Currently, only single-use chains are considered for scalarization. 1801 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1802 ElementCount VF); 1803 1804 /// Collect the instructions that are uniform after vectorization. An 1805 /// instruction is uniform if we represent it with a single scalar value in 1806 /// the vectorized loop corresponding to each vector iteration. Examples of 1807 /// uniform instructions include pointer operands of consecutive or 1808 /// interleaved memory accesses. Note that although uniformity implies an 1809 /// instruction will be scalar, the reverse is not true. In general, a 1810 /// scalarized instruction will be represented by VF scalar values in the 1811 /// vectorized loop, each corresponding to an iteration of the original 1812 /// scalar loop. 1813 void collectLoopUniforms(ElementCount VF); 1814 1815 /// Collect the instructions that are scalar after vectorization. An 1816 /// instruction is scalar if it is known to be uniform or will be scalarized 1817 /// during vectorization. Non-uniform scalarized instructions will be 1818 /// represented by VF values in the vectorized loop, each corresponding to an 1819 /// iteration of the original scalar loop. 1820 void collectLoopScalars(ElementCount VF); 1821 1822 /// Keeps cost model vectorization decision and cost for instructions. 1823 /// Right now it is used for memory instructions only. 1824 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1825 std::pair<InstWidening, InstructionCost>>; 1826 1827 DecisionList WideningDecisions; 1828 1829 /// Returns true if \p V is expected to be vectorized and it needs to be 1830 /// extracted. 1831 bool needsExtract(Value *V, ElementCount VF) const { 1832 Instruction *I = dyn_cast<Instruction>(V); 1833 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1834 TheLoop->isLoopInvariant(I)) 1835 return false; 1836 1837 // Assume we can vectorize V (and hence we need extraction) if the 1838 // scalars are not computed yet. This can happen, because it is called 1839 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1840 // the scalars are collected. That should be a safe assumption in most 1841 // cases, because we check if the operands have vectorizable types 1842 // beforehand in LoopVectorizationLegality. 1843 return Scalars.find(VF) == Scalars.end() || 1844 !isScalarAfterVectorization(I, VF); 1845 }; 1846 1847 /// Returns a range containing only operands needing to be extracted. 1848 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1849 ElementCount VF) const { 1850 return SmallVector<Value *, 4>(make_filter_range( 1851 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1852 } 1853 1854 /// Determines if we have the infrastructure to vectorize loop \p L and its 1855 /// epilogue, assuming the main loop is vectorized by \p VF. 1856 bool isCandidateForEpilogueVectorization(const Loop &L, 1857 const ElementCount VF) const; 1858 1859 /// Returns true if epilogue vectorization is considered profitable, and 1860 /// false otherwise. 1861 /// \p VF is the vectorization factor chosen for the original loop. 1862 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1863 1864 public: 1865 /// The loop that we evaluate. 1866 Loop *TheLoop; 1867 1868 /// Predicated scalar evolution analysis. 1869 PredicatedScalarEvolution &PSE; 1870 1871 /// Loop Info analysis. 1872 LoopInfo *LI; 1873 1874 /// Vectorization legality. 1875 LoopVectorizationLegality *Legal; 1876 1877 /// Vector target information. 1878 const TargetTransformInfo &TTI; 1879 1880 /// Target Library Info. 1881 const TargetLibraryInfo *TLI; 1882 1883 /// Demanded bits analysis. 1884 DemandedBits *DB; 1885 1886 /// Assumption cache. 1887 AssumptionCache *AC; 1888 1889 /// Interface to emit optimization remarks. 1890 OptimizationRemarkEmitter *ORE; 1891 1892 const Function *TheFunction; 1893 1894 /// Loop Vectorize Hint. 1895 const LoopVectorizeHints *Hints; 1896 1897 /// The interleave access information contains groups of interleaved accesses 1898 /// with the same stride and close to each other. 1899 InterleavedAccessInfo &InterleaveInfo; 1900 1901 /// Values to ignore in the cost model. 1902 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1903 1904 /// Values to ignore in the cost model when VF > 1. 1905 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1906 1907 /// All element types found in the loop. 1908 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1909 1910 /// Profitable vector factors. 1911 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1912 }; 1913 } // end namespace llvm 1914 1915 /// Helper struct to manage generating runtime checks for vectorization. 1916 /// 1917 /// The runtime checks are created up-front in temporary blocks to allow better 1918 /// estimating the cost and un-linked from the existing IR. After deciding to 1919 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1920 /// temporary blocks are completely removed. 1921 class GeneratedRTChecks { 1922 /// Basic block which contains the generated SCEV checks, if any. 1923 BasicBlock *SCEVCheckBlock = nullptr; 1924 1925 /// The value representing the result of the generated SCEV checks. If it is 1926 /// nullptr, either no SCEV checks have been generated or they have been used. 1927 Value *SCEVCheckCond = nullptr; 1928 1929 /// Basic block which contains the generated memory runtime checks, if any. 1930 BasicBlock *MemCheckBlock = nullptr; 1931 1932 /// The value representing the result of the generated memory runtime checks. 1933 /// If it is nullptr, either no memory runtime checks have been generated or 1934 /// they have been used. 1935 Value *MemRuntimeCheckCond = nullptr; 1936 1937 DominatorTree *DT; 1938 LoopInfo *LI; 1939 1940 SCEVExpander SCEVExp; 1941 SCEVExpander MemCheckExp; 1942 1943 public: 1944 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1945 const DataLayout &DL) 1946 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1947 MemCheckExp(SE, DL, "scev.check") {} 1948 1949 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1950 /// accurately estimate the cost of the runtime checks. The blocks are 1951 /// un-linked from the IR and is added back during vector code generation. If 1952 /// there is no vector code generation, the check blocks are removed 1953 /// completely. 1954 void Create(Loop *L, const LoopAccessInfo &LAI, 1955 const SCEVUnionPredicate &UnionPred) { 1956 1957 BasicBlock *LoopHeader = L->getHeader(); 1958 BasicBlock *Preheader = L->getLoopPreheader(); 1959 1960 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1961 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1962 // may be used by SCEVExpander. The blocks will be un-linked from their 1963 // predecessors and removed from LI & DT at the end of the function. 1964 if (!UnionPred.isAlwaysTrue()) { 1965 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1966 nullptr, "vector.scevcheck"); 1967 1968 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1969 &UnionPred, SCEVCheckBlock->getTerminator()); 1970 } 1971 1972 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1973 if (RtPtrChecking.Need) { 1974 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1975 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1976 "vector.memcheck"); 1977 1978 MemRuntimeCheckCond = 1979 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1980 RtPtrChecking.getChecks(), MemCheckExp); 1981 assert(MemRuntimeCheckCond && 1982 "no RT checks generated although RtPtrChecking " 1983 "claimed checks are required"); 1984 } 1985 1986 if (!MemCheckBlock && !SCEVCheckBlock) 1987 return; 1988 1989 // Unhook the temporary block with the checks, update various places 1990 // accordingly. 1991 if (SCEVCheckBlock) 1992 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1993 if (MemCheckBlock) 1994 MemCheckBlock->replaceAllUsesWith(Preheader); 1995 1996 if (SCEVCheckBlock) { 1997 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1998 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1999 Preheader->getTerminator()->eraseFromParent(); 2000 } 2001 if (MemCheckBlock) { 2002 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2003 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2004 Preheader->getTerminator()->eraseFromParent(); 2005 } 2006 2007 DT->changeImmediateDominator(LoopHeader, Preheader); 2008 if (MemCheckBlock) { 2009 DT->eraseNode(MemCheckBlock); 2010 LI->removeBlock(MemCheckBlock); 2011 } 2012 if (SCEVCheckBlock) { 2013 DT->eraseNode(SCEVCheckBlock); 2014 LI->removeBlock(SCEVCheckBlock); 2015 } 2016 } 2017 2018 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2019 /// unused. 2020 ~GeneratedRTChecks() { 2021 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2022 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2023 if (!SCEVCheckCond) 2024 SCEVCleaner.markResultUsed(); 2025 2026 if (!MemRuntimeCheckCond) 2027 MemCheckCleaner.markResultUsed(); 2028 2029 if (MemRuntimeCheckCond) { 2030 auto &SE = *MemCheckExp.getSE(); 2031 // Memory runtime check generation creates compares that use expanded 2032 // values. Remove them before running the SCEVExpanderCleaners. 2033 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2034 if (MemCheckExp.isInsertedInstruction(&I)) 2035 continue; 2036 SE.forgetValue(&I); 2037 SE.eraseValueFromMap(&I); 2038 I.eraseFromParent(); 2039 } 2040 } 2041 MemCheckCleaner.cleanup(); 2042 SCEVCleaner.cleanup(); 2043 2044 if (SCEVCheckCond) 2045 SCEVCheckBlock->eraseFromParent(); 2046 if (MemRuntimeCheckCond) 2047 MemCheckBlock->eraseFromParent(); 2048 } 2049 2050 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2051 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2052 /// depending on the generated condition. 2053 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2054 BasicBlock *LoopVectorPreHeader, 2055 BasicBlock *LoopExitBlock) { 2056 if (!SCEVCheckCond) 2057 return nullptr; 2058 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2059 if (C->isZero()) 2060 return nullptr; 2061 2062 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2063 2064 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2065 // Create new preheader for vector loop. 2066 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2067 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2068 2069 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2070 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2071 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2072 SCEVCheckBlock); 2073 2074 DT->addNewBlock(SCEVCheckBlock, Pred); 2075 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2076 2077 ReplaceInstWithInst( 2078 SCEVCheckBlock->getTerminator(), 2079 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2080 // Mark the check as used, to prevent it from being removed during cleanup. 2081 SCEVCheckCond = nullptr; 2082 return SCEVCheckBlock; 2083 } 2084 2085 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2086 /// the branches to branch to the vector preheader or \p Bypass, depending on 2087 /// the generated condition. 2088 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2089 BasicBlock *LoopVectorPreHeader) { 2090 // Check if we generated code that checks in runtime if arrays overlap. 2091 if (!MemRuntimeCheckCond) 2092 return nullptr; 2093 2094 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2095 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2096 MemCheckBlock); 2097 2098 DT->addNewBlock(MemCheckBlock, Pred); 2099 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2100 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2101 2102 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2103 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2104 2105 ReplaceInstWithInst( 2106 MemCheckBlock->getTerminator(), 2107 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2108 MemCheckBlock->getTerminator()->setDebugLoc( 2109 Pred->getTerminator()->getDebugLoc()); 2110 2111 // Mark the check as used, to prevent it from being removed during cleanup. 2112 MemRuntimeCheckCond = nullptr; 2113 return MemCheckBlock; 2114 } 2115 }; 2116 2117 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2118 // vectorization. The loop needs to be annotated with #pragma omp simd 2119 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2120 // vector length information is not provided, vectorization is not considered 2121 // explicit. Interleave hints are not allowed either. These limitations will be 2122 // relaxed in the future. 2123 // Please, note that we are currently forced to abuse the pragma 'clang 2124 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2125 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2126 // provides *explicit vectorization hints* (LV can bypass legal checks and 2127 // assume that vectorization is legal). However, both hints are implemented 2128 // using the same metadata (llvm.loop.vectorize, processed by 2129 // LoopVectorizeHints). This will be fixed in the future when the native IR 2130 // representation for pragma 'omp simd' is introduced. 2131 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2132 OptimizationRemarkEmitter *ORE) { 2133 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2134 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2135 2136 // Only outer loops with an explicit vectorization hint are supported. 2137 // Unannotated outer loops are ignored. 2138 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2139 return false; 2140 2141 Function *Fn = OuterLp->getHeader()->getParent(); 2142 if (!Hints.allowVectorization(Fn, OuterLp, 2143 true /*VectorizeOnlyWhenForced*/)) { 2144 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2145 return false; 2146 } 2147 2148 if (Hints.getInterleave() > 1) { 2149 // TODO: Interleave support is future work. 2150 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2151 "outer loops.\n"); 2152 Hints.emitRemarkWithHints(); 2153 return false; 2154 } 2155 2156 return true; 2157 } 2158 2159 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2160 OptimizationRemarkEmitter *ORE, 2161 SmallVectorImpl<Loop *> &V) { 2162 // Collect inner loops and outer loops without irreducible control flow. For 2163 // now, only collect outer loops that have explicit vectorization hints. If we 2164 // are stress testing the VPlan H-CFG construction, we collect the outermost 2165 // loop of every loop nest. 2166 if (L.isInnermost() || VPlanBuildStressTest || 2167 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2168 LoopBlocksRPO RPOT(&L); 2169 RPOT.perform(LI); 2170 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2171 V.push_back(&L); 2172 // TODO: Collect inner loops inside marked outer loops in case 2173 // vectorization fails for the outer loop. Do not invoke 2174 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2175 // already known to be reducible. We can use an inherited attribute for 2176 // that. 2177 return; 2178 } 2179 } 2180 for (Loop *InnerL : L) 2181 collectSupportedLoops(*InnerL, LI, ORE, V); 2182 } 2183 2184 namespace { 2185 2186 /// The LoopVectorize Pass. 2187 struct LoopVectorize : public FunctionPass { 2188 /// Pass identification, replacement for typeid 2189 static char ID; 2190 2191 LoopVectorizePass Impl; 2192 2193 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2194 bool VectorizeOnlyWhenForced = false) 2195 : FunctionPass(ID), 2196 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2197 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2198 } 2199 2200 bool runOnFunction(Function &F) override { 2201 if (skipFunction(F)) 2202 return false; 2203 2204 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2205 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2206 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2207 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2208 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2209 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2210 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2211 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2212 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2213 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2214 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2215 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2216 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2217 2218 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2219 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2220 2221 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2222 GetLAA, *ORE, PSI).MadeAnyChange; 2223 } 2224 2225 void getAnalysisUsage(AnalysisUsage &AU) const override { 2226 AU.addRequired<AssumptionCacheTracker>(); 2227 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2228 AU.addRequired<DominatorTreeWrapperPass>(); 2229 AU.addRequired<LoopInfoWrapperPass>(); 2230 AU.addRequired<ScalarEvolutionWrapperPass>(); 2231 AU.addRequired<TargetTransformInfoWrapperPass>(); 2232 AU.addRequired<AAResultsWrapperPass>(); 2233 AU.addRequired<LoopAccessLegacyAnalysis>(); 2234 AU.addRequired<DemandedBitsWrapperPass>(); 2235 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2236 AU.addRequired<InjectTLIMappingsLegacy>(); 2237 2238 // We currently do not preserve loopinfo/dominator analyses with outer loop 2239 // vectorization. Until this is addressed, mark these analyses as preserved 2240 // only for non-VPlan-native path. 2241 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2242 if (!EnableVPlanNativePath) { 2243 AU.addPreserved<LoopInfoWrapperPass>(); 2244 AU.addPreserved<DominatorTreeWrapperPass>(); 2245 } 2246 2247 AU.addPreserved<BasicAAWrapperPass>(); 2248 AU.addPreserved<GlobalsAAWrapperPass>(); 2249 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2250 } 2251 }; 2252 2253 } // end anonymous namespace 2254 2255 //===----------------------------------------------------------------------===// 2256 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2257 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2258 //===----------------------------------------------------------------------===// 2259 2260 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2261 // We need to place the broadcast of invariant variables outside the loop, 2262 // but only if it's proven safe to do so. Else, broadcast will be inside 2263 // vector loop body. 2264 Instruction *Instr = dyn_cast<Instruction>(V); 2265 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2266 (!Instr || 2267 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2268 // Place the code for broadcasting invariant variables in the new preheader. 2269 IRBuilder<>::InsertPointGuard Guard(Builder); 2270 if (SafeToHoist) 2271 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2272 2273 // Broadcast the scalar into all locations in the vector. 2274 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2275 2276 return Shuf; 2277 } 2278 2279 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2280 const InductionDescriptor &II, Value *Step, Value *Start, 2281 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2282 VPTransformState &State) { 2283 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2284 "Expected either an induction phi-node or a truncate of it!"); 2285 2286 // Construct the initial value of the vector IV in the vector loop preheader 2287 auto CurrIP = Builder.saveIP(); 2288 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2289 if (isa<TruncInst>(EntryVal)) { 2290 assert(Start->getType()->isIntegerTy() && 2291 "Truncation requires an integer type"); 2292 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2293 Step = Builder.CreateTrunc(Step, TruncType); 2294 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2295 } 2296 2297 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2298 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2299 Value *SteppedStart = 2300 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2301 2302 // We create vector phi nodes for both integer and floating-point induction 2303 // variables. Here, we determine the kind of arithmetic we will perform. 2304 Instruction::BinaryOps AddOp; 2305 Instruction::BinaryOps MulOp; 2306 if (Step->getType()->isIntegerTy()) { 2307 AddOp = Instruction::Add; 2308 MulOp = Instruction::Mul; 2309 } else { 2310 AddOp = II.getInductionOpcode(); 2311 MulOp = Instruction::FMul; 2312 } 2313 2314 // Multiply the vectorization factor by the step using integer or 2315 // floating-point arithmetic as appropriate. 2316 Type *StepType = Step->getType(); 2317 Value *RuntimeVF; 2318 if (Step->getType()->isFloatingPointTy()) 2319 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); 2320 else 2321 RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2322 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2323 2324 // Create a vector splat to use in the induction update. 2325 // 2326 // FIXME: If the step is non-constant, we create the vector splat with 2327 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2328 // handle a constant vector splat. 2329 Value *SplatVF = isa<Constant>(Mul) 2330 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2331 : Builder.CreateVectorSplat(VF, Mul); 2332 Builder.restoreIP(CurrIP); 2333 2334 // We may need to add the step a number of times, depending on the unroll 2335 // factor. The last of those goes into the PHI. 2336 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2337 &*LoopVectorBody->getFirstInsertionPt()); 2338 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2339 Instruction *LastInduction = VecInd; 2340 for (unsigned Part = 0; Part < UF; ++Part) { 2341 State.set(Def, LastInduction, Part); 2342 2343 if (isa<TruncInst>(EntryVal)) 2344 addMetadata(LastInduction, EntryVal); 2345 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2346 State, Part); 2347 2348 LastInduction = cast<Instruction>( 2349 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2350 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2351 } 2352 2353 // Move the last step to the end of the latch block. This ensures consistent 2354 // placement of all induction updates. 2355 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2356 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2357 auto *ICmp = cast<Instruction>(Br->getCondition()); 2358 LastInduction->moveBefore(ICmp); 2359 LastInduction->setName("vec.ind.next"); 2360 2361 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2362 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2363 } 2364 2365 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2366 return Cost->isScalarAfterVectorization(I, VF) || 2367 Cost->isProfitableToScalarize(I, VF); 2368 } 2369 2370 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2371 if (shouldScalarizeInstruction(IV)) 2372 return true; 2373 auto isScalarInst = [&](User *U) -> bool { 2374 auto *I = cast<Instruction>(U); 2375 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2376 }; 2377 return llvm::any_of(IV->users(), isScalarInst); 2378 } 2379 2380 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2381 const InductionDescriptor &ID, const Instruction *EntryVal, 2382 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2383 unsigned Part, unsigned Lane) { 2384 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2385 "Expected either an induction phi-node or a truncate of it!"); 2386 2387 // This induction variable is not the phi from the original loop but the 2388 // newly-created IV based on the proof that casted Phi is equal to the 2389 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2390 // re-uses the same InductionDescriptor that original IV uses but we don't 2391 // have to do any recording in this case - that is done when original IV is 2392 // processed. 2393 if (isa<TruncInst>(EntryVal)) 2394 return; 2395 2396 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2397 if (Casts.empty()) 2398 return; 2399 // Only the first Cast instruction in the Casts vector is of interest. 2400 // The rest of the Casts (if exist) have no uses outside the 2401 // induction update chain itself. 2402 if (Lane < UINT_MAX) 2403 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2404 else 2405 State.set(CastDef, VectorLoopVal, Part); 2406 } 2407 2408 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2409 TruncInst *Trunc, VPValue *Def, 2410 VPValue *CastDef, 2411 VPTransformState &State) { 2412 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2413 "Primary induction variable must have an integer type"); 2414 2415 auto II = Legal->getInductionVars().find(IV); 2416 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2417 2418 auto ID = II->second; 2419 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2420 2421 // The value from the original loop to which we are mapping the new induction 2422 // variable. 2423 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2424 2425 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2426 2427 // Generate code for the induction step. Note that induction steps are 2428 // required to be loop-invariant 2429 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2430 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2431 "Induction step should be loop invariant"); 2432 if (PSE.getSE()->isSCEVable(IV->getType())) { 2433 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2434 return Exp.expandCodeFor(Step, Step->getType(), 2435 LoopVectorPreHeader->getTerminator()); 2436 } 2437 return cast<SCEVUnknown>(Step)->getValue(); 2438 }; 2439 2440 // The scalar value to broadcast. This is derived from the canonical 2441 // induction variable. If a truncation type is given, truncate the canonical 2442 // induction variable and step. Otherwise, derive these values from the 2443 // induction descriptor. 2444 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2445 Value *ScalarIV = Induction; 2446 if (IV != OldInduction) { 2447 ScalarIV = IV->getType()->isIntegerTy() 2448 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2449 : Builder.CreateCast(Instruction::SIToFP, Induction, 2450 IV->getType()); 2451 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2452 ScalarIV->setName("offset.idx"); 2453 } 2454 if (Trunc) { 2455 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2456 assert(Step->getType()->isIntegerTy() && 2457 "Truncation requires an integer step"); 2458 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2459 Step = Builder.CreateTrunc(Step, TruncType); 2460 } 2461 return ScalarIV; 2462 }; 2463 2464 // Create the vector values from the scalar IV, in the absence of creating a 2465 // vector IV. 2466 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2467 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2468 for (unsigned Part = 0; Part < UF; ++Part) { 2469 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2470 Value *StartIdx; 2471 if (Step->getType()->isFloatingPointTy()) 2472 StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); 2473 else 2474 StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); 2475 2476 Value *EntryPart = 2477 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2478 State.set(Def, EntryPart, Part); 2479 if (Trunc) 2480 addMetadata(EntryPart, Trunc); 2481 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2482 State, Part); 2483 } 2484 }; 2485 2486 // Fast-math-flags propagate from the original induction instruction. 2487 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2488 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2489 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2490 2491 // Now do the actual transformations, and start with creating the step value. 2492 Value *Step = CreateStepValue(ID.getStep()); 2493 if (VF.isZero() || VF.isScalar()) { 2494 Value *ScalarIV = CreateScalarIV(Step); 2495 CreateSplatIV(ScalarIV, Step); 2496 return; 2497 } 2498 2499 // Determine if we want a scalar version of the induction variable. This is 2500 // true if the induction variable itself is not widened, or if it has at 2501 // least one user in the loop that is not widened. 2502 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2503 if (!NeedsScalarIV) { 2504 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2505 State); 2506 return; 2507 } 2508 2509 // Try to create a new independent vector induction variable. If we can't 2510 // create the phi node, we will splat the scalar induction variable in each 2511 // loop iteration. 2512 if (!shouldScalarizeInstruction(EntryVal)) { 2513 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2514 State); 2515 Value *ScalarIV = CreateScalarIV(Step); 2516 // Create scalar steps that can be used by instructions we will later 2517 // scalarize. Note that the addition of the scalar steps will not increase 2518 // the number of instructions in the loop in the common case prior to 2519 // InstCombine. We will be trading one vector extract for each scalar step. 2520 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2521 return; 2522 } 2523 2524 // All IV users are scalar instructions, so only emit a scalar IV, not a 2525 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2526 // predicate used by the masked loads/stores. 2527 Value *ScalarIV = CreateScalarIV(Step); 2528 if (!Cost->isScalarEpilogueAllowed()) 2529 CreateSplatIV(ScalarIV, Step); 2530 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2531 } 2532 2533 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2534 Value *Step, 2535 Instruction::BinaryOps BinOp) { 2536 // Create and check the types. 2537 auto *ValVTy = cast<VectorType>(Val->getType()); 2538 ElementCount VLen = ValVTy->getElementCount(); 2539 2540 Type *STy = Val->getType()->getScalarType(); 2541 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2542 "Induction Step must be an integer or FP"); 2543 assert(Step->getType() == STy && "Step has wrong type"); 2544 2545 SmallVector<Constant *, 8> Indices; 2546 2547 // Create a vector of consecutive numbers from zero to VF. 2548 VectorType *InitVecValVTy = ValVTy; 2549 Type *InitVecValSTy = STy; 2550 if (STy->isFloatingPointTy()) { 2551 InitVecValSTy = 2552 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2553 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2554 } 2555 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2556 2557 // Splat the StartIdx 2558 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2559 2560 if (STy->isIntegerTy()) { 2561 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2562 Step = Builder.CreateVectorSplat(VLen, Step); 2563 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2564 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2565 // which can be found from the original scalar operations. 2566 Step = Builder.CreateMul(InitVec, Step); 2567 return Builder.CreateAdd(Val, Step, "induction"); 2568 } 2569 2570 // Floating point induction. 2571 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2572 "Binary Opcode should be specified for FP induction"); 2573 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2574 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2575 2576 Step = Builder.CreateVectorSplat(VLen, Step); 2577 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2578 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2579 } 2580 2581 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2582 Instruction *EntryVal, 2583 const InductionDescriptor &ID, 2584 VPValue *Def, VPValue *CastDef, 2585 VPTransformState &State) { 2586 // We shouldn't have to build scalar steps if we aren't vectorizing. 2587 assert(VF.isVector() && "VF should be greater than one"); 2588 // Get the value type and ensure it and the step have the same integer type. 2589 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2590 assert(ScalarIVTy == Step->getType() && 2591 "Val and Step should have the same type"); 2592 2593 // We build scalar steps for both integer and floating-point induction 2594 // variables. Here, we determine the kind of arithmetic we will perform. 2595 Instruction::BinaryOps AddOp; 2596 Instruction::BinaryOps MulOp; 2597 if (ScalarIVTy->isIntegerTy()) { 2598 AddOp = Instruction::Add; 2599 MulOp = Instruction::Mul; 2600 } else { 2601 AddOp = ID.getInductionOpcode(); 2602 MulOp = Instruction::FMul; 2603 } 2604 2605 // Determine the number of scalars we need to generate for each unroll 2606 // iteration. If EntryVal is uniform, we only need to generate the first 2607 // lane. Otherwise, we generate all VF values. 2608 bool IsUniform = 2609 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2610 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2611 // Compute the scalar steps and save the results in State. 2612 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2613 ScalarIVTy->getScalarSizeInBits()); 2614 Type *VecIVTy = nullptr; 2615 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2616 if (!IsUniform && VF.isScalable()) { 2617 VecIVTy = VectorType::get(ScalarIVTy, VF); 2618 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2619 SplatStep = Builder.CreateVectorSplat(VF, Step); 2620 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2621 } 2622 2623 for (unsigned Part = 0; Part < UF; ++Part) { 2624 Value *StartIdx0 = 2625 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2626 2627 if (!IsUniform && VF.isScalable()) { 2628 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2629 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2630 if (ScalarIVTy->isFloatingPointTy()) 2631 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2632 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2633 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2634 State.set(Def, Add, Part); 2635 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2636 Part); 2637 // It's useful to record the lane values too for the known minimum number 2638 // of elements so we do those below. This improves the code quality when 2639 // trying to extract the first element, for example. 2640 } 2641 2642 if (ScalarIVTy->isFloatingPointTy()) 2643 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2644 2645 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2646 Value *StartIdx = Builder.CreateBinOp( 2647 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2648 // The step returned by `createStepForVF` is a runtime-evaluated value 2649 // when VF is scalable. Otherwise, it should be folded into a Constant. 2650 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2651 "Expected StartIdx to be folded to a constant when VF is not " 2652 "scalable"); 2653 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2654 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2655 State.set(Def, Add, VPIteration(Part, Lane)); 2656 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2657 Part, Lane); 2658 } 2659 } 2660 } 2661 2662 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2663 const VPIteration &Instance, 2664 VPTransformState &State) { 2665 Value *ScalarInst = State.get(Def, Instance); 2666 Value *VectorValue = State.get(Def, Instance.Part); 2667 VectorValue = Builder.CreateInsertElement( 2668 VectorValue, ScalarInst, 2669 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2670 State.set(Def, VectorValue, Instance.Part); 2671 } 2672 2673 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2674 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2675 return Builder.CreateVectorReverse(Vec, "reverse"); 2676 } 2677 2678 // Return whether we allow using masked interleave-groups (for dealing with 2679 // strided loads/stores that reside in predicated blocks, or for dealing 2680 // with gaps). 2681 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2682 // If an override option has been passed in for interleaved accesses, use it. 2683 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2684 return EnableMaskedInterleavedMemAccesses; 2685 2686 return TTI.enableMaskedInterleavedAccessVectorization(); 2687 } 2688 2689 // Try to vectorize the interleave group that \p Instr belongs to. 2690 // 2691 // E.g. Translate following interleaved load group (factor = 3): 2692 // for (i = 0; i < N; i+=3) { 2693 // R = Pic[i]; // Member of index 0 2694 // G = Pic[i+1]; // Member of index 1 2695 // B = Pic[i+2]; // Member of index 2 2696 // ... // do something to R, G, B 2697 // } 2698 // To: 2699 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2700 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2701 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2702 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2703 // 2704 // Or translate following interleaved store group (factor = 3): 2705 // for (i = 0; i < N; i+=3) { 2706 // ... do something to R, G, B 2707 // Pic[i] = R; // Member of index 0 2708 // Pic[i+1] = G; // Member of index 1 2709 // Pic[i+2] = B; // Member of index 2 2710 // } 2711 // To: 2712 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2713 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2714 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2715 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2716 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2717 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2718 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2719 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2720 VPValue *BlockInMask) { 2721 Instruction *Instr = Group->getInsertPos(); 2722 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2723 2724 // Prepare for the vector type of the interleaved load/store. 2725 Type *ScalarTy = getLoadStoreType(Instr); 2726 unsigned InterleaveFactor = Group->getFactor(); 2727 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2728 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2729 2730 // Prepare for the new pointers. 2731 SmallVector<Value *, 2> AddrParts; 2732 unsigned Index = Group->getIndex(Instr); 2733 2734 // TODO: extend the masked interleaved-group support to reversed access. 2735 assert((!BlockInMask || !Group->isReverse()) && 2736 "Reversed masked interleave-group not supported."); 2737 2738 // If the group is reverse, adjust the index to refer to the last vector lane 2739 // instead of the first. We adjust the index from the first vector lane, 2740 // rather than directly getting the pointer for lane VF - 1, because the 2741 // pointer operand of the interleaved access is supposed to be uniform. For 2742 // uniform instructions, we're only required to generate a value for the 2743 // first vector lane in each unroll iteration. 2744 if (Group->isReverse()) 2745 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2746 2747 for (unsigned Part = 0; Part < UF; Part++) { 2748 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2749 setDebugLocFromInst(AddrPart); 2750 2751 // Notice current instruction could be any index. Need to adjust the address 2752 // to the member of index 0. 2753 // 2754 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2755 // b = A[i]; // Member of index 0 2756 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2757 // 2758 // E.g. A[i+1] = a; // Member of index 1 2759 // A[i] = b; // Member of index 0 2760 // A[i+2] = c; // Member of index 2 (Current instruction) 2761 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2762 2763 bool InBounds = false; 2764 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2765 InBounds = gep->isInBounds(); 2766 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2767 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2768 2769 // Cast to the vector pointer type. 2770 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2771 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2772 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2773 } 2774 2775 setDebugLocFromInst(Instr); 2776 Value *PoisonVec = PoisonValue::get(VecTy); 2777 2778 Value *MaskForGaps = nullptr; 2779 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2780 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2781 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2782 } 2783 2784 // Vectorize the interleaved load group. 2785 if (isa<LoadInst>(Instr)) { 2786 // For each unroll part, create a wide load for the group. 2787 SmallVector<Value *, 2> NewLoads; 2788 for (unsigned Part = 0; Part < UF; Part++) { 2789 Instruction *NewLoad; 2790 if (BlockInMask || MaskForGaps) { 2791 assert(useMaskedInterleavedAccesses(*TTI) && 2792 "masked interleaved groups are not allowed."); 2793 Value *GroupMask = MaskForGaps; 2794 if (BlockInMask) { 2795 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2796 Value *ShuffledMask = Builder.CreateShuffleVector( 2797 BlockInMaskPart, 2798 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2799 "interleaved.mask"); 2800 GroupMask = MaskForGaps 2801 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2802 MaskForGaps) 2803 : ShuffledMask; 2804 } 2805 NewLoad = 2806 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2807 GroupMask, PoisonVec, "wide.masked.vec"); 2808 } 2809 else 2810 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2811 Group->getAlign(), "wide.vec"); 2812 Group->addMetadata(NewLoad); 2813 NewLoads.push_back(NewLoad); 2814 } 2815 2816 // For each member in the group, shuffle out the appropriate data from the 2817 // wide loads. 2818 unsigned J = 0; 2819 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2820 Instruction *Member = Group->getMember(I); 2821 2822 // Skip the gaps in the group. 2823 if (!Member) 2824 continue; 2825 2826 auto StrideMask = 2827 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2828 for (unsigned Part = 0; Part < UF; Part++) { 2829 Value *StridedVec = Builder.CreateShuffleVector( 2830 NewLoads[Part], StrideMask, "strided.vec"); 2831 2832 // If this member has different type, cast the result type. 2833 if (Member->getType() != ScalarTy) { 2834 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2835 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2836 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2837 } 2838 2839 if (Group->isReverse()) 2840 StridedVec = reverseVector(StridedVec); 2841 2842 State.set(VPDefs[J], StridedVec, Part); 2843 } 2844 ++J; 2845 } 2846 return; 2847 } 2848 2849 // The sub vector type for current instruction. 2850 auto *SubVT = VectorType::get(ScalarTy, VF); 2851 2852 // Vectorize the interleaved store group. 2853 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2854 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2855 "masked interleaved groups are not allowed."); 2856 assert((!MaskForGaps || !VF.isScalable()) && 2857 "masking gaps for scalable vectors is not yet supported."); 2858 for (unsigned Part = 0; Part < UF; Part++) { 2859 // Collect the stored vector from each member. 2860 SmallVector<Value *, 4> StoredVecs; 2861 for (unsigned i = 0; i < InterleaveFactor; i++) { 2862 assert((Group->getMember(i) || MaskForGaps) && 2863 "Fail to get a member from an interleaved store group"); 2864 Instruction *Member = Group->getMember(i); 2865 2866 // Skip the gaps in the group. 2867 if (!Member) { 2868 Value *Undef = PoisonValue::get(SubVT); 2869 StoredVecs.push_back(Undef); 2870 continue; 2871 } 2872 2873 Value *StoredVec = State.get(StoredValues[i], Part); 2874 2875 if (Group->isReverse()) 2876 StoredVec = reverseVector(StoredVec); 2877 2878 // If this member has different type, cast it to a unified type. 2879 2880 if (StoredVec->getType() != SubVT) 2881 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2882 2883 StoredVecs.push_back(StoredVec); 2884 } 2885 2886 // Concatenate all vectors into a wide vector. 2887 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2888 2889 // Interleave the elements in the wide vector. 2890 Value *IVec = Builder.CreateShuffleVector( 2891 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2892 "interleaved.vec"); 2893 2894 Instruction *NewStoreInstr; 2895 if (BlockInMask || MaskForGaps) { 2896 Value *GroupMask = MaskForGaps; 2897 if (BlockInMask) { 2898 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2899 Value *ShuffledMask = Builder.CreateShuffleVector( 2900 BlockInMaskPart, 2901 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2902 "interleaved.mask"); 2903 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2904 ShuffledMask, MaskForGaps) 2905 : ShuffledMask; 2906 } 2907 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2908 Group->getAlign(), GroupMask); 2909 } else 2910 NewStoreInstr = 2911 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2912 2913 Group->addMetadata(NewStoreInstr); 2914 } 2915 } 2916 2917 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2918 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2919 VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, 2920 bool Reverse) { 2921 // Attempt to issue a wide load. 2922 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2923 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2924 2925 assert((LI || SI) && "Invalid Load/Store instruction"); 2926 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2927 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2928 2929 Type *ScalarDataTy = getLoadStoreType(Instr); 2930 2931 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2932 const Align Alignment = getLoadStoreAlignment(Instr); 2933 bool CreateGatherScatter = !ConsecutiveStride; 2934 2935 VectorParts BlockInMaskParts(UF); 2936 bool isMaskRequired = BlockInMask; 2937 if (isMaskRequired) 2938 for (unsigned Part = 0; Part < UF; ++Part) 2939 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2940 2941 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2942 // Calculate the pointer for the specific unroll-part. 2943 GetElementPtrInst *PartPtr = nullptr; 2944 2945 bool InBounds = false; 2946 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2947 InBounds = gep->isInBounds(); 2948 if (Reverse) { 2949 // If the address is consecutive but reversed, then the 2950 // wide store needs to start at the last vector element. 2951 // RunTimeVF = VScale * VF.getKnownMinValue() 2952 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2953 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2954 // NumElt = -Part * RunTimeVF 2955 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2956 // LastLane = 1 - RunTimeVF 2957 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2958 PartPtr = 2959 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2960 PartPtr->setIsInBounds(InBounds); 2961 PartPtr = cast<GetElementPtrInst>( 2962 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2963 PartPtr->setIsInBounds(InBounds); 2964 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2965 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2966 } else { 2967 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2968 PartPtr = cast<GetElementPtrInst>( 2969 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2970 PartPtr->setIsInBounds(InBounds); 2971 } 2972 2973 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2974 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2975 }; 2976 2977 // Handle Stores: 2978 if (SI) { 2979 setDebugLocFromInst(SI); 2980 2981 for (unsigned Part = 0; Part < UF; ++Part) { 2982 Instruction *NewSI = nullptr; 2983 Value *StoredVal = State.get(StoredValue, Part); 2984 if (CreateGatherScatter) { 2985 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2986 Value *VectorGep = State.get(Addr, Part); 2987 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2988 MaskPart); 2989 } else { 2990 if (Reverse) { 2991 // If we store to reverse consecutive memory locations, then we need 2992 // to reverse the order of elements in the stored value. 2993 StoredVal = reverseVector(StoredVal); 2994 // We don't want to update the value in the map as it might be used in 2995 // another expression. So don't call resetVectorValue(StoredVal). 2996 } 2997 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2998 if (isMaskRequired) 2999 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 3000 BlockInMaskParts[Part]); 3001 else 3002 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 3003 } 3004 addMetadata(NewSI, SI); 3005 } 3006 return; 3007 } 3008 3009 // Handle loads. 3010 assert(LI && "Must have a load instruction"); 3011 setDebugLocFromInst(LI); 3012 for (unsigned Part = 0; Part < UF; ++Part) { 3013 Value *NewLI; 3014 if (CreateGatherScatter) { 3015 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3016 Value *VectorGep = State.get(Addr, Part); 3017 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3018 nullptr, "wide.masked.gather"); 3019 addMetadata(NewLI, LI); 3020 } else { 3021 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3022 if (isMaskRequired) 3023 NewLI = Builder.CreateMaskedLoad( 3024 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3025 PoisonValue::get(DataTy), "wide.masked.load"); 3026 else 3027 NewLI = 3028 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3029 3030 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3031 addMetadata(NewLI, LI); 3032 if (Reverse) 3033 NewLI = reverseVector(NewLI); 3034 } 3035 3036 State.set(Def, NewLI, Part); 3037 } 3038 } 3039 3040 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3041 VPUser &User, 3042 const VPIteration &Instance, 3043 bool IfPredicateInstr, 3044 VPTransformState &State) { 3045 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3046 3047 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3048 // the first lane and part. 3049 if (isa<NoAliasScopeDeclInst>(Instr)) 3050 if (!Instance.isFirstIteration()) 3051 return; 3052 3053 setDebugLocFromInst(Instr); 3054 3055 // Does this instruction return a value ? 3056 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3057 3058 Instruction *Cloned = Instr->clone(); 3059 if (!IsVoidRetTy) 3060 Cloned->setName(Instr->getName() + ".cloned"); 3061 3062 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3063 Builder.GetInsertPoint()); 3064 // Replace the operands of the cloned instructions with their scalar 3065 // equivalents in the new loop. 3066 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3067 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3068 auto InputInstance = Instance; 3069 if (!Operand || !OrigLoop->contains(Operand) || 3070 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3071 InputInstance.Lane = VPLane::getFirstLane(); 3072 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3073 Cloned->setOperand(op, NewOp); 3074 } 3075 addNewMetadata(Cloned, Instr); 3076 3077 // Place the cloned scalar in the new loop. 3078 Builder.Insert(Cloned); 3079 3080 State.set(Def, Cloned, Instance); 3081 3082 // If we just cloned a new assumption, add it the assumption cache. 3083 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3084 AC->registerAssumption(II); 3085 3086 // End if-block. 3087 if (IfPredicateInstr) 3088 PredicatedInstructions.push_back(Cloned); 3089 } 3090 3091 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3092 Value *End, Value *Step, 3093 Instruction *DL) { 3094 BasicBlock *Header = L->getHeader(); 3095 BasicBlock *Latch = L->getLoopLatch(); 3096 // As we're just creating this loop, it's possible no latch exists 3097 // yet. If so, use the header as this will be a single block loop. 3098 if (!Latch) 3099 Latch = Header; 3100 3101 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3102 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3103 setDebugLocFromInst(OldInst, &B); 3104 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3105 3106 B.SetInsertPoint(Latch->getTerminator()); 3107 setDebugLocFromInst(OldInst, &B); 3108 3109 // Create i+1 and fill the PHINode. 3110 // 3111 // If the tail is not folded, we know that End - Start >= Step (either 3112 // statically or through the minimum iteration checks). We also know that both 3113 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3114 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3115 // overflows and we can mark the induction increment as NUW. 3116 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3117 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3118 Induction->addIncoming(Start, L->getLoopPreheader()); 3119 Induction->addIncoming(Next, Latch); 3120 // Create the compare. 3121 Value *ICmp = B.CreateICmpEQ(Next, End); 3122 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3123 3124 // Now we have two terminators. Remove the old one from the block. 3125 Latch->getTerminator()->eraseFromParent(); 3126 3127 return Induction; 3128 } 3129 3130 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3131 if (TripCount) 3132 return TripCount; 3133 3134 assert(L && "Create Trip Count for null loop."); 3135 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3136 // Find the loop boundaries. 3137 ScalarEvolution *SE = PSE.getSE(); 3138 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3139 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3140 "Invalid loop count"); 3141 3142 Type *IdxTy = Legal->getWidestInductionType(); 3143 assert(IdxTy && "No type for induction"); 3144 3145 // The exit count might have the type of i64 while the phi is i32. This can 3146 // happen if we have an induction variable that is sign extended before the 3147 // compare. The only way that we get a backedge taken count is that the 3148 // induction variable was signed and as such will not overflow. In such a case 3149 // truncation is legal. 3150 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3151 IdxTy->getPrimitiveSizeInBits()) 3152 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3153 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3154 3155 // Get the total trip count from the count by adding 1. 3156 const SCEV *ExitCount = SE->getAddExpr( 3157 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3158 3159 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3160 3161 // Expand the trip count and place the new instructions in the preheader. 3162 // Notice that the pre-header does not change, only the loop body. 3163 SCEVExpander Exp(*SE, DL, "induction"); 3164 3165 // Count holds the overall loop count (N). 3166 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3167 L->getLoopPreheader()->getTerminator()); 3168 3169 if (TripCount->getType()->isPointerTy()) 3170 TripCount = 3171 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3172 L->getLoopPreheader()->getTerminator()); 3173 3174 return TripCount; 3175 } 3176 3177 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3178 if (VectorTripCount) 3179 return VectorTripCount; 3180 3181 Value *TC = getOrCreateTripCount(L); 3182 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3183 3184 Type *Ty = TC->getType(); 3185 // This is where we can make the step a runtime constant. 3186 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3187 3188 // If the tail is to be folded by masking, round the number of iterations N 3189 // up to a multiple of Step instead of rounding down. This is done by first 3190 // adding Step-1 and then rounding down. Note that it's ok if this addition 3191 // overflows: the vector induction variable will eventually wrap to zero given 3192 // that it starts at zero and its Step is a power of two; the loop will then 3193 // exit, with the last early-exit vector comparison also producing all-true. 3194 if (Cost->foldTailByMasking()) { 3195 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3196 "VF*UF must be a power of 2 when folding tail by masking"); 3197 assert(!VF.isScalable() && 3198 "Tail folding not yet supported for scalable vectors"); 3199 TC = Builder.CreateAdd( 3200 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3201 } 3202 3203 // Now we need to generate the expression for the part of the loop that the 3204 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3205 // iterations are not required for correctness, or N - Step, otherwise. Step 3206 // is equal to the vectorization factor (number of SIMD elements) times the 3207 // unroll factor (number of SIMD instructions). 3208 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3209 3210 // There are cases where we *must* run at least one iteration in the remainder 3211 // loop. See the cost model for when this can happen. If the step evenly 3212 // divides the trip count, we set the remainder to be equal to the step. If 3213 // the step does not evenly divide the trip count, no adjustment is necessary 3214 // since there will already be scalar iterations. Note that the minimum 3215 // iterations check ensures that N >= Step. 3216 if (Cost->requiresScalarEpilogue(VF)) { 3217 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3218 R = Builder.CreateSelect(IsZero, Step, R); 3219 } 3220 3221 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3222 3223 return VectorTripCount; 3224 } 3225 3226 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3227 const DataLayout &DL) { 3228 // Verify that V is a vector type with same number of elements as DstVTy. 3229 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3230 unsigned VF = DstFVTy->getNumElements(); 3231 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3232 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3233 Type *SrcElemTy = SrcVecTy->getElementType(); 3234 Type *DstElemTy = DstFVTy->getElementType(); 3235 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3236 "Vector elements must have same size"); 3237 3238 // Do a direct cast if element types are castable. 3239 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3240 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3241 } 3242 // V cannot be directly casted to desired vector type. 3243 // May happen when V is a floating point vector but DstVTy is a vector of 3244 // pointers or vice-versa. Handle this using a two-step bitcast using an 3245 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3246 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3247 "Only one type should be a pointer type"); 3248 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3249 "Only one type should be a floating point type"); 3250 Type *IntTy = 3251 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3252 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3253 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3254 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3255 } 3256 3257 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3258 BasicBlock *Bypass) { 3259 Value *Count = getOrCreateTripCount(L); 3260 // Reuse existing vector loop preheader for TC checks. 3261 // Note that new preheader block is generated for vector loop. 3262 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3263 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3264 3265 // Generate code to check if the loop's trip count is less than VF * UF, or 3266 // equal to it in case a scalar epilogue is required; this implies that the 3267 // vector trip count is zero. This check also covers the case where adding one 3268 // to the backedge-taken count overflowed leading to an incorrect trip count 3269 // of zero. In this case we will also jump to the scalar loop. 3270 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3271 : ICmpInst::ICMP_ULT; 3272 3273 // If tail is to be folded, vector loop takes care of all iterations. 3274 Value *CheckMinIters = Builder.getFalse(); 3275 if (!Cost->foldTailByMasking()) { 3276 Value *Step = 3277 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3278 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3279 } 3280 // Create new preheader for vector loop. 3281 LoopVectorPreHeader = 3282 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3283 "vector.ph"); 3284 3285 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3286 DT->getNode(Bypass)->getIDom()) && 3287 "TC check is expected to dominate Bypass"); 3288 3289 // Update dominator for Bypass & LoopExit (if needed). 3290 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3291 if (!Cost->requiresScalarEpilogue(VF)) 3292 // If there is an epilogue which must run, there's no edge from the 3293 // middle block to exit blocks and thus no need to update the immediate 3294 // dominator of the exit blocks. 3295 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3296 3297 ReplaceInstWithInst( 3298 TCCheckBlock->getTerminator(), 3299 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3300 LoopBypassBlocks.push_back(TCCheckBlock); 3301 } 3302 3303 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3304 3305 BasicBlock *const SCEVCheckBlock = 3306 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3307 if (!SCEVCheckBlock) 3308 return nullptr; 3309 3310 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3311 (OptForSizeBasedOnProfile && 3312 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3313 "Cannot SCEV check stride or overflow when optimizing for size"); 3314 3315 3316 // Update dominator only if this is first RT check. 3317 if (LoopBypassBlocks.empty()) { 3318 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3319 if (!Cost->requiresScalarEpilogue(VF)) 3320 // If there is an epilogue which must run, there's no edge from the 3321 // middle block to exit blocks and thus no need to update the immediate 3322 // dominator of the exit blocks. 3323 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3324 } 3325 3326 LoopBypassBlocks.push_back(SCEVCheckBlock); 3327 AddedSafetyChecks = true; 3328 return SCEVCheckBlock; 3329 } 3330 3331 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3332 BasicBlock *Bypass) { 3333 // VPlan-native path does not do any analysis for runtime checks currently. 3334 if (EnableVPlanNativePath) 3335 return nullptr; 3336 3337 BasicBlock *const MemCheckBlock = 3338 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3339 3340 // Check if we generated code that checks in runtime if arrays overlap. We put 3341 // the checks into a separate block to make the more common case of few 3342 // elements faster. 3343 if (!MemCheckBlock) 3344 return nullptr; 3345 3346 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3347 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3348 "Cannot emit memory checks when optimizing for size, unless forced " 3349 "to vectorize."); 3350 ORE->emit([&]() { 3351 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3352 L->getStartLoc(), L->getHeader()) 3353 << "Code-size may be reduced by not forcing " 3354 "vectorization, or by source-code modifications " 3355 "eliminating the need for runtime checks " 3356 "(e.g., adding 'restrict')."; 3357 }); 3358 } 3359 3360 LoopBypassBlocks.push_back(MemCheckBlock); 3361 3362 AddedSafetyChecks = true; 3363 3364 // We currently don't use LoopVersioning for the actual loop cloning but we 3365 // still use it to add the noalias metadata. 3366 LVer = std::make_unique<LoopVersioning>( 3367 *Legal->getLAI(), 3368 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3369 DT, PSE.getSE()); 3370 LVer->prepareNoAliasMetadata(); 3371 return MemCheckBlock; 3372 } 3373 3374 Value *InnerLoopVectorizer::emitTransformedIndex( 3375 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3376 const InductionDescriptor &ID) const { 3377 3378 SCEVExpander Exp(*SE, DL, "induction"); 3379 auto Step = ID.getStep(); 3380 auto StartValue = ID.getStartValue(); 3381 assert(Index->getType()->getScalarType() == Step->getType() && 3382 "Index scalar type does not match StepValue type"); 3383 3384 // Note: the IR at this point is broken. We cannot use SE to create any new 3385 // SCEV and then expand it, hoping that SCEV's simplification will give us 3386 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3387 // lead to various SCEV crashes. So all we can do is to use builder and rely 3388 // on InstCombine for future simplifications. Here we handle some trivial 3389 // cases only. 3390 auto CreateAdd = [&B](Value *X, Value *Y) { 3391 assert(X->getType() == Y->getType() && "Types don't match!"); 3392 if (auto *CX = dyn_cast<ConstantInt>(X)) 3393 if (CX->isZero()) 3394 return Y; 3395 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3396 if (CY->isZero()) 3397 return X; 3398 return B.CreateAdd(X, Y); 3399 }; 3400 3401 // We allow X to be a vector type, in which case Y will potentially be 3402 // splatted into a vector with the same element count. 3403 auto CreateMul = [&B](Value *X, Value *Y) { 3404 assert(X->getType()->getScalarType() == Y->getType() && 3405 "Types don't match!"); 3406 if (auto *CX = dyn_cast<ConstantInt>(X)) 3407 if (CX->isOne()) 3408 return Y; 3409 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3410 if (CY->isOne()) 3411 return X; 3412 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3413 if (XVTy && !isa<VectorType>(Y->getType())) 3414 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3415 return B.CreateMul(X, Y); 3416 }; 3417 3418 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3419 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3420 // the DomTree is not kept up-to-date for additional blocks generated in the 3421 // vector loop. By using the header as insertion point, we guarantee that the 3422 // expanded instructions dominate all their uses. 3423 auto GetInsertPoint = [this, &B]() { 3424 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3425 if (InsertBB != LoopVectorBody && 3426 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3427 return LoopVectorBody->getTerminator(); 3428 return &*B.GetInsertPoint(); 3429 }; 3430 3431 switch (ID.getKind()) { 3432 case InductionDescriptor::IK_IntInduction: { 3433 assert(!isa<VectorType>(Index->getType()) && 3434 "Vector indices not supported for integer inductions yet"); 3435 assert(Index->getType() == StartValue->getType() && 3436 "Index type does not match StartValue type"); 3437 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3438 return B.CreateSub(StartValue, Index); 3439 auto *Offset = CreateMul( 3440 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3441 return CreateAdd(StartValue, Offset); 3442 } 3443 case InductionDescriptor::IK_PtrInduction: { 3444 assert(isa<SCEVConstant>(Step) && 3445 "Expected constant step for pointer induction"); 3446 return B.CreateGEP( 3447 ID.getElementType(), StartValue, 3448 CreateMul(Index, 3449 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3450 GetInsertPoint()))); 3451 } 3452 case InductionDescriptor::IK_FpInduction: { 3453 assert(!isa<VectorType>(Index->getType()) && 3454 "Vector indices not supported for FP inductions yet"); 3455 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3456 auto InductionBinOp = ID.getInductionBinOp(); 3457 assert(InductionBinOp && 3458 (InductionBinOp->getOpcode() == Instruction::FAdd || 3459 InductionBinOp->getOpcode() == Instruction::FSub) && 3460 "Original bin op should be defined for FP induction"); 3461 3462 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3463 Value *MulExp = B.CreateFMul(StepValue, Index); 3464 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3465 "induction"); 3466 } 3467 case InductionDescriptor::IK_NoInduction: 3468 return nullptr; 3469 } 3470 llvm_unreachable("invalid enum"); 3471 } 3472 3473 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3474 LoopScalarBody = OrigLoop->getHeader(); 3475 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3476 assert(LoopVectorPreHeader && "Invalid loop structure"); 3477 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3478 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3479 "multiple exit loop without required epilogue?"); 3480 3481 LoopMiddleBlock = 3482 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3483 LI, nullptr, Twine(Prefix) + "middle.block"); 3484 LoopScalarPreHeader = 3485 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3486 nullptr, Twine(Prefix) + "scalar.ph"); 3487 3488 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3489 3490 // Set up the middle block terminator. Two cases: 3491 // 1) If we know that we must execute the scalar epilogue, emit an 3492 // unconditional branch. 3493 // 2) Otherwise, we must have a single unique exit block (due to how we 3494 // implement the multiple exit case). In this case, set up a conditonal 3495 // branch from the middle block to the loop scalar preheader, and the 3496 // exit block. completeLoopSkeleton will update the condition to use an 3497 // iteration check, if required to decide whether to execute the remainder. 3498 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3499 BranchInst::Create(LoopScalarPreHeader) : 3500 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3501 Builder.getTrue()); 3502 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3503 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3504 3505 // We intentionally don't let SplitBlock to update LoopInfo since 3506 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3507 // LoopVectorBody is explicitly added to the correct place few lines later. 3508 LoopVectorBody = 3509 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3510 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3511 3512 // Update dominator for loop exit. 3513 if (!Cost->requiresScalarEpilogue(VF)) 3514 // If there is an epilogue which must run, there's no edge from the 3515 // middle block to exit blocks and thus no need to update the immediate 3516 // dominator of the exit blocks. 3517 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3518 3519 // Create and register the new vector loop. 3520 Loop *Lp = LI->AllocateLoop(); 3521 Loop *ParentLoop = OrigLoop->getParentLoop(); 3522 3523 // Insert the new loop into the loop nest and register the new basic blocks 3524 // before calling any utilities such as SCEV that require valid LoopInfo. 3525 if (ParentLoop) { 3526 ParentLoop->addChildLoop(Lp); 3527 } else { 3528 LI->addTopLevelLoop(Lp); 3529 } 3530 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3531 return Lp; 3532 } 3533 3534 void InnerLoopVectorizer::createInductionResumeValues( 3535 Loop *L, Value *VectorTripCount, 3536 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3537 assert(VectorTripCount && L && "Expected valid arguments"); 3538 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3539 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3540 "Inconsistent information about additional bypass."); 3541 // We are going to resume the execution of the scalar loop. 3542 // Go over all of the induction variables that we found and fix the 3543 // PHIs that are left in the scalar version of the loop. 3544 // The starting values of PHI nodes depend on the counter of the last 3545 // iteration in the vectorized loop. 3546 // If we come from a bypass edge then we need to start from the original 3547 // start value. 3548 for (auto &InductionEntry : Legal->getInductionVars()) { 3549 PHINode *OrigPhi = InductionEntry.first; 3550 InductionDescriptor II = InductionEntry.second; 3551 3552 // Create phi nodes to merge from the backedge-taken check block. 3553 PHINode *BCResumeVal = 3554 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3555 LoopScalarPreHeader->getTerminator()); 3556 // Copy original phi DL over to the new one. 3557 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3558 Value *&EndValue = IVEndValues[OrigPhi]; 3559 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3560 if (OrigPhi == OldInduction) { 3561 // We know what the end value is. 3562 EndValue = VectorTripCount; 3563 } else { 3564 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3565 3566 // Fast-math-flags propagate from the original induction instruction. 3567 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3568 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3569 3570 Type *StepType = II.getStep()->getType(); 3571 Instruction::CastOps CastOp = 3572 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3573 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3574 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3575 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3576 EndValue->setName("ind.end"); 3577 3578 // Compute the end value for the additional bypass (if applicable). 3579 if (AdditionalBypass.first) { 3580 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3581 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3582 StepType, true); 3583 CRD = 3584 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3585 EndValueFromAdditionalBypass = 3586 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3587 EndValueFromAdditionalBypass->setName("ind.end"); 3588 } 3589 } 3590 // The new PHI merges the original incoming value, in case of a bypass, 3591 // or the value at the end of the vectorized loop. 3592 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3593 3594 // Fix the scalar body counter (PHI node). 3595 // The old induction's phi node in the scalar body needs the truncated 3596 // value. 3597 for (BasicBlock *BB : LoopBypassBlocks) 3598 BCResumeVal->addIncoming(II.getStartValue(), BB); 3599 3600 if (AdditionalBypass.first) 3601 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3602 EndValueFromAdditionalBypass); 3603 3604 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3605 } 3606 } 3607 3608 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3609 MDNode *OrigLoopID) { 3610 assert(L && "Expected valid loop."); 3611 3612 // The trip counts should be cached by now. 3613 Value *Count = getOrCreateTripCount(L); 3614 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3615 3616 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3617 3618 // Add a check in the middle block to see if we have completed 3619 // all of the iterations in the first vector loop. Three cases: 3620 // 1) If we require a scalar epilogue, there is no conditional branch as 3621 // we unconditionally branch to the scalar preheader. Do nothing. 3622 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3623 // Thus if tail is to be folded, we know we don't need to run the 3624 // remainder and we can use the previous value for the condition (true). 3625 // 3) Otherwise, construct a runtime check. 3626 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3627 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3628 Count, VectorTripCount, "cmp.n", 3629 LoopMiddleBlock->getTerminator()); 3630 3631 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3632 // of the corresponding compare because they may have ended up with 3633 // different line numbers and we want to avoid awkward line stepping while 3634 // debugging. Eg. if the compare has got a line number inside the loop. 3635 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3636 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3637 } 3638 3639 // Get ready to start creating new instructions into the vectorized body. 3640 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3641 "Inconsistent vector loop preheader"); 3642 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3643 3644 Optional<MDNode *> VectorizedLoopID = 3645 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3646 LLVMLoopVectorizeFollowupVectorized}); 3647 if (VectorizedLoopID.hasValue()) { 3648 L->setLoopID(VectorizedLoopID.getValue()); 3649 3650 // Do not setAlreadyVectorized if loop attributes have been defined 3651 // explicitly. 3652 return LoopVectorPreHeader; 3653 } 3654 3655 // Keep all loop hints from the original loop on the vector loop (we'll 3656 // replace the vectorizer-specific hints below). 3657 if (MDNode *LID = OrigLoop->getLoopID()) 3658 L->setLoopID(LID); 3659 3660 LoopVectorizeHints Hints(L, true, *ORE); 3661 Hints.setAlreadyVectorized(); 3662 3663 #ifdef EXPENSIVE_CHECKS 3664 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3665 LI->verify(*DT); 3666 #endif 3667 3668 return LoopVectorPreHeader; 3669 } 3670 3671 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3672 /* 3673 In this function we generate a new loop. The new loop will contain 3674 the vectorized instructions while the old loop will continue to run the 3675 scalar remainder. 3676 3677 [ ] <-- loop iteration number check. 3678 / | 3679 / v 3680 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3681 | / | 3682 | / v 3683 || [ ] <-- vector pre header. 3684 |/ | 3685 | v 3686 | [ ] \ 3687 | [ ]_| <-- vector loop. 3688 | | 3689 | v 3690 \ -[ ] <--- middle-block. 3691 \/ | 3692 /\ v 3693 | ->[ ] <--- new preheader. 3694 | | 3695 (opt) v <-- edge from middle to exit iff epilogue is not required. 3696 | [ ] \ 3697 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3698 \ | 3699 \ v 3700 >[ ] <-- exit block(s). 3701 ... 3702 */ 3703 3704 // Get the metadata of the original loop before it gets modified. 3705 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3706 3707 // Workaround! Compute the trip count of the original loop and cache it 3708 // before we start modifying the CFG. This code has a systemic problem 3709 // wherein it tries to run analysis over partially constructed IR; this is 3710 // wrong, and not simply for SCEV. The trip count of the original loop 3711 // simply happens to be prone to hitting this in practice. In theory, we 3712 // can hit the same issue for any SCEV, or ValueTracking query done during 3713 // mutation. See PR49900. 3714 getOrCreateTripCount(OrigLoop); 3715 3716 // Create an empty vector loop, and prepare basic blocks for the runtime 3717 // checks. 3718 Loop *Lp = createVectorLoopSkeleton(""); 3719 3720 // Now, compare the new count to zero. If it is zero skip the vector loop and 3721 // jump to the scalar loop. This check also covers the case where the 3722 // backedge-taken count is uint##_max: adding one to it will overflow leading 3723 // to an incorrect trip count of zero. In this (rare) case we will also jump 3724 // to the scalar loop. 3725 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3726 3727 // Generate the code to check any assumptions that we've made for SCEV 3728 // expressions. 3729 emitSCEVChecks(Lp, LoopScalarPreHeader); 3730 3731 // Generate the code that checks in runtime if arrays overlap. We put the 3732 // checks into a separate block to make the more common case of few elements 3733 // faster. 3734 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3735 3736 // Some loops have a single integer induction variable, while other loops 3737 // don't. One example is c++ iterators that often have multiple pointer 3738 // induction variables. In the code below we also support a case where we 3739 // don't have a single induction variable. 3740 // 3741 // We try to obtain an induction variable from the original loop as hard 3742 // as possible. However if we don't find one that: 3743 // - is an integer 3744 // - counts from zero, stepping by one 3745 // - is the size of the widest induction variable type 3746 // then we create a new one. 3747 OldInduction = Legal->getPrimaryInduction(); 3748 Type *IdxTy = Legal->getWidestInductionType(); 3749 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3750 // The loop step is equal to the vectorization factor (num of SIMD elements) 3751 // times the unroll factor (num of SIMD instructions). 3752 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3753 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3754 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3755 Induction = 3756 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3757 getDebugLocFromInstOrOperands(OldInduction)); 3758 3759 // Emit phis for the new starting index of the scalar loop. 3760 createInductionResumeValues(Lp, CountRoundDown); 3761 3762 return completeLoopSkeleton(Lp, OrigLoopID); 3763 } 3764 3765 // Fix up external users of the induction variable. At this point, we are 3766 // in LCSSA form, with all external PHIs that use the IV having one input value, 3767 // coming from the remainder loop. We need those PHIs to also have a correct 3768 // value for the IV when arriving directly from the middle block. 3769 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3770 const InductionDescriptor &II, 3771 Value *CountRoundDown, Value *EndValue, 3772 BasicBlock *MiddleBlock) { 3773 // There are two kinds of external IV usages - those that use the value 3774 // computed in the last iteration (the PHI) and those that use the penultimate 3775 // value (the value that feeds into the phi from the loop latch). 3776 // We allow both, but they, obviously, have different values. 3777 3778 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3779 3780 DenseMap<Value *, Value *> MissingVals; 3781 3782 // An external user of the last iteration's value should see the value that 3783 // the remainder loop uses to initialize its own IV. 3784 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3785 for (User *U : PostInc->users()) { 3786 Instruction *UI = cast<Instruction>(U); 3787 if (!OrigLoop->contains(UI)) { 3788 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3789 MissingVals[UI] = EndValue; 3790 } 3791 } 3792 3793 // An external user of the penultimate value need to see EndValue - Step. 3794 // The simplest way to get this is to recompute it from the constituent SCEVs, 3795 // that is Start + (Step * (CRD - 1)). 3796 for (User *U : OrigPhi->users()) { 3797 auto *UI = cast<Instruction>(U); 3798 if (!OrigLoop->contains(UI)) { 3799 const DataLayout &DL = 3800 OrigLoop->getHeader()->getModule()->getDataLayout(); 3801 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3802 3803 IRBuilder<> B(MiddleBlock->getTerminator()); 3804 3805 // Fast-math-flags propagate from the original induction instruction. 3806 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3807 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3808 3809 Value *CountMinusOne = B.CreateSub( 3810 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3811 Value *CMO = 3812 !II.getStep()->getType()->isIntegerTy() 3813 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3814 II.getStep()->getType()) 3815 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3816 CMO->setName("cast.cmo"); 3817 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3818 Escape->setName("ind.escape"); 3819 MissingVals[UI] = Escape; 3820 } 3821 } 3822 3823 for (auto &I : MissingVals) { 3824 PHINode *PHI = cast<PHINode>(I.first); 3825 // One corner case we have to handle is two IVs "chasing" each-other, 3826 // that is %IV2 = phi [...], [ %IV1, %latch ] 3827 // In this case, if IV1 has an external use, we need to avoid adding both 3828 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3829 // don't already have an incoming value for the middle block. 3830 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3831 PHI->addIncoming(I.second, MiddleBlock); 3832 } 3833 } 3834 3835 namespace { 3836 3837 struct CSEDenseMapInfo { 3838 static bool canHandle(const Instruction *I) { 3839 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3840 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3841 } 3842 3843 static inline Instruction *getEmptyKey() { 3844 return DenseMapInfo<Instruction *>::getEmptyKey(); 3845 } 3846 3847 static inline Instruction *getTombstoneKey() { 3848 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3849 } 3850 3851 static unsigned getHashValue(const Instruction *I) { 3852 assert(canHandle(I) && "Unknown instruction!"); 3853 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3854 I->value_op_end())); 3855 } 3856 3857 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3858 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3859 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3860 return LHS == RHS; 3861 return LHS->isIdenticalTo(RHS); 3862 } 3863 }; 3864 3865 } // end anonymous namespace 3866 3867 ///Perform cse of induction variable instructions. 3868 static void cse(BasicBlock *BB) { 3869 // Perform simple cse. 3870 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3871 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3872 if (!CSEDenseMapInfo::canHandle(&In)) 3873 continue; 3874 3875 // Check if we can replace this instruction with any of the 3876 // visited instructions. 3877 if (Instruction *V = CSEMap.lookup(&In)) { 3878 In.replaceAllUsesWith(V); 3879 In.eraseFromParent(); 3880 continue; 3881 } 3882 3883 CSEMap[&In] = &In; 3884 } 3885 } 3886 3887 InstructionCost 3888 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3889 bool &NeedToScalarize) const { 3890 Function *F = CI->getCalledFunction(); 3891 Type *ScalarRetTy = CI->getType(); 3892 SmallVector<Type *, 4> Tys, ScalarTys; 3893 for (auto &ArgOp : CI->args()) 3894 ScalarTys.push_back(ArgOp->getType()); 3895 3896 // Estimate cost of scalarized vector call. The source operands are assumed 3897 // to be vectors, so we need to extract individual elements from there, 3898 // execute VF scalar calls, and then gather the result into the vector return 3899 // value. 3900 InstructionCost ScalarCallCost = 3901 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3902 if (VF.isScalar()) 3903 return ScalarCallCost; 3904 3905 // Compute corresponding vector type for return value and arguments. 3906 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3907 for (Type *ScalarTy : ScalarTys) 3908 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3909 3910 // Compute costs of unpacking argument values for the scalar calls and 3911 // packing the return values to a vector. 3912 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3913 3914 InstructionCost Cost = 3915 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3916 3917 // If we can't emit a vector call for this function, then the currently found 3918 // cost is the cost we need to return. 3919 NeedToScalarize = true; 3920 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3921 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3922 3923 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3924 return Cost; 3925 3926 // If the corresponding vector cost is cheaper, return its cost. 3927 InstructionCost VectorCallCost = 3928 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3929 if (VectorCallCost < Cost) { 3930 NeedToScalarize = false; 3931 Cost = VectorCallCost; 3932 } 3933 return Cost; 3934 } 3935 3936 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3937 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3938 return Elt; 3939 return VectorType::get(Elt, VF); 3940 } 3941 3942 InstructionCost 3943 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3944 ElementCount VF) const { 3945 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3946 assert(ID && "Expected intrinsic call!"); 3947 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3948 FastMathFlags FMF; 3949 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3950 FMF = FPMO->getFastMathFlags(); 3951 3952 SmallVector<const Value *> Arguments(CI->args()); 3953 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3954 SmallVector<Type *> ParamTys; 3955 std::transform(FTy->param_begin(), FTy->param_end(), 3956 std::back_inserter(ParamTys), 3957 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3958 3959 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3960 dyn_cast<IntrinsicInst>(CI)); 3961 return TTI.getIntrinsicInstrCost(CostAttrs, 3962 TargetTransformInfo::TCK_RecipThroughput); 3963 } 3964 3965 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3966 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3967 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3968 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3969 } 3970 3971 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3972 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3973 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3974 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3975 } 3976 3977 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3978 // For every instruction `I` in MinBWs, truncate the operands, create a 3979 // truncated version of `I` and reextend its result. InstCombine runs 3980 // later and will remove any ext/trunc pairs. 3981 SmallPtrSet<Value *, 4> Erased; 3982 for (const auto &KV : Cost->getMinimalBitwidths()) { 3983 // If the value wasn't vectorized, we must maintain the original scalar 3984 // type. The absence of the value from State indicates that it 3985 // wasn't vectorized. 3986 // FIXME: Should not rely on getVPValue at this point. 3987 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3988 if (!State.hasAnyVectorValue(Def)) 3989 continue; 3990 for (unsigned Part = 0; Part < UF; ++Part) { 3991 Value *I = State.get(Def, Part); 3992 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3993 continue; 3994 Type *OriginalTy = I->getType(); 3995 Type *ScalarTruncatedTy = 3996 IntegerType::get(OriginalTy->getContext(), KV.second); 3997 auto *TruncatedTy = VectorType::get( 3998 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3999 if (TruncatedTy == OriginalTy) 4000 continue; 4001 4002 IRBuilder<> B(cast<Instruction>(I)); 4003 auto ShrinkOperand = [&](Value *V) -> Value * { 4004 if (auto *ZI = dyn_cast<ZExtInst>(V)) 4005 if (ZI->getSrcTy() == TruncatedTy) 4006 return ZI->getOperand(0); 4007 return B.CreateZExtOrTrunc(V, TruncatedTy); 4008 }; 4009 4010 // The actual instruction modification depends on the instruction type, 4011 // unfortunately. 4012 Value *NewI = nullptr; 4013 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4014 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4015 ShrinkOperand(BO->getOperand(1))); 4016 4017 // Any wrapping introduced by shrinking this operation shouldn't be 4018 // considered undefined behavior. So, we can't unconditionally copy 4019 // arithmetic wrapping flags to NewI. 4020 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4021 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4022 NewI = 4023 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4024 ShrinkOperand(CI->getOperand(1))); 4025 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4026 NewI = B.CreateSelect(SI->getCondition(), 4027 ShrinkOperand(SI->getTrueValue()), 4028 ShrinkOperand(SI->getFalseValue())); 4029 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4030 switch (CI->getOpcode()) { 4031 default: 4032 llvm_unreachable("Unhandled cast!"); 4033 case Instruction::Trunc: 4034 NewI = ShrinkOperand(CI->getOperand(0)); 4035 break; 4036 case Instruction::SExt: 4037 NewI = B.CreateSExtOrTrunc( 4038 CI->getOperand(0), 4039 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4040 break; 4041 case Instruction::ZExt: 4042 NewI = B.CreateZExtOrTrunc( 4043 CI->getOperand(0), 4044 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4045 break; 4046 } 4047 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4048 auto Elements0 = 4049 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4050 auto *O0 = B.CreateZExtOrTrunc( 4051 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4052 auto Elements1 = 4053 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4054 auto *O1 = B.CreateZExtOrTrunc( 4055 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4056 4057 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4058 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4059 // Don't do anything with the operands, just extend the result. 4060 continue; 4061 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4062 auto Elements = 4063 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4064 auto *O0 = B.CreateZExtOrTrunc( 4065 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4066 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4067 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4068 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4069 auto Elements = 4070 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4071 auto *O0 = B.CreateZExtOrTrunc( 4072 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4073 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4074 } else { 4075 // If we don't know what to do, be conservative and don't do anything. 4076 continue; 4077 } 4078 4079 // Lastly, extend the result. 4080 NewI->takeName(cast<Instruction>(I)); 4081 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4082 I->replaceAllUsesWith(Res); 4083 cast<Instruction>(I)->eraseFromParent(); 4084 Erased.insert(I); 4085 State.reset(Def, Res, Part); 4086 } 4087 } 4088 4089 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4090 for (const auto &KV : Cost->getMinimalBitwidths()) { 4091 // If the value wasn't vectorized, we must maintain the original scalar 4092 // type. The absence of the value from State indicates that it 4093 // wasn't vectorized. 4094 // FIXME: Should not rely on getVPValue at this point. 4095 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4096 if (!State.hasAnyVectorValue(Def)) 4097 continue; 4098 for (unsigned Part = 0; Part < UF; ++Part) { 4099 Value *I = State.get(Def, Part); 4100 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4101 if (Inst && Inst->use_empty()) { 4102 Value *NewI = Inst->getOperand(0); 4103 Inst->eraseFromParent(); 4104 State.reset(Def, NewI, Part); 4105 } 4106 } 4107 } 4108 } 4109 4110 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4111 // Insert truncates and extends for any truncated instructions as hints to 4112 // InstCombine. 4113 if (VF.isVector()) 4114 truncateToMinimalBitwidths(State); 4115 4116 // Fix widened non-induction PHIs by setting up the PHI operands. 4117 if (OrigPHIsToFix.size()) { 4118 assert(EnableVPlanNativePath && 4119 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4120 fixNonInductionPHIs(State); 4121 } 4122 4123 // At this point every instruction in the original loop is widened to a 4124 // vector form. Now we need to fix the recurrences in the loop. These PHI 4125 // nodes are currently empty because we did not want to introduce cycles. 4126 // This is the second stage of vectorizing recurrences. 4127 fixCrossIterationPHIs(State); 4128 4129 // Forget the original basic block. 4130 PSE.getSE()->forgetLoop(OrigLoop); 4131 4132 // If we inserted an edge from the middle block to the unique exit block, 4133 // update uses outside the loop (phis) to account for the newly inserted 4134 // edge. 4135 if (!Cost->requiresScalarEpilogue(VF)) { 4136 // Fix-up external users of the induction variables. 4137 for (auto &Entry : Legal->getInductionVars()) 4138 fixupIVUsers(Entry.first, Entry.second, 4139 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4140 IVEndValues[Entry.first], LoopMiddleBlock); 4141 4142 fixLCSSAPHIs(State); 4143 } 4144 4145 for (Instruction *PI : PredicatedInstructions) 4146 sinkScalarOperands(&*PI); 4147 4148 // Remove redundant induction instructions. 4149 cse(LoopVectorBody); 4150 4151 // Set/update profile weights for the vector and remainder loops as original 4152 // loop iterations are now distributed among them. Note that original loop 4153 // represented by LoopScalarBody becomes remainder loop after vectorization. 4154 // 4155 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4156 // end up getting slightly roughened result but that should be OK since 4157 // profile is not inherently precise anyway. Note also possible bypass of 4158 // vector code caused by legality checks is ignored, assigning all the weight 4159 // to the vector loop, optimistically. 4160 // 4161 // For scalable vectorization we can't know at compile time how many iterations 4162 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4163 // vscale of '1'. 4164 setProfileInfoAfterUnrolling( 4165 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4166 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4167 } 4168 4169 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4170 // In order to support recurrences we need to be able to vectorize Phi nodes. 4171 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4172 // stage #2: We now need to fix the recurrences by adding incoming edges to 4173 // the currently empty PHI nodes. At this point every instruction in the 4174 // original loop is widened to a vector form so we can use them to construct 4175 // the incoming edges. 4176 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4177 for (VPRecipeBase &R : Header->phis()) { 4178 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4179 fixReduction(ReductionPhi, State); 4180 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4181 fixFirstOrderRecurrence(FOR, State); 4182 } 4183 } 4184 4185 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4186 VPTransformState &State) { 4187 // This is the second phase of vectorizing first-order recurrences. An 4188 // overview of the transformation is described below. Suppose we have the 4189 // following loop. 4190 // 4191 // for (int i = 0; i < n; ++i) 4192 // b[i] = a[i] - a[i - 1]; 4193 // 4194 // There is a first-order recurrence on "a". For this loop, the shorthand 4195 // scalar IR looks like: 4196 // 4197 // scalar.ph: 4198 // s_init = a[-1] 4199 // br scalar.body 4200 // 4201 // scalar.body: 4202 // i = phi [0, scalar.ph], [i+1, scalar.body] 4203 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4204 // s2 = a[i] 4205 // b[i] = s2 - s1 4206 // br cond, scalar.body, ... 4207 // 4208 // In this example, s1 is a recurrence because it's value depends on the 4209 // previous iteration. In the first phase of vectorization, we created a 4210 // vector phi v1 for s1. We now complete the vectorization and produce the 4211 // shorthand vector IR shown below (for VF = 4, UF = 1). 4212 // 4213 // vector.ph: 4214 // v_init = vector(..., ..., ..., a[-1]) 4215 // br vector.body 4216 // 4217 // vector.body 4218 // i = phi [0, vector.ph], [i+4, vector.body] 4219 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4220 // v2 = a[i, i+1, i+2, i+3]; 4221 // v3 = vector(v1(3), v2(0, 1, 2)) 4222 // b[i, i+1, i+2, i+3] = v2 - v3 4223 // br cond, vector.body, middle.block 4224 // 4225 // middle.block: 4226 // x = v2(3) 4227 // br scalar.ph 4228 // 4229 // scalar.ph: 4230 // s_init = phi [x, middle.block], [a[-1], otherwise] 4231 // br scalar.body 4232 // 4233 // After execution completes the vector loop, we extract the next value of 4234 // the recurrence (x) to use as the initial value in the scalar loop. 4235 4236 // Extract the last vector element in the middle block. This will be the 4237 // initial value for the recurrence when jumping to the scalar loop. 4238 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4239 Value *Incoming = State.get(PreviousDef, UF - 1); 4240 auto *ExtractForScalar = Incoming; 4241 auto *IdxTy = Builder.getInt32Ty(); 4242 if (VF.isVector()) { 4243 auto *One = ConstantInt::get(IdxTy, 1); 4244 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4245 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4246 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4247 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4248 "vector.recur.extract"); 4249 } 4250 // Extract the second last element in the middle block if the 4251 // Phi is used outside the loop. We need to extract the phi itself 4252 // and not the last element (the phi update in the current iteration). This 4253 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4254 // when the scalar loop is not run at all. 4255 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4256 if (VF.isVector()) { 4257 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4258 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4259 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4260 Incoming, Idx, "vector.recur.extract.for.phi"); 4261 } else if (UF > 1) 4262 // When loop is unrolled without vectorizing, initialize 4263 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4264 // of `Incoming`. This is analogous to the vectorized case above: extracting 4265 // the second last element when VF > 1. 4266 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4267 4268 // Fix the initial value of the original recurrence in the scalar loop. 4269 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4270 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4271 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4272 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4273 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4274 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4275 Start->addIncoming(Incoming, BB); 4276 } 4277 4278 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4279 Phi->setName("scalar.recur"); 4280 4281 // Finally, fix users of the recurrence outside the loop. The users will need 4282 // either the last value of the scalar recurrence or the last value of the 4283 // vector recurrence we extracted in the middle block. Since the loop is in 4284 // LCSSA form, we just need to find all the phi nodes for the original scalar 4285 // recurrence in the exit block, and then add an edge for the middle block. 4286 // Note that LCSSA does not imply single entry when the original scalar loop 4287 // had multiple exiting edges (as we always run the last iteration in the 4288 // scalar epilogue); in that case, there is no edge from middle to exit and 4289 // and thus no phis which needed updated. 4290 if (!Cost->requiresScalarEpilogue(VF)) 4291 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4292 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4293 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4294 } 4295 4296 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4297 VPTransformState &State) { 4298 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4299 // Get it's reduction variable descriptor. 4300 assert(Legal->isReductionVariable(OrigPhi) && 4301 "Unable to find the reduction variable"); 4302 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4303 4304 RecurKind RK = RdxDesc.getRecurrenceKind(); 4305 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4306 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4307 setDebugLocFromInst(ReductionStartValue); 4308 4309 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4310 // This is the vector-clone of the value that leaves the loop. 4311 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4312 4313 // Wrap flags are in general invalid after vectorization, clear them. 4314 clearReductionWrapFlags(RdxDesc, State); 4315 4316 // Before each round, move the insertion point right between 4317 // the PHIs and the values we are going to write. 4318 // This allows us to write both PHINodes and the extractelement 4319 // instructions. 4320 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4321 4322 setDebugLocFromInst(LoopExitInst); 4323 4324 Type *PhiTy = OrigPhi->getType(); 4325 // If tail is folded by masking, the vector value to leave the loop should be 4326 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4327 // instead of the former. For an inloop reduction the reduction will already 4328 // be predicated, and does not need to be handled here. 4329 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4330 for (unsigned Part = 0; Part < UF; ++Part) { 4331 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4332 Value *Sel = nullptr; 4333 for (User *U : VecLoopExitInst->users()) { 4334 if (isa<SelectInst>(U)) { 4335 assert(!Sel && "Reduction exit feeding two selects"); 4336 Sel = U; 4337 } else 4338 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4339 } 4340 assert(Sel && "Reduction exit feeds no select"); 4341 State.reset(LoopExitInstDef, Sel, Part); 4342 4343 // If the target can create a predicated operator for the reduction at no 4344 // extra cost in the loop (for example a predicated vadd), it can be 4345 // cheaper for the select to remain in the loop than be sunk out of it, 4346 // and so use the select value for the phi instead of the old 4347 // LoopExitValue. 4348 if (PreferPredicatedReductionSelect || 4349 TTI->preferPredicatedReductionSelect( 4350 RdxDesc.getOpcode(), PhiTy, 4351 TargetTransformInfo::ReductionFlags())) { 4352 auto *VecRdxPhi = 4353 cast<PHINode>(State.get(PhiR, Part)); 4354 VecRdxPhi->setIncomingValueForBlock( 4355 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4356 } 4357 } 4358 } 4359 4360 // If the vector reduction can be performed in a smaller type, we truncate 4361 // then extend the loop exit value to enable InstCombine to evaluate the 4362 // entire expression in the smaller type. 4363 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4364 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4365 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4366 Builder.SetInsertPoint( 4367 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4368 VectorParts RdxParts(UF); 4369 for (unsigned Part = 0; Part < UF; ++Part) { 4370 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4371 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4372 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4373 : Builder.CreateZExt(Trunc, VecTy); 4374 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4375 UI != RdxParts[Part]->user_end();) 4376 if (*UI != Trunc) { 4377 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4378 RdxParts[Part] = Extnd; 4379 } else { 4380 ++UI; 4381 } 4382 } 4383 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4384 for (unsigned Part = 0; Part < UF; ++Part) { 4385 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4386 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4387 } 4388 } 4389 4390 // Reduce all of the unrolled parts into a single vector. 4391 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4392 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4393 4394 // The middle block terminator has already been assigned a DebugLoc here (the 4395 // OrigLoop's single latch terminator). We want the whole middle block to 4396 // appear to execute on this line because: (a) it is all compiler generated, 4397 // (b) these instructions are always executed after evaluating the latch 4398 // conditional branch, and (c) other passes may add new predecessors which 4399 // terminate on this line. This is the easiest way to ensure we don't 4400 // accidentally cause an extra step back into the loop while debugging. 4401 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4402 if (PhiR->isOrdered()) 4403 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4404 else { 4405 // Floating-point operations should have some FMF to enable the reduction. 4406 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4407 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4408 for (unsigned Part = 1; Part < UF; ++Part) { 4409 Value *RdxPart = State.get(LoopExitInstDef, Part); 4410 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4411 ReducedPartRdx = Builder.CreateBinOp( 4412 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4413 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4414 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4415 ReducedPartRdx, RdxPart); 4416 else 4417 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4418 } 4419 } 4420 4421 // Create the reduction after the loop. Note that inloop reductions create the 4422 // target reduction in the loop using a Reduction recipe. 4423 if (VF.isVector() && !PhiR->isInLoop()) { 4424 ReducedPartRdx = 4425 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4426 // If the reduction can be performed in a smaller type, we need to extend 4427 // the reduction to the wider type before we branch to the original loop. 4428 if (PhiTy != RdxDesc.getRecurrenceType()) 4429 ReducedPartRdx = RdxDesc.isSigned() 4430 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4431 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4432 } 4433 4434 // Create a phi node that merges control-flow from the backedge-taken check 4435 // block and the middle block. 4436 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4437 LoopScalarPreHeader->getTerminator()); 4438 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4439 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4440 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4441 4442 // Now, we need to fix the users of the reduction variable 4443 // inside and outside of the scalar remainder loop. 4444 4445 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4446 // in the exit blocks. See comment on analogous loop in 4447 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4448 if (!Cost->requiresScalarEpilogue(VF)) 4449 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4450 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4451 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4452 4453 // Fix the scalar loop reduction variable with the incoming reduction sum 4454 // from the vector body and from the backedge value. 4455 int IncomingEdgeBlockIdx = 4456 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4457 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4458 // Pick the other block. 4459 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4460 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4461 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4462 } 4463 4464 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4465 VPTransformState &State) { 4466 RecurKind RK = RdxDesc.getRecurrenceKind(); 4467 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4468 return; 4469 4470 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4471 assert(LoopExitInstr && "null loop exit instruction"); 4472 SmallVector<Instruction *, 8> Worklist; 4473 SmallPtrSet<Instruction *, 8> Visited; 4474 Worklist.push_back(LoopExitInstr); 4475 Visited.insert(LoopExitInstr); 4476 4477 while (!Worklist.empty()) { 4478 Instruction *Cur = Worklist.pop_back_val(); 4479 if (isa<OverflowingBinaryOperator>(Cur)) 4480 for (unsigned Part = 0; Part < UF; ++Part) { 4481 // FIXME: Should not rely on getVPValue at this point. 4482 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4483 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4484 } 4485 4486 for (User *U : Cur->users()) { 4487 Instruction *UI = cast<Instruction>(U); 4488 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4489 Visited.insert(UI).second) 4490 Worklist.push_back(UI); 4491 } 4492 } 4493 } 4494 4495 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4496 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4497 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4498 // Some phis were already hand updated by the reduction and recurrence 4499 // code above, leave them alone. 4500 continue; 4501 4502 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4503 // Non-instruction incoming values will have only one value. 4504 4505 VPLane Lane = VPLane::getFirstLane(); 4506 if (isa<Instruction>(IncomingValue) && 4507 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4508 VF)) 4509 Lane = VPLane::getLastLaneForVF(VF); 4510 4511 // Can be a loop invariant incoming value or the last scalar value to be 4512 // extracted from the vectorized loop. 4513 // FIXME: Should not rely on getVPValue at this point. 4514 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4515 Value *lastIncomingValue = 4516 OrigLoop->isLoopInvariant(IncomingValue) 4517 ? IncomingValue 4518 : State.get(State.Plan->getVPValue(IncomingValue, true), 4519 VPIteration(UF - 1, Lane)); 4520 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4521 } 4522 } 4523 4524 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4525 // The basic block and loop containing the predicated instruction. 4526 auto *PredBB = PredInst->getParent(); 4527 auto *VectorLoop = LI->getLoopFor(PredBB); 4528 4529 // Initialize a worklist with the operands of the predicated instruction. 4530 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4531 4532 // Holds instructions that we need to analyze again. An instruction may be 4533 // reanalyzed if we don't yet know if we can sink it or not. 4534 SmallVector<Instruction *, 8> InstsToReanalyze; 4535 4536 // Returns true if a given use occurs in the predicated block. Phi nodes use 4537 // their operands in their corresponding predecessor blocks. 4538 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4539 auto *I = cast<Instruction>(U.getUser()); 4540 BasicBlock *BB = I->getParent(); 4541 if (auto *Phi = dyn_cast<PHINode>(I)) 4542 BB = Phi->getIncomingBlock( 4543 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4544 return BB == PredBB; 4545 }; 4546 4547 // Iteratively sink the scalarized operands of the predicated instruction 4548 // into the block we created for it. When an instruction is sunk, it's 4549 // operands are then added to the worklist. The algorithm ends after one pass 4550 // through the worklist doesn't sink a single instruction. 4551 bool Changed; 4552 do { 4553 // Add the instructions that need to be reanalyzed to the worklist, and 4554 // reset the changed indicator. 4555 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4556 InstsToReanalyze.clear(); 4557 Changed = false; 4558 4559 while (!Worklist.empty()) { 4560 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4561 4562 // We can't sink an instruction if it is a phi node, is not in the loop, 4563 // or may have side effects. 4564 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4565 I->mayHaveSideEffects()) 4566 continue; 4567 4568 // If the instruction is already in PredBB, check if we can sink its 4569 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4570 // sinking the scalar instruction I, hence it appears in PredBB; but it 4571 // may have failed to sink I's operands (recursively), which we try 4572 // (again) here. 4573 if (I->getParent() == PredBB) { 4574 Worklist.insert(I->op_begin(), I->op_end()); 4575 continue; 4576 } 4577 4578 // It's legal to sink the instruction if all its uses occur in the 4579 // predicated block. Otherwise, there's nothing to do yet, and we may 4580 // need to reanalyze the instruction. 4581 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4582 InstsToReanalyze.push_back(I); 4583 continue; 4584 } 4585 4586 // Move the instruction to the beginning of the predicated block, and add 4587 // it's operands to the worklist. 4588 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4589 Worklist.insert(I->op_begin(), I->op_end()); 4590 4591 // The sinking may have enabled other instructions to be sunk, so we will 4592 // need to iterate. 4593 Changed = true; 4594 } 4595 } while (Changed); 4596 } 4597 4598 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4599 for (PHINode *OrigPhi : OrigPHIsToFix) { 4600 VPWidenPHIRecipe *VPPhi = 4601 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4602 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4603 // Make sure the builder has a valid insert point. 4604 Builder.SetInsertPoint(NewPhi); 4605 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4606 VPValue *Inc = VPPhi->getIncomingValue(i); 4607 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4608 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4609 } 4610 } 4611 } 4612 4613 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4614 return Cost->useOrderedReductions(RdxDesc); 4615 } 4616 4617 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4618 VPUser &Operands, unsigned UF, 4619 ElementCount VF, bool IsPtrLoopInvariant, 4620 SmallBitVector &IsIndexLoopInvariant, 4621 VPTransformState &State) { 4622 // Construct a vector GEP by widening the operands of the scalar GEP as 4623 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4624 // results in a vector of pointers when at least one operand of the GEP 4625 // is vector-typed. Thus, to keep the representation compact, we only use 4626 // vector-typed operands for loop-varying values. 4627 4628 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4629 // If we are vectorizing, but the GEP has only loop-invariant operands, 4630 // the GEP we build (by only using vector-typed operands for 4631 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4632 // produce a vector of pointers, we need to either arbitrarily pick an 4633 // operand to broadcast, or broadcast a clone of the original GEP. 4634 // Here, we broadcast a clone of the original. 4635 // 4636 // TODO: If at some point we decide to scalarize instructions having 4637 // loop-invariant operands, this special case will no longer be 4638 // required. We would add the scalarization decision to 4639 // collectLoopScalars() and teach getVectorValue() to broadcast 4640 // the lane-zero scalar value. 4641 auto *Clone = Builder.Insert(GEP->clone()); 4642 for (unsigned Part = 0; Part < UF; ++Part) { 4643 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4644 State.set(VPDef, EntryPart, Part); 4645 addMetadata(EntryPart, GEP); 4646 } 4647 } else { 4648 // If the GEP has at least one loop-varying operand, we are sure to 4649 // produce a vector of pointers. But if we are only unrolling, we want 4650 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4651 // produce with the code below will be scalar (if VF == 1) or vector 4652 // (otherwise). Note that for the unroll-only case, we still maintain 4653 // values in the vector mapping with initVector, as we do for other 4654 // instructions. 4655 for (unsigned Part = 0; Part < UF; ++Part) { 4656 // The pointer operand of the new GEP. If it's loop-invariant, we 4657 // won't broadcast it. 4658 auto *Ptr = IsPtrLoopInvariant 4659 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4660 : State.get(Operands.getOperand(0), Part); 4661 4662 // Collect all the indices for the new GEP. If any index is 4663 // loop-invariant, we won't broadcast it. 4664 SmallVector<Value *, 4> Indices; 4665 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4666 VPValue *Operand = Operands.getOperand(I); 4667 if (IsIndexLoopInvariant[I - 1]) 4668 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4669 else 4670 Indices.push_back(State.get(Operand, Part)); 4671 } 4672 4673 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4674 // but it should be a vector, otherwise. 4675 auto *NewGEP = 4676 GEP->isInBounds() 4677 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4678 Indices) 4679 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4680 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4681 "NewGEP is not a pointer vector"); 4682 State.set(VPDef, NewGEP, Part); 4683 addMetadata(NewGEP, GEP); 4684 } 4685 } 4686 } 4687 4688 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4689 VPWidenPHIRecipe *PhiR, 4690 VPTransformState &State) { 4691 PHINode *P = cast<PHINode>(PN); 4692 if (EnableVPlanNativePath) { 4693 // Currently we enter here in the VPlan-native path for non-induction 4694 // PHIs where all control flow is uniform. We simply widen these PHIs. 4695 // Create a vector phi with no operands - the vector phi operands will be 4696 // set at the end of vector code generation. 4697 Type *VecTy = (State.VF.isScalar()) 4698 ? PN->getType() 4699 : VectorType::get(PN->getType(), State.VF); 4700 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4701 State.set(PhiR, VecPhi, 0); 4702 OrigPHIsToFix.push_back(P); 4703 4704 return; 4705 } 4706 4707 assert(PN->getParent() == OrigLoop->getHeader() && 4708 "Non-header phis should have been handled elsewhere"); 4709 4710 // In order to support recurrences we need to be able to vectorize Phi nodes. 4711 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4712 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4713 // this value when we vectorize all of the instructions that use the PHI. 4714 4715 assert(!Legal->isReductionVariable(P) && 4716 "reductions should be handled elsewhere"); 4717 4718 setDebugLocFromInst(P); 4719 4720 // This PHINode must be an induction variable. 4721 // Make sure that we know about it. 4722 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4723 4724 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4725 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4726 4727 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4728 // which can be found from the original scalar operations. 4729 switch (II.getKind()) { 4730 case InductionDescriptor::IK_NoInduction: 4731 llvm_unreachable("Unknown induction"); 4732 case InductionDescriptor::IK_IntInduction: 4733 case InductionDescriptor::IK_FpInduction: 4734 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4735 case InductionDescriptor::IK_PtrInduction: { 4736 // Handle the pointer induction variable case. 4737 assert(P->getType()->isPointerTy() && "Unexpected type."); 4738 4739 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4740 // This is the normalized GEP that starts counting at zero. 4741 Value *PtrInd = 4742 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4743 // Determine the number of scalars we need to generate for each unroll 4744 // iteration. If the instruction is uniform, we only need to generate the 4745 // first lane. Otherwise, we generate all VF values. 4746 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4747 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4748 4749 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4750 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4751 if (NeedsVectorIndex) { 4752 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4753 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4754 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4755 } 4756 4757 for (unsigned Part = 0; Part < UF; ++Part) { 4758 Value *PartStart = createStepForVF( 4759 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4760 4761 if (NeedsVectorIndex) { 4762 // Here we cache the whole vector, which means we can support the 4763 // extraction of any lane. However, in some cases the extractelement 4764 // instruction that is generated for scalar uses of this vector (e.g. 4765 // a load instruction) is not folded away. Therefore we still 4766 // calculate values for the first n lanes to avoid redundant moves 4767 // (when extracting the 0th element) and to produce scalar code (i.e. 4768 // additional add/gep instructions instead of expensive extractelement 4769 // instructions) when extracting higher-order elements. 4770 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4771 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4772 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4773 Value *SclrGep = 4774 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4775 SclrGep->setName("next.gep"); 4776 State.set(PhiR, SclrGep, Part); 4777 } 4778 4779 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4780 Value *Idx = Builder.CreateAdd( 4781 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4782 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4783 Value *SclrGep = 4784 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4785 SclrGep->setName("next.gep"); 4786 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4787 } 4788 } 4789 return; 4790 } 4791 assert(isa<SCEVConstant>(II.getStep()) && 4792 "Induction step not a SCEV constant!"); 4793 Type *PhiType = II.getStep()->getType(); 4794 4795 // Build a pointer phi 4796 Value *ScalarStartValue = II.getStartValue(); 4797 Type *ScStValueType = ScalarStartValue->getType(); 4798 PHINode *NewPointerPhi = 4799 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4800 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4801 4802 // A pointer induction, performed by using a gep 4803 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4804 Instruction *InductionLoc = LoopLatch->getTerminator(); 4805 const SCEV *ScalarStep = II.getStep(); 4806 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4807 Value *ScalarStepValue = 4808 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4809 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4810 Value *NumUnrolledElems = 4811 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4812 Value *InductionGEP = GetElementPtrInst::Create( 4813 II.getElementType(), NewPointerPhi, 4814 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4815 InductionLoc); 4816 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4817 4818 // Create UF many actual address geps that use the pointer 4819 // phi as base and a vectorized version of the step value 4820 // (<step*0, ..., step*N>) as offset. 4821 for (unsigned Part = 0; Part < State.UF; ++Part) { 4822 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4823 Value *StartOffsetScalar = 4824 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4825 Value *StartOffset = 4826 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4827 // Create a vector of consecutive numbers from zero to VF. 4828 StartOffset = 4829 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4830 4831 Value *GEP = Builder.CreateGEP( 4832 II.getElementType(), NewPointerPhi, 4833 Builder.CreateMul( 4834 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4835 "vector.gep")); 4836 State.set(PhiR, GEP, Part); 4837 } 4838 } 4839 } 4840 } 4841 4842 /// A helper function for checking whether an integer division-related 4843 /// instruction may divide by zero (in which case it must be predicated if 4844 /// executed conditionally in the scalar code). 4845 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4846 /// Non-zero divisors that are non compile-time constants will not be 4847 /// converted into multiplication, so we will still end up scalarizing 4848 /// the division, but can do so w/o predication. 4849 static bool mayDivideByZero(Instruction &I) { 4850 assert((I.getOpcode() == Instruction::UDiv || 4851 I.getOpcode() == Instruction::SDiv || 4852 I.getOpcode() == Instruction::URem || 4853 I.getOpcode() == Instruction::SRem) && 4854 "Unexpected instruction"); 4855 Value *Divisor = I.getOperand(1); 4856 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4857 return !CInt || CInt->isZero(); 4858 } 4859 4860 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4861 VPUser &User, 4862 VPTransformState &State) { 4863 switch (I.getOpcode()) { 4864 case Instruction::Call: 4865 case Instruction::Br: 4866 case Instruction::PHI: 4867 case Instruction::GetElementPtr: 4868 case Instruction::Select: 4869 llvm_unreachable("This instruction is handled by a different recipe."); 4870 case Instruction::UDiv: 4871 case Instruction::SDiv: 4872 case Instruction::SRem: 4873 case Instruction::URem: 4874 case Instruction::Add: 4875 case Instruction::FAdd: 4876 case Instruction::Sub: 4877 case Instruction::FSub: 4878 case Instruction::FNeg: 4879 case Instruction::Mul: 4880 case Instruction::FMul: 4881 case Instruction::FDiv: 4882 case Instruction::FRem: 4883 case Instruction::Shl: 4884 case Instruction::LShr: 4885 case Instruction::AShr: 4886 case Instruction::And: 4887 case Instruction::Or: 4888 case Instruction::Xor: { 4889 // Just widen unops and binops. 4890 setDebugLocFromInst(&I); 4891 4892 for (unsigned Part = 0; Part < UF; ++Part) { 4893 SmallVector<Value *, 2> Ops; 4894 for (VPValue *VPOp : User.operands()) 4895 Ops.push_back(State.get(VPOp, Part)); 4896 4897 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4898 4899 if (auto *VecOp = dyn_cast<Instruction>(V)) 4900 VecOp->copyIRFlags(&I); 4901 4902 // Use this vector value for all users of the original instruction. 4903 State.set(Def, V, Part); 4904 addMetadata(V, &I); 4905 } 4906 4907 break; 4908 } 4909 case Instruction::ICmp: 4910 case Instruction::FCmp: { 4911 // Widen compares. Generate vector compares. 4912 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4913 auto *Cmp = cast<CmpInst>(&I); 4914 setDebugLocFromInst(Cmp); 4915 for (unsigned Part = 0; Part < UF; ++Part) { 4916 Value *A = State.get(User.getOperand(0), Part); 4917 Value *B = State.get(User.getOperand(1), Part); 4918 Value *C = nullptr; 4919 if (FCmp) { 4920 // Propagate fast math flags. 4921 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4922 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4923 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4924 } else { 4925 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4926 } 4927 State.set(Def, C, Part); 4928 addMetadata(C, &I); 4929 } 4930 4931 break; 4932 } 4933 4934 case Instruction::ZExt: 4935 case Instruction::SExt: 4936 case Instruction::FPToUI: 4937 case Instruction::FPToSI: 4938 case Instruction::FPExt: 4939 case Instruction::PtrToInt: 4940 case Instruction::IntToPtr: 4941 case Instruction::SIToFP: 4942 case Instruction::UIToFP: 4943 case Instruction::Trunc: 4944 case Instruction::FPTrunc: 4945 case Instruction::BitCast: { 4946 auto *CI = cast<CastInst>(&I); 4947 setDebugLocFromInst(CI); 4948 4949 /// Vectorize casts. 4950 Type *DestTy = 4951 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4952 4953 for (unsigned Part = 0; Part < UF; ++Part) { 4954 Value *A = State.get(User.getOperand(0), Part); 4955 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4956 State.set(Def, Cast, Part); 4957 addMetadata(Cast, &I); 4958 } 4959 break; 4960 } 4961 default: 4962 // This instruction is not vectorized by simple widening. 4963 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4964 llvm_unreachable("Unhandled instruction!"); 4965 } // end of switch. 4966 } 4967 4968 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4969 VPUser &ArgOperands, 4970 VPTransformState &State) { 4971 assert(!isa<DbgInfoIntrinsic>(I) && 4972 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4973 setDebugLocFromInst(&I); 4974 4975 Module *M = I.getParent()->getParent()->getParent(); 4976 auto *CI = cast<CallInst>(&I); 4977 4978 SmallVector<Type *, 4> Tys; 4979 for (Value *ArgOperand : CI->args()) 4980 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4981 4982 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4983 4984 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4985 // version of the instruction. 4986 // Is it beneficial to perform intrinsic call compared to lib call? 4987 bool NeedToScalarize = false; 4988 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4989 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4990 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4991 assert((UseVectorIntrinsic || !NeedToScalarize) && 4992 "Instruction should be scalarized elsewhere."); 4993 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4994 "Either the intrinsic cost or vector call cost must be valid"); 4995 4996 for (unsigned Part = 0; Part < UF; ++Part) { 4997 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4998 SmallVector<Value *, 4> Args; 4999 for (auto &I : enumerate(ArgOperands.operands())) { 5000 // Some intrinsics have a scalar argument - don't replace it with a 5001 // vector. 5002 Value *Arg; 5003 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5004 Arg = State.get(I.value(), Part); 5005 else { 5006 Arg = State.get(I.value(), VPIteration(0, 0)); 5007 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 5008 TysForDecl.push_back(Arg->getType()); 5009 } 5010 Args.push_back(Arg); 5011 } 5012 5013 Function *VectorF; 5014 if (UseVectorIntrinsic) { 5015 // Use vector version of the intrinsic. 5016 if (VF.isVector()) 5017 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5018 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5019 assert(VectorF && "Can't retrieve vector intrinsic."); 5020 } else { 5021 // Use vector version of the function call. 5022 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5023 #ifndef NDEBUG 5024 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5025 "Can't create vector function."); 5026 #endif 5027 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5028 } 5029 SmallVector<OperandBundleDef, 1> OpBundles; 5030 CI->getOperandBundlesAsDefs(OpBundles); 5031 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5032 5033 if (isa<FPMathOperator>(V)) 5034 V->copyFastMathFlags(CI); 5035 5036 State.set(Def, V, Part); 5037 addMetadata(V, &I); 5038 } 5039 } 5040 5041 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5042 VPUser &Operands, 5043 bool InvariantCond, 5044 VPTransformState &State) { 5045 setDebugLocFromInst(&I); 5046 5047 // The condition can be loop invariant but still defined inside the 5048 // loop. This means that we can't just use the original 'cond' value. 5049 // We have to take the 'vectorized' value and pick the first lane. 5050 // Instcombine will make this a no-op. 5051 auto *InvarCond = InvariantCond 5052 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5053 : nullptr; 5054 5055 for (unsigned Part = 0; Part < UF; ++Part) { 5056 Value *Cond = 5057 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5058 Value *Op0 = State.get(Operands.getOperand(1), Part); 5059 Value *Op1 = State.get(Operands.getOperand(2), Part); 5060 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5061 State.set(VPDef, Sel, Part); 5062 addMetadata(Sel, &I); 5063 } 5064 } 5065 5066 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5067 // We should not collect Scalars more than once per VF. Right now, this 5068 // function is called from collectUniformsAndScalars(), which already does 5069 // this check. Collecting Scalars for VF=1 does not make any sense. 5070 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5071 "This function should not be visited twice for the same VF"); 5072 5073 SmallSetVector<Instruction *, 8> Worklist; 5074 5075 // These sets are used to seed the analysis with pointers used by memory 5076 // accesses that will remain scalar. 5077 SmallSetVector<Instruction *, 8> ScalarPtrs; 5078 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5079 auto *Latch = TheLoop->getLoopLatch(); 5080 5081 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5082 // The pointer operands of loads and stores will be scalar as long as the 5083 // memory access is not a gather or scatter operation. The value operand of a 5084 // store will remain scalar if the store is scalarized. 5085 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5086 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5087 assert(WideningDecision != CM_Unknown && 5088 "Widening decision should be ready at this moment"); 5089 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5090 if (Ptr == Store->getValueOperand()) 5091 return WideningDecision == CM_Scalarize; 5092 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5093 "Ptr is neither a value or pointer operand"); 5094 return WideningDecision != CM_GatherScatter; 5095 }; 5096 5097 // A helper that returns true if the given value is a bitcast or 5098 // getelementptr instruction contained in the loop. 5099 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5100 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5101 isa<GetElementPtrInst>(V)) && 5102 !TheLoop->isLoopInvariant(V); 5103 }; 5104 5105 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5106 if (!isa<PHINode>(Ptr) || 5107 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5108 return false; 5109 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5110 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5111 return false; 5112 return isScalarUse(MemAccess, Ptr); 5113 }; 5114 5115 // A helper that evaluates a memory access's use of a pointer. If the 5116 // pointer is actually the pointer induction of a loop, it is being 5117 // inserted into Worklist. If the use will be a scalar use, and the 5118 // pointer is only used by memory accesses, we place the pointer in 5119 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5120 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5121 if (isScalarPtrInduction(MemAccess, Ptr)) { 5122 Worklist.insert(cast<Instruction>(Ptr)); 5123 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5124 << "\n"); 5125 5126 Instruction *Update = cast<Instruction>( 5127 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5128 5129 // If there is more than one user of Update (Ptr), we shouldn't assume it 5130 // will be scalar after vectorisation as other users of the instruction 5131 // may require widening. Otherwise, add it to ScalarPtrs. 5132 if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) { 5133 ScalarPtrs.insert(Update); 5134 return; 5135 } 5136 } 5137 // We only care about bitcast and getelementptr instructions contained in 5138 // the loop. 5139 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5140 return; 5141 5142 // If the pointer has already been identified as scalar (e.g., if it was 5143 // also identified as uniform), there's nothing to do. 5144 auto *I = cast<Instruction>(Ptr); 5145 if (Worklist.count(I)) 5146 return; 5147 5148 // If the use of the pointer will be a scalar use, and all users of the 5149 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5150 // place the pointer in PossibleNonScalarPtrs. 5151 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5152 return isa<LoadInst>(U) || isa<StoreInst>(U); 5153 })) 5154 ScalarPtrs.insert(I); 5155 else 5156 PossibleNonScalarPtrs.insert(I); 5157 }; 5158 5159 // We seed the scalars analysis with three classes of instructions: (1) 5160 // instructions marked uniform-after-vectorization and (2) bitcast, 5161 // getelementptr and (pointer) phi instructions used by memory accesses 5162 // requiring a scalar use. 5163 // 5164 // (1) Add to the worklist all instructions that have been identified as 5165 // uniform-after-vectorization. 5166 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5167 5168 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5169 // memory accesses requiring a scalar use. The pointer operands of loads and 5170 // stores will be scalar as long as the memory accesses is not a gather or 5171 // scatter operation. The value operand of a store will remain scalar if the 5172 // store is scalarized. 5173 for (auto *BB : TheLoop->blocks()) 5174 for (auto &I : *BB) { 5175 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5176 evaluatePtrUse(Load, Load->getPointerOperand()); 5177 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5178 evaluatePtrUse(Store, Store->getPointerOperand()); 5179 evaluatePtrUse(Store, Store->getValueOperand()); 5180 } 5181 } 5182 for (auto *I : ScalarPtrs) 5183 if (!PossibleNonScalarPtrs.count(I)) { 5184 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5185 Worklist.insert(I); 5186 } 5187 5188 // Insert the forced scalars. 5189 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5190 // induction variable when the PHI user is scalarized. 5191 auto ForcedScalar = ForcedScalars.find(VF); 5192 if (ForcedScalar != ForcedScalars.end()) 5193 for (auto *I : ForcedScalar->second) 5194 Worklist.insert(I); 5195 5196 // Expand the worklist by looking through any bitcasts and getelementptr 5197 // instructions we've already identified as scalar. This is similar to the 5198 // expansion step in collectLoopUniforms(); however, here we're only 5199 // expanding to include additional bitcasts and getelementptr instructions. 5200 unsigned Idx = 0; 5201 while (Idx != Worklist.size()) { 5202 Instruction *Dst = Worklist[Idx++]; 5203 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5204 continue; 5205 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5206 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5207 auto *J = cast<Instruction>(U); 5208 return !TheLoop->contains(J) || Worklist.count(J) || 5209 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5210 isScalarUse(J, Src)); 5211 })) { 5212 Worklist.insert(Src); 5213 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5214 } 5215 } 5216 5217 // An induction variable will remain scalar if all users of the induction 5218 // variable and induction variable update remain scalar. 5219 for (auto &Induction : Legal->getInductionVars()) { 5220 auto *Ind = Induction.first; 5221 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5222 5223 // If tail-folding is applied, the primary induction variable will be used 5224 // to feed a vector compare. 5225 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5226 continue; 5227 5228 // Determine if all users of the induction variable are scalar after 5229 // vectorization. 5230 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5231 auto *I = cast<Instruction>(U); 5232 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5233 }); 5234 if (!ScalarInd) 5235 continue; 5236 5237 // Determine if all users of the induction variable update instruction are 5238 // scalar after vectorization. 5239 auto ScalarIndUpdate = 5240 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5241 auto *I = cast<Instruction>(U); 5242 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5243 }); 5244 if (!ScalarIndUpdate) 5245 continue; 5246 5247 // The induction variable and its update instruction will remain scalar. 5248 Worklist.insert(Ind); 5249 Worklist.insert(IndUpdate); 5250 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5251 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5252 << "\n"); 5253 } 5254 5255 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5256 } 5257 5258 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5259 if (!blockNeedsPredication(I->getParent())) 5260 return false; 5261 switch(I->getOpcode()) { 5262 default: 5263 break; 5264 case Instruction::Load: 5265 case Instruction::Store: { 5266 if (!Legal->isMaskRequired(I)) 5267 return false; 5268 auto *Ptr = getLoadStorePointerOperand(I); 5269 auto *Ty = getLoadStoreType(I); 5270 const Align Alignment = getLoadStoreAlignment(I); 5271 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5272 TTI.isLegalMaskedGather(Ty, Alignment)) 5273 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5274 TTI.isLegalMaskedScatter(Ty, Alignment)); 5275 } 5276 case Instruction::UDiv: 5277 case Instruction::SDiv: 5278 case Instruction::SRem: 5279 case Instruction::URem: 5280 return mayDivideByZero(*I); 5281 } 5282 return false; 5283 } 5284 5285 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5286 Instruction *I, ElementCount VF) { 5287 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5288 assert(getWideningDecision(I, VF) == CM_Unknown && 5289 "Decision should not be set yet."); 5290 auto *Group = getInterleavedAccessGroup(I); 5291 assert(Group && "Must have a group."); 5292 5293 // If the instruction's allocated size doesn't equal it's type size, it 5294 // requires padding and will be scalarized. 5295 auto &DL = I->getModule()->getDataLayout(); 5296 auto *ScalarTy = getLoadStoreType(I); 5297 if (hasIrregularType(ScalarTy, DL)) 5298 return false; 5299 5300 // Check if masking is required. 5301 // A Group may need masking for one of two reasons: it resides in a block that 5302 // needs predication, or it was decided to use masking to deal with gaps 5303 // (either a gap at the end of a load-access that may result in a speculative 5304 // load, or any gaps in a store-access). 5305 bool PredicatedAccessRequiresMasking = 5306 blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5307 bool LoadAccessWithGapsRequiresEpilogMasking = 5308 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 5309 !isScalarEpilogueAllowed(); 5310 bool StoreAccessWithGapsRequiresMasking = 5311 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 5312 if (!PredicatedAccessRequiresMasking && 5313 !LoadAccessWithGapsRequiresEpilogMasking && 5314 !StoreAccessWithGapsRequiresMasking) 5315 return true; 5316 5317 // If masked interleaving is required, we expect that the user/target had 5318 // enabled it, because otherwise it either wouldn't have been created or 5319 // it should have been invalidated by the CostModel. 5320 assert(useMaskedInterleavedAccesses(TTI) && 5321 "Masked interleave-groups for predicated accesses are not enabled."); 5322 5323 if (Group->isReverse()) 5324 return false; 5325 5326 auto *Ty = getLoadStoreType(I); 5327 const Align Alignment = getLoadStoreAlignment(I); 5328 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5329 : TTI.isLegalMaskedStore(Ty, Alignment); 5330 } 5331 5332 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5333 Instruction *I, ElementCount VF) { 5334 // Get and ensure we have a valid memory instruction. 5335 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 5336 5337 auto *Ptr = getLoadStorePointerOperand(I); 5338 auto *ScalarTy = getLoadStoreType(I); 5339 5340 // In order to be widened, the pointer should be consecutive, first of all. 5341 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5342 return false; 5343 5344 // If the instruction is a store located in a predicated block, it will be 5345 // scalarized. 5346 if (isScalarWithPredication(I)) 5347 return false; 5348 5349 // If the instruction's allocated size doesn't equal it's type size, it 5350 // requires padding and will be scalarized. 5351 auto &DL = I->getModule()->getDataLayout(); 5352 if (hasIrregularType(ScalarTy, DL)) 5353 return false; 5354 5355 return true; 5356 } 5357 5358 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5359 // We should not collect Uniforms more than once per VF. Right now, 5360 // this function is called from collectUniformsAndScalars(), which 5361 // already does this check. Collecting Uniforms for VF=1 does not make any 5362 // sense. 5363 5364 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5365 "This function should not be visited twice for the same VF"); 5366 5367 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5368 // not analyze again. Uniforms.count(VF) will return 1. 5369 Uniforms[VF].clear(); 5370 5371 // We now know that the loop is vectorizable! 5372 // Collect instructions inside the loop that will remain uniform after 5373 // vectorization. 5374 5375 // Global values, params and instructions outside of current loop are out of 5376 // scope. 5377 auto isOutOfScope = [&](Value *V) -> bool { 5378 Instruction *I = dyn_cast<Instruction>(V); 5379 return (!I || !TheLoop->contains(I)); 5380 }; 5381 5382 SetVector<Instruction *> Worklist; 5383 BasicBlock *Latch = TheLoop->getLoopLatch(); 5384 5385 // Instructions that are scalar with predication must not be considered 5386 // uniform after vectorization, because that would create an erroneous 5387 // replicating region where only a single instance out of VF should be formed. 5388 // TODO: optimize such seldom cases if found important, see PR40816. 5389 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5390 if (isOutOfScope(I)) { 5391 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5392 << *I << "\n"); 5393 return; 5394 } 5395 if (isScalarWithPredication(I)) { 5396 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5397 << *I << "\n"); 5398 return; 5399 } 5400 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5401 Worklist.insert(I); 5402 }; 5403 5404 // Start with the conditional branch. If the branch condition is an 5405 // instruction contained in the loop that is only used by the branch, it is 5406 // uniform. 5407 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5408 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5409 addToWorklistIfAllowed(Cmp); 5410 5411 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5412 InstWidening WideningDecision = getWideningDecision(I, VF); 5413 assert(WideningDecision != CM_Unknown && 5414 "Widening decision should be ready at this moment"); 5415 5416 // A uniform memory op is itself uniform. We exclude uniform stores 5417 // here as they demand the last lane, not the first one. 5418 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5419 assert(WideningDecision == CM_Scalarize); 5420 return true; 5421 } 5422 5423 return (WideningDecision == CM_Widen || 5424 WideningDecision == CM_Widen_Reverse || 5425 WideningDecision == CM_Interleave); 5426 }; 5427 5428 5429 // Returns true if Ptr is the pointer operand of a memory access instruction 5430 // I, and I is known to not require scalarization. 5431 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5432 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5433 }; 5434 5435 // Holds a list of values which are known to have at least one uniform use. 5436 // Note that there may be other uses which aren't uniform. A "uniform use" 5437 // here is something which only demands lane 0 of the unrolled iterations; 5438 // it does not imply that all lanes produce the same value (e.g. this is not 5439 // the usual meaning of uniform) 5440 SetVector<Value *> HasUniformUse; 5441 5442 // Scan the loop for instructions which are either a) known to have only 5443 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5444 for (auto *BB : TheLoop->blocks()) 5445 for (auto &I : *BB) { 5446 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5447 switch (II->getIntrinsicID()) { 5448 case Intrinsic::sideeffect: 5449 case Intrinsic::experimental_noalias_scope_decl: 5450 case Intrinsic::assume: 5451 case Intrinsic::lifetime_start: 5452 case Intrinsic::lifetime_end: 5453 if (TheLoop->hasLoopInvariantOperands(&I)) 5454 addToWorklistIfAllowed(&I); 5455 break; 5456 default: 5457 break; 5458 } 5459 } 5460 5461 // ExtractValue instructions must be uniform, because the operands are 5462 // known to be loop-invariant. 5463 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5464 assert(isOutOfScope(EVI->getAggregateOperand()) && 5465 "Expected aggregate value to be loop invariant"); 5466 addToWorklistIfAllowed(EVI); 5467 continue; 5468 } 5469 5470 // If there's no pointer operand, there's nothing to do. 5471 auto *Ptr = getLoadStorePointerOperand(&I); 5472 if (!Ptr) 5473 continue; 5474 5475 // A uniform memory op is itself uniform. We exclude uniform stores 5476 // here as they demand the last lane, not the first one. 5477 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5478 addToWorklistIfAllowed(&I); 5479 5480 if (isUniformDecision(&I, VF)) { 5481 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5482 HasUniformUse.insert(Ptr); 5483 } 5484 } 5485 5486 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5487 // demanding) users. Since loops are assumed to be in LCSSA form, this 5488 // disallows uses outside the loop as well. 5489 for (auto *V : HasUniformUse) { 5490 if (isOutOfScope(V)) 5491 continue; 5492 auto *I = cast<Instruction>(V); 5493 auto UsersAreMemAccesses = 5494 llvm::all_of(I->users(), [&](User *U) -> bool { 5495 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5496 }); 5497 if (UsersAreMemAccesses) 5498 addToWorklistIfAllowed(I); 5499 } 5500 5501 // Expand Worklist in topological order: whenever a new instruction 5502 // is added , its users should be already inside Worklist. It ensures 5503 // a uniform instruction will only be used by uniform instructions. 5504 unsigned idx = 0; 5505 while (idx != Worklist.size()) { 5506 Instruction *I = Worklist[idx++]; 5507 5508 for (auto OV : I->operand_values()) { 5509 // isOutOfScope operands cannot be uniform instructions. 5510 if (isOutOfScope(OV)) 5511 continue; 5512 // First order recurrence Phi's should typically be considered 5513 // non-uniform. 5514 auto *OP = dyn_cast<PHINode>(OV); 5515 if (OP && Legal->isFirstOrderRecurrence(OP)) 5516 continue; 5517 // If all the users of the operand are uniform, then add the 5518 // operand into the uniform worklist. 5519 auto *OI = cast<Instruction>(OV); 5520 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5521 auto *J = cast<Instruction>(U); 5522 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5523 })) 5524 addToWorklistIfAllowed(OI); 5525 } 5526 } 5527 5528 // For an instruction to be added into Worklist above, all its users inside 5529 // the loop should also be in Worklist. However, this condition cannot be 5530 // true for phi nodes that form a cyclic dependence. We must process phi 5531 // nodes separately. An induction variable will remain uniform if all users 5532 // of the induction variable and induction variable update remain uniform. 5533 // The code below handles both pointer and non-pointer induction variables. 5534 for (auto &Induction : Legal->getInductionVars()) { 5535 auto *Ind = Induction.first; 5536 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5537 5538 // Determine if all users of the induction variable are uniform after 5539 // vectorization. 5540 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5541 auto *I = cast<Instruction>(U); 5542 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5543 isVectorizedMemAccessUse(I, Ind); 5544 }); 5545 if (!UniformInd) 5546 continue; 5547 5548 // Determine if all users of the induction variable update instruction are 5549 // uniform after vectorization. 5550 auto UniformIndUpdate = 5551 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5552 auto *I = cast<Instruction>(U); 5553 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5554 isVectorizedMemAccessUse(I, IndUpdate); 5555 }); 5556 if (!UniformIndUpdate) 5557 continue; 5558 5559 // The induction variable and its update instruction will remain uniform. 5560 addToWorklistIfAllowed(Ind); 5561 addToWorklistIfAllowed(IndUpdate); 5562 } 5563 5564 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5565 } 5566 5567 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5568 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5569 5570 if (Legal->getRuntimePointerChecking()->Need) { 5571 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5572 "runtime pointer checks needed. Enable vectorization of this " 5573 "loop with '#pragma clang loop vectorize(enable)' when " 5574 "compiling with -Os/-Oz", 5575 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5576 return true; 5577 } 5578 5579 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5580 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5581 "runtime SCEV checks needed. Enable vectorization of this " 5582 "loop with '#pragma clang loop vectorize(enable)' when " 5583 "compiling with -Os/-Oz", 5584 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5585 return true; 5586 } 5587 5588 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5589 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5590 reportVectorizationFailure("Runtime stride check for small trip count", 5591 "runtime stride == 1 checks needed. Enable vectorization of " 5592 "this loop without such check by compiling with -Os/-Oz", 5593 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5594 return true; 5595 } 5596 5597 return false; 5598 } 5599 5600 ElementCount 5601 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5602 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5603 return ElementCount::getScalable(0); 5604 5605 if (Hints->isScalableVectorizationDisabled()) { 5606 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5607 "ScalableVectorizationDisabled", ORE, TheLoop); 5608 return ElementCount::getScalable(0); 5609 } 5610 5611 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5612 5613 auto MaxScalableVF = ElementCount::getScalable( 5614 std::numeric_limits<ElementCount::ScalarTy>::max()); 5615 5616 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5617 // FIXME: While for scalable vectors this is currently sufficient, this should 5618 // be replaced by a more detailed mechanism that filters out specific VFs, 5619 // instead of invalidating vectorization for a whole set of VFs based on the 5620 // MaxVF. 5621 5622 // Disable scalable vectorization if the loop contains unsupported reductions. 5623 if (!canVectorizeReductions(MaxScalableVF)) { 5624 reportVectorizationInfo( 5625 "Scalable vectorization not supported for the reduction " 5626 "operations found in this loop.", 5627 "ScalableVFUnfeasible", ORE, TheLoop); 5628 return ElementCount::getScalable(0); 5629 } 5630 5631 // Disable scalable vectorization if the loop contains any instructions 5632 // with element types not supported for scalable vectors. 5633 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5634 return !Ty->isVoidTy() && 5635 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5636 })) { 5637 reportVectorizationInfo("Scalable vectorization is not supported " 5638 "for all element types found in this loop.", 5639 "ScalableVFUnfeasible", ORE, TheLoop); 5640 return ElementCount::getScalable(0); 5641 } 5642 5643 if (Legal->isSafeForAnyVectorWidth()) 5644 return MaxScalableVF; 5645 5646 // Limit MaxScalableVF by the maximum safe dependence distance. 5647 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5648 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5649 unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) 5650 .getVScaleRangeArgs() 5651 .second; 5652 if (VScaleMax > 0) 5653 MaxVScale = VScaleMax; 5654 } 5655 MaxScalableVF = ElementCount::getScalable( 5656 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5657 if (!MaxScalableVF) 5658 reportVectorizationInfo( 5659 "Max legal vector width too small, scalable vectorization " 5660 "unfeasible.", 5661 "ScalableVFUnfeasible", ORE, TheLoop); 5662 5663 return MaxScalableVF; 5664 } 5665 5666 FixedScalableVFPair 5667 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5668 ElementCount UserVF) { 5669 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5670 unsigned SmallestType, WidestType; 5671 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5672 5673 // Get the maximum safe dependence distance in bits computed by LAA. 5674 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5675 // the memory accesses that is most restrictive (involved in the smallest 5676 // dependence distance). 5677 unsigned MaxSafeElements = 5678 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5679 5680 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5681 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5682 5683 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5684 << ".\n"); 5685 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5686 << ".\n"); 5687 5688 // First analyze the UserVF, fall back if the UserVF should be ignored. 5689 if (UserVF) { 5690 auto MaxSafeUserVF = 5691 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5692 5693 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5694 // If `VF=vscale x N` is safe, then so is `VF=N` 5695 if (UserVF.isScalable()) 5696 return FixedScalableVFPair( 5697 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5698 else 5699 return UserVF; 5700 } 5701 5702 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5703 5704 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5705 // is better to ignore the hint and let the compiler choose a suitable VF. 5706 if (!UserVF.isScalable()) { 5707 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5708 << " is unsafe, clamping to max safe VF=" 5709 << MaxSafeFixedVF << ".\n"); 5710 ORE->emit([&]() { 5711 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5712 TheLoop->getStartLoc(), 5713 TheLoop->getHeader()) 5714 << "User-specified vectorization factor " 5715 << ore::NV("UserVectorizationFactor", UserVF) 5716 << " is unsafe, clamping to maximum safe vectorization factor " 5717 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5718 }); 5719 return MaxSafeFixedVF; 5720 } 5721 5722 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5723 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5724 << " is ignored because scalable vectors are not " 5725 "available.\n"); 5726 ORE->emit([&]() { 5727 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5728 TheLoop->getStartLoc(), 5729 TheLoop->getHeader()) 5730 << "User-specified vectorization factor " 5731 << ore::NV("UserVectorizationFactor", UserVF) 5732 << " is ignored because the target does not support scalable " 5733 "vectors. The compiler will pick a more suitable value."; 5734 }); 5735 } else { 5736 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5737 << " is unsafe. Ignoring scalable UserVF.\n"); 5738 ORE->emit([&]() { 5739 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5740 TheLoop->getStartLoc(), 5741 TheLoop->getHeader()) 5742 << "User-specified vectorization factor " 5743 << ore::NV("UserVectorizationFactor", UserVF) 5744 << " is unsafe. Ignoring the hint to let the compiler pick a " 5745 "more suitable value."; 5746 }); 5747 } 5748 } 5749 5750 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5751 << " / " << WidestType << " bits.\n"); 5752 5753 FixedScalableVFPair Result(ElementCount::getFixed(1), 5754 ElementCount::getScalable(0)); 5755 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5756 WidestType, MaxSafeFixedVF)) 5757 Result.FixedVF = MaxVF; 5758 5759 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5760 WidestType, MaxSafeScalableVF)) 5761 if (MaxVF.isScalable()) { 5762 Result.ScalableVF = MaxVF; 5763 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5764 << "\n"); 5765 } 5766 5767 return Result; 5768 } 5769 5770 FixedScalableVFPair 5771 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5772 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5773 // TODO: It may by useful to do since it's still likely to be dynamically 5774 // uniform if the target can skip. 5775 reportVectorizationFailure( 5776 "Not inserting runtime ptr check for divergent target", 5777 "runtime pointer checks needed. Not enabled for divergent target", 5778 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5779 return FixedScalableVFPair::getNone(); 5780 } 5781 5782 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5783 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5784 if (TC == 1) { 5785 reportVectorizationFailure("Single iteration (non) loop", 5786 "loop trip count is one, irrelevant for vectorization", 5787 "SingleIterationLoop", ORE, TheLoop); 5788 return FixedScalableVFPair::getNone(); 5789 } 5790 5791 switch (ScalarEpilogueStatus) { 5792 case CM_ScalarEpilogueAllowed: 5793 return computeFeasibleMaxVF(TC, UserVF); 5794 case CM_ScalarEpilogueNotAllowedUsePredicate: 5795 LLVM_FALLTHROUGH; 5796 case CM_ScalarEpilogueNotNeededUsePredicate: 5797 LLVM_DEBUG( 5798 dbgs() << "LV: vector predicate hint/switch found.\n" 5799 << "LV: Not allowing scalar epilogue, creating predicated " 5800 << "vector loop.\n"); 5801 break; 5802 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5803 // fallthrough as a special case of OptForSize 5804 case CM_ScalarEpilogueNotAllowedOptSize: 5805 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5806 LLVM_DEBUG( 5807 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5808 else 5809 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5810 << "count.\n"); 5811 5812 // Bail if runtime checks are required, which are not good when optimising 5813 // for size. 5814 if (runtimeChecksRequired()) 5815 return FixedScalableVFPair::getNone(); 5816 5817 break; 5818 } 5819 5820 // The only loops we can vectorize without a scalar epilogue, are loops with 5821 // a bottom-test and a single exiting block. We'd have to handle the fact 5822 // that not every instruction executes on the last iteration. This will 5823 // require a lane mask which varies through the vector loop body. (TODO) 5824 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5825 // If there was a tail-folding hint/switch, but we can't fold the tail by 5826 // masking, fallback to a vectorization with a scalar epilogue. 5827 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5828 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5829 "scalar epilogue instead.\n"); 5830 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5831 return computeFeasibleMaxVF(TC, UserVF); 5832 } 5833 return FixedScalableVFPair::getNone(); 5834 } 5835 5836 // Now try the tail folding 5837 5838 // Invalidate interleave groups that require an epilogue if we can't mask 5839 // the interleave-group. 5840 if (!useMaskedInterleavedAccesses(TTI)) { 5841 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5842 "No decisions should have been taken at this point"); 5843 // Note: There is no need to invalidate any cost modeling decisions here, as 5844 // non where taken so far. 5845 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5846 } 5847 5848 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5849 // Avoid tail folding if the trip count is known to be a multiple of any VF 5850 // we chose. 5851 // FIXME: The condition below pessimises the case for fixed-width vectors, 5852 // when scalable VFs are also candidates for vectorization. 5853 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5854 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5855 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5856 "MaxFixedVF must be a power of 2"); 5857 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5858 : MaxFixedVF.getFixedValue(); 5859 ScalarEvolution *SE = PSE.getSE(); 5860 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5861 const SCEV *ExitCount = SE->getAddExpr( 5862 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5863 const SCEV *Rem = SE->getURemExpr( 5864 SE->applyLoopGuards(ExitCount, TheLoop), 5865 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5866 if (Rem->isZero()) { 5867 // Accept MaxFixedVF if we do not have a tail. 5868 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5869 return MaxFactors; 5870 } 5871 } 5872 5873 // For scalable vectors, don't use tail folding as this is currently not yet 5874 // supported. The code is likely to have ended up here if the tripcount is 5875 // low, in which case it makes sense not to use scalable vectors. 5876 if (MaxFactors.ScalableVF.isVector()) 5877 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5878 5879 // If we don't know the precise trip count, or if the trip count that we 5880 // found modulo the vectorization factor is not zero, try to fold the tail 5881 // by masking. 5882 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5883 if (Legal->prepareToFoldTailByMasking()) { 5884 FoldTailByMasking = true; 5885 return MaxFactors; 5886 } 5887 5888 // If there was a tail-folding hint/switch, but we can't fold the tail by 5889 // masking, fallback to a vectorization with a scalar epilogue. 5890 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5891 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5892 "scalar epilogue instead.\n"); 5893 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5894 return MaxFactors; 5895 } 5896 5897 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5898 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5899 return FixedScalableVFPair::getNone(); 5900 } 5901 5902 if (TC == 0) { 5903 reportVectorizationFailure( 5904 "Unable to calculate the loop count due to complex control flow", 5905 "unable to calculate the loop count due to complex control flow", 5906 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5907 return FixedScalableVFPair::getNone(); 5908 } 5909 5910 reportVectorizationFailure( 5911 "Cannot optimize for size and vectorize at the same time.", 5912 "cannot optimize for size and vectorize at the same time. " 5913 "Enable vectorization of this loop with '#pragma clang loop " 5914 "vectorize(enable)' when compiling with -Os/-Oz", 5915 "NoTailLoopWithOptForSize", ORE, TheLoop); 5916 return FixedScalableVFPair::getNone(); 5917 } 5918 5919 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5920 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5921 const ElementCount &MaxSafeVF) { 5922 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5923 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5924 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5925 : TargetTransformInfo::RGK_FixedWidthVector); 5926 5927 // Convenience function to return the minimum of two ElementCounts. 5928 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5929 assert((LHS.isScalable() == RHS.isScalable()) && 5930 "Scalable flags must match"); 5931 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5932 }; 5933 5934 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5935 // Note that both WidestRegister and WidestType may not be a powers of 2. 5936 auto MaxVectorElementCount = ElementCount::get( 5937 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5938 ComputeScalableMaxVF); 5939 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5940 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5941 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5942 5943 if (!MaxVectorElementCount) { 5944 LLVM_DEBUG(dbgs() << "LV: The target has no " 5945 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5946 << " vector registers.\n"); 5947 return ElementCount::getFixed(1); 5948 } 5949 5950 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5951 if (ConstTripCount && 5952 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5953 isPowerOf2_32(ConstTripCount)) { 5954 // We need to clamp the VF to be the ConstTripCount. There is no point in 5955 // choosing a higher viable VF as done in the loop below. If 5956 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5957 // the TC is less than or equal to the known number of lanes. 5958 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5959 << ConstTripCount << "\n"); 5960 return TripCountEC; 5961 } 5962 5963 ElementCount MaxVF = MaxVectorElementCount; 5964 if (TTI.shouldMaximizeVectorBandwidth() || 5965 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5966 auto MaxVectorElementCountMaxBW = ElementCount::get( 5967 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5968 ComputeScalableMaxVF); 5969 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5970 5971 // Collect all viable vectorization factors larger than the default MaxVF 5972 // (i.e. MaxVectorElementCount). 5973 SmallVector<ElementCount, 8> VFs; 5974 for (ElementCount VS = MaxVectorElementCount * 2; 5975 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5976 VFs.push_back(VS); 5977 5978 // For each VF calculate its register usage. 5979 auto RUs = calculateRegisterUsage(VFs); 5980 5981 // Select the largest VF which doesn't require more registers than existing 5982 // ones. 5983 for (int i = RUs.size() - 1; i >= 0; --i) { 5984 bool Selected = true; 5985 for (auto &pair : RUs[i].MaxLocalUsers) { 5986 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5987 if (pair.second > TargetNumRegisters) 5988 Selected = false; 5989 } 5990 if (Selected) { 5991 MaxVF = VFs[i]; 5992 break; 5993 } 5994 } 5995 if (ElementCount MinVF = 5996 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5997 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5998 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5999 << ") with target's minimum: " << MinVF << '\n'); 6000 MaxVF = MinVF; 6001 } 6002 } 6003 } 6004 return MaxVF; 6005 } 6006 6007 bool LoopVectorizationCostModel::isMoreProfitable( 6008 const VectorizationFactor &A, const VectorizationFactor &B) const { 6009 InstructionCost CostA = A.Cost; 6010 InstructionCost CostB = B.Cost; 6011 6012 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6013 6014 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6015 MaxTripCount) { 6016 // If we are folding the tail and the trip count is a known (possibly small) 6017 // constant, the trip count will be rounded up to an integer number of 6018 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6019 // which we compare directly. When not folding the tail, the total cost will 6020 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6021 // approximated with the per-lane cost below instead of using the tripcount 6022 // as here. 6023 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6024 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6025 return RTCostA < RTCostB; 6026 } 6027 6028 // When set to preferred, for now assume vscale may be larger than 1, so 6029 // that scalable vectorization is slightly favorable over fixed-width 6030 // vectorization. 6031 if (Hints->isScalableVectorizationPreferred()) 6032 if (A.Width.isScalable() && !B.Width.isScalable()) 6033 return (CostA * B.Width.getKnownMinValue()) <= 6034 (CostB * A.Width.getKnownMinValue()); 6035 6036 // To avoid the need for FP division: 6037 // (CostA / A.Width) < (CostB / B.Width) 6038 // <=> (CostA * B.Width) < (CostB * A.Width) 6039 return (CostA * B.Width.getKnownMinValue()) < 6040 (CostB * A.Width.getKnownMinValue()); 6041 } 6042 6043 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 6044 const ElementCountSet &VFCandidates) { 6045 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6046 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6047 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6048 assert(VFCandidates.count(ElementCount::getFixed(1)) && 6049 "Expected Scalar VF to be a candidate"); 6050 6051 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6052 VectorizationFactor ChosenFactor = ScalarCost; 6053 6054 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6055 if (ForceVectorization && VFCandidates.size() > 1) { 6056 // Ignore scalar width, because the user explicitly wants vectorization. 6057 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6058 // evaluation. 6059 ChosenFactor.Cost = InstructionCost::getMax(); 6060 } 6061 6062 SmallVector<InstructionVFPair> InvalidCosts; 6063 for (const auto &i : VFCandidates) { 6064 // The cost for scalar VF=1 is already calculated, so ignore it. 6065 if (i.isScalar()) 6066 continue; 6067 6068 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 6069 VectorizationFactor Candidate(i, C.first); 6070 LLVM_DEBUG( 6071 dbgs() << "LV: Vector loop of width " << i << " costs: " 6072 << (Candidate.Cost / Candidate.Width.getKnownMinValue()) 6073 << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") 6074 << ".\n"); 6075 6076 if (!C.second && !ForceVectorization) { 6077 LLVM_DEBUG( 6078 dbgs() << "LV: Not considering vector loop of width " << i 6079 << " because it will not generate any vector instructions.\n"); 6080 continue; 6081 } 6082 6083 // If profitable add it to ProfitableVF list. 6084 if (isMoreProfitable(Candidate, ScalarCost)) 6085 ProfitableVFs.push_back(Candidate); 6086 6087 if (isMoreProfitable(Candidate, ChosenFactor)) 6088 ChosenFactor = Candidate; 6089 } 6090 6091 // Emit a report of VFs with invalid costs in the loop. 6092 if (!InvalidCosts.empty()) { 6093 // Group the remarks per instruction, keeping the instruction order from 6094 // InvalidCosts. 6095 std::map<Instruction *, unsigned> Numbering; 6096 unsigned I = 0; 6097 for (auto &Pair : InvalidCosts) 6098 if (!Numbering.count(Pair.first)) 6099 Numbering[Pair.first] = I++; 6100 6101 // Sort the list, first on instruction(number) then on VF. 6102 llvm::sort(InvalidCosts, 6103 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6104 if (Numbering[A.first] != Numbering[B.first]) 6105 return Numbering[A.first] < Numbering[B.first]; 6106 ElementCountComparator ECC; 6107 return ECC(A.second, B.second); 6108 }); 6109 6110 // For a list of ordered instruction-vf pairs: 6111 // [(load, vf1), (load, vf2), (store, vf1)] 6112 // Group the instructions together to emit separate remarks for: 6113 // load (vf1, vf2) 6114 // store (vf1) 6115 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6116 auto Subset = ArrayRef<InstructionVFPair>(); 6117 do { 6118 if (Subset.empty()) 6119 Subset = Tail.take_front(1); 6120 6121 Instruction *I = Subset.front().first; 6122 6123 // If the next instruction is different, or if there are no other pairs, 6124 // emit a remark for the collated subset. e.g. 6125 // [(load, vf1), (load, vf2))] 6126 // to emit: 6127 // remark: invalid costs for 'load' at VF=(vf, vf2) 6128 if (Subset == Tail || Tail[Subset.size()].first != I) { 6129 std::string OutString; 6130 raw_string_ostream OS(OutString); 6131 assert(!Subset.empty() && "Unexpected empty range"); 6132 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6133 for (auto &Pair : Subset) 6134 OS << (Pair.second == Subset.front().second ? "" : ", ") 6135 << Pair.second; 6136 OS << "):"; 6137 if (auto *CI = dyn_cast<CallInst>(I)) 6138 OS << " call to " << CI->getCalledFunction()->getName(); 6139 else 6140 OS << " " << I->getOpcodeName(); 6141 OS.flush(); 6142 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6143 Tail = Tail.drop_front(Subset.size()); 6144 Subset = {}; 6145 } else 6146 // Grow the subset by one element 6147 Subset = Tail.take_front(Subset.size() + 1); 6148 } while (!Tail.empty()); 6149 } 6150 6151 if (!EnableCondStoresVectorization && NumPredStores) { 6152 reportVectorizationFailure("There are conditional stores.", 6153 "store that is conditionally executed prevents vectorization", 6154 "ConditionalStore", ORE, TheLoop); 6155 ChosenFactor = ScalarCost; 6156 } 6157 6158 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6159 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6160 << "LV: Vectorization seems to be not beneficial, " 6161 << "but was forced by a user.\n"); 6162 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6163 return ChosenFactor; 6164 } 6165 6166 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6167 const Loop &L, ElementCount VF) const { 6168 // Cross iteration phis such as reductions need special handling and are 6169 // currently unsupported. 6170 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6171 return Legal->isFirstOrderRecurrence(&Phi) || 6172 Legal->isReductionVariable(&Phi); 6173 })) 6174 return false; 6175 6176 // Phis with uses outside of the loop require special handling and are 6177 // currently unsupported. 6178 for (auto &Entry : Legal->getInductionVars()) { 6179 // Look for uses of the value of the induction at the last iteration. 6180 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6181 for (User *U : PostInc->users()) 6182 if (!L.contains(cast<Instruction>(U))) 6183 return false; 6184 // Look for uses of penultimate value of the induction. 6185 for (User *U : Entry.first->users()) 6186 if (!L.contains(cast<Instruction>(U))) 6187 return false; 6188 } 6189 6190 // Induction variables that are widened require special handling that is 6191 // currently not supported. 6192 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6193 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6194 this->isProfitableToScalarize(Entry.first, VF)); 6195 })) 6196 return false; 6197 6198 // Epilogue vectorization code has not been auditted to ensure it handles 6199 // non-latch exits properly. It may be fine, but it needs auditted and 6200 // tested. 6201 if (L.getExitingBlock() != L.getLoopLatch()) 6202 return false; 6203 6204 return true; 6205 } 6206 6207 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6208 const ElementCount VF) const { 6209 // FIXME: We need a much better cost-model to take different parameters such 6210 // as register pressure, code size increase and cost of extra branches into 6211 // account. For now we apply a very crude heuristic and only consider loops 6212 // with vectorization factors larger than a certain value. 6213 // We also consider epilogue vectorization unprofitable for targets that don't 6214 // consider interleaving beneficial (eg. MVE). 6215 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6216 return false; 6217 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6218 return true; 6219 return false; 6220 } 6221 6222 VectorizationFactor 6223 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6224 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6225 VectorizationFactor Result = VectorizationFactor::Disabled(); 6226 if (!EnableEpilogueVectorization) { 6227 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6228 return Result; 6229 } 6230 6231 if (!isScalarEpilogueAllowed()) { 6232 LLVM_DEBUG( 6233 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6234 "allowed.\n";); 6235 return Result; 6236 } 6237 6238 // FIXME: This can be fixed for scalable vectors later, because at this stage 6239 // the LoopVectorizer will only consider vectorizing a loop with scalable 6240 // vectors when the loop has a hint to enable vectorization for a given VF. 6241 if (MainLoopVF.isScalable()) { 6242 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6243 "yet supported.\n"); 6244 return Result; 6245 } 6246 6247 // Not really a cost consideration, but check for unsupported cases here to 6248 // simplify the logic. 6249 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6250 LLVM_DEBUG( 6251 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6252 "not a supported candidate.\n";); 6253 return Result; 6254 } 6255 6256 if (EpilogueVectorizationForceVF > 1) { 6257 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6258 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 6259 if (LVP.hasPlanWithVFs({MainLoopVF, ForcedEC})) 6260 return {ForcedEC, 0}; 6261 else { 6262 LLVM_DEBUG( 6263 dbgs() 6264 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6265 return Result; 6266 } 6267 } 6268 6269 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6270 TheLoop->getHeader()->getParent()->hasMinSize()) { 6271 LLVM_DEBUG( 6272 dbgs() 6273 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6274 return Result; 6275 } 6276 6277 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6278 return Result; 6279 6280 for (auto &NextVF : ProfitableVFs) 6281 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6282 (Result.Width.getFixedValue() == 1 || 6283 isMoreProfitable(NextVF, Result)) && 6284 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6285 Result = NextVF; 6286 6287 if (Result != VectorizationFactor::Disabled()) 6288 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6289 << Result.Width.getFixedValue() << "\n";); 6290 return Result; 6291 } 6292 6293 std::pair<unsigned, unsigned> 6294 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6295 unsigned MinWidth = -1U; 6296 unsigned MaxWidth = 8; 6297 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6298 for (Type *T : ElementTypesInLoop) { 6299 MinWidth = std::min<unsigned>( 6300 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6301 MaxWidth = std::max<unsigned>( 6302 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6303 } 6304 return {MinWidth, MaxWidth}; 6305 } 6306 6307 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6308 ElementTypesInLoop.clear(); 6309 // For each block. 6310 for (BasicBlock *BB : TheLoop->blocks()) { 6311 // For each instruction in the loop. 6312 for (Instruction &I : BB->instructionsWithoutDebug()) { 6313 Type *T = I.getType(); 6314 6315 // Skip ignored values. 6316 if (ValuesToIgnore.count(&I)) 6317 continue; 6318 6319 // Only examine Loads, Stores and PHINodes. 6320 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6321 continue; 6322 6323 // Examine PHI nodes that are reduction variables. Update the type to 6324 // account for the recurrence type. 6325 if (auto *PN = dyn_cast<PHINode>(&I)) { 6326 if (!Legal->isReductionVariable(PN)) 6327 continue; 6328 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6329 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6330 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6331 RdxDesc.getRecurrenceType(), 6332 TargetTransformInfo::ReductionFlags())) 6333 continue; 6334 T = RdxDesc.getRecurrenceType(); 6335 } 6336 6337 // Examine the stored values. 6338 if (auto *ST = dyn_cast<StoreInst>(&I)) 6339 T = ST->getValueOperand()->getType(); 6340 6341 // Ignore loaded pointer types and stored pointer types that are not 6342 // vectorizable. 6343 // 6344 // FIXME: The check here attempts to predict whether a load or store will 6345 // be vectorized. We only know this for certain after a VF has 6346 // been selected. Here, we assume that if an access can be 6347 // vectorized, it will be. We should also look at extending this 6348 // optimization to non-pointer types. 6349 // 6350 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6351 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6352 continue; 6353 6354 ElementTypesInLoop.insert(T); 6355 } 6356 } 6357 } 6358 6359 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6360 unsigned LoopCost) { 6361 // -- The interleave heuristics -- 6362 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6363 // There are many micro-architectural considerations that we can't predict 6364 // at this level. For example, frontend pressure (on decode or fetch) due to 6365 // code size, or the number and capabilities of the execution ports. 6366 // 6367 // We use the following heuristics to select the interleave count: 6368 // 1. If the code has reductions, then we interleave to break the cross 6369 // iteration dependency. 6370 // 2. If the loop is really small, then we interleave to reduce the loop 6371 // overhead. 6372 // 3. We don't interleave if we think that we will spill registers to memory 6373 // due to the increased register pressure. 6374 6375 if (!isScalarEpilogueAllowed()) 6376 return 1; 6377 6378 // We used the distance for the interleave count. 6379 if (Legal->getMaxSafeDepDistBytes() != -1U) 6380 return 1; 6381 6382 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6383 const bool HasReductions = !Legal->getReductionVars().empty(); 6384 // Do not interleave loops with a relatively small known or estimated trip 6385 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6386 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6387 // because with the above conditions interleaving can expose ILP and break 6388 // cross iteration dependences for reductions. 6389 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6390 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6391 return 1; 6392 6393 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6394 // We divide by these constants so assume that we have at least one 6395 // instruction that uses at least one register. 6396 for (auto& pair : R.MaxLocalUsers) { 6397 pair.second = std::max(pair.second, 1U); 6398 } 6399 6400 // We calculate the interleave count using the following formula. 6401 // Subtract the number of loop invariants from the number of available 6402 // registers. These registers are used by all of the interleaved instances. 6403 // Next, divide the remaining registers by the number of registers that is 6404 // required by the loop, in order to estimate how many parallel instances 6405 // fit without causing spills. All of this is rounded down if necessary to be 6406 // a power of two. We want power of two interleave count to simplify any 6407 // addressing operations or alignment considerations. 6408 // We also want power of two interleave counts to ensure that the induction 6409 // variable of the vector loop wraps to zero, when tail is folded by masking; 6410 // this currently happens when OptForSize, in which case IC is set to 1 above. 6411 unsigned IC = UINT_MAX; 6412 6413 for (auto& pair : R.MaxLocalUsers) { 6414 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6415 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6416 << " registers of " 6417 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6418 if (VF.isScalar()) { 6419 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6420 TargetNumRegisters = ForceTargetNumScalarRegs; 6421 } else { 6422 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6423 TargetNumRegisters = ForceTargetNumVectorRegs; 6424 } 6425 unsigned MaxLocalUsers = pair.second; 6426 unsigned LoopInvariantRegs = 0; 6427 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6428 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6429 6430 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6431 // Don't count the induction variable as interleaved. 6432 if (EnableIndVarRegisterHeur) { 6433 TmpIC = 6434 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6435 std::max(1U, (MaxLocalUsers - 1))); 6436 } 6437 6438 IC = std::min(IC, TmpIC); 6439 } 6440 6441 // Clamp the interleave ranges to reasonable counts. 6442 unsigned MaxInterleaveCount = 6443 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6444 6445 // Check if the user has overridden the max. 6446 if (VF.isScalar()) { 6447 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6448 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6449 } else { 6450 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6451 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6452 } 6453 6454 // If trip count is known or estimated compile time constant, limit the 6455 // interleave count to be less than the trip count divided by VF, provided it 6456 // is at least 1. 6457 // 6458 // For scalable vectors we can't know if interleaving is beneficial. It may 6459 // not be beneficial for small loops if none of the lanes in the second vector 6460 // iterations is enabled. However, for larger loops, there is likely to be a 6461 // similar benefit as for fixed-width vectors. For now, we choose to leave 6462 // the InterleaveCount as if vscale is '1', although if some information about 6463 // the vector is known (e.g. min vector size), we can make a better decision. 6464 if (BestKnownTC) { 6465 MaxInterleaveCount = 6466 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6467 // Make sure MaxInterleaveCount is greater than 0. 6468 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6469 } 6470 6471 assert(MaxInterleaveCount > 0 && 6472 "Maximum interleave count must be greater than 0"); 6473 6474 // Clamp the calculated IC to be between the 1 and the max interleave count 6475 // that the target and trip count allows. 6476 if (IC > MaxInterleaveCount) 6477 IC = MaxInterleaveCount; 6478 else 6479 // Make sure IC is greater than 0. 6480 IC = std::max(1u, IC); 6481 6482 assert(IC > 0 && "Interleave count must be greater than 0."); 6483 6484 // If we did not calculate the cost for VF (because the user selected the VF) 6485 // then we calculate the cost of VF here. 6486 if (LoopCost == 0) { 6487 InstructionCost C = expectedCost(VF).first; 6488 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6489 LoopCost = *C.getValue(); 6490 } 6491 6492 assert(LoopCost && "Non-zero loop cost expected"); 6493 6494 // Interleave if we vectorized this loop and there is a reduction that could 6495 // benefit from interleaving. 6496 if (VF.isVector() && HasReductions) { 6497 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6498 return IC; 6499 } 6500 6501 // Note that if we've already vectorized the loop we will have done the 6502 // runtime check and so interleaving won't require further checks. 6503 bool InterleavingRequiresRuntimePointerCheck = 6504 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6505 6506 // We want to interleave small loops in order to reduce the loop overhead and 6507 // potentially expose ILP opportunities. 6508 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6509 << "LV: IC is " << IC << '\n' 6510 << "LV: VF is " << VF << '\n'); 6511 const bool AggressivelyInterleaveReductions = 6512 TTI.enableAggressiveInterleaving(HasReductions); 6513 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6514 // We assume that the cost overhead is 1 and we use the cost model 6515 // to estimate the cost of the loop and interleave until the cost of the 6516 // loop overhead is about 5% of the cost of the loop. 6517 unsigned SmallIC = 6518 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6519 6520 // Interleave until store/load ports (estimated by max interleave count) are 6521 // saturated. 6522 unsigned NumStores = Legal->getNumStores(); 6523 unsigned NumLoads = Legal->getNumLoads(); 6524 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6525 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6526 6527 // There is little point in interleaving for reductions containing selects 6528 // and compares when VF=1 since it may just create more overhead than it's 6529 // worth for loops with small trip counts. This is because we still have to 6530 // do the final reduction after the loop. 6531 bool HasSelectCmpReductions = 6532 HasReductions && 6533 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6534 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6535 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6536 RdxDesc.getRecurrenceKind()); 6537 }); 6538 if (HasSelectCmpReductions) { 6539 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6540 return 1; 6541 } 6542 6543 // If we have a scalar reduction (vector reductions are already dealt with 6544 // by this point), we can increase the critical path length if the loop 6545 // we're interleaving is inside another loop. For tree-wise reductions 6546 // set the limit to 2, and for ordered reductions it's best to disable 6547 // interleaving entirely. 6548 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6549 bool HasOrderedReductions = 6550 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6551 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6552 return RdxDesc.isOrdered(); 6553 }); 6554 if (HasOrderedReductions) { 6555 LLVM_DEBUG( 6556 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6557 return 1; 6558 } 6559 6560 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6561 SmallIC = std::min(SmallIC, F); 6562 StoresIC = std::min(StoresIC, F); 6563 LoadsIC = std::min(LoadsIC, F); 6564 } 6565 6566 if (EnableLoadStoreRuntimeInterleave && 6567 std::max(StoresIC, LoadsIC) > SmallIC) { 6568 LLVM_DEBUG( 6569 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6570 return std::max(StoresIC, LoadsIC); 6571 } 6572 6573 // If there are scalar reductions and TTI has enabled aggressive 6574 // interleaving for reductions, we will interleave to expose ILP. 6575 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6576 AggressivelyInterleaveReductions) { 6577 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6578 // Interleave no less than SmallIC but not as aggressive as the normal IC 6579 // to satisfy the rare situation when resources are too limited. 6580 return std::max(IC / 2, SmallIC); 6581 } else { 6582 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6583 return SmallIC; 6584 } 6585 } 6586 6587 // Interleave if this is a large loop (small loops are already dealt with by 6588 // this point) that could benefit from interleaving. 6589 if (AggressivelyInterleaveReductions) { 6590 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6591 return IC; 6592 } 6593 6594 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6595 return 1; 6596 } 6597 6598 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6599 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6600 // This function calculates the register usage by measuring the highest number 6601 // of values that are alive at a single location. Obviously, this is a very 6602 // rough estimation. We scan the loop in a topological order in order and 6603 // assign a number to each instruction. We use RPO to ensure that defs are 6604 // met before their users. We assume that each instruction that has in-loop 6605 // users starts an interval. We record every time that an in-loop value is 6606 // used, so we have a list of the first and last occurrences of each 6607 // instruction. Next, we transpose this data structure into a multi map that 6608 // holds the list of intervals that *end* at a specific location. This multi 6609 // map allows us to perform a linear search. We scan the instructions linearly 6610 // and record each time that a new interval starts, by placing it in a set. 6611 // If we find this value in the multi-map then we remove it from the set. 6612 // The max register usage is the maximum size of the set. 6613 // We also search for instructions that are defined outside the loop, but are 6614 // used inside the loop. We need this number separately from the max-interval 6615 // usage number because when we unroll, loop-invariant values do not take 6616 // more register. 6617 LoopBlocksDFS DFS(TheLoop); 6618 DFS.perform(LI); 6619 6620 RegisterUsage RU; 6621 6622 // Each 'key' in the map opens a new interval. The values 6623 // of the map are the index of the 'last seen' usage of the 6624 // instruction that is the key. 6625 using IntervalMap = DenseMap<Instruction *, unsigned>; 6626 6627 // Maps instruction to its index. 6628 SmallVector<Instruction *, 64> IdxToInstr; 6629 // Marks the end of each interval. 6630 IntervalMap EndPoint; 6631 // Saves the list of instruction indices that are used in the loop. 6632 SmallPtrSet<Instruction *, 8> Ends; 6633 // Saves the list of values that are used in the loop but are 6634 // defined outside the loop, such as arguments and constants. 6635 SmallPtrSet<Value *, 8> LoopInvariants; 6636 6637 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6638 for (Instruction &I : BB->instructionsWithoutDebug()) { 6639 IdxToInstr.push_back(&I); 6640 6641 // Save the end location of each USE. 6642 for (Value *U : I.operands()) { 6643 auto *Instr = dyn_cast<Instruction>(U); 6644 6645 // Ignore non-instruction values such as arguments, constants, etc. 6646 if (!Instr) 6647 continue; 6648 6649 // If this instruction is outside the loop then record it and continue. 6650 if (!TheLoop->contains(Instr)) { 6651 LoopInvariants.insert(Instr); 6652 continue; 6653 } 6654 6655 // Overwrite previous end points. 6656 EndPoint[Instr] = IdxToInstr.size(); 6657 Ends.insert(Instr); 6658 } 6659 } 6660 } 6661 6662 // Saves the list of intervals that end with the index in 'key'. 6663 using InstrList = SmallVector<Instruction *, 2>; 6664 DenseMap<unsigned, InstrList> TransposeEnds; 6665 6666 // Transpose the EndPoints to a list of values that end at each index. 6667 for (auto &Interval : EndPoint) 6668 TransposeEnds[Interval.second].push_back(Interval.first); 6669 6670 SmallPtrSet<Instruction *, 8> OpenIntervals; 6671 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6672 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6673 6674 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6675 6676 // A lambda that gets the register usage for the given type and VF. 6677 const auto &TTICapture = TTI; 6678 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6679 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6680 return 0; 6681 InstructionCost::CostType RegUsage = 6682 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6683 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6684 "Nonsensical values for register usage."); 6685 return RegUsage; 6686 }; 6687 6688 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6689 Instruction *I = IdxToInstr[i]; 6690 6691 // Remove all of the instructions that end at this location. 6692 InstrList &List = TransposeEnds[i]; 6693 for (Instruction *ToRemove : List) 6694 OpenIntervals.erase(ToRemove); 6695 6696 // Ignore instructions that are never used within the loop. 6697 if (!Ends.count(I)) 6698 continue; 6699 6700 // Skip ignored values. 6701 if (ValuesToIgnore.count(I)) 6702 continue; 6703 6704 // For each VF find the maximum usage of registers. 6705 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6706 // Count the number of live intervals. 6707 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6708 6709 if (VFs[j].isScalar()) { 6710 for (auto Inst : OpenIntervals) { 6711 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6712 if (RegUsage.find(ClassID) == RegUsage.end()) 6713 RegUsage[ClassID] = 1; 6714 else 6715 RegUsage[ClassID] += 1; 6716 } 6717 } else { 6718 collectUniformsAndScalars(VFs[j]); 6719 for (auto Inst : OpenIntervals) { 6720 // Skip ignored values for VF > 1. 6721 if (VecValuesToIgnore.count(Inst)) 6722 continue; 6723 if (isScalarAfterVectorization(Inst, VFs[j])) { 6724 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6725 if (RegUsage.find(ClassID) == RegUsage.end()) 6726 RegUsage[ClassID] = 1; 6727 else 6728 RegUsage[ClassID] += 1; 6729 } else { 6730 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6731 if (RegUsage.find(ClassID) == RegUsage.end()) 6732 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6733 else 6734 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6735 } 6736 } 6737 } 6738 6739 for (auto& pair : RegUsage) { 6740 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6741 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6742 else 6743 MaxUsages[j][pair.first] = pair.second; 6744 } 6745 } 6746 6747 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6748 << OpenIntervals.size() << '\n'); 6749 6750 // Add the current instruction to the list of open intervals. 6751 OpenIntervals.insert(I); 6752 } 6753 6754 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6755 SmallMapVector<unsigned, unsigned, 4> Invariant; 6756 6757 for (auto Inst : LoopInvariants) { 6758 unsigned Usage = 6759 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6760 unsigned ClassID = 6761 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6762 if (Invariant.find(ClassID) == Invariant.end()) 6763 Invariant[ClassID] = Usage; 6764 else 6765 Invariant[ClassID] += Usage; 6766 } 6767 6768 LLVM_DEBUG({ 6769 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6770 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6771 << " item\n"; 6772 for (const auto &pair : MaxUsages[i]) { 6773 dbgs() << "LV(REG): RegisterClass: " 6774 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6775 << " registers\n"; 6776 } 6777 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6778 << " item\n"; 6779 for (const auto &pair : Invariant) { 6780 dbgs() << "LV(REG): RegisterClass: " 6781 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6782 << " registers\n"; 6783 } 6784 }); 6785 6786 RU.LoopInvariantRegs = Invariant; 6787 RU.MaxLocalUsers = MaxUsages[i]; 6788 RUs[i] = RU; 6789 } 6790 6791 return RUs; 6792 } 6793 6794 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6795 // TODO: Cost model for emulated masked load/store is completely 6796 // broken. This hack guides the cost model to use an artificially 6797 // high enough value to practically disable vectorization with such 6798 // operations, except where previously deployed legality hack allowed 6799 // using very low cost values. This is to avoid regressions coming simply 6800 // from moving "masked load/store" check from legality to cost model. 6801 // Masked Load/Gather emulation was previously never allowed. 6802 // Limited number of Masked Store/Scatter emulation was allowed. 6803 assert(isPredicatedInst(I) && 6804 "Expecting a scalar emulated instruction"); 6805 return isa<LoadInst>(I) || 6806 (isa<StoreInst>(I) && 6807 NumPredStores > NumberOfStoresToPredicate); 6808 } 6809 6810 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6811 // If we aren't vectorizing the loop, or if we've already collected the 6812 // instructions to scalarize, there's nothing to do. Collection may already 6813 // have occurred if we have a user-selected VF and are now computing the 6814 // expected cost for interleaving. 6815 if (VF.isScalar() || VF.isZero() || 6816 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6817 return; 6818 6819 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6820 // not profitable to scalarize any instructions, the presence of VF in the 6821 // map will indicate that we've analyzed it already. 6822 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6823 6824 // Find all the instructions that are scalar with predication in the loop and 6825 // determine if it would be better to not if-convert the blocks they are in. 6826 // If so, we also record the instructions to scalarize. 6827 for (BasicBlock *BB : TheLoop->blocks()) { 6828 if (!blockNeedsPredication(BB)) 6829 continue; 6830 for (Instruction &I : *BB) 6831 if (isScalarWithPredication(&I)) { 6832 ScalarCostsTy ScalarCosts; 6833 // Do not apply discount if scalable, because that would lead to 6834 // invalid scalarization costs. 6835 // Do not apply discount logic if hacked cost is needed 6836 // for emulated masked memrefs. 6837 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6838 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6839 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6840 // Remember that BB will remain after vectorization. 6841 PredicatedBBsAfterVectorization.insert(BB); 6842 } 6843 } 6844 } 6845 6846 int LoopVectorizationCostModel::computePredInstDiscount( 6847 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6848 assert(!isUniformAfterVectorization(PredInst, VF) && 6849 "Instruction marked uniform-after-vectorization will be predicated"); 6850 6851 // Initialize the discount to zero, meaning that the scalar version and the 6852 // vector version cost the same. 6853 InstructionCost Discount = 0; 6854 6855 // Holds instructions to analyze. The instructions we visit are mapped in 6856 // ScalarCosts. Those instructions are the ones that would be scalarized if 6857 // we find that the scalar version costs less. 6858 SmallVector<Instruction *, 8> Worklist; 6859 6860 // Returns true if the given instruction can be scalarized. 6861 auto canBeScalarized = [&](Instruction *I) -> bool { 6862 // We only attempt to scalarize instructions forming a single-use chain 6863 // from the original predicated block that would otherwise be vectorized. 6864 // Although not strictly necessary, we give up on instructions we know will 6865 // already be scalar to avoid traversing chains that are unlikely to be 6866 // beneficial. 6867 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6868 isScalarAfterVectorization(I, VF)) 6869 return false; 6870 6871 // If the instruction is scalar with predication, it will be analyzed 6872 // separately. We ignore it within the context of PredInst. 6873 if (isScalarWithPredication(I)) 6874 return false; 6875 6876 // If any of the instruction's operands are uniform after vectorization, 6877 // the instruction cannot be scalarized. This prevents, for example, a 6878 // masked load from being scalarized. 6879 // 6880 // We assume we will only emit a value for lane zero of an instruction 6881 // marked uniform after vectorization, rather than VF identical values. 6882 // Thus, if we scalarize an instruction that uses a uniform, we would 6883 // create uses of values corresponding to the lanes we aren't emitting code 6884 // for. This behavior can be changed by allowing getScalarValue to clone 6885 // the lane zero values for uniforms rather than asserting. 6886 for (Use &U : I->operands()) 6887 if (auto *J = dyn_cast<Instruction>(U.get())) 6888 if (isUniformAfterVectorization(J, VF)) 6889 return false; 6890 6891 // Otherwise, we can scalarize the instruction. 6892 return true; 6893 }; 6894 6895 // Compute the expected cost discount from scalarizing the entire expression 6896 // feeding the predicated instruction. We currently only consider expressions 6897 // that are single-use instruction chains. 6898 Worklist.push_back(PredInst); 6899 while (!Worklist.empty()) { 6900 Instruction *I = Worklist.pop_back_val(); 6901 6902 // If we've already analyzed the instruction, there's nothing to do. 6903 if (ScalarCosts.find(I) != ScalarCosts.end()) 6904 continue; 6905 6906 // Compute the cost of the vector instruction. Note that this cost already 6907 // includes the scalarization overhead of the predicated instruction. 6908 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6909 6910 // Compute the cost of the scalarized instruction. This cost is the cost of 6911 // the instruction as if it wasn't if-converted and instead remained in the 6912 // predicated block. We will scale this cost by block probability after 6913 // computing the scalarization overhead. 6914 InstructionCost ScalarCost = 6915 VF.getFixedValue() * 6916 getInstructionCost(I, ElementCount::getFixed(1)).first; 6917 6918 // Compute the scalarization overhead of needed insertelement instructions 6919 // and phi nodes. 6920 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6921 ScalarCost += TTI.getScalarizationOverhead( 6922 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6923 APInt::getAllOnes(VF.getFixedValue()), true, false); 6924 ScalarCost += 6925 VF.getFixedValue() * 6926 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6927 } 6928 6929 // Compute the scalarization overhead of needed extractelement 6930 // instructions. For each of the instruction's operands, if the operand can 6931 // be scalarized, add it to the worklist; otherwise, account for the 6932 // overhead. 6933 for (Use &U : I->operands()) 6934 if (auto *J = dyn_cast<Instruction>(U.get())) { 6935 assert(VectorType::isValidElementType(J->getType()) && 6936 "Instruction has non-scalar type"); 6937 if (canBeScalarized(J)) 6938 Worklist.push_back(J); 6939 else if (needsExtract(J, VF)) { 6940 ScalarCost += TTI.getScalarizationOverhead( 6941 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6942 APInt::getAllOnes(VF.getFixedValue()), false, true); 6943 } 6944 } 6945 6946 // Scale the total scalar cost by block probability. 6947 ScalarCost /= getReciprocalPredBlockProb(); 6948 6949 // Compute the discount. A non-negative discount means the vector version 6950 // of the instruction costs more, and scalarizing would be beneficial. 6951 Discount += VectorCost - ScalarCost; 6952 ScalarCosts[I] = ScalarCost; 6953 } 6954 6955 return *Discount.getValue(); 6956 } 6957 6958 LoopVectorizationCostModel::VectorizationCostTy 6959 LoopVectorizationCostModel::expectedCost( 6960 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6961 VectorizationCostTy Cost; 6962 6963 // For each block. 6964 for (BasicBlock *BB : TheLoop->blocks()) { 6965 VectorizationCostTy BlockCost; 6966 6967 // For each instruction in the old loop. 6968 for (Instruction &I : BB->instructionsWithoutDebug()) { 6969 // Skip ignored values. 6970 if (ValuesToIgnore.count(&I) || 6971 (VF.isVector() && VecValuesToIgnore.count(&I))) 6972 continue; 6973 6974 VectorizationCostTy C = getInstructionCost(&I, VF); 6975 6976 // Check if we should override the cost. 6977 if (C.first.isValid() && 6978 ForceTargetInstructionCost.getNumOccurrences() > 0) 6979 C.first = InstructionCost(ForceTargetInstructionCost); 6980 6981 // Keep a list of instructions with invalid costs. 6982 if (Invalid && !C.first.isValid()) 6983 Invalid->emplace_back(&I, VF); 6984 6985 BlockCost.first += C.first; 6986 BlockCost.second |= C.second; 6987 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6988 << " for VF " << VF << " For instruction: " << I 6989 << '\n'); 6990 } 6991 6992 // If we are vectorizing a predicated block, it will have been 6993 // if-converted. This means that the block's instructions (aside from 6994 // stores and instructions that may divide by zero) will now be 6995 // unconditionally executed. For the scalar case, we may not always execute 6996 // the predicated block, if it is an if-else block. Thus, scale the block's 6997 // cost by the probability of executing it. blockNeedsPredication from 6998 // Legal is used so as to not include all blocks in tail folded loops. 6999 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 7000 BlockCost.first /= getReciprocalPredBlockProb(); 7001 7002 Cost.first += BlockCost.first; 7003 Cost.second |= BlockCost.second; 7004 } 7005 7006 return Cost; 7007 } 7008 7009 /// Gets Address Access SCEV after verifying that the access pattern 7010 /// is loop invariant except the induction variable dependence. 7011 /// 7012 /// This SCEV can be sent to the Target in order to estimate the address 7013 /// calculation cost. 7014 static const SCEV *getAddressAccessSCEV( 7015 Value *Ptr, 7016 LoopVectorizationLegality *Legal, 7017 PredicatedScalarEvolution &PSE, 7018 const Loop *TheLoop) { 7019 7020 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 7021 if (!Gep) 7022 return nullptr; 7023 7024 // We are looking for a gep with all loop invariant indices except for one 7025 // which should be an induction variable. 7026 auto SE = PSE.getSE(); 7027 unsigned NumOperands = Gep->getNumOperands(); 7028 for (unsigned i = 1; i < NumOperands; ++i) { 7029 Value *Opd = Gep->getOperand(i); 7030 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 7031 !Legal->isInductionVariable(Opd)) 7032 return nullptr; 7033 } 7034 7035 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 7036 return PSE.getSCEV(Ptr); 7037 } 7038 7039 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 7040 return Legal->hasStride(I->getOperand(0)) || 7041 Legal->hasStride(I->getOperand(1)); 7042 } 7043 7044 InstructionCost 7045 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 7046 ElementCount VF) { 7047 assert(VF.isVector() && 7048 "Scalarization cost of instruction implies vectorization."); 7049 if (VF.isScalable()) 7050 return InstructionCost::getInvalid(); 7051 7052 Type *ValTy = getLoadStoreType(I); 7053 auto SE = PSE.getSE(); 7054 7055 unsigned AS = getLoadStoreAddressSpace(I); 7056 Value *Ptr = getLoadStorePointerOperand(I); 7057 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 7058 7059 // Figure out whether the access is strided and get the stride value 7060 // if it's known in compile time 7061 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 7062 7063 // Get the cost of the scalar memory instruction and address computation. 7064 InstructionCost Cost = 7065 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 7066 7067 // Don't pass *I here, since it is scalar but will actually be part of a 7068 // vectorized loop where the user of it is a vectorized instruction. 7069 const Align Alignment = getLoadStoreAlignment(I); 7070 Cost += VF.getKnownMinValue() * 7071 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 7072 AS, TTI::TCK_RecipThroughput); 7073 7074 // Get the overhead of the extractelement and insertelement instructions 7075 // we might create due to scalarization. 7076 Cost += getScalarizationOverhead(I, VF); 7077 7078 // If we have a predicated load/store, it will need extra i1 extracts and 7079 // conditional branches, but may not be executed for each vector lane. Scale 7080 // the cost by the probability of executing the predicated block. 7081 if (isPredicatedInst(I)) { 7082 Cost /= getReciprocalPredBlockProb(); 7083 7084 // Add the cost of an i1 extract and a branch 7085 auto *Vec_i1Ty = 7086 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 7087 Cost += TTI.getScalarizationOverhead( 7088 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 7089 /*Insert=*/false, /*Extract=*/true); 7090 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 7091 7092 if (useEmulatedMaskMemRefHack(I)) 7093 // Artificially setting to a high enough value to practically disable 7094 // vectorization with such operations. 7095 Cost = 3000000; 7096 } 7097 7098 return Cost; 7099 } 7100 7101 InstructionCost 7102 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7103 ElementCount VF) { 7104 Type *ValTy = getLoadStoreType(I); 7105 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7106 Value *Ptr = getLoadStorePointerOperand(I); 7107 unsigned AS = getLoadStoreAddressSpace(I); 7108 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 7109 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7110 7111 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7112 "Stride should be 1 or -1 for consecutive memory access"); 7113 const Align Alignment = getLoadStoreAlignment(I); 7114 InstructionCost Cost = 0; 7115 if (Legal->isMaskRequired(I)) 7116 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7117 CostKind); 7118 else 7119 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7120 CostKind, I); 7121 7122 bool Reverse = ConsecutiveStride < 0; 7123 if (Reverse) 7124 Cost += 7125 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7126 return Cost; 7127 } 7128 7129 InstructionCost 7130 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7131 ElementCount VF) { 7132 assert(Legal->isUniformMemOp(*I)); 7133 7134 Type *ValTy = getLoadStoreType(I); 7135 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7136 const Align Alignment = getLoadStoreAlignment(I); 7137 unsigned AS = getLoadStoreAddressSpace(I); 7138 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7139 if (isa<LoadInst>(I)) { 7140 return TTI.getAddressComputationCost(ValTy) + 7141 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7142 CostKind) + 7143 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7144 } 7145 StoreInst *SI = cast<StoreInst>(I); 7146 7147 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7148 return TTI.getAddressComputationCost(ValTy) + 7149 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7150 CostKind) + 7151 (isLoopInvariantStoreValue 7152 ? 0 7153 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7154 VF.getKnownMinValue() - 1)); 7155 } 7156 7157 InstructionCost 7158 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7159 ElementCount VF) { 7160 Type *ValTy = getLoadStoreType(I); 7161 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7162 const Align Alignment = getLoadStoreAlignment(I); 7163 const Value *Ptr = getLoadStorePointerOperand(I); 7164 7165 return TTI.getAddressComputationCost(VectorTy) + 7166 TTI.getGatherScatterOpCost( 7167 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7168 TargetTransformInfo::TCK_RecipThroughput, I); 7169 } 7170 7171 InstructionCost 7172 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7173 ElementCount VF) { 7174 // TODO: Once we have support for interleaving with scalable vectors 7175 // we can calculate the cost properly here. 7176 if (VF.isScalable()) 7177 return InstructionCost::getInvalid(); 7178 7179 Type *ValTy = getLoadStoreType(I); 7180 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7181 unsigned AS = getLoadStoreAddressSpace(I); 7182 7183 auto Group = getInterleavedAccessGroup(I); 7184 assert(Group && "Fail to get an interleaved access group."); 7185 7186 unsigned InterleaveFactor = Group->getFactor(); 7187 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7188 7189 // Holds the indices of existing members in the interleaved group. 7190 SmallVector<unsigned, 4> Indices; 7191 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 7192 if (Group->getMember(IF)) 7193 Indices.push_back(IF); 7194 7195 // Calculate the cost of the whole interleaved group. 7196 bool UseMaskForGaps = 7197 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 7198 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 7199 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7200 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7201 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7202 7203 if (Group->isReverse()) { 7204 // TODO: Add support for reversed masked interleaved access. 7205 assert(!Legal->isMaskRequired(I) && 7206 "Reverse masked interleaved access not supported."); 7207 Cost += 7208 Group->getNumMembers() * 7209 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7210 } 7211 return Cost; 7212 } 7213 7214 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7215 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7216 using namespace llvm::PatternMatch; 7217 // Early exit for no inloop reductions 7218 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7219 return None; 7220 auto *VectorTy = cast<VectorType>(Ty); 7221 7222 // We are looking for a pattern of, and finding the minimal acceptable cost: 7223 // reduce(mul(ext(A), ext(B))) or 7224 // reduce(mul(A, B)) or 7225 // reduce(ext(A)) or 7226 // reduce(A). 7227 // The basic idea is that we walk down the tree to do that, finding the root 7228 // reduction instruction in InLoopReductionImmediateChains. From there we find 7229 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7230 // of the components. If the reduction cost is lower then we return it for the 7231 // reduction instruction and 0 for the other instructions in the pattern. If 7232 // it is not we return an invalid cost specifying the orignal cost method 7233 // should be used. 7234 Instruction *RetI = I; 7235 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7236 if (!RetI->hasOneUser()) 7237 return None; 7238 RetI = RetI->user_back(); 7239 } 7240 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7241 RetI->user_back()->getOpcode() == Instruction::Add) { 7242 if (!RetI->hasOneUser()) 7243 return None; 7244 RetI = RetI->user_back(); 7245 } 7246 7247 // Test if the found instruction is a reduction, and if not return an invalid 7248 // cost specifying the parent to use the original cost modelling. 7249 if (!InLoopReductionImmediateChains.count(RetI)) 7250 return None; 7251 7252 // Find the reduction this chain is a part of and calculate the basic cost of 7253 // the reduction on its own. 7254 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7255 Instruction *ReductionPhi = LastChain; 7256 while (!isa<PHINode>(ReductionPhi)) 7257 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7258 7259 const RecurrenceDescriptor &RdxDesc = 7260 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7261 7262 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7263 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7264 7265 // If we're using ordered reductions then we can just return the base cost 7266 // here, since getArithmeticReductionCost calculates the full ordered 7267 // reduction cost when FP reassociation is not allowed. 7268 if (useOrderedReductions(RdxDesc)) 7269 return BaseCost; 7270 7271 // Get the operand that was not the reduction chain and match it to one of the 7272 // patterns, returning the better cost if it is found. 7273 Instruction *RedOp = RetI->getOperand(1) == LastChain 7274 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7275 : dyn_cast<Instruction>(RetI->getOperand(1)); 7276 7277 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7278 7279 Instruction *Op0, *Op1; 7280 if (RedOp && 7281 match(RedOp, 7282 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7283 match(Op0, m_ZExtOrSExt(m_Value())) && 7284 Op0->getOpcode() == Op1->getOpcode() && 7285 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7286 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7287 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7288 7289 // Matched reduce(ext(mul(ext(A), ext(B))) 7290 // Note that the extend opcodes need to all match, or if A==B they will have 7291 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7292 // which is equally fine. 7293 bool IsUnsigned = isa<ZExtInst>(Op0); 7294 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7295 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7296 7297 InstructionCost ExtCost = 7298 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7299 TTI::CastContextHint::None, CostKind, Op0); 7300 InstructionCost MulCost = 7301 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7302 InstructionCost Ext2Cost = 7303 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7304 TTI::CastContextHint::None, CostKind, RedOp); 7305 7306 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7307 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7308 CostKind); 7309 7310 if (RedCost.isValid() && 7311 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7312 return I == RetI ? RedCost : 0; 7313 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7314 !TheLoop->isLoopInvariant(RedOp)) { 7315 // Matched reduce(ext(A)) 7316 bool IsUnsigned = isa<ZExtInst>(RedOp); 7317 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7318 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7319 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7320 CostKind); 7321 7322 InstructionCost ExtCost = 7323 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7324 TTI::CastContextHint::None, CostKind, RedOp); 7325 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7326 return I == RetI ? RedCost : 0; 7327 } else if (RedOp && 7328 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7329 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7330 Op0->getOpcode() == Op1->getOpcode() && 7331 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7332 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7333 bool IsUnsigned = isa<ZExtInst>(Op0); 7334 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7335 // Matched reduce(mul(ext, ext)) 7336 InstructionCost ExtCost = 7337 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7338 TTI::CastContextHint::None, CostKind, Op0); 7339 InstructionCost MulCost = 7340 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7341 7342 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7343 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7344 CostKind); 7345 7346 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7347 return I == RetI ? RedCost : 0; 7348 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7349 // Matched reduce(mul()) 7350 InstructionCost MulCost = 7351 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7352 7353 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7354 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7355 CostKind); 7356 7357 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7358 return I == RetI ? RedCost : 0; 7359 } 7360 } 7361 7362 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7363 } 7364 7365 InstructionCost 7366 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7367 ElementCount VF) { 7368 // Calculate scalar cost only. Vectorization cost should be ready at this 7369 // moment. 7370 if (VF.isScalar()) { 7371 Type *ValTy = getLoadStoreType(I); 7372 const Align Alignment = getLoadStoreAlignment(I); 7373 unsigned AS = getLoadStoreAddressSpace(I); 7374 7375 return TTI.getAddressComputationCost(ValTy) + 7376 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7377 TTI::TCK_RecipThroughput, I); 7378 } 7379 return getWideningCost(I, VF); 7380 } 7381 7382 LoopVectorizationCostModel::VectorizationCostTy 7383 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7384 ElementCount VF) { 7385 // If we know that this instruction will remain uniform, check the cost of 7386 // the scalar version. 7387 if (isUniformAfterVectorization(I, VF)) 7388 VF = ElementCount::getFixed(1); 7389 7390 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7391 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7392 7393 // Forced scalars do not have any scalarization overhead. 7394 auto ForcedScalar = ForcedScalars.find(VF); 7395 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7396 auto InstSet = ForcedScalar->second; 7397 if (InstSet.count(I)) 7398 return VectorizationCostTy( 7399 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7400 VF.getKnownMinValue()), 7401 false); 7402 } 7403 7404 Type *VectorTy; 7405 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7406 7407 bool TypeNotScalarized = 7408 VF.isVector() && VectorTy->isVectorTy() && 7409 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7410 return VectorizationCostTy(C, TypeNotScalarized); 7411 } 7412 7413 InstructionCost 7414 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7415 ElementCount VF) const { 7416 7417 // There is no mechanism yet to create a scalable scalarization loop, 7418 // so this is currently Invalid. 7419 if (VF.isScalable()) 7420 return InstructionCost::getInvalid(); 7421 7422 if (VF.isScalar()) 7423 return 0; 7424 7425 InstructionCost Cost = 0; 7426 Type *RetTy = ToVectorTy(I->getType(), VF); 7427 if (!RetTy->isVoidTy() && 7428 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7429 Cost += TTI.getScalarizationOverhead( 7430 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7431 false); 7432 7433 // Some targets keep addresses scalar. 7434 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7435 return Cost; 7436 7437 // Some targets support efficient element stores. 7438 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7439 return Cost; 7440 7441 // Collect operands to consider. 7442 CallInst *CI = dyn_cast<CallInst>(I); 7443 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7444 7445 // Skip operands that do not require extraction/scalarization and do not incur 7446 // any overhead. 7447 SmallVector<Type *> Tys; 7448 for (auto *V : filterExtractingOperands(Ops, VF)) 7449 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7450 return Cost + TTI.getOperandsScalarizationOverhead( 7451 filterExtractingOperands(Ops, VF), Tys); 7452 } 7453 7454 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7455 if (VF.isScalar()) 7456 return; 7457 NumPredStores = 0; 7458 for (BasicBlock *BB : TheLoop->blocks()) { 7459 // For each instruction in the old loop. 7460 for (Instruction &I : *BB) { 7461 Value *Ptr = getLoadStorePointerOperand(&I); 7462 if (!Ptr) 7463 continue; 7464 7465 // TODO: We should generate better code and update the cost model for 7466 // predicated uniform stores. Today they are treated as any other 7467 // predicated store (see added test cases in 7468 // invariant-store-vectorization.ll). 7469 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7470 NumPredStores++; 7471 7472 if (Legal->isUniformMemOp(I)) { 7473 // TODO: Avoid replicating loads and stores instead of 7474 // relying on instcombine to remove them. 7475 // Load: Scalar load + broadcast 7476 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7477 InstructionCost Cost; 7478 if (isa<StoreInst>(&I) && VF.isScalable() && 7479 isLegalGatherOrScatter(&I)) { 7480 Cost = getGatherScatterCost(&I, VF); 7481 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7482 } else { 7483 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7484 "Cannot yet scalarize uniform stores"); 7485 Cost = getUniformMemOpCost(&I, VF); 7486 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7487 } 7488 continue; 7489 } 7490 7491 // We assume that widening is the best solution when possible. 7492 if (memoryInstructionCanBeWidened(&I, VF)) { 7493 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7494 int ConsecutiveStride = Legal->isConsecutivePtr( 7495 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7496 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7497 "Expected consecutive stride."); 7498 InstWidening Decision = 7499 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7500 setWideningDecision(&I, VF, Decision, Cost); 7501 continue; 7502 } 7503 7504 // Choose between Interleaving, Gather/Scatter or Scalarization. 7505 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7506 unsigned NumAccesses = 1; 7507 if (isAccessInterleaved(&I)) { 7508 auto Group = getInterleavedAccessGroup(&I); 7509 assert(Group && "Fail to get an interleaved access group."); 7510 7511 // Make one decision for the whole group. 7512 if (getWideningDecision(&I, VF) != CM_Unknown) 7513 continue; 7514 7515 NumAccesses = Group->getNumMembers(); 7516 if (interleavedAccessCanBeWidened(&I, VF)) 7517 InterleaveCost = getInterleaveGroupCost(&I, VF); 7518 } 7519 7520 InstructionCost GatherScatterCost = 7521 isLegalGatherOrScatter(&I) 7522 ? getGatherScatterCost(&I, VF) * NumAccesses 7523 : InstructionCost::getInvalid(); 7524 7525 InstructionCost ScalarizationCost = 7526 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7527 7528 // Choose better solution for the current VF, 7529 // write down this decision and use it during vectorization. 7530 InstructionCost Cost; 7531 InstWidening Decision; 7532 if (InterleaveCost <= GatherScatterCost && 7533 InterleaveCost < ScalarizationCost) { 7534 Decision = CM_Interleave; 7535 Cost = InterleaveCost; 7536 } else if (GatherScatterCost < ScalarizationCost) { 7537 Decision = CM_GatherScatter; 7538 Cost = GatherScatterCost; 7539 } else { 7540 Decision = CM_Scalarize; 7541 Cost = ScalarizationCost; 7542 } 7543 // If the instructions belongs to an interleave group, the whole group 7544 // receives the same decision. The whole group receives the cost, but 7545 // the cost will actually be assigned to one instruction. 7546 if (auto Group = getInterleavedAccessGroup(&I)) 7547 setWideningDecision(Group, VF, Decision, Cost); 7548 else 7549 setWideningDecision(&I, VF, Decision, Cost); 7550 } 7551 } 7552 7553 // Make sure that any load of address and any other address computation 7554 // remains scalar unless there is gather/scatter support. This avoids 7555 // inevitable extracts into address registers, and also has the benefit of 7556 // activating LSR more, since that pass can't optimize vectorized 7557 // addresses. 7558 if (TTI.prefersVectorizedAddressing()) 7559 return; 7560 7561 // Start with all scalar pointer uses. 7562 SmallPtrSet<Instruction *, 8> AddrDefs; 7563 for (BasicBlock *BB : TheLoop->blocks()) 7564 for (Instruction &I : *BB) { 7565 Instruction *PtrDef = 7566 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7567 if (PtrDef && TheLoop->contains(PtrDef) && 7568 getWideningDecision(&I, VF) != CM_GatherScatter) 7569 AddrDefs.insert(PtrDef); 7570 } 7571 7572 // Add all instructions used to generate the addresses. 7573 SmallVector<Instruction *, 4> Worklist; 7574 append_range(Worklist, AddrDefs); 7575 while (!Worklist.empty()) { 7576 Instruction *I = Worklist.pop_back_val(); 7577 for (auto &Op : I->operands()) 7578 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7579 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7580 AddrDefs.insert(InstOp).second) 7581 Worklist.push_back(InstOp); 7582 } 7583 7584 for (auto *I : AddrDefs) { 7585 if (isa<LoadInst>(I)) { 7586 // Setting the desired widening decision should ideally be handled in 7587 // by cost functions, but since this involves the task of finding out 7588 // if the loaded register is involved in an address computation, it is 7589 // instead changed here when we know this is the case. 7590 InstWidening Decision = getWideningDecision(I, VF); 7591 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7592 // Scalarize a widened load of address. 7593 setWideningDecision( 7594 I, VF, CM_Scalarize, 7595 (VF.getKnownMinValue() * 7596 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7597 else if (auto Group = getInterleavedAccessGroup(I)) { 7598 // Scalarize an interleave group of address loads. 7599 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7600 if (Instruction *Member = Group->getMember(I)) 7601 setWideningDecision( 7602 Member, VF, CM_Scalarize, 7603 (VF.getKnownMinValue() * 7604 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7605 } 7606 } 7607 } else 7608 // Make sure I gets scalarized and a cost estimate without 7609 // scalarization overhead. 7610 ForcedScalars[VF].insert(I); 7611 } 7612 } 7613 7614 InstructionCost 7615 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7616 Type *&VectorTy) { 7617 Type *RetTy = I->getType(); 7618 if (canTruncateToMinimalBitwidth(I, VF)) 7619 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7620 auto SE = PSE.getSE(); 7621 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7622 7623 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7624 ElementCount VF) -> bool { 7625 if (VF.isScalar()) 7626 return true; 7627 7628 auto Scalarized = InstsToScalarize.find(VF); 7629 assert(Scalarized != InstsToScalarize.end() && 7630 "VF not yet analyzed for scalarization profitability"); 7631 return !Scalarized->second.count(I) && 7632 llvm::all_of(I->users(), [&](User *U) { 7633 auto *UI = cast<Instruction>(U); 7634 return !Scalarized->second.count(UI); 7635 }); 7636 }; 7637 (void) hasSingleCopyAfterVectorization; 7638 7639 if (isScalarAfterVectorization(I, VF)) { 7640 // With the exception of GEPs and PHIs, after scalarization there should 7641 // only be one copy of the instruction generated in the loop. This is 7642 // because the VF is either 1, or any instructions that need scalarizing 7643 // have already been dealt with by the the time we get here. As a result, 7644 // it means we don't have to multiply the instruction cost by VF. 7645 assert(I->getOpcode() == Instruction::GetElementPtr || 7646 I->getOpcode() == Instruction::PHI || 7647 (I->getOpcode() == Instruction::BitCast && 7648 I->getType()->isPointerTy()) || 7649 hasSingleCopyAfterVectorization(I, VF)); 7650 VectorTy = RetTy; 7651 } else 7652 VectorTy = ToVectorTy(RetTy, VF); 7653 7654 // TODO: We need to estimate the cost of intrinsic calls. 7655 switch (I->getOpcode()) { 7656 case Instruction::GetElementPtr: 7657 // We mark this instruction as zero-cost because the cost of GEPs in 7658 // vectorized code depends on whether the corresponding memory instruction 7659 // is scalarized or not. Therefore, we handle GEPs with the memory 7660 // instruction cost. 7661 return 0; 7662 case Instruction::Br: { 7663 // In cases of scalarized and predicated instructions, there will be VF 7664 // predicated blocks in the vectorized loop. Each branch around these 7665 // blocks requires also an extract of its vector compare i1 element. 7666 bool ScalarPredicatedBB = false; 7667 BranchInst *BI = cast<BranchInst>(I); 7668 if (VF.isVector() && BI->isConditional() && 7669 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7670 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7671 ScalarPredicatedBB = true; 7672 7673 if (ScalarPredicatedBB) { 7674 // Not possible to scalarize scalable vector with predicated instructions. 7675 if (VF.isScalable()) 7676 return InstructionCost::getInvalid(); 7677 // Return cost for branches around scalarized and predicated blocks. 7678 auto *Vec_i1Ty = 7679 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7680 return ( 7681 TTI.getScalarizationOverhead( 7682 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7683 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7684 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7685 // The back-edge branch will remain, as will all scalar branches. 7686 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7687 else 7688 // This branch will be eliminated by if-conversion. 7689 return 0; 7690 // Note: We currently assume zero cost for an unconditional branch inside 7691 // a predicated block since it will become a fall-through, although we 7692 // may decide in the future to call TTI for all branches. 7693 } 7694 case Instruction::PHI: { 7695 auto *Phi = cast<PHINode>(I); 7696 7697 // First-order recurrences are replaced by vector shuffles inside the loop. 7698 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7699 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7700 return TTI.getShuffleCost( 7701 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7702 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7703 7704 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7705 // converted into select instructions. We require N - 1 selects per phi 7706 // node, where N is the number of incoming values. 7707 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7708 return (Phi->getNumIncomingValues() - 1) * 7709 TTI.getCmpSelInstrCost( 7710 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7711 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7712 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7713 7714 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7715 } 7716 case Instruction::UDiv: 7717 case Instruction::SDiv: 7718 case Instruction::URem: 7719 case Instruction::SRem: 7720 // If we have a predicated instruction, it may not be executed for each 7721 // vector lane. Get the scalarization cost and scale this amount by the 7722 // probability of executing the predicated block. If the instruction is not 7723 // predicated, we fall through to the next case. 7724 if (VF.isVector() && isScalarWithPredication(I)) { 7725 InstructionCost Cost = 0; 7726 7727 // These instructions have a non-void type, so account for the phi nodes 7728 // that we will create. This cost is likely to be zero. The phi node 7729 // cost, if any, should be scaled by the block probability because it 7730 // models a copy at the end of each predicated block. 7731 Cost += VF.getKnownMinValue() * 7732 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7733 7734 // The cost of the non-predicated instruction. 7735 Cost += VF.getKnownMinValue() * 7736 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7737 7738 // The cost of insertelement and extractelement instructions needed for 7739 // scalarization. 7740 Cost += getScalarizationOverhead(I, VF); 7741 7742 // Scale the cost by the probability of executing the predicated blocks. 7743 // This assumes the predicated block for each vector lane is equally 7744 // likely. 7745 return Cost / getReciprocalPredBlockProb(); 7746 } 7747 LLVM_FALLTHROUGH; 7748 case Instruction::Add: 7749 case Instruction::FAdd: 7750 case Instruction::Sub: 7751 case Instruction::FSub: 7752 case Instruction::Mul: 7753 case Instruction::FMul: 7754 case Instruction::FDiv: 7755 case Instruction::FRem: 7756 case Instruction::Shl: 7757 case Instruction::LShr: 7758 case Instruction::AShr: 7759 case Instruction::And: 7760 case Instruction::Or: 7761 case Instruction::Xor: { 7762 // Since we will replace the stride by 1 the multiplication should go away. 7763 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7764 return 0; 7765 7766 // Detect reduction patterns 7767 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7768 return *RedCost; 7769 7770 // Certain instructions can be cheaper to vectorize if they have a constant 7771 // second vector operand. One example of this are shifts on x86. 7772 Value *Op2 = I->getOperand(1); 7773 TargetTransformInfo::OperandValueProperties Op2VP; 7774 TargetTransformInfo::OperandValueKind Op2VK = 7775 TTI.getOperandInfo(Op2, Op2VP); 7776 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7777 Op2VK = TargetTransformInfo::OK_UniformValue; 7778 7779 SmallVector<const Value *, 4> Operands(I->operand_values()); 7780 return TTI.getArithmeticInstrCost( 7781 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7782 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7783 } 7784 case Instruction::FNeg: { 7785 return TTI.getArithmeticInstrCost( 7786 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7787 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7788 TargetTransformInfo::OP_None, I->getOperand(0), I); 7789 } 7790 case Instruction::Select: { 7791 SelectInst *SI = cast<SelectInst>(I); 7792 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7793 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7794 7795 const Value *Op0, *Op1; 7796 using namespace llvm::PatternMatch; 7797 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7798 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7799 // select x, y, false --> x & y 7800 // select x, true, y --> x | y 7801 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7802 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7803 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7804 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7805 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7806 Op1->getType()->getScalarSizeInBits() == 1); 7807 7808 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7809 return TTI.getArithmeticInstrCost( 7810 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7811 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7812 } 7813 7814 Type *CondTy = SI->getCondition()->getType(); 7815 if (!ScalarCond) 7816 CondTy = VectorType::get(CondTy, VF); 7817 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7818 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7819 } 7820 case Instruction::ICmp: 7821 case Instruction::FCmp: { 7822 Type *ValTy = I->getOperand(0)->getType(); 7823 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7824 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7825 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7826 VectorTy = ToVectorTy(ValTy, VF); 7827 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7828 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7829 } 7830 case Instruction::Store: 7831 case Instruction::Load: { 7832 ElementCount Width = VF; 7833 if (Width.isVector()) { 7834 InstWidening Decision = getWideningDecision(I, Width); 7835 assert(Decision != CM_Unknown && 7836 "CM decision should be taken at this point"); 7837 if (Decision == CM_Scalarize) 7838 Width = ElementCount::getFixed(1); 7839 } 7840 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7841 return getMemoryInstructionCost(I, VF); 7842 } 7843 case Instruction::BitCast: 7844 if (I->getType()->isPointerTy()) 7845 return 0; 7846 LLVM_FALLTHROUGH; 7847 case Instruction::ZExt: 7848 case Instruction::SExt: 7849 case Instruction::FPToUI: 7850 case Instruction::FPToSI: 7851 case Instruction::FPExt: 7852 case Instruction::PtrToInt: 7853 case Instruction::IntToPtr: 7854 case Instruction::SIToFP: 7855 case Instruction::UIToFP: 7856 case Instruction::Trunc: 7857 case Instruction::FPTrunc: { 7858 // Computes the CastContextHint from a Load/Store instruction. 7859 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7860 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7861 "Expected a load or a store!"); 7862 7863 if (VF.isScalar() || !TheLoop->contains(I)) 7864 return TTI::CastContextHint::Normal; 7865 7866 switch (getWideningDecision(I, VF)) { 7867 case LoopVectorizationCostModel::CM_GatherScatter: 7868 return TTI::CastContextHint::GatherScatter; 7869 case LoopVectorizationCostModel::CM_Interleave: 7870 return TTI::CastContextHint::Interleave; 7871 case LoopVectorizationCostModel::CM_Scalarize: 7872 case LoopVectorizationCostModel::CM_Widen: 7873 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7874 : TTI::CastContextHint::Normal; 7875 case LoopVectorizationCostModel::CM_Widen_Reverse: 7876 return TTI::CastContextHint::Reversed; 7877 case LoopVectorizationCostModel::CM_Unknown: 7878 llvm_unreachable("Instr did not go through cost modelling?"); 7879 } 7880 7881 llvm_unreachable("Unhandled case!"); 7882 }; 7883 7884 unsigned Opcode = I->getOpcode(); 7885 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7886 // For Trunc, the context is the only user, which must be a StoreInst. 7887 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7888 if (I->hasOneUse()) 7889 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7890 CCH = ComputeCCH(Store); 7891 } 7892 // For Z/Sext, the context is the operand, which must be a LoadInst. 7893 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7894 Opcode == Instruction::FPExt) { 7895 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7896 CCH = ComputeCCH(Load); 7897 } 7898 7899 // We optimize the truncation of induction variables having constant 7900 // integer steps. The cost of these truncations is the same as the scalar 7901 // operation. 7902 if (isOptimizableIVTruncate(I, VF)) { 7903 auto *Trunc = cast<TruncInst>(I); 7904 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7905 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7906 } 7907 7908 // Detect reduction patterns 7909 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7910 return *RedCost; 7911 7912 Type *SrcScalarTy = I->getOperand(0)->getType(); 7913 Type *SrcVecTy = 7914 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7915 if (canTruncateToMinimalBitwidth(I, VF)) { 7916 // This cast is going to be shrunk. This may remove the cast or it might 7917 // turn it into slightly different cast. For example, if MinBW == 16, 7918 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7919 // 7920 // Calculate the modified src and dest types. 7921 Type *MinVecTy = VectorTy; 7922 if (Opcode == Instruction::Trunc) { 7923 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7924 VectorTy = 7925 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7926 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7927 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7928 VectorTy = 7929 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7930 } 7931 } 7932 7933 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7934 } 7935 case Instruction::Call: { 7936 bool NeedToScalarize; 7937 CallInst *CI = cast<CallInst>(I); 7938 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7939 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7940 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7941 return std::min(CallCost, IntrinsicCost); 7942 } 7943 return CallCost; 7944 } 7945 case Instruction::ExtractValue: 7946 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7947 case Instruction::Alloca: 7948 // We cannot easily widen alloca to a scalable alloca, as 7949 // the result would need to be a vector of pointers. 7950 if (VF.isScalable()) 7951 return InstructionCost::getInvalid(); 7952 LLVM_FALLTHROUGH; 7953 default: 7954 // This opcode is unknown. Assume that it is the same as 'mul'. 7955 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7956 } // end of switch. 7957 } 7958 7959 char LoopVectorize::ID = 0; 7960 7961 static const char lv_name[] = "Loop Vectorization"; 7962 7963 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7964 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7965 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7966 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7967 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7968 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7969 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7970 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7971 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7972 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7973 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7974 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7975 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7976 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7977 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7978 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7979 7980 namespace llvm { 7981 7982 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7983 7984 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7985 bool VectorizeOnlyWhenForced) { 7986 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7987 } 7988 7989 } // end namespace llvm 7990 7991 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7992 // Check if the pointer operand of a load or store instruction is 7993 // consecutive. 7994 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7995 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7996 return false; 7997 } 7998 7999 void LoopVectorizationCostModel::collectValuesToIgnore() { 8000 // Ignore ephemeral values. 8001 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 8002 8003 // Ignore type-promoting instructions we identified during reduction 8004 // detection. 8005 for (auto &Reduction : Legal->getReductionVars()) { 8006 RecurrenceDescriptor &RedDes = Reduction.second; 8007 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 8008 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8009 } 8010 // Ignore type-casting instructions we identified during induction 8011 // detection. 8012 for (auto &Induction : Legal->getInductionVars()) { 8013 InductionDescriptor &IndDes = Induction.second; 8014 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8015 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8016 } 8017 } 8018 8019 void LoopVectorizationCostModel::collectInLoopReductions() { 8020 for (auto &Reduction : Legal->getReductionVars()) { 8021 PHINode *Phi = Reduction.first; 8022 RecurrenceDescriptor &RdxDesc = Reduction.second; 8023 8024 // We don't collect reductions that are type promoted (yet). 8025 if (RdxDesc.getRecurrenceType() != Phi->getType()) 8026 continue; 8027 8028 // If the target would prefer this reduction to happen "in-loop", then we 8029 // want to record it as such. 8030 unsigned Opcode = RdxDesc.getOpcode(); 8031 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 8032 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 8033 TargetTransformInfo::ReductionFlags())) 8034 continue; 8035 8036 // Check that we can correctly put the reductions into the loop, by 8037 // finding the chain of operations that leads from the phi to the loop 8038 // exit value. 8039 SmallVector<Instruction *, 4> ReductionOperations = 8040 RdxDesc.getReductionOpChain(Phi, TheLoop); 8041 bool InLoop = !ReductionOperations.empty(); 8042 if (InLoop) { 8043 InLoopReductionChains[Phi] = ReductionOperations; 8044 // Add the elements to InLoopReductionImmediateChains for cost modelling. 8045 Instruction *LastChain = Phi; 8046 for (auto *I : ReductionOperations) { 8047 InLoopReductionImmediateChains[I] = LastChain; 8048 LastChain = I; 8049 } 8050 } 8051 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 8052 << " reduction for phi: " << *Phi << "\n"); 8053 } 8054 } 8055 8056 // TODO: we could return a pair of values that specify the max VF and 8057 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 8058 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 8059 // doesn't have a cost model that can choose which plan to execute if 8060 // more than one is generated. 8061 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 8062 LoopVectorizationCostModel &CM) { 8063 unsigned WidestType; 8064 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 8065 return WidestVectorRegBits / WidestType; 8066 } 8067 8068 VectorizationFactor 8069 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 8070 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 8071 ElementCount VF = UserVF; 8072 // Outer loop handling: They may require CFG and instruction level 8073 // transformations before even evaluating whether vectorization is profitable. 8074 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8075 // the vectorization pipeline. 8076 if (!OrigLoop->isInnermost()) { 8077 // If the user doesn't provide a vectorization factor, determine a 8078 // reasonable one. 8079 if (UserVF.isZero()) { 8080 VF = ElementCount::getFixed(determineVPlanVF( 8081 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8082 .getFixedSize(), 8083 CM)); 8084 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8085 8086 // Make sure we have a VF > 1 for stress testing. 8087 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8088 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8089 << "overriding computed VF.\n"); 8090 VF = ElementCount::getFixed(4); 8091 } 8092 } 8093 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8094 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8095 "VF needs to be a power of two"); 8096 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8097 << "VF " << VF << " to build VPlans.\n"); 8098 buildVPlans(VF, VF); 8099 8100 // For VPlan build stress testing, we bail out after VPlan construction. 8101 if (VPlanBuildStressTest) 8102 return VectorizationFactor::Disabled(); 8103 8104 return {VF, 0 /*Cost*/}; 8105 } 8106 8107 LLVM_DEBUG( 8108 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8109 "VPlan-native path.\n"); 8110 return VectorizationFactor::Disabled(); 8111 } 8112 8113 Optional<VectorizationFactor> 8114 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8115 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8116 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8117 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8118 return None; 8119 8120 // Invalidate interleave groups if all blocks of loop will be predicated. 8121 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 8122 !useMaskedInterleavedAccesses(*TTI)) { 8123 LLVM_DEBUG( 8124 dbgs() 8125 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8126 "which requires masked-interleaved support.\n"); 8127 if (CM.InterleaveInfo.invalidateGroups()) 8128 // Invalidating interleave groups also requires invalidating all decisions 8129 // based on them, which includes widening decisions and uniform and scalar 8130 // values. 8131 CM.invalidateCostModelingDecisions(); 8132 } 8133 8134 ElementCount MaxUserVF = 8135 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8136 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8137 if (!UserVF.isZero() && UserVFIsLegal) { 8138 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8139 "VF needs to be a power of two"); 8140 // Collect the instructions (and their associated costs) that will be more 8141 // profitable to scalarize. 8142 if (CM.selectUserVectorizationFactor(UserVF)) { 8143 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8144 CM.collectInLoopReductions(); 8145 buildVPlansWithVPRecipes(UserVF, UserVF); 8146 LLVM_DEBUG(printPlans(dbgs())); 8147 return {{UserVF, 0}}; 8148 } else 8149 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8150 "InvalidCost", ORE, OrigLoop); 8151 } 8152 8153 // Populate the set of Vectorization Factor Candidates. 8154 ElementCountSet VFCandidates; 8155 for (auto VF = ElementCount::getFixed(1); 8156 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8157 VFCandidates.insert(VF); 8158 for (auto VF = ElementCount::getScalable(1); 8159 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8160 VFCandidates.insert(VF); 8161 8162 for (const auto &VF : VFCandidates) { 8163 // Collect Uniform and Scalar instructions after vectorization with VF. 8164 CM.collectUniformsAndScalars(VF); 8165 8166 // Collect the instructions (and their associated costs) that will be more 8167 // profitable to scalarize. 8168 if (VF.isVector()) 8169 CM.collectInstsToScalarize(VF); 8170 } 8171 8172 CM.collectInLoopReductions(); 8173 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8174 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8175 8176 LLVM_DEBUG(printPlans(dbgs())); 8177 if (!MaxFactors.hasVector()) 8178 return VectorizationFactor::Disabled(); 8179 8180 // Select the optimal vectorization factor. 8181 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8182 8183 // Check if it is profitable to vectorize with runtime checks. 8184 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8185 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8186 bool PragmaThresholdReached = 8187 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8188 bool ThresholdReached = 8189 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8190 if ((ThresholdReached && !Hints.allowReordering()) || 8191 PragmaThresholdReached) { 8192 ORE->emit([&]() { 8193 return OptimizationRemarkAnalysisAliasing( 8194 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8195 OrigLoop->getHeader()) 8196 << "loop not vectorized: cannot prove it is safe to reorder " 8197 "memory operations"; 8198 }); 8199 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8200 Hints.emitRemarkWithHints(); 8201 return VectorizationFactor::Disabled(); 8202 } 8203 } 8204 return SelectedVF; 8205 } 8206 8207 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 8208 assert(count_if(VPlans, 8209 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 8210 1 && 8211 "Best VF has not a single VPlan."); 8212 8213 for (const VPlanPtr &Plan : VPlans) { 8214 if (Plan->hasVF(VF)) 8215 return *Plan.get(); 8216 } 8217 llvm_unreachable("No plan found!"); 8218 } 8219 8220 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 8221 VPlan &BestVPlan, 8222 InnerLoopVectorizer &ILV, 8223 DominatorTree *DT) { 8224 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 8225 << '\n'); 8226 8227 // Perform the actual loop transformation. 8228 8229 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8230 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 8231 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8232 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8233 State.CanonicalIV = ILV.Induction; 8234 8235 ILV.printDebugTracesAtStart(); 8236 8237 //===------------------------------------------------===// 8238 // 8239 // Notice: any optimization or new instruction that go 8240 // into the code below should also be implemented in 8241 // the cost-model. 8242 // 8243 //===------------------------------------------------===// 8244 8245 // 2. Copy and widen instructions from the old loop into the new loop. 8246 BestVPlan.execute(&State); 8247 8248 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8249 // predication, updating analyses. 8250 ILV.fixVectorizedLoop(State); 8251 8252 ILV.printDebugTracesAtEnd(); 8253 } 8254 8255 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8256 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8257 for (const auto &Plan : VPlans) 8258 if (PrintVPlansInDotFormat) 8259 Plan->printDOT(O); 8260 else 8261 Plan->print(O); 8262 } 8263 #endif 8264 8265 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8266 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8267 8268 // We create new control-flow for the vectorized loop, so the original exit 8269 // conditions will be dead after vectorization if it's only used by the 8270 // terminator 8271 SmallVector<BasicBlock*> ExitingBlocks; 8272 OrigLoop->getExitingBlocks(ExitingBlocks); 8273 for (auto *BB : ExitingBlocks) { 8274 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8275 if (!Cmp || !Cmp->hasOneUse()) 8276 continue; 8277 8278 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8279 if (!DeadInstructions.insert(Cmp).second) 8280 continue; 8281 8282 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8283 // TODO: can recurse through operands in general 8284 for (Value *Op : Cmp->operands()) { 8285 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8286 DeadInstructions.insert(cast<Instruction>(Op)); 8287 } 8288 } 8289 8290 // We create new "steps" for induction variable updates to which the original 8291 // induction variables map. An original update instruction will be dead if 8292 // all its users except the induction variable are dead. 8293 auto *Latch = OrigLoop->getLoopLatch(); 8294 for (auto &Induction : Legal->getInductionVars()) { 8295 PHINode *Ind = Induction.first; 8296 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8297 8298 // If the tail is to be folded by masking, the primary induction variable, 8299 // if exists, isn't dead: it will be used for masking. Don't kill it. 8300 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8301 continue; 8302 8303 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8304 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8305 })) 8306 DeadInstructions.insert(IndUpdate); 8307 8308 // We record as "Dead" also the type-casting instructions we had identified 8309 // during induction analysis. We don't need any handling for them in the 8310 // vectorized loop because we have proven that, under a proper runtime 8311 // test guarding the vectorized loop, the value of the phi, and the casted 8312 // value of the phi, are the same. The last instruction in this casting chain 8313 // will get its scalar/vector/widened def from the scalar/vector/widened def 8314 // of the respective phi node. Any other casts in the induction def-use chain 8315 // have no other uses outside the phi update chain, and will be ignored. 8316 InductionDescriptor &IndDes = Induction.second; 8317 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8318 DeadInstructions.insert(Casts.begin(), Casts.end()); 8319 } 8320 } 8321 8322 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8323 8324 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8325 8326 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8327 Value *Step, 8328 Instruction::BinaryOps BinOp) { 8329 // When unrolling and the VF is 1, we only need to add a simple scalar. 8330 Type *Ty = Val->getType(); 8331 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8332 8333 if (Ty->isFloatingPointTy()) { 8334 // Floating-point operations inherit FMF via the builder's flags. 8335 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8336 return Builder.CreateBinOp(BinOp, Val, MulOp); 8337 } 8338 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8339 } 8340 8341 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8342 SmallVector<Metadata *, 4> MDs; 8343 // Reserve first location for self reference to the LoopID metadata node. 8344 MDs.push_back(nullptr); 8345 bool IsUnrollMetadata = false; 8346 MDNode *LoopID = L->getLoopID(); 8347 if (LoopID) { 8348 // First find existing loop unrolling disable metadata. 8349 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8350 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8351 if (MD) { 8352 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8353 IsUnrollMetadata = 8354 S && S->getString().startswith("llvm.loop.unroll.disable"); 8355 } 8356 MDs.push_back(LoopID->getOperand(i)); 8357 } 8358 } 8359 8360 if (!IsUnrollMetadata) { 8361 // Add runtime unroll disable metadata. 8362 LLVMContext &Context = L->getHeader()->getContext(); 8363 SmallVector<Metadata *, 1> DisableOperands; 8364 DisableOperands.push_back( 8365 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8366 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8367 MDs.push_back(DisableNode); 8368 MDNode *NewLoopID = MDNode::get(Context, MDs); 8369 // Set operand 0 to refer to the loop id itself. 8370 NewLoopID->replaceOperandWith(0, NewLoopID); 8371 L->setLoopID(NewLoopID); 8372 } 8373 } 8374 8375 //===--------------------------------------------------------------------===// 8376 // EpilogueVectorizerMainLoop 8377 //===--------------------------------------------------------------------===// 8378 8379 /// This function is partially responsible for generating the control flow 8380 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8381 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8382 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8383 Loop *Lp = createVectorLoopSkeleton(""); 8384 8385 // Generate the code to check the minimum iteration count of the vector 8386 // epilogue (see below). 8387 EPI.EpilogueIterationCountCheck = 8388 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8389 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8390 8391 // Generate the code to check any assumptions that we've made for SCEV 8392 // expressions. 8393 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8394 8395 // Generate the code that checks at runtime if arrays overlap. We put the 8396 // checks into a separate block to make the more common case of few elements 8397 // faster. 8398 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8399 8400 // Generate the iteration count check for the main loop, *after* the check 8401 // for the epilogue loop, so that the path-length is shorter for the case 8402 // that goes directly through the vector epilogue. The longer-path length for 8403 // the main loop is compensated for, by the gain from vectorizing the larger 8404 // trip count. Note: the branch will get updated later on when we vectorize 8405 // the epilogue. 8406 EPI.MainLoopIterationCountCheck = 8407 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8408 8409 // Generate the induction variable. 8410 OldInduction = Legal->getPrimaryInduction(); 8411 Type *IdxTy = Legal->getWidestInductionType(); 8412 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8413 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8414 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8415 EPI.VectorTripCount = CountRoundDown; 8416 Induction = 8417 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8418 getDebugLocFromInstOrOperands(OldInduction)); 8419 8420 // Skip induction resume value creation here because they will be created in 8421 // the second pass. If we created them here, they wouldn't be used anyway, 8422 // because the vplan in the second pass still contains the inductions from the 8423 // original loop. 8424 8425 return completeLoopSkeleton(Lp, OrigLoopID); 8426 } 8427 8428 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8429 LLVM_DEBUG({ 8430 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8431 << "Main Loop VF:" << EPI.MainLoopVF 8432 << ", Main Loop UF:" << EPI.MainLoopUF 8433 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8434 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8435 }); 8436 } 8437 8438 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8439 DEBUG_WITH_TYPE(VerboseDebug, { 8440 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8441 }); 8442 } 8443 8444 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8445 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8446 assert(L && "Expected valid Loop."); 8447 assert(Bypass && "Expected valid bypass basic block."); 8448 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8449 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8450 Value *Count = getOrCreateTripCount(L); 8451 // Reuse existing vector loop preheader for TC checks. 8452 // Note that new preheader block is generated for vector loop. 8453 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8454 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8455 8456 // Generate code to check if the loop's trip count is less than VF * UF of the 8457 // main vector loop. 8458 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8459 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8460 8461 Value *CheckMinIters = Builder.CreateICmp( 8462 P, Count, getRuntimeVF(Builder, Count->getType(), VFactor * UFactor), 8463 "min.iters.check"); 8464 8465 if (!ForEpilogue) 8466 TCCheckBlock->setName("vector.main.loop.iter.check"); 8467 8468 // Create new preheader for vector loop. 8469 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8470 DT, LI, nullptr, "vector.ph"); 8471 8472 if (ForEpilogue) { 8473 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8474 DT->getNode(Bypass)->getIDom()) && 8475 "TC check is expected to dominate Bypass"); 8476 8477 // Update dominator for Bypass & LoopExit. 8478 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8479 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8480 // For loops with multiple exits, there's no edge from the middle block 8481 // to exit blocks (as the epilogue must run) and thus no need to update 8482 // the immediate dominator of the exit blocks. 8483 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8484 8485 LoopBypassBlocks.push_back(TCCheckBlock); 8486 8487 // Save the trip count so we don't have to regenerate it in the 8488 // vec.epilog.iter.check. This is safe to do because the trip count 8489 // generated here dominates the vector epilog iter check. 8490 EPI.TripCount = Count; 8491 } 8492 8493 ReplaceInstWithInst( 8494 TCCheckBlock->getTerminator(), 8495 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8496 8497 return TCCheckBlock; 8498 } 8499 8500 //===--------------------------------------------------------------------===// 8501 // EpilogueVectorizerEpilogueLoop 8502 //===--------------------------------------------------------------------===// 8503 8504 /// This function is partially responsible for generating the control flow 8505 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8506 BasicBlock * 8507 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8508 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8509 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8510 8511 // Now, compare the remaining count and if there aren't enough iterations to 8512 // execute the vectorized epilogue skip to the scalar part. 8513 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8514 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8515 LoopVectorPreHeader = 8516 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8517 LI, nullptr, "vec.epilog.ph"); 8518 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8519 VecEpilogueIterationCountCheck); 8520 8521 // Adjust the control flow taking the state info from the main loop 8522 // vectorization into account. 8523 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8524 "expected this to be saved from the previous pass."); 8525 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8526 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8527 8528 DT->changeImmediateDominator(LoopVectorPreHeader, 8529 EPI.MainLoopIterationCountCheck); 8530 8531 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8532 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8533 8534 if (EPI.SCEVSafetyCheck) 8535 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8536 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8537 if (EPI.MemSafetyCheck) 8538 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8539 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8540 8541 DT->changeImmediateDominator( 8542 VecEpilogueIterationCountCheck, 8543 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8544 8545 DT->changeImmediateDominator(LoopScalarPreHeader, 8546 EPI.EpilogueIterationCountCheck); 8547 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8548 // If there is an epilogue which must run, there's no edge from the 8549 // middle block to exit blocks and thus no need to update the immediate 8550 // dominator of the exit blocks. 8551 DT->changeImmediateDominator(LoopExitBlock, 8552 EPI.EpilogueIterationCountCheck); 8553 8554 // Keep track of bypass blocks, as they feed start values to the induction 8555 // phis in the scalar loop preheader. 8556 if (EPI.SCEVSafetyCheck) 8557 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8558 if (EPI.MemSafetyCheck) 8559 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8560 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8561 8562 // Generate a resume induction for the vector epilogue and put it in the 8563 // vector epilogue preheader 8564 Type *IdxTy = Legal->getWidestInductionType(); 8565 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8566 LoopVectorPreHeader->getFirstNonPHI()); 8567 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8568 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8569 EPI.MainLoopIterationCountCheck); 8570 8571 // Generate the induction variable. 8572 OldInduction = Legal->getPrimaryInduction(); 8573 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8574 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8575 Value *StartIdx = EPResumeVal; 8576 Induction = 8577 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8578 getDebugLocFromInstOrOperands(OldInduction)); 8579 8580 // Generate induction resume values. These variables save the new starting 8581 // indexes for the scalar loop. They are used to test if there are any tail 8582 // iterations left once the vector loop has completed. 8583 // Note that when the vectorized epilogue is skipped due to iteration count 8584 // check, then the resume value for the induction variable comes from 8585 // the trip count of the main vector loop, hence passing the AdditionalBypass 8586 // argument. 8587 createInductionResumeValues(Lp, CountRoundDown, 8588 {VecEpilogueIterationCountCheck, 8589 EPI.VectorTripCount} /* AdditionalBypass */); 8590 8591 AddRuntimeUnrollDisableMetaData(Lp); 8592 return completeLoopSkeleton(Lp, OrigLoopID); 8593 } 8594 8595 BasicBlock * 8596 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8597 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8598 8599 assert(EPI.TripCount && 8600 "Expected trip count to have been safed in the first pass."); 8601 assert( 8602 (!isa<Instruction>(EPI.TripCount) || 8603 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8604 "saved trip count does not dominate insertion point."); 8605 Value *TC = EPI.TripCount; 8606 IRBuilder<> Builder(Insert->getTerminator()); 8607 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8608 8609 // Generate code to check if the loop's trip count is less than VF * UF of the 8610 // vector epilogue loop. 8611 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8612 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8613 8614 Value *CheckMinIters = Builder.CreateICmp( 8615 P, Count, 8616 getRuntimeVF(Builder, Count->getType(), EPI.EpilogueVF * EPI.EpilogueUF), 8617 "min.epilog.iters.check"); 8618 8619 ReplaceInstWithInst( 8620 Insert->getTerminator(), 8621 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8622 8623 LoopBypassBlocks.push_back(Insert); 8624 return Insert; 8625 } 8626 8627 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8628 LLVM_DEBUG({ 8629 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8630 << "Epilogue Loop VF:" << EPI.EpilogueVF 8631 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8632 }); 8633 } 8634 8635 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8636 DEBUG_WITH_TYPE(VerboseDebug, { 8637 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8638 }); 8639 } 8640 8641 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8642 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8643 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8644 bool PredicateAtRangeStart = Predicate(Range.Start); 8645 8646 for (ElementCount TmpVF = Range.Start * 2; 8647 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8648 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8649 Range.End = TmpVF; 8650 break; 8651 } 8652 8653 return PredicateAtRangeStart; 8654 } 8655 8656 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8657 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8658 /// of VF's starting at a given VF and extending it as much as possible. Each 8659 /// vectorization decision can potentially shorten this sub-range during 8660 /// buildVPlan(). 8661 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8662 ElementCount MaxVF) { 8663 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8664 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8665 VFRange SubRange = {VF, MaxVFPlusOne}; 8666 VPlans.push_back(buildVPlan(SubRange)); 8667 VF = SubRange.End; 8668 } 8669 } 8670 8671 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8672 VPlanPtr &Plan) { 8673 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8674 8675 // Look for cached value. 8676 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8677 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8678 if (ECEntryIt != EdgeMaskCache.end()) 8679 return ECEntryIt->second; 8680 8681 VPValue *SrcMask = createBlockInMask(Src, Plan); 8682 8683 // The terminator has to be a branch inst! 8684 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8685 assert(BI && "Unexpected terminator found"); 8686 8687 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8688 return EdgeMaskCache[Edge] = SrcMask; 8689 8690 // If source is an exiting block, we know the exit edge is dynamically dead 8691 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8692 // adding uses of an otherwise potentially dead instruction. 8693 if (OrigLoop->isLoopExiting(Src)) 8694 return EdgeMaskCache[Edge] = SrcMask; 8695 8696 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8697 assert(EdgeMask && "No Edge Mask found for condition"); 8698 8699 if (BI->getSuccessor(0) != Dst) 8700 EdgeMask = Builder.createNot(EdgeMask); 8701 8702 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8703 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8704 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8705 // The select version does not introduce new UB if SrcMask is false and 8706 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8707 VPValue *False = Plan->getOrAddVPValue( 8708 ConstantInt::getFalse(BI->getCondition()->getType())); 8709 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8710 } 8711 8712 return EdgeMaskCache[Edge] = EdgeMask; 8713 } 8714 8715 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8716 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8717 8718 // Look for cached value. 8719 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8720 if (BCEntryIt != BlockMaskCache.end()) 8721 return BCEntryIt->second; 8722 8723 // All-one mask is modelled as no-mask following the convention for masked 8724 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8725 VPValue *BlockMask = nullptr; 8726 8727 if (OrigLoop->getHeader() == BB) { 8728 if (!CM.blockNeedsPredication(BB)) 8729 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8730 8731 // Create the block in mask as the first non-phi instruction in the block. 8732 VPBuilder::InsertPointGuard Guard(Builder); 8733 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8734 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8735 8736 // Introduce the early-exit compare IV <= BTC to form header block mask. 8737 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8738 // Start by constructing the desired canonical IV. 8739 VPValue *IV = nullptr; 8740 if (Legal->getPrimaryInduction()) 8741 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8742 else { 8743 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8744 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8745 IV = IVRecipe; 8746 } 8747 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8748 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8749 8750 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8751 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8752 // as a second argument, we only pass the IV here and extract the 8753 // tripcount from the transform state where codegen of the VP instructions 8754 // happen. 8755 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8756 } else { 8757 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8758 } 8759 return BlockMaskCache[BB] = BlockMask; 8760 } 8761 8762 // This is the block mask. We OR all incoming edges. 8763 for (auto *Predecessor : predecessors(BB)) { 8764 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8765 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8766 return BlockMaskCache[BB] = EdgeMask; 8767 8768 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8769 BlockMask = EdgeMask; 8770 continue; 8771 } 8772 8773 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8774 } 8775 8776 return BlockMaskCache[BB] = BlockMask; 8777 } 8778 8779 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8780 ArrayRef<VPValue *> Operands, 8781 VFRange &Range, 8782 VPlanPtr &Plan) { 8783 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8784 "Must be called with either a load or store"); 8785 8786 auto willWiden = [&](ElementCount VF) -> bool { 8787 if (VF.isScalar()) 8788 return false; 8789 LoopVectorizationCostModel::InstWidening Decision = 8790 CM.getWideningDecision(I, VF); 8791 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8792 "CM decision should be taken at this point."); 8793 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8794 return true; 8795 if (CM.isScalarAfterVectorization(I, VF) || 8796 CM.isProfitableToScalarize(I, VF)) 8797 return false; 8798 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8799 }; 8800 8801 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8802 return nullptr; 8803 8804 VPValue *Mask = nullptr; 8805 if (Legal->isMaskRequired(I)) 8806 Mask = createBlockInMask(I->getParent(), Plan); 8807 8808 // Determine if the pointer operand of the access is either consecutive or 8809 // reverse consecutive. 8810 LoopVectorizationCostModel::InstWidening Decision = 8811 CM.getWideningDecision(I, Range.Start); 8812 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8813 bool Consecutive = 8814 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8815 8816 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8817 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8818 Consecutive, Reverse); 8819 8820 StoreInst *Store = cast<StoreInst>(I); 8821 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8822 Mask, Consecutive, Reverse); 8823 } 8824 8825 VPWidenIntOrFpInductionRecipe * 8826 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8827 ArrayRef<VPValue *> Operands) const { 8828 // Check if this is an integer or fp induction. If so, build the recipe that 8829 // produces its scalar and vector values. 8830 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8831 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8832 II.getKind() == InductionDescriptor::IK_FpInduction) { 8833 assert(II.getStartValue() == 8834 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8835 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8836 return new VPWidenIntOrFpInductionRecipe( 8837 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8838 } 8839 8840 return nullptr; 8841 } 8842 8843 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8844 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8845 VPlan &Plan) const { 8846 // Optimize the special case where the source is a constant integer 8847 // induction variable. Notice that we can only optimize the 'trunc' case 8848 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8849 // (c) other casts depend on pointer size. 8850 8851 // Determine whether \p K is a truncation based on an induction variable that 8852 // can be optimized. 8853 auto isOptimizableIVTruncate = 8854 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8855 return [=](ElementCount VF) -> bool { 8856 return CM.isOptimizableIVTruncate(K, VF); 8857 }; 8858 }; 8859 8860 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8861 isOptimizableIVTruncate(I), Range)) { 8862 8863 InductionDescriptor II = 8864 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8865 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8866 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8867 Start, nullptr, I); 8868 } 8869 return nullptr; 8870 } 8871 8872 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8873 ArrayRef<VPValue *> Operands, 8874 VPlanPtr &Plan) { 8875 // If all incoming values are equal, the incoming VPValue can be used directly 8876 // instead of creating a new VPBlendRecipe. 8877 VPValue *FirstIncoming = Operands[0]; 8878 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8879 return FirstIncoming == Inc; 8880 })) { 8881 return Operands[0]; 8882 } 8883 8884 // We know that all PHIs in non-header blocks are converted into selects, so 8885 // we don't have to worry about the insertion order and we can just use the 8886 // builder. At this point we generate the predication tree. There may be 8887 // duplications since this is a simple recursive scan, but future 8888 // optimizations will clean it up. 8889 SmallVector<VPValue *, 2> OperandsWithMask; 8890 unsigned NumIncoming = Phi->getNumIncomingValues(); 8891 8892 for (unsigned In = 0; In < NumIncoming; In++) { 8893 VPValue *EdgeMask = 8894 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8895 assert((EdgeMask || NumIncoming == 1) && 8896 "Multiple predecessors with one having a full mask"); 8897 OperandsWithMask.push_back(Operands[In]); 8898 if (EdgeMask) 8899 OperandsWithMask.push_back(EdgeMask); 8900 } 8901 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8902 } 8903 8904 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8905 ArrayRef<VPValue *> Operands, 8906 VFRange &Range) const { 8907 8908 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8909 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8910 Range); 8911 8912 if (IsPredicated) 8913 return nullptr; 8914 8915 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8916 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8917 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8918 ID == Intrinsic::pseudoprobe || 8919 ID == Intrinsic::experimental_noalias_scope_decl)) 8920 return nullptr; 8921 8922 auto willWiden = [&](ElementCount VF) -> bool { 8923 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8924 // The following case may be scalarized depending on the VF. 8925 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8926 // version of the instruction. 8927 // Is it beneficial to perform intrinsic call compared to lib call? 8928 bool NeedToScalarize = false; 8929 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8930 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8931 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8932 return UseVectorIntrinsic || !NeedToScalarize; 8933 }; 8934 8935 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8936 return nullptr; 8937 8938 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8939 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8940 } 8941 8942 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8943 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8944 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8945 // Instruction should be widened, unless it is scalar after vectorization, 8946 // scalarization is profitable or it is predicated. 8947 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8948 return CM.isScalarAfterVectorization(I, VF) || 8949 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8950 }; 8951 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8952 Range); 8953 } 8954 8955 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8956 ArrayRef<VPValue *> Operands) const { 8957 auto IsVectorizableOpcode = [](unsigned Opcode) { 8958 switch (Opcode) { 8959 case Instruction::Add: 8960 case Instruction::And: 8961 case Instruction::AShr: 8962 case Instruction::BitCast: 8963 case Instruction::FAdd: 8964 case Instruction::FCmp: 8965 case Instruction::FDiv: 8966 case Instruction::FMul: 8967 case Instruction::FNeg: 8968 case Instruction::FPExt: 8969 case Instruction::FPToSI: 8970 case Instruction::FPToUI: 8971 case Instruction::FPTrunc: 8972 case Instruction::FRem: 8973 case Instruction::FSub: 8974 case Instruction::ICmp: 8975 case Instruction::IntToPtr: 8976 case Instruction::LShr: 8977 case Instruction::Mul: 8978 case Instruction::Or: 8979 case Instruction::PtrToInt: 8980 case Instruction::SDiv: 8981 case Instruction::Select: 8982 case Instruction::SExt: 8983 case Instruction::Shl: 8984 case Instruction::SIToFP: 8985 case Instruction::SRem: 8986 case Instruction::Sub: 8987 case Instruction::Trunc: 8988 case Instruction::UDiv: 8989 case Instruction::UIToFP: 8990 case Instruction::URem: 8991 case Instruction::Xor: 8992 case Instruction::ZExt: 8993 return true; 8994 } 8995 return false; 8996 }; 8997 8998 if (!IsVectorizableOpcode(I->getOpcode())) 8999 return nullptr; 9000 9001 // Success: widen this instruction. 9002 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 9003 } 9004 9005 void VPRecipeBuilder::fixHeaderPhis() { 9006 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 9007 for (VPWidenPHIRecipe *R : PhisToFix) { 9008 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 9009 VPRecipeBase *IncR = 9010 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 9011 R->addOperand(IncR->getVPSingleValue()); 9012 } 9013 } 9014 9015 VPBasicBlock *VPRecipeBuilder::handleReplication( 9016 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 9017 VPlanPtr &Plan) { 9018 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 9019 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 9020 Range); 9021 9022 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 9023 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 9024 9025 // Even if the instruction is not marked as uniform, there are certain 9026 // intrinsic calls that can be effectively treated as such, so we check for 9027 // them here. Conservatively, we only do this for scalable vectors, since 9028 // for fixed-width VFs we can always fall back on full scalarization. 9029 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 9030 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 9031 case Intrinsic::assume: 9032 case Intrinsic::lifetime_start: 9033 case Intrinsic::lifetime_end: 9034 // For scalable vectors if one of the operands is variant then we still 9035 // want to mark as uniform, which will generate one instruction for just 9036 // the first lane of the vector. We can't scalarize the call in the same 9037 // way as for fixed-width vectors because we don't know how many lanes 9038 // there are. 9039 // 9040 // The reasons for doing it this way for scalable vectors are: 9041 // 1. For the assume intrinsic generating the instruction for the first 9042 // lane is still be better than not generating any at all. For 9043 // example, the input may be a splat across all lanes. 9044 // 2. For the lifetime start/end intrinsics the pointer operand only 9045 // does anything useful when the input comes from a stack object, 9046 // which suggests it should always be uniform. For non-stack objects 9047 // the effect is to poison the object, which still allows us to 9048 // remove the call. 9049 IsUniform = true; 9050 break; 9051 default: 9052 break; 9053 } 9054 } 9055 9056 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 9057 IsUniform, IsPredicated); 9058 setRecipe(I, Recipe); 9059 Plan->addVPValue(I, Recipe); 9060 9061 // Find if I uses a predicated instruction. If so, it will use its scalar 9062 // value. Avoid hoisting the insert-element which packs the scalar value into 9063 // a vector value, as that happens iff all users use the vector value. 9064 for (VPValue *Op : Recipe->operands()) { 9065 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 9066 if (!PredR) 9067 continue; 9068 auto *RepR = 9069 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 9070 assert(RepR->isPredicated() && 9071 "expected Replicate recipe to be predicated"); 9072 RepR->setAlsoPack(false); 9073 } 9074 9075 // Finalize the recipe for Instr, first if it is not predicated. 9076 if (!IsPredicated) { 9077 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9078 VPBB->appendRecipe(Recipe); 9079 return VPBB; 9080 } 9081 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9082 assert(VPBB->getSuccessors().empty() && 9083 "VPBB has successors when handling predicated replication."); 9084 // Record predicated instructions for above packing optimizations. 9085 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9086 VPBlockUtils::insertBlockAfter(Region, VPBB); 9087 auto *RegSucc = new VPBasicBlock(); 9088 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9089 return RegSucc; 9090 } 9091 9092 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9093 VPRecipeBase *PredRecipe, 9094 VPlanPtr &Plan) { 9095 // Instructions marked for predication are replicated and placed under an 9096 // if-then construct to prevent side-effects. 9097 9098 // Generate recipes to compute the block mask for this region. 9099 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9100 9101 // Build the triangular if-then region. 9102 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9103 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9104 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9105 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9106 auto *PHIRecipe = Instr->getType()->isVoidTy() 9107 ? nullptr 9108 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9109 if (PHIRecipe) { 9110 Plan->removeVPValueFor(Instr); 9111 Plan->addVPValue(Instr, PHIRecipe); 9112 } 9113 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9114 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9115 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9116 9117 // Note: first set Entry as region entry and then connect successors starting 9118 // from it in order, to propagate the "parent" of each VPBasicBlock. 9119 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9120 VPBlockUtils::connectBlocks(Pred, Exit); 9121 9122 return Region; 9123 } 9124 9125 VPRecipeOrVPValueTy 9126 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9127 ArrayRef<VPValue *> Operands, 9128 VFRange &Range, VPlanPtr &Plan) { 9129 // First, check for specific widening recipes that deal with calls, memory 9130 // operations, inductions and Phi nodes. 9131 if (auto *CI = dyn_cast<CallInst>(Instr)) 9132 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9133 9134 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9135 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9136 9137 VPRecipeBase *Recipe; 9138 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9139 if (Phi->getParent() != OrigLoop->getHeader()) 9140 return tryToBlend(Phi, Operands, Plan); 9141 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9142 return toVPRecipeResult(Recipe); 9143 9144 VPWidenPHIRecipe *PhiRecipe = nullptr; 9145 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9146 VPValue *StartV = Operands[0]; 9147 if (Legal->isReductionVariable(Phi)) { 9148 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9149 assert(RdxDesc.getRecurrenceStartValue() == 9150 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9151 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9152 CM.isInLoopReduction(Phi), 9153 CM.useOrderedReductions(RdxDesc)); 9154 } else { 9155 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9156 } 9157 9158 // Record the incoming value from the backedge, so we can add the incoming 9159 // value from the backedge after all recipes have been created. 9160 recordRecipeOf(cast<Instruction>( 9161 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9162 PhisToFix.push_back(PhiRecipe); 9163 } else { 9164 // TODO: record start and backedge value for remaining pointer induction 9165 // phis. 9166 assert(Phi->getType()->isPointerTy() && 9167 "only pointer phis should be handled here"); 9168 PhiRecipe = new VPWidenPHIRecipe(Phi); 9169 } 9170 9171 return toVPRecipeResult(PhiRecipe); 9172 } 9173 9174 if (isa<TruncInst>(Instr) && 9175 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9176 Range, *Plan))) 9177 return toVPRecipeResult(Recipe); 9178 9179 if (!shouldWiden(Instr, Range)) 9180 return nullptr; 9181 9182 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9183 return toVPRecipeResult(new VPWidenGEPRecipe( 9184 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9185 9186 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9187 bool InvariantCond = 9188 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9189 return toVPRecipeResult(new VPWidenSelectRecipe( 9190 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9191 } 9192 9193 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9194 } 9195 9196 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9197 ElementCount MaxVF) { 9198 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9199 9200 // Collect instructions from the original loop that will become trivially dead 9201 // in the vectorized loop. We don't need to vectorize these instructions. For 9202 // example, original induction update instructions can become dead because we 9203 // separately emit induction "steps" when generating code for the new loop. 9204 // Similarly, we create a new latch condition when setting up the structure 9205 // of the new loop, so the old one can become dead. 9206 SmallPtrSet<Instruction *, 4> DeadInstructions; 9207 collectTriviallyDeadInstructions(DeadInstructions); 9208 9209 // Add assume instructions we need to drop to DeadInstructions, to prevent 9210 // them from being added to the VPlan. 9211 // TODO: We only need to drop assumes in blocks that get flattend. If the 9212 // control flow is preserved, we should keep them. 9213 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9214 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9215 9216 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9217 // Dead instructions do not need sinking. Remove them from SinkAfter. 9218 for (Instruction *I : DeadInstructions) 9219 SinkAfter.erase(I); 9220 9221 // Cannot sink instructions after dead instructions (there won't be any 9222 // recipes for them). Instead, find the first non-dead previous instruction. 9223 for (auto &P : Legal->getSinkAfter()) { 9224 Instruction *SinkTarget = P.second; 9225 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9226 (void)FirstInst; 9227 while (DeadInstructions.contains(SinkTarget)) { 9228 assert( 9229 SinkTarget != FirstInst && 9230 "Must find a live instruction (at least the one feeding the " 9231 "first-order recurrence PHI) before reaching beginning of the block"); 9232 SinkTarget = SinkTarget->getPrevNode(); 9233 assert(SinkTarget != P.first && 9234 "sink source equals target, no sinking required"); 9235 } 9236 P.second = SinkTarget; 9237 } 9238 9239 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9240 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9241 VFRange SubRange = {VF, MaxVFPlusOne}; 9242 VPlans.push_back( 9243 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9244 VF = SubRange.End; 9245 } 9246 } 9247 9248 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9249 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9250 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9251 9252 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9253 9254 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9255 9256 // --------------------------------------------------------------------------- 9257 // Pre-construction: record ingredients whose recipes we'll need to further 9258 // process after constructing the initial VPlan. 9259 // --------------------------------------------------------------------------- 9260 9261 // Mark instructions we'll need to sink later and their targets as 9262 // ingredients whose recipe we'll need to record. 9263 for (auto &Entry : SinkAfter) { 9264 RecipeBuilder.recordRecipeOf(Entry.first); 9265 RecipeBuilder.recordRecipeOf(Entry.second); 9266 } 9267 for (auto &Reduction : CM.getInLoopReductionChains()) { 9268 PHINode *Phi = Reduction.first; 9269 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9270 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9271 9272 RecipeBuilder.recordRecipeOf(Phi); 9273 for (auto &R : ReductionOperations) { 9274 RecipeBuilder.recordRecipeOf(R); 9275 // For min/max reducitons, where we have a pair of icmp/select, we also 9276 // need to record the ICmp recipe, so it can be removed later. 9277 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9278 "Only min/max recurrences allowed for inloop reductions"); 9279 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9280 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9281 } 9282 } 9283 9284 // For each interleave group which is relevant for this (possibly trimmed) 9285 // Range, add it to the set of groups to be later applied to the VPlan and add 9286 // placeholders for its members' Recipes which we'll be replacing with a 9287 // single VPInterleaveRecipe. 9288 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9289 auto applyIG = [IG, this](ElementCount VF) -> bool { 9290 return (VF.isVector() && // Query is illegal for VF == 1 9291 CM.getWideningDecision(IG->getInsertPos(), VF) == 9292 LoopVectorizationCostModel::CM_Interleave); 9293 }; 9294 if (!getDecisionAndClampRange(applyIG, Range)) 9295 continue; 9296 InterleaveGroups.insert(IG); 9297 for (unsigned i = 0; i < IG->getFactor(); i++) 9298 if (Instruction *Member = IG->getMember(i)) 9299 RecipeBuilder.recordRecipeOf(Member); 9300 }; 9301 9302 // --------------------------------------------------------------------------- 9303 // Build initial VPlan: Scan the body of the loop in a topological order to 9304 // visit each basic block after having visited its predecessor basic blocks. 9305 // --------------------------------------------------------------------------- 9306 9307 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9308 auto Plan = std::make_unique<VPlan>(); 9309 9310 // Scan the body of the loop in a topological order to visit each basic block 9311 // after having visited its predecessor basic blocks. 9312 LoopBlocksDFS DFS(OrigLoop); 9313 DFS.perform(LI); 9314 9315 VPBasicBlock *VPBB = nullptr; 9316 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9317 // Relevant instructions from basic block BB will be grouped into VPRecipe 9318 // ingredients and fill a new VPBasicBlock. 9319 unsigned VPBBsForBB = 0; 9320 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9321 if (VPBB) 9322 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9323 else 9324 Plan->setEntry(FirstVPBBForBB); 9325 VPBB = FirstVPBBForBB; 9326 Builder.setInsertPoint(VPBB); 9327 9328 // Introduce each ingredient into VPlan. 9329 // TODO: Model and preserve debug instrinsics in VPlan. 9330 for (Instruction &I : BB->instructionsWithoutDebug()) { 9331 Instruction *Instr = &I; 9332 9333 // First filter out irrelevant instructions, to ensure no recipes are 9334 // built for them. 9335 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9336 continue; 9337 9338 SmallVector<VPValue *, 4> Operands; 9339 auto *Phi = dyn_cast<PHINode>(Instr); 9340 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9341 Operands.push_back(Plan->getOrAddVPValue( 9342 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9343 } else { 9344 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9345 Operands = {OpRange.begin(), OpRange.end()}; 9346 } 9347 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9348 Instr, Operands, Range, Plan)) { 9349 // If Instr can be simplified to an existing VPValue, use it. 9350 if (RecipeOrValue.is<VPValue *>()) { 9351 auto *VPV = RecipeOrValue.get<VPValue *>(); 9352 Plan->addVPValue(Instr, VPV); 9353 // If the re-used value is a recipe, register the recipe for the 9354 // instruction, in case the recipe for Instr needs to be recorded. 9355 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9356 RecipeBuilder.setRecipe(Instr, R); 9357 continue; 9358 } 9359 // Otherwise, add the new recipe. 9360 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9361 for (auto *Def : Recipe->definedValues()) { 9362 auto *UV = Def->getUnderlyingValue(); 9363 Plan->addVPValue(UV, Def); 9364 } 9365 9366 RecipeBuilder.setRecipe(Instr, Recipe); 9367 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe)) { 9368 // Make sure induction recipes are all kept in the header block. 9369 // VPWidenIntOrFpInductionRecipe may be generated when reaching a 9370 // Trunc of an induction Phi, where Trunc may not be in the header. 9371 auto *Header = Plan->getEntry()->getEntryBasicBlock(); 9372 Header->insert(Recipe, Header->getFirstNonPhi()); 9373 } else 9374 VPBB->appendRecipe(Recipe); 9375 continue; 9376 } 9377 9378 // Otherwise, if all widening options failed, Instruction is to be 9379 // replicated. This may create a successor for VPBB. 9380 VPBasicBlock *NextVPBB = 9381 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9382 if (NextVPBB != VPBB) { 9383 VPBB = NextVPBB; 9384 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9385 : ""); 9386 } 9387 } 9388 } 9389 9390 assert(isa<VPBasicBlock>(Plan->getEntry()) && 9391 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9392 "entry block must be set to a non-empty VPBasicBlock"); 9393 RecipeBuilder.fixHeaderPhis(); 9394 9395 // --------------------------------------------------------------------------- 9396 // Transform initial VPlan: Apply previously taken decisions, in order, to 9397 // bring the VPlan to its final state. 9398 // --------------------------------------------------------------------------- 9399 9400 // Apply Sink-After legal constraints. 9401 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9402 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9403 if (Region && Region->isReplicator()) { 9404 assert(Region->getNumSuccessors() == 1 && 9405 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9406 assert(R->getParent()->size() == 1 && 9407 "A recipe in an original replicator region must be the only " 9408 "recipe in its block"); 9409 return Region; 9410 } 9411 return nullptr; 9412 }; 9413 for (auto &Entry : SinkAfter) { 9414 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9415 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9416 9417 auto *TargetRegion = GetReplicateRegion(Target); 9418 auto *SinkRegion = GetReplicateRegion(Sink); 9419 if (!SinkRegion) { 9420 // If the sink source is not a replicate region, sink the recipe directly. 9421 if (TargetRegion) { 9422 // The target is in a replication region, make sure to move Sink to 9423 // the block after it, not into the replication region itself. 9424 VPBasicBlock *NextBlock = 9425 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9426 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9427 } else 9428 Sink->moveAfter(Target); 9429 continue; 9430 } 9431 9432 // The sink source is in a replicate region. Unhook the region from the CFG. 9433 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9434 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9435 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9436 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9437 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9438 9439 if (TargetRegion) { 9440 // The target recipe is also in a replicate region, move the sink region 9441 // after the target region. 9442 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9443 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9444 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9445 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9446 } else { 9447 // The sink source is in a replicate region, we need to move the whole 9448 // replicate region, which should only contain a single recipe in the 9449 // main block. 9450 auto *SplitBlock = 9451 Target->getParent()->splitAt(std::next(Target->getIterator())); 9452 9453 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9454 9455 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9456 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9457 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9458 if (VPBB == SplitPred) 9459 VPBB = SplitBlock; 9460 } 9461 } 9462 9463 // Adjust the recipes for any inloop reductions. 9464 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9465 9466 // Introduce a recipe to combine the incoming and previous values of a 9467 // first-order recurrence. 9468 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9469 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9470 if (!RecurPhi) 9471 continue; 9472 9473 auto *RecurSplice = cast<VPInstruction>( 9474 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9475 {RecurPhi, RecurPhi->getBackedgeValue()})); 9476 9477 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9478 if (auto *Region = GetReplicateRegion(PrevRecipe)) { 9479 VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9480 RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi()); 9481 } else 9482 RecurSplice->moveAfter(PrevRecipe); 9483 RecurPhi->replaceAllUsesWith(RecurSplice); 9484 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9485 // all users. 9486 RecurSplice->setOperand(0, RecurPhi); 9487 } 9488 9489 // Interleave memory: for each Interleave Group we marked earlier as relevant 9490 // for this VPlan, replace the Recipes widening its memory instructions with a 9491 // single VPInterleaveRecipe at its insertion point. 9492 for (auto IG : InterleaveGroups) { 9493 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9494 RecipeBuilder.getRecipe(IG->getInsertPos())); 9495 SmallVector<VPValue *, 4> StoredValues; 9496 for (unsigned i = 0; i < IG->getFactor(); ++i) 9497 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9498 auto *StoreR = 9499 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9500 StoredValues.push_back(StoreR->getStoredValue()); 9501 } 9502 9503 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9504 Recipe->getMask()); 9505 VPIG->insertBefore(Recipe); 9506 unsigned J = 0; 9507 for (unsigned i = 0; i < IG->getFactor(); ++i) 9508 if (Instruction *Member = IG->getMember(i)) { 9509 if (!Member->getType()->isVoidTy()) { 9510 VPValue *OriginalV = Plan->getVPValue(Member); 9511 Plan->removeVPValueFor(Member); 9512 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9513 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9514 J++; 9515 } 9516 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9517 } 9518 } 9519 9520 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9521 // in ways that accessing values using original IR values is incorrect. 9522 Plan->disableValue2VPValue(); 9523 9524 VPlanTransforms::sinkScalarOperands(*Plan); 9525 VPlanTransforms::mergeReplicateRegions(*Plan); 9526 9527 std::string PlanName; 9528 raw_string_ostream RSO(PlanName); 9529 ElementCount VF = Range.Start; 9530 Plan->addVF(VF); 9531 RSO << "Initial VPlan for VF={" << VF; 9532 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9533 Plan->addVF(VF); 9534 RSO << "," << VF; 9535 } 9536 RSO << "},UF>=1"; 9537 RSO.flush(); 9538 Plan->setName(PlanName); 9539 9540 return Plan; 9541 } 9542 9543 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9544 // Outer loop handling: They may require CFG and instruction level 9545 // transformations before even evaluating whether vectorization is profitable. 9546 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9547 // the vectorization pipeline. 9548 assert(!OrigLoop->isInnermost()); 9549 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9550 9551 // Create new empty VPlan 9552 auto Plan = std::make_unique<VPlan>(); 9553 9554 // Build hierarchical CFG 9555 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9556 HCFGBuilder.buildHierarchicalCFG(); 9557 9558 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9559 VF *= 2) 9560 Plan->addVF(VF); 9561 9562 if (EnableVPlanPredication) { 9563 VPlanPredicator VPP(*Plan); 9564 VPP.predicate(); 9565 9566 // Avoid running transformation to recipes until masked code generation in 9567 // VPlan-native path is in place. 9568 return Plan; 9569 } 9570 9571 SmallPtrSet<Instruction *, 1> DeadInstructions; 9572 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9573 Legal->getInductionVars(), 9574 DeadInstructions, *PSE.getSE()); 9575 return Plan; 9576 } 9577 9578 // Adjust the recipes for reductions. For in-loop reductions the chain of 9579 // instructions leading from the loop exit instr to the phi need to be converted 9580 // to reductions, with one operand being vector and the other being the scalar 9581 // reduction chain. For other reductions, a select is introduced between the phi 9582 // and live-out recipes when folding the tail. 9583 void LoopVectorizationPlanner::adjustRecipesForReductions( 9584 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9585 ElementCount MinVF) { 9586 for (auto &Reduction : CM.getInLoopReductionChains()) { 9587 PHINode *Phi = Reduction.first; 9588 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9589 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9590 9591 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9592 continue; 9593 9594 // ReductionOperations are orders top-down from the phi's use to the 9595 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9596 // which of the two operands will remain scalar and which will be reduced. 9597 // For minmax the chain will be the select instructions. 9598 Instruction *Chain = Phi; 9599 for (Instruction *R : ReductionOperations) { 9600 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9601 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9602 9603 VPValue *ChainOp = Plan->getVPValue(Chain); 9604 unsigned FirstOpId; 9605 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9606 "Only min/max recurrences allowed for inloop reductions"); 9607 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9608 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9609 "Expected to replace a VPWidenSelectSC"); 9610 FirstOpId = 1; 9611 } else { 9612 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && 9613 "Expected to replace a VPWidenSC"); 9614 FirstOpId = 0; 9615 } 9616 unsigned VecOpId = 9617 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9618 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9619 9620 auto *CondOp = CM.foldTailByMasking() 9621 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9622 : nullptr; 9623 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9624 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9625 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9626 Plan->removeVPValueFor(R); 9627 Plan->addVPValue(R, RedRecipe); 9628 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9629 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9630 WidenRecipe->eraseFromParent(); 9631 9632 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9633 VPRecipeBase *CompareRecipe = 9634 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9635 assert(isa<VPWidenRecipe>(CompareRecipe) && 9636 "Expected to replace a VPWidenSC"); 9637 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9638 "Expected no remaining users"); 9639 CompareRecipe->eraseFromParent(); 9640 } 9641 Chain = R; 9642 } 9643 } 9644 9645 // If tail is folded by masking, introduce selects between the phi 9646 // and the live-out instruction of each reduction, at the end of the latch. 9647 if (CM.foldTailByMasking()) { 9648 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9649 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9650 if (!PhiR || PhiR->isInLoop()) 9651 continue; 9652 Builder.setInsertPoint(LatchVPBB); 9653 VPValue *Cond = 9654 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9655 VPValue *Red = PhiR->getBackedgeValue(); 9656 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9657 } 9658 } 9659 } 9660 9661 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9662 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9663 VPSlotTracker &SlotTracker) const { 9664 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9665 IG->getInsertPos()->printAsOperand(O, false); 9666 O << ", "; 9667 getAddr()->printAsOperand(O, SlotTracker); 9668 VPValue *Mask = getMask(); 9669 if (Mask) { 9670 O << ", "; 9671 Mask->printAsOperand(O, SlotTracker); 9672 } 9673 9674 unsigned OpIdx = 0; 9675 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9676 if (!IG->getMember(i)) 9677 continue; 9678 if (getNumStoreOperands() > 0) { 9679 O << "\n" << Indent << " store "; 9680 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9681 O << " to index " << i; 9682 } else { 9683 O << "\n" << Indent << " "; 9684 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9685 O << " = load from index " << i; 9686 } 9687 ++OpIdx; 9688 } 9689 } 9690 #endif 9691 9692 void VPWidenCallRecipe::execute(VPTransformState &State) { 9693 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9694 *this, State); 9695 } 9696 9697 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9698 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9699 this, *this, InvariantCond, State); 9700 } 9701 9702 void VPWidenRecipe::execute(VPTransformState &State) { 9703 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9704 } 9705 9706 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9707 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9708 *this, State.UF, State.VF, IsPtrLoopInvariant, 9709 IsIndexLoopInvariant, State); 9710 } 9711 9712 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9713 assert(!State.Instance && "Int or FP induction being replicated."); 9714 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9715 getTruncInst(), getVPValue(0), 9716 getCastValue(), State); 9717 } 9718 9719 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9720 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9721 State); 9722 } 9723 9724 void VPBlendRecipe::execute(VPTransformState &State) { 9725 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9726 // We know that all PHIs in non-header blocks are converted into 9727 // selects, so we don't have to worry about the insertion order and we 9728 // can just use the builder. 9729 // At this point we generate the predication tree. There may be 9730 // duplications since this is a simple recursive scan, but future 9731 // optimizations will clean it up. 9732 9733 unsigned NumIncoming = getNumIncomingValues(); 9734 9735 // Generate a sequence of selects of the form: 9736 // SELECT(Mask3, In3, 9737 // SELECT(Mask2, In2, 9738 // SELECT(Mask1, In1, 9739 // In0))) 9740 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9741 // are essentially undef are taken from In0. 9742 InnerLoopVectorizer::VectorParts Entry(State.UF); 9743 for (unsigned In = 0; In < NumIncoming; ++In) { 9744 for (unsigned Part = 0; Part < State.UF; ++Part) { 9745 // We might have single edge PHIs (blocks) - use an identity 9746 // 'select' for the first PHI operand. 9747 Value *In0 = State.get(getIncomingValue(In), Part); 9748 if (In == 0) 9749 Entry[Part] = In0; // Initialize with the first incoming value. 9750 else { 9751 // Select between the current value and the previous incoming edge 9752 // based on the incoming mask. 9753 Value *Cond = State.get(getMask(In), Part); 9754 Entry[Part] = 9755 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9756 } 9757 } 9758 } 9759 for (unsigned Part = 0; Part < State.UF; ++Part) 9760 State.set(this, Entry[Part], Part); 9761 } 9762 9763 void VPInterleaveRecipe::execute(VPTransformState &State) { 9764 assert(!State.Instance && "Interleave group being replicated."); 9765 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9766 getStoredValues(), getMask()); 9767 } 9768 9769 void VPReductionRecipe::execute(VPTransformState &State) { 9770 assert(!State.Instance && "Reduction being replicated."); 9771 Value *PrevInChain = State.get(getChainOp(), 0); 9772 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9773 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9774 // Propagate the fast-math flags carried by the underlying instruction. 9775 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9776 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9777 for (unsigned Part = 0; Part < State.UF; ++Part) { 9778 Value *NewVecOp = State.get(getVecOp(), Part); 9779 if (VPValue *Cond = getCondOp()) { 9780 Value *NewCond = State.get(Cond, Part); 9781 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9782 Value *Iden = RdxDesc->getRecurrenceIdentity( 9783 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9784 Value *IdenVec = 9785 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9786 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9787 NewVecOp = Select; 9788 } 9789 Value *NewRed; 9790 Value *NextInChain; 9791 if (IsOrdered) { 9792 if (State.VF.isVector()) 9793 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9794 PrevInChain); 9795 else 9796 NewRed = State.Builder.CreateBinOp( 9797 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9798 NewVecOp); 9799 PrevInChain = NewRed; 9800 } else { 9801 PrevInChain = State.get(getChainOp(), Part); 9802 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9803 } 9804 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9805 NextInChain = 9806 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9807 NewRed, PrevInChain); 9808 } else if (IsOrdered) 9809 NextInChain = NewRed; 9810 else 9811 NextInChain = State.Builder.CreateBinOp( 9812 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9813 PrevInChain); 9814 State.set(this, NextInChain, Part); 9815 } 9816 } 9817 9818 void VPReplicateRecipe::execute(VPTransformState &State) { 9819 if (State.Instance) { // Generate a single instance. 9820 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9821 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9822 *State.Instance, IsPredicated, State); 9823 // Insert scalar instance packing it into a vector. 9824 if (AlsoPack && State.VF.isVector()) { 9825 // If we're constructing lane 0, initialize to start from poison. 9826 if (State.Instance->Lane.isFirstLane()) { 9827 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9828 Value *Poison = PoisonValue::get( 9829 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9830 State.set(this, Poison, State.Instance->Part); 9831 } 9832 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9833 } 9834 return; 9835 } 9836 9837 // Generate scalar instances for all VF lanes of all UF parts, unless the 9838 // instruction is uniform inwhich case generate only the first lane for each 9839 // of the UF parts. 9840 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9841 assert((!State.VF.isScalable() || IsUniform) && 9842 "Can't scalarize a scalable vector"); 9843 for (unsigned Part = 0; Part < State.UF; ++Part) 9844 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9845 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9846 VPIteration(Part, Lane), IsPredicated, 9847 State); 9848 } 9849 9850 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9851 assert(State.Instance && "Branch on Mask works only on single instance."); 9852 9853 unsigned Part = State.Instance->Part; 9854 unsigned Lane = State.Instance->Lane.getKnownLane(); 9855 9856 Value *ConditionBit = nullptr; 9857 VPValue *BlockInMask = getMask(); 9858 if (BlockInMask) { 9859 ConditionBit = State.get(BlockInMask, Part); 9860 if (ConditionBit->getType()->isVectorTy()) 9861 ConditionBit = State.Builder.CreateExtractElement( 9862 ConditionBit, State.Builder.getInt32(Lane)); 9863 } else // Block in mask is all-one. 9864 ConditionBit = State.Builder.getTrue(); 9865 9866 // Replace the temporary unreachable terminator with a new conditional branch, 9867 // whose two destinations will be set later when they are created. 9868 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9869 assert(isa<UnreachableInst>(CurrentTerminator) && 9870 "Expected to replace unreachable terminator with conditional branch."); 9871 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9872 CondBr->setSuccessor(0, nullptr); 9873 ReplaceInstWithInst(CurrentTerminator, CondBr); 9874 } 9875 9876 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9877 assert(State.Instance && "Predicated instruction PHI works per instance."); 9878 Instruction *ScalarPredInst = 9879 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9880 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9881 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9882 assert(PredicatingBB && "Predicated block has no single predecessor."); 9883 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9884 "operand must be VPReplicateRecipe"); 9885 9886 // By current pack/unpack logic we need to generate only a single phi node: if 9887 // a vector value for the predicated instruction exists at this point it means 9888 // the instruction has vector users only, and a phi for the vector value is 9889 // needed. In this case the recipe of the predicated instruction is marked to 9890 // also do that packing, thereby "hoisting" the insert-element sequence. 9891 // Otherwise, a phi node for the scalar value is needed. 9892 unsigned Part = State.Instance->Part; 9893 if (State.hasVectorValue(getOperand(0), Part)) { 9894 Value *VectorValue = State.get(getOperand(0), Part); 9895 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9896 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9897 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9898 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9899 if (State.hasVectorValue(this, Part)) 9900 State.reset(this, VPhi, Part); 9901 else 9902 State.set(this, VPhi, Part); 9903 // NOTE: Currently we need to update the value of the operand, so the next 9904 // predicated iteration inserts its generated value in the correct vector. 9905 State.reset(getOperand(0), VPhi, Part); 9906 } else { 9907 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9908 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9909 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9910 PredicatingBB); 9911 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9912 if (State.hasScalarValue(this, *State.Instance)) 9913 State.reset(this, Phi, *State.Instance); 9914 else 9915 State.set(this, Phi, *State.Instance); 9916 // NOTE: Currently we need to update the value of the operand, so the next 9917 // predicated iteration inserts its generated value in the correct vector. 9918 State.reset(getOperand(0), Phi, *State.Instance); 9919 } 9920 } 9921 9922 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9923 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9924 State.ILV->vectorizeMemoryInstruction( 9925 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9926 StoredValue, getMask(), Consecutive, Reverse); 9927 } 9928 9929 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9930 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9931 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9932 // for predication. 9933 static ScalarEpilogueLowering getScalarEpilogueLowering( 9934 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9935 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9936 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9937 LoopVectorizationLegality &LVL) { 9938 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9939 // don't look at hints or options, and don't request a scalar epilogue. 9940 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9941 // LoopAccessInfo (due to code dependency and not being able to reliably get 9942 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9943 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9944 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9945 // back to the old way and vectorize with versioning when forced. See D81345.) 9946 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9947 PGSOQueryType::IRPass) && 9948 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9949 return CM_ScalarEpilogueNotAllowedOptSize; 9950 9951 // 2) If set, obey the directives 9952 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9953 switch (PreferPredicateOverEpilogue) { 9954 case PreferPredicateTy::ScalarEpilogue: 9955 return CM_ScalarEpilogueAllowed; 9956 case PreferPredicateTy::PredicateElseScalarEpilogue: 9957 return CM_ScalarEpilogueNotNeededUsePredicate; 9958 case PreferPredicateTy::PredicateOrDontVectorize: 9959 return CM_ScalarEpilogueNotAllowedUsePredicate; 9960 }; 9961 } 9962 9963 // 3) If set, obey the hints 9964 switch (Hints.getPredicate()) { 9965 case LoopVectorizeHints::FK_Enabled: 9966 return CM_ScalarEpilogueNotNeededUsePredicate; 9967 case LoopVectorizeHints::FK_Disabled: 9968 return CM_ScalarEpilogueAllowed; 9969 }; 9970 9971 // 4) if the TTI hook indicates this is profitable, request predication. 9972 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9973 LVL.getLAI())) 9974 return CM_ScalarEpilogueNotNeededUsePredicate; 9975 9976 return CM_ScalarEpilogueAllowed; 9977 } 9978 9979 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9980 // If Values have been set for this Def return the one relevant for \p Part. 9981 if (hasVectorValue(Def, Part)) 9982 return Data.PerPartOutput[Def][Part]; 9983 9984 if (!hasScalarValue(Def, {Part, 0})) { 9985 Value *IRV = Def->getLiveInIRValue(); 9986 Value *B = ILV->getBroadcastInstrs(IRV); 9987 set(Def, B, Part); 9988 return B; 9989 } 9990 9991 Value *ScalarValue = get(Def, {Part, 0}); 9992 // If we aren't vectorizing, we can just copy the scalar map values over 9993 // to the vector map. 9994 if (VF.isScalar()) { 9995 set(Def, ScalarValue, Part); 9996 return ScalarValue; 9997 } 9998 9999 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10000 bool IsUniform = RepR && RepR->isUniform(); 10001 10002 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10003 // Check if there is a scalar value for the selected lane. 10004 if (!hasScalarValue(Def, {Part, LastLane})) { 10005 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10006 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10007 "unexpected recipe found to be invariant"); 10008 IsUniform = true; 10009 LastLane = 0; 10010 } 10011 10012 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10013 // Set the insert point after the last scalarized instruction or after the 10014 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10015 // will directly follow the scalar definitions. 10016 auto OldIP = Builder.saveIP(); 10017 auto NewIP = 10018 isa<PHINode>(LastInst) 10019 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10020 : std::next(BasicBlock::iterator(LastInst)); 10021 Builder.SetInsertPoint(&*NewIP); 10022 10023 // However, if we are vectorizing, we need to construct the vector values. 10024 // If the value is known to be uniform after vectorization, we can just 10025 // broadcast the scalar value corresponding to lane zero for each unroll 10026 // iteration. Otherwise, we construct the vector values using 10027 // insertelement instructions. Since the resulting vectors are stored in 10028 // State, we will only generate the insertelements once. 10029 Value *VectorValue = nullptr; 10030 if (IsUniform) { 10031 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10032 set(Def, VectorValue, Part); 10033 } else { 10034 // Initialize packing with insertelements to start from undef. 10035 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10036 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10037 set(Def, Undef, Part); 10038 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10039 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10040 VectorValue = get(Def, Part); 10041 } 10042 Builder.restoreIP(OldIP); 10043 return VectorValue; 10044 } 10045 10046 // Process the loop in the VPlan-native vectorization path. This path builds 10047 // VPlan upfront in the vectorization pipeline, which allows to apply 10048 // VPlan-to-VPlan transformations from the very beginning without modifying the 10049 // input LLVM IR. 10050 static bool processLoopInVPlanNativePath( 10051 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10052 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10053 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10054 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10055 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10056 LoopVectorizationRequirements &Requirements) { 10057 10058 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10059 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10060 return false; 10061 } 10062 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10063 Function *F = L->getHeader()->getParent(); 10064 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10065 10066 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10067 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10068 10069 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10070 &Hints, IAI); 10071 // Use the planner for outer loop vectorization. 10072 // TODO: CM is not used at this point inside the planner. Turn CM into an 10073 // optional argument if we don't need it in the future. 10074 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10075 Requirements, ORE); 10076 10077 // Get user vectorization factor. 10078 ElementCount UserVF = Hints.getWidth(); 10079 10080 CM.collectElementTypesForWidening(); 10081 10082 // Plan how to best vectorize, return the best VF and its cost. 10083 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10084 10085 // If we are stress testing VPlan builds, do not attempt to generate vector 10086 // code. Masked vector code generation support will follow soon. 10087 // Also, do not attempt to vectorize if no vector code will be produced. 10088 if (VPlanBuildStressTest || EnableVPlanPredication || 10089 VectorizationFactor::Disabled() == VF) 10090 return false; 10091 10092 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10093 10094 { 10095 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10096 F->getParent()->getDataLayout()); 10097 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10098 &CM, BFI, PSI, Checks); 10099 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10100 << L->getHeader()->getParent()->getName() << "\"\n"); 10101 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10102 } 10103 10104 // Mark the loop as already vectorized to avoid vectorizing again. 10105 Hints.setAlreadyVectorized(); 10106 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10107 return true; 10108 } 10109 10110 // Emit a remark if there are stores to floats that required a floating point 10111 // extension. If the vectorized loop was generated with floating point there 10112 // will be a performance penalty from the conversion overhead and the change in 10113 // the vector width. 10114 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10115 SmallVector<Instruction *, 4> Worklist; 10116 for (BasicBlock *BB : L->getBlocks()) { 10117 for (Instruction &Inst : *BB) { 10118 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10119 if (S->getValueOperand()->getType()->isFloatTy()) 10120 Worklist.push_back(S); 10121 } 10122 } 10123 } 10124 10125 // Traverse the floating point stores upwards searching, for floating point 10126 // conversions. 10127 SmallPtrSet<const Instruction *, 4> Visited; 10128 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10129 while (!Worklist.empty()) { 10130 auto *I = Worklist.pop_back_val(); 10131 if (!L->contains(I)) 10132 continue; 10133 if (!Visited.insert(I).second) 10134 continue; 10135 10136 // Emit a remark if the floating point store required a floating 10137 // point conversion. 10138 // TODO: More work could be done to identify the root cause such as a 10139 // constant or a function return type and point the user to it. 10140 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10141 ORE->emit([&]() { 10142 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10143 I->getDebugLoc(), L->getHeader()) 10144 << "floating point conversion changes vector width. " 10145 << "Mixed floating point precision requires an up/down " 10146 << "cast that will negatively impact performance."; 10147 }); 10148 10149 for (Use &Op : I->operands()) 10150 if (auto *OpI = dyn_cast<Instruction>(Op)) 10151 Worklist.push_back(OpI); 10152 } 10153 } 10154 10155 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10156 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10157 !EnableLoopInterleaving), 10158 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10159 !EnableLoopVectorization) {} 10160 10161 bool LoopVectorizePass::processLoop(Loop *L) { 10162 assert((EnableVPlanNativePath || L->isInnermost()) && 10163 "VPlan-native path is not enabled. Only process inner loops."); 10164 10165 #ifndef NDEBUG 10166 const std::string DebugLocStr = getDebugLocString(L); 10167 #endif /* NDEBUG */ 10168 10169 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10170 << L->getHeader()->getParent()->getName() << "\" from " 10171 << DebugLocStr << "\n"); 10172 10173 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10174 10175 LLVM_DEBUG( 10176 dbgs() << "LV: Loop hints:" 10177 << " force=" 10178 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10179 ? "disabled" 10180 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10181 ? "enabled" 10182 : "?")) 10183 << " width=" << Hints.getWidth() 10184 << " interleave=" << Hints.getInterleave() << "\n"); 10185 10186 // Function containing loop 10187 Function *F = L->getHeader()->getParent(); 10188 10189 // Looking at the diagnostic output is the only way to determine if a loop 10190 // was vectorized (other than looking at the IR or machine code), so it 10191 // is important to generate an optimization remark for each loop. Most of 10192 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10193 // generated as OptimizationRemark and OptimizationRemarkMissed are 10194 // less verbose reporting vectorized loops and unvectorized loops that may 10195 // benefit from vectorization, respectively. 10196 10197 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10198 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10199 return false; 10200 } 10201 10202 PredicatedScalarEvolution PSE(*SE, *L); 10203 10204 // Check if it is legal to vectorize the loop. 10205 LoopVectorizationRequirements Requirements; 10206 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10207 &Requirements, &Hints, DB, AC, BFI, PSI); 10208 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10209 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10210 Hints.emitRemarkWithHints(); 10211 return false; 10212 } 10213 10214 // Check the function attributes and profiles to find out if this function 10215 // should be optimized for size. 10216 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10217 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10218 10219 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10220 // here. They may require CFG and instruction level transformations before 10221 // even evaluating whether vectorization is profitable. Since we cannot modify 10222 // the incoming IR, we need to build VPlan upfront in the vectorization 10223 // pipeline. 10224 if (!L->isInnermost()) 10225 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10226 ORE, BFI, PSI, Hints, Requirements); 10227 10228 assert(L->isInnermost() && "Inner loop expected."); 10229 10230 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10231 // count by optimizing for size, to minimize overheads. 10232 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10233 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10234 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10235 << "This loop is worth vectorizing only if no scalar " 10236 << "iteration overheads are incurred."); 10237 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10238 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10239 else { 10240 LLVM_DEBUG(dbgs() << "\n"); 10241 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10242 } 10243 } 10244 10245 // Check the function attributes to see if implicit floats are allowed. 10246 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10247 // an integer loop and the vector instructions selected are purely integer 10248 // vector instructions? 10249 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10250 reportVectorizationFailure( 10251 "Can't vectorize when the NoImplicitFloat attribute is used", 10252 "loop not vectorized due to NoImplicitFloat attribute", 10253 "NoImplicitFloat", ORE, L); 10254 Hints.emitRemarkWithHints(); 10255 return false; 10256 } 10257 10258 // Check if the target supports potentially unsafe FP vectorization. 10259 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10260 // for the target we're vectorizing for, to make sure none of the 10261 // additional fp-math flags can help. 10262 if (Hints.isPotentiallyUnsafe() && 10263 TTI->isFPVectorizationPotentiallyUnsafe()) { 10264 reportVectorizationFailure( 10265 "Potentially unsafe FP op prevents vectorization", 10266 "loop not vectorized due to unsafe FP support.", 10267 "UnsafeFP", ORE, L); 10268 Hints.emitRemarkWithHints(); 10269 return false; 10270 } 10271 10272 bool AllowOrderedReductions; 10273 // If the flag is set, use that instead and override the TTI behaviour. 10274 if (ForceOrderedReductions.getNumOccurrences() > 0) 10275 AllowOrderedReductions = ForceOrderedReductions; 10276 else 10277 AllowOrderedReductions = TTI->enableOrderedReductions(); 10278 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10279 ORE->emit([&]() { 10280 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10281 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10282 ExactFPMathInst->getDebugLoc(), 10283 ExactFPMathInst->getParent()) 10284 << "loop not vectorized: cannot prove it is safe to reorder " 10285 "floating-point operations"; 10286 }); 10287 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10288 "reorder floating-point operations\n"); 10289 Hints.emitRemarkWithHints(); 10290 return false; 10291 } 10292 10293 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10294 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10295 10296 // If an override option has been passed in for interleaved accesses, use it. 10297 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10298 UseInterleaved = EnableInterleavedMemAccesses; 10299 10300 // Analyze interleaved memory accesses. 10301 if (UseInterleaved) { 10302 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10303 } 10304 10305 // Use the cost model. 10306 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10307 F, &Hints, IAI); 10308 CM.collectValuesToIgnore(); 10309 CM.collectElementTypesForWidening(); 10310 10311 // Use the planner for vectorization. 10312 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10313 Requirements, ORE); 10314 10315 // Get user vectorization factor and interleave count. 10316 ElementCount UserVF = Hints.getWidth(); 10317 unsigned UserIC = Hints.getInterleave(); 10318 10319 // Plan how to best vectorize, return the best VF and its cost. 10320 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10321 10322 VectorizationFactor VF = VectorizationFactor::Disabled(); 10323 unsigned IC = 1; 10324 10325 if (MaybeVF) { 10326 VF = *MaybeVF; 10327 // Select the interleave count. 10328 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10329 } 10330 10331 // Identify the diagnostic messages that should be produced. 10332 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10333 bool VectorizeLoop = true, InterleaveLoop = true; 10334 if (VF.Width.isScalar()) { 10335 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10336 VecDiagMsg = std::make_pair( 10337 "VectorizationNotBeneficial", 10338 "the cost-model indicates that vectorization is not beneficial"); 10339 VectorizeLoop = false; 10340 } 10341 10342 if (!MaybeVF && UserIC > 1) { 10343 // Tell the user interleaving was avoided up-front, despite being explicitly 10344 // requested. 10345 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10346 "interleaving should be avoided up front\n"); 10347 IntDiagMsg = std::make_pair( 10348 "InterleavingAvoided", 10349 "Ignoring UserIC, because interleaving was avoided up front"); 10350 InterleaveLoop = false; 10351 } else if (IC == 1 && UserIC <= 1) { 10352 // Tell the user interleaving is not beneficial. 10353 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10354 IntDiagMsg = std::make_pair( 10355 "InterleavingNotBeneficial", 10356 "the cost-model indicates that interleaving is not beneficial"); 10357 InterleaveLoop = false; 10358 if (UserIC == 1) { 10359 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10360 IntDiagMsg.second += 10361 " and is explicitly disabled or interleave count is set to 1"; 10362 } 10363 } else if (IC > 1 && UserIC == 1) { 10364 // Tell the user interleaving is beneficial, but it explicitly disabled. 10365 LLVM_DEBUG( 10366 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10367 IntDiagMsg = std::make_pair( 10368 "InterleavingBeneficialButDisabled", 10369 "the cost-model indicates that interleaving is beneficial " 10370 "but is explicitly disabled or interleave count is set to 1"); 10371 InterleaveLoop = false; 10372 } 10373 10374 // Override IC if user provided an interleave count. 10375 IC = UserIC > 0 ? UserIC : IC; 10376 10377 // Emit diagnostic messages, if any. 10378 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10379 if (!VectorizeLoop && !InterleaveLoop) { 10380 // Do not vectorize or interleaving the loop. 10381 ORE->emit([&]() { 10382 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10383 L->getStartLoc(), L->getHeader()) 10384 << VecDiagMsg.second; 10385 }); 10386 ORE->emit([&]() { 10387 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10388 L->getStartLoc(), L->getHeader()) 10389 << IntDiagMsg.second; 10390 }); 10391 return false; 10392 } else if (!VectorizeLoop && InterleaveLoop) { 10393 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10394 ORE->emit([&]() { 10395 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10396 L->getStartLoc(), L->getHeader()) 10397 << VecDiagMsg.second; 10398 }); 10399 } else if (VectorizeLoop && !InterleaveLoop) { 10400 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10401 << ") in " << DebugLocStr << '\n'); 10402 ORE->emit([&]() { 10403 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10404 L->getStartLoc(), L->getHeader()) 10405 << IntDiagMsg.second; 10406 }); 10407 } else if (VectorizeLoop && InterleaveLoop) { 10408 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10409 << ") in " << DebugLocStr << '\n'); 10410 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10411 } 10412 10413 bool DisableRuntimeUnroll = false; 10414 MDNode *OrigLoopID = L->getLoopID(); 10415 { 10416 // Optimistically generate runtime checks. Drop them if they turn out to not 10417 // be profitable. Limit the scope of Checks, so the cleanup happens 10418 // immediately after vector codegeneration is done. 10419 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10420 F->getParent()->getDataLayout()); 10421 if (!VF.Width.isScalar() || IC > 1) 10422 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10423 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10424 10425 using namespace ore; 10426 if (!VectorizeLoop) { 10427 assert(IC > 1 && "interleave count should not be 1 or 0"); 10428 // If we decided that it is not legal to vectorize the loop, then 10429 // interleave it. 10430 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10431 &CM, BFI, PSI, Checks); 10432 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10433 10434 ORE->emit([&]() { 10435 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10436 L->getHeader()) 10437 << "interleaved loop (interleaved count: " 10438 << NV("InterleaveCount", IC) << ")"; 10439 }); 10440 } else { 10441 // If we decided that it is *legal* to vectorize the loop, then do it. 10442 10443 // Consider vectorizing the epilogue too if it's profitable. 10444 VectorizationFactor EpilogueVF = 10445 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10446 if (EpilogueVF.Width.isVector()) { 10447 10448 // The first pass vectorizes the main loop and creates a scalar epilogue 10449 // to be vectorized by executing the plan (potentially with a different 10450 // factor) again shortly afterwards. 10451 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10452 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10453 EPI, &LVL, &CM, BFI, PSI, Checks); 10454 10455 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestPlan, MainILV, DT); 10456 ++LoopsVectorized; 10457 10458 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10459 formLCSSARecursively(*L, *DT, LI, SE); 10460 10461 // Second pass vectorizes the epilogue and adjusts the control flow 10462 // edges from the first pass. 10463 EPI.MainLoopVF = EPI.EpilogueVF; 10464 EPI.MainLoopUF = EPI.EpilogueUF; 10465 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10466 ORE, EPI, &LVL, &CM, BFI, PSI, 10467 Checks); 10468 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestPlan, EpilogILV, 10469 DT); 10470 ++LoopsEpilogueVectorized; 10471 10472 if (!MainILV.areSafetyChecksAdded()) 10473 DisableRuntimeUnroll = true; 10474 } else { 10475 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10476 &LVL, &CM, BFI, PSI, Checks); 10477 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10478 ++LoopsVectorized; 10479 10480 // Add metadata to disable runtime unrolling a scalar loop when there 10481 // are no runtime checks about strides and memory. A scalar loop that is 10482 // rarely used is not worth unrolling. 10483 if (!LB.areSafetyChecksAdded()) 10484 DisableRuntimeUnroll = true; 10485 } 10486 // Report the vectorization decision. 10487 ORE->emit([&]() { 10488 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10489 L->getHeader()) 10490 << "vectorized loop (vectorization width: " 10491 << NV("VectorizationFactor", VF.Width) 10492 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10493 }); 10494 } 10495 10496 if (ORE->allowExtraAnalysis(LV_NAME)) 10497 checkMixedPrecision(L, ORE); 10498 } 10499 10500 Optional<MDNode *> RemainderLoopID = 10501 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10502 LLVMLoopVectorizeFollowupEpilogue}); 10503 if (RemainderLoopID.hasValue()) { 10504 L->setLoopID(RemainderLoopID.getValue()); 10505 } else { 10506 if (DisableRuntimeUnroll) 10507 AddRuntimeUnrollDisableMetaData(L); 10508 10509 // Mark the loop as already vectorized to avoid vectorizing again. 10510 Hints.setAlreadyVectorized(); 10511 } 10512 10513 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10514 return true; 10515 } 10516 10517 LoopVectorizeResult LoopVectorizePass::runImpl( 10518 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10519 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10520 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10521 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10522 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10523 SE = &SE_; 10524 LI = &LI_; 10525 TTI = &TTI_; 10526 DT = &DT_; 10527 BFI = &BFI_; 10528 TLI = TLI_; 10529 AA = &AA_; 10530 AC = &AC_; 10531 GetLAA = &GetLAA_; 10532 DB = &DB_; 10533 ORE = &ORE_; 10534 PSI = PSI_; 10535 10536 // Don't attempt if 10537 // 1. the target claims to have no vector registers, and 10538 // 2. interleaving won't help ILP. 10539 // 10540 // The second condition is necessary because, even if the target has no 10541 // vector registers, loop vectorization may still enable scalar 10542 // interleaving. 10543 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10544 TTI->getMaxInterleaveFactor(1) < 2) 10545 return LoopVectorizeResult(false, false); 10546 10547 bool Changed = false, CFGChanged = false; 10548 10549 // The vectorizer requires loops to be in simplified form. 10550 // Since simplification may add new inner loops, it has to run before the 10551 // legality and profitability checks. This means running the loop vectorizer 10552 // will simplify all loops, regardless of whether anything end up being 10553 // vectorized. 10554 for (auto &L : *LI) 10555 Changed |= CFGChanged |= 10556 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10557 10558 // Build up a worklist of inner-loops to vectorize. This is necessary as 10559 // the act of vectorizing or partially unrolling a loop creates new loops 10560 // and can invalidate iterators across the loops. 10561 SmallVector<Loop *, 8> Worklist; 10562 10563 for (Loop *L : *LI) 10564 collectSupportedLoops(*L, LI, ORE, Worklist); 10565 10566 LoopsAnalyzed += Worklist.size(); 10567 10568 // Now walk the identified inner loops. 10569 while (!Worklist.empty()) { 10570 Loop *L = Worklist.pop_back_val(); 10571 10572 // For the inner loops we actually process, form LCSSA to simplify the 10573 // transform. 10574 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10575 10576 Changed |= CFGChanged |= processLoop(L); 10577 } 10578 10579 // Process each loop nest in the function. 10580 return LoopVectorizeResult(Changed, CFGChanged); 10581 } 10582 10583 PreservedAnalyses LoopVectorizePass::run(Function &F, 10584 FunctionAnalysisManager &AM) { 10585 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10586 auto &LI = AM.getResult<LoopAnalysis>(F); 10587 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10588 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10589 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10590 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10591 auto &AA = AM.getResult<AAManager>(F); 10592 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10593 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10594 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10595 10596 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10597 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10598 [&](Loop &L) -> const LoopAccessInfo & { 10599 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10600 TLI, TTI, nullptr, nullptr, nullptr}; 10601 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10602 }; 10603 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10604 ProfileSummaryInfo *PSI = 10605 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10606 LoopVectorizeResult Result = 10607 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10608 if (!Result.MadeAnyChange) 10609 return PreservedAnalyses::all(); 10610 PreservedAnalyses PA; 10611 10612 // We currently do not preserve loopinfo/dominator analyses with outer loop 10613 // vectorization. Until this is addressed, mark these analyses as preserved 10614 // only for non-VPlan-native path. 10615 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10616 if (!EnableVPlanNativePath) { 10617 PA.preserve<LoopAnalysis>(); 10618 PA.preserve<DominatorTreeAnalysis>(); 10619 } 10620 if (!Result.MadeCFGChange) 10621 PA.preserveSet<CFGAnalyses>(); 10622 return PA; 10623 } 10624 10625 void LoopVectorizePass::printPipeline( 10626 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10627 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10628 OS, MapClassName2PassName); 10629 10630 OS << "<"; 10631 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10632 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10633 OS << ">"; 10634 } 10635