1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/PatternMatch.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/InstructionCost.h" 135 #include "llvm/Support/MathExtras.h" 136 #include "llvm/Support/raw_ostream.h" 137 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 138 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 139 #include "llvm/Transforms/Utils/LoopSimplify.h" 140 #include "llvm/Transforms/Utils/LoopUtils.h" 141 #include "llvm/Transforms/Utils/LoopVersioning.h" 142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 143 #include "llvm/Transforms/Utils/SizeOpts.h" 144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 145 #include <algorithm> 146 #include <cassert> 147 #include <cstdint> 148 #include <cstdlib> 149 #include <functional> 150 #include <iterator> 151 #include <limits> 152 #include <memory> 153 #include <string> 154 #include <tuple> 155 #include <utility> 156 157 using namespace llvm; 158 159 #define LV_NAME "loop-vectorize" 160 #define DEBUG_TYPE LV_NAME 161 162 #ifndef NDEBUG 163 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 164 #endif 165 166 /// @{ 167 /// Metadata attribute names 168 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 169 const char LLVMLoopVectorizeFollowupVectorized[] = 170 "llvm.loop.vectorize.followup_vectorized"; 171 const char LLVMLoopVectorizeFollowupEpilogue[] = 172 "llvm.loop.vectorize.followup_epilogue"; 173 /// @} 174 175 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 176 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 177 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 178 179 static cl::opt<bool> EnableEpilogueVectorization( 180 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 181 cl::desc("Enable vectorization of epilogue loops.")); 182 183 static cl::opt<unsigned> EpilogueVectorizationForceVF( 184 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 185 cl::desc("When epilogue vectorization is enabled, and a value greater than " 186 "1 is specified, forces the given VF for all applicable epilogue " 187 "loops.")); 188 189 static cl::opt<unsigned> EpilogueVectorizationMinVF( 190 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 191 cl::desc("Only loops with vectorization factor equal to or larger than " 192 "the specified value are considered for epilogue vectorization.")); 193 194 /// Loops with a known constant trip count below this number are vectorized only 195 /// if no scalar iteration overheads are incurred. 196 static cl::opt<unsigned> TinyTripCountVectorThreshold( 197 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 198 cl::desc("Loops with a constant trip count that is smaller than this " 199 "value are vectorized only if no scalar iteration overheads " 200 "are incurred.")); 201 202 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 203 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 204 cl::desc("The maximum allowed number of runtime memory checks with a " 205 "vectorize(enable) pragma.")); 206 207 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 208 // that predication is preferred, and this lists all options. I.e., the 209 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 210 // and predicate the instructions accordingly. If tail-folding fails, there are 211 // different fallback strategies depending on these values: 212 namespace PreferPredicateTy { 213 enum Option { 214 ScalarEpilogue = 0, 215 PredicateElseScalarEpilogue, 216 PredicateOrDontVectorize 217 }; 218 } // namespace PreferPredicateTy 219 220 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 221 "prefer-predicate-over-epilogue", 222 cl::init(PreferPredicateTy::ScalarEpilogue), 223 cl::Hidden, 224 cl::desc("Tail-folding and predication preferences over creating a scalar " 225 "epilogue loop."), 226 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 227 "scalar-epilogue", 228 "Don't tail-predicate loops, create scalar epilogue"), 229 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 230 "predicate-else-scalar-epilogue", 231 "prefer tail-folding, create scalar epilogue if tail " 232 "folding fails."), 233 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 234 "predicate-dont-vectorize", 235 "prefers tail-folding, don't attempt vectorization if " 236 "tail-folding fails."))); 237 238 static cl::opt<bool> MaximizeBandwidth( 239 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 240 cl::desc("Maximize bandwidth when selecting vectorization factor which " 241 "will be determined by the smallest type in loop.")); 242 243 static cl::opt<bool> EnableInterleavedMemAccesses( 244 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 245 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 246 247 /// An interleave-group may need masking if it resides in a block that needs 248 /// predication, or in order to mask away gaps. 249 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 250 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 251 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 252 253 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 254 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 255 cl::desc("We don't interleave loops with a estimated constant trip count " 256 "below this number")); 257 258 static cl::opt<unsigned> ForceTargetNumScalarRegs( 259 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 260 cl::desc("A flag that overrides the target's number of scalar registers.")); 261 262 static cl::opt<unsigned> ForceTargetNumVectorRegs( 263 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 264 cl::desc("A flag that overrides the target's number of vector registers.")); 265 266 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 267 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 268 cl::desc("A flag that overrides the target's max interleave factor for " 269 "scalar loops.")); 270 271 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 272 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 273 cl::desc("A flag that overrides the target's max interleave factor for " 274 "vectorized loops.")); 275 276 static cl::opt<unsigned> ForceTargetInstructionCost( 277 "force-target-instruction-cost", cl::init(0), cl::Hidden, 278 cl::desc("A flag that overrides the target's expected cost for " 279 "an instruction to a single constant value. Mostly " 280 "useful for getting consistent testing.")); 281 282 static cl::opt<bool> ForceTargetSupportsScalableVectors( 283 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 284 cl::desc( 285 "Pretend that scalable vectors are supported, even if the target does " 286 "not support them. This flag should only be used for testing.")); 287 288 static cl::opt<unsigned> SmallLoopCost( 289 "small-loop-cost", cl::init(20), cl::Hidden, 290 cl::desc( 291 "The cost of a loop that is considered 'small' by the interleaver.")); 292 293 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 294 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 295 cl::desc("Enable the use of the block frequency analysis to access PGO " 296 "heuristics minimizing code growth in cold regions and being more " 297 "aggressive in hot regions.")); 298 299 // Runtime interleave loops for load/store throughput. 300 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 301 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 302 cl::desc( 303 "Enable runtime interleaving until load/store ports are saturated")); 304 305 /// Interleave small loops with scalar reductions. 306 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 307 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 308 cl::desc("Enable interleaving for loops with small iteration counts that " 309 "contain scalar reductions to expose ILP.")); 310 311 /// The number of stores in a loop that are allowed to need predication. 312 static cl::opt<unsigned> NumberOfStoresToPredicate( 313 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 314 cl::desc("Max number of stores to be predicated behind an if.")); 315 316 static cl::opt<bool> EnableIndVarRegisterHeur( 317 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 318 cl::desc("Count the induction variable only once when interleaving")); 319 320 static cl::opt<bool> EnableCondStoresVectorization( 321 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 322 cl::desc("Enable if predication of stores during vectorization.")); 323 324 static cl::opt<unsigned> MaxNestedScalarReductionIC( 325 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 326 cl::desc("The maximum interleave count to use when interleaving a scalar " 327 "reduction in a nested loop.")); 328 329 static cl::opt<bool> 330 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 331 cl::Hidden, 332 cl::desc("Prefer in-loop vector reductions, " 333 "overriding the targets preference.")); 334 335 cl::opt<bool> ForceOrderedReductions( 336 "force-ordered-reductions", cl::init(false), cl::Hidden, 337 cl::desc("Enable the vectorisation of loops with in-order (strict) " 338 "FP reductions")); 339 340 static cl::opt<bool> PreferPredicatedReductionSelect( 341 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 342 cl::desc( 343 "Prefer predicating a reduction operation over an after loop select.")); 344 345 cl::opt<bool> EnableVPlanNativePath( 346 "enable-vplan-native-path", cl::init(false), cl::Hidden, 347 cl::desc("Enable VPlan-native vectorization path with " 348 "support for outer loop vectorization.")); 349 350 // FIXME: Remove this switch once we have divergence analysis. Currently we 351 // assume divergent non-backedge branches when this switch is true. 352 cl::opt<bool> EnableVPlanPredication( 353 "enable-vplan-predication", cl::init(false), cl::Hidden, 354 cl::desc("Enable VPlan-native vectorization path predicator with " 355 "support for outer loop vectorization.")); 356 357 // This flag enables the stress testing of the VPlan H-CFG construction in the 358 // VPlan-native vectorization path. It must be used in conjuction with 359 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 360 // verification of the H-CFGs built. 361 static cl::opt<bool> VPlanBuildStressTest( 362 "vplan-build-stress-test", cl::init(false), cl::Hidden, 363 cl::desc( 364 "Build VPlan for every supported loop nest in the function and bail " 365 "out right after the build (stress test the VPlan H-CFG construction " 366 "in the VPlan-native vectorization path).")); 367 368 cl::opt<bool> llvm::EnableLoopInterleaving( 369 "interleave-loops", cl::init(true), cl::Hidden, 370 cl::desc("Enable loop interleaving in Loop vectorization passes")); 371 cl::opt<bool> llvm::EnableLoopVectorization( 372 "vectorize-loops", cl::init(true), cl::Hidden, 373 cl::desc("Run the Loop vectorization passes")); 374 375 cl::opt<bool> PrintVPlansInDotFormat( 376 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 377 cl::desc("Use dot format instead of plain text when dumping VPlans")); 378 379 /// A helper function that returns true if the given type is irregular. The 380 /// type is irregular if its allocated size doesn't equal the store size of an 381 /// element of the corresponding vector type. 382 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 383 // Determine if an array of N elements of type Ty is "bitcast compatible" 384 // with a <N x Ty> vector. 385 // This is only true if there is no padding between the array elements. 386 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 387 } 388 389 /// A helper function that returns the reciprocal of the block probability of 390 /// predicated blocks. If we return X, we are assuming the predicated block 391 /// will execute once for every X iterations of the loop header. 392 /// 393 /// TODO: We should use actual block probability here, if available. Currently, 394 /// we always assume predicated blocks have a 50% chance of executing. 395 static unsigned getReciprocalPredBlockProb() { return 2; } 396 397 /// A helper function that returns an integer or floating-point constant with 398 /// value C. 399 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 400 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 401 : ConstantFP::get(Ty, C); 402 } 403 404 /// Returns "best known" trip count for the specified loop \p L as defined by 405 /// the following procedure: 406 /// 1) Returns exact trip count if it is known. 407 /// 2) Returns expected trip count according to profile data if any. 408 /// 3) Returns upper bound estimate if it is known. 409 /// 4) Returns None if all of the above failed. 410 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 411 // Check if exact trip count is known. 412 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 413 return ExpectedTC; 414 415 // Check if there is an expected trip count available from profile data. 416 if (LoopVectorizeWithBlockFrequency) 417 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 418 return EstimatedTC; 419 420 // Check if upper bound estimate is known. 421 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 422 return ExpectedTC; 423 424 return None; 425 } 426 427 // Forward declare GeneratedRTChecks. 428 class GeneratedRTChecks; 429 430 namespace llvm { 431 432 /// InnerLoopVectorizer vectorizes loops which contain only one basic 433 /// block to a specified vectorization factor (VF). 434 /// This class performs the widening of scalars into vectors, or multiple 435 /// scalars. This class also implements the following features: 436 /// * It inserts an epilogue loop for handling loops that don't have iteration 437 /// counts that are known to be a multiple of the vectorization factor. 438 /// * It handles the code generation for reduction variables. 439 /// * Scalarization (implementation using scalars) of un-vectorizable 440 /// instructions. 441 /// InnerLoopVectorizer does not perform any vectorization-legality 442 /// checks, and relies on the caller to check for the different legality 443 /// aspects. The InnerLoopVectorizer relies on the 444 /// LoopVectorizationLegality class to provide information about the induction 445 /// and reduction variables that were found to a given vectorization factor. 446 class InnerLoopVectorizer { 447 public: 448 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 449 LoopInfo *LI, DominatorTree *DT, 450 const TargetLibraryInfo *TLI, 451 const TargetTransformInfo *TTI, AssumptionCache *AC, 452 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 453 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 459 PSI(PSI), RTChecks(RTChecks) { 460 // Query this against the original loop and save it here because the profile 461 // of the original loop header may change as the transformation happens. 462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 464 } 465 466 virtual ~InnerLoopVectorizer() = default; 467 468 /// Create a new empty loop that will contain vectorized instructions later 469 /// on, while the old loop will be used as the scalar remainder. Control flow 470 /// is generated around the vectorized (and scalar epilogue) loops consisting 471 /// of various checks and bypasses. Return the pre-header block of the new 472 /// loop. 473 /// In the case of epilogue vectorization, this function is overriden to 474 /// handle the more complex control flow around the loops. 475 virtual BasicBlock *createVectorizedLoopSkeleton(); 476 477 /// Widen a single instruction within the innermost loop. 478 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 479 VPTransformState &State); 480 481 /// Widen a single call instruction within the innermost loop. 482 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 483 VPTransformState &State); 484 485 /// Widen a single select instruction within the innermost loop. 486 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 487 bool InvariantCond, VPTransformState &State); 488 489 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 490 void fixVectorizedLoop(VPTransformState &State); 491 492 // Return true if any runtime check is added. 493 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 494 495 /// A type for vectorized values in the new loop. Each value from the 496 /// original loop, when vectorized, is represented by UF vector values in the 497 /// new unrolled loop, where UF is the unroll factor. 498 using VectorParts = SmallVector<Value *, 2>; 499 500 /// Vectorize a single GetElementPtrInst based on information gathered and 501 /// decisions taken during planning. 502 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 503 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 504 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 505 506 /// Vectorize a single first-order recurrence or pointer induction PHINode in 507 /// a block. This method handles the induction variable canonicalization. It 508 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 509 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 510 VPTransformState &State); 511 512 /// A helper function to scalarize a single Instruction in the innermost loop. 513 /// Generates a sequence of scalar instances for each lane between \p MinLane 514 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 515 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 516 /// Instr's operands. 517 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 518 const VPIteration &Instance, bool IfPredicateInstr, 519 VPTransformState &State); 520 521 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 522 /// is provided, the integer induction variable will first be truncated to 523 /// the corresponding type. 524 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 525 VPValue *Def, VPValue *CastDef, 526 VPTransformState &State); 527 528 /// Construct the vector value of a scalarized value \p V one lane at a time. 529 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 530 VPTransformState &State); 531 532 /// Try to vectorize interleaved access group \p Group with the base address 533 /// given in \p Addr, optionally masking the vector operations if \p 534 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 535 /// values in the vectorized loop. 536 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 537 ArrayRef<VPValue *> VPDefs, 538 VPTransformState &State, VPValue *Addr, 539 ArrayRef<VPValue *> StoredValues, 540 VPValue *BlockInMask = nullptr); 541 542 /// Vectorize Load and Store instructions with the base address given in \p 543 /// Addr, optionally masking the vector operations if \p BlockInMask is 544 /// non-null. Use \p State to translate given VPValues to IR values in the 545 /// vectorized loop. 546 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 547 VPValue *Def, VPValue *Addr, 548 VPValue *StoredValue, VPValue *BlockInMask); 549 550 /// Set the debug location in the builder \p Ptr using the debug location in 551 /// \p V. If \p Ptr is None then it uses the class member's Builder. 552 void setDebugLocFromInst(const Value *V, 553 Optional<IRBuilder<> *> CustomBuilder = None); 554 555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 556 void fixNonInductionPHIs(VPTransformState &State); 557 558 /// Returns true if the reordering of FP operations is not allowed, but we are 559 /// able to vectorize with strict in-order reductions for the given RdxDesc. 560 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 561 562 /// Create a broadcast instruction. This method generates a broadcast 563 /// instruction (shuffle) for loop invariant values and for the induction 564 /// value. If this is the induction variable then we extend it to N, N+1, ... 565 /// this is needed because each iteration in the loop corresponds to a SIMD 566 /// element. 567 virtual Value *getBroadcastInstrs(Value *V); 568 569 protected: 570 friend class LoopVectorizationPlanner; 571 572 /// A small list of PHINodes. 573 using PhiVector = SmallVector<PHINode *, 4>; 574 575 /// A type for scalarized values in the new loop. Each value from the 576 /// original loop, when scalarized, is represented by UF x VF scalar values 577 /// in the new unrolled loop, where UF is the unroll factor and VF is the 578 /// vectorization factor. 579 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 580 581 /// Set up the values of the IVs correctly when exiting the vector loop. 582 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 583 Value *CountRoundDown, Value *EndValue, 584 BasicBlock *MiddleBlock); 585 586 /// Create a new induction variable inside L. 587 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 588 Value *Step, Instruction *DL); 589 590 /// Handle all cross-iteration phis in the header. 591 void fixCrossIterationPHIs(VPTransformState &State); 592 593 /// Create the exit value of first order recurrences in the middle block and 594 /// update their users. 595 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 596 597 /// Create code for the loop exit value of the reduction. 598 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 599 600 /// Clear NSW/NUW flags from reduction instructions if necessary. 601 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 602 VPTransformState &State); 603 604 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 605 /// means we need to add the appropriate incoming value from the middle 606 /// block as exiting edges from the scalar epilogue loop (if present) are 607 /// already in place, and we exit the vector loop exclusively to the middle 608 /// block. 609 void fixLCSSAPHIs(VPTransformState &State); 610 611 /// Iteratively sink the scalarized operands of a predicated instruction into 612 /// the block that was created for it. 613 void sinkScalarOperands(Instruction *PredInst); 614 615 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 616 /// represented as. 617 void truncateToMinimalBitwidths(VPTransformState &State); 618 619 /// This function adds 620 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 621 /// to each vector element of Val. The sequence starts at StartIndex. 622 /// \p Opcode is relevant for FP induction variable. 623 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 624 Instruction::BinaryOps Opcode = 625 Instruction::BinaryOpsEnd); 626 627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 628 /// variable on which to base the steps, \p Step is the size of the step, and 629 /// \p EntryVal is the value from the original loop that maps to the steps. 630 /// Note that \p EntryVal doesn't have to be an induction variable - it 631 /// can also be a truncate instruction. 632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 633 const InductionDescriptor &ID, VPValue *Def, 634 VPValue *CastDef, VPTransformState &State); 635 636 /// Create a vector induction phi node based on an existing scalar one. \p 637 /// EntryVal is the value from the original loop that maps to the vector phi 638 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 639 /// truncate instruction, instead of widening the original IV, we widen a 640 /// version of the IV truncated to \p EntryVal's type. 641 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 642 Value *Step, Value *Start, 643 Instruction *EntryVal, VPValue *Def, 644 VPValue *CastDef, 645 VPTransformState &State); 646 647 /// Returns true if an instruction \p I should be scalarized instead of 648 /// vectorized for the chosen vectorization factor. 649 bool shouldScalarizeInstruction(Instruction *I) const; 650 651 /// Returns true if we should generate a scalar version of \p IV. 652 bool needsScalarInduction(Instruction *IV) const; 653 654 /// If there is a cast involved in the induction variable \p ID, which should 655 /// be ignored in the vectorized loop body, this function records the 656 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 657 /// cast. We had already proved that the casted Phi is equal to the uncasted 658 /// Phi in the vectorized loop (under a runtime guard), and therefore 659 /// there is no need to vectorize the cast - the same value can be used in the 660 /// vector loop for both the Phi and the cast. 661 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 662 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 663 /// 664 /// \p EntryVal is the value from the original loop that maps to the vector 665 /// phi node and is used to distinguish what is the IV currently being 666 /// processed - original one (if \p EntryVal is a phi corresponding to the 667 /// original IV) or the "newly-created" one based on the proof mentioned above 668 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 669 /// latter case \p EntryVal is a TruncInst and we must not record anything for 670 /// that IV, but it's error-prone to expect callers of this routine to care 671 /// about that, hence this explicit parameter. 672 void recordVectorLoopValueForInductionCast( 673 const InductionDescriptor &ID, const Instruction *EntryVal, 674 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 675 unsigned Part, unsigned Lane = UINT_MAX); 676 677 /// Generate a shuffle sequence that will reverse the vector Vec. 678 virtual Value *reverseVector(Value *Vec); 679 680 /// Returns (and creates if needed) the original loop trip count. 681 Value *getOrCreateTripCount(Loop *NewLoop); 682 683 /// Returns (and creates if needed) the trip count of the widened loop. 684 Value *getOrCreateVectorTripCount(Loop *NewLoop); 685 686 /// Returns a bitcasted value to the requested vector type. 687 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 688 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 689 const DataLayout &DL); 690 691 /// Emit a bypass check to see if the vector trip count is zero, including if 692 /// it overflows. 693 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 694 695 /// Emit a bypass check to see if all of the SCEV assumptions we've 696 /// had to make are correct. Returns the block containing the checks or 697 /// nullptr if no checks have been added. 698 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 699 700 /// Emit bypass checks to check any memory assumptions we may have made. 701 /// Returns the block containing the checks or nullptr if no checks have been 702 /// added. 703 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 704 705 /// Compute the transformed value of Index at offset StartValue using step 706 /// StepValue. 707 /// For integer induction, returns StartValue + Index * StepValue. 708 /// For pointer induction, returns StartValue[Index * StepValue]. 709 /// FIXME: The newly created binary instructions should contain nsw/nuw 710 /// flags, which can be found from the original scalar operations. 711 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 712 const DataLayout &DL, 713 const InductionDescriptor &ID) const; 714 715 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 716 /// vector loop preheader, middle block and scalar preheader. Also 717 /// allocate a loop object for the new vector loop and return it. 718 Loop *createVectorLoopSkeleton(StringRef Prefix); 719 720 /// Create new phi nodes for the induction variables to resume iteration count 721 /// in the scalar epilogue, from where the vectorized loop left off (given by 722 /// \p VectorTripCount). 723 /// In cases where the loop skeleton is more complicated (eg. epilogue 724 /// vectorization) and the resume values can come from an additional bypass 725 /// block, the \p AdditionalBypass pair provides information about the bypass 726 /// block and the end value on the edge from bypass to this loop. 727 void createInductionResumeValues( 728 Loop *L, Value *VectorTripCount, 729 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 730 731 /// Complete the loop skeleton by adding debug MDs, creating appropriate 732 /// conditional branches in the middle block, preparing the builder and 733 /// running the verifier. Take in the vector loop \p L as argument, and return 734 /// the preheader of the completed vector loop. 735 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 736 737 /// Add additional metadata to \p To that was not present on \p Orig. 738 /// 739 /// Currently this is used to add the noalias annotations based on the 740 /// inserted memchecks. Use this for instructions that are *cloned* into the 741 /// vector loop. 742 void addNewMetadata(Instruction *To, const Instruction *Orig); 743 744 /// Add metadata from one instruction to another. 745 /// 746 /// This includes both the original MDs from \p From and additional ones (\see 747 /// addNewMetadata). Use this for *newly created* instructions in the vector 748 /// loop. 749 void addMetadata(Instruction *To, Instruction *From); 750 751 /// Similar to the previous function but it adds the metadata to a 752 /// vector of instructions. 753 void addMetadata(ArrayRef<Value *> To, Instruction *From); 754 755 /// Allow subclasses to override and print debug traces before/after vplan 756 /// execution, when trace information is requested. 757 virtual void printDebugTracesAtStart(){}; 758 virtual void printDebugTracesAtEnd(){}; 759 760 /// The original loop. 761 Loop *OrigLoop; 762 763 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 764 /// dynamic knowledge to simplify SCEV expressions and converts them to a 765 /// more usable form. 766 PredicatedScalarEvolution &PSE; 767 768 /// Loop Info. 769 LoopInfo *LI; 770 771 /// Dominator Tree. 772 DominatorTree *DT; 773 774 /// Alias Analysis. 775 AAResults *AA; 776 777 /// Target Library Info. 778 const TargetLibraryInfo *TLI; 779 780 /// Target Transform Info. 781 const TargetTransformInfo *TTI; 782 783 /// Assumption Cache. 784 AssumptionCache *AC; 785 786 /// Interface to emit optimization remarks. 787 OptimizationRemarkEmitter *ORE; 788 789 /// LoopVersioning. It's only set up (non-null) if memchecks were 790 /// used. 791 /// 792 /// This is currently only used to add no-alias metadata based on the 793 /// memchecks. The actually versioning is performed manually. 794 std::unique_ptr<LoopVersioning> LVer; 795 796 /// The vectorization SIMD factor to use. Each vector will have this many 797 /// vector elements. 798 ElementCount VF; 799 800 /// The vectorization unroll factor to use. Each scalar is vectorized to this 801 /// many different vector instructions. 802 unsigned UF; 803 804 /// The builder that we use 805 IRBuilder<> Builder; 806 807 // --- Vectorization state --- 808 809 /// The vector-loop preheader. 810 BasicBlock *LoopVectorPreHeader; 811 812 /// The scalar-loop preheader. 813 BasicBlock *LoopScalarPreHeader; 814 815 /// Middle Block between the vector and the scalar. 816 BasicBlock *LoopMiddleBlock; 817 818 /// The unique ExitBlock of the scalar loop if one exists. Note that 819 /// there can be multiple exiting edges reaching this block. 820 BasicBlock *LoopExitBlock; 821 822 /// The vector loop body. 823 BasicBlock *LoopVectorBody; 824 825 /// The scalar loop body. 826 BasicBlock *LoopScalarBody; 827 828 /// A list of all bypass blocks. The first block is the entry of the loop. 829 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 830 831 /// The new Induction variable which was added to the new block. 832 PHINode *Induction = nullptr; 833 834 /// The induction variable of the old basic block. 835 PHINode *OldInduction = nullptr; 836 837 /// Store instructions that were predicated. 838 SmallVector<Instruction *, 4> PredicatedInstructions; 839 840 /// Trip count of the original loop. 841 Value *TripCount = nullptr; 842 843 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 844 Value *VectorTripCount = nullptr; 845 846 /// The legality analysis. 847 LoopVectorizationLegality *Legal; 848 849 /// The profitablity analysis. 850 LoopVectorizationCostModel *Cost; 851 852 // Record whether runtime checks are added. 853 bool AddedSafetyChecks = false; 854 855 // Holds the end values for each induction variable. We save the end values 856 // so we can later fix-up the external users of the induction variables. 857 DenseMap<PHINode *, Value *> IVEndValues; 858 859 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 860 // fixed up at the end of vector code generation. 861 SmallVector<PHINode *, 8> OrigPHIsToFix; 862 863 /// BFI and PSI are used to check for profile guided size optimizations. 864 BlockFrequencyInfo *BFI; 865 ProfileSummaryInfo *PSI; 866 867 // Whether this loop should be optimized for size based on profile guided size 868 // optimizatios. 869 bool OptForSizeBasedOnProfile; 870 871 /// Structure to hold information about generated runtime checks, responsible 872 /// for cleaning the checks, if vectorization turns out unprofitable. 873 GeneratedRTChecks &RTChecks; 874 }; 875 876 class InnerLoopUnroller : public InnerLoopVectorizer { 877 public: 878 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 879 LoopInfo *LI, DominatorTree *DT, 880 const TargetLibraryInfo *TLI, 881 const TargetTransformInfo *TTI, AssumptionCache *AC, 882 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 883 LoopVectorizationLegality *LVL, 884 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 885 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 887 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 888 BFI, PSI, Check) {} 889 890 private: 891 Value *getBroadcastInstrs(Value *V) override; 892 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 893 Instruction::BinaryOps Opcode = 894 Instruction::BinaryOpsEnd) override; 895 Value *reverseVector(Value *Vec) override; 896 }; 897 898 /// Encapsulate information regarding vectorization of a loop and its epilogue. 899 /// This information is meant to be updated and used across two stages of 900 /// epilogue vectorization. 901 struct EpilogueLoopVectorizationInfo { 902 ElementCount MainLoopVF = ElementCount::getFixed(0); 903 unsigned MainLoopUF = 0; 904 ElementCount EpilogueVF = ElementCount::getFixed(0); 905 unsigned EpilogueUF = 0; 906 BasicBlock *MainLoopIterationCountCheck = nullptr; 907 BasicBlock *EpilogueIterationCountCheck = nullptr; 908 BasicBlock *SCEVSafetyCheck = nullptr; 909 BasicBlock *MemSafetyCheck = nullptr; 910 Value *TripCount = nullptr; 911 Value *VectorTripCount = nullptr; 912 913 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 914 unsigned EUF) 915 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 916 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 917 assert(EUF == 1 && 918 "A high UF for the epilogue loop is likely not beneficial."); 919 } 920 }; 921 922 /// An extension of the inner loop vectorizer that creates a skeleton for a 923 /// vectorized loop that has its epilogue (residual) also vectorized. 924 /// The idea is to run the vplan on a given loop twice, firstly to setup the 925 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 926 /// from the first step and vectorize the epilogue. This is achieved by 927 /// deriving two concrete strategy classes from this base class and invoking 928 /// them in succession from the loop vectorizer planner. 929 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 930 public: 931 InnerLoopAndEpilogueVectorizer( 932 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 933 DominatorTree *DT, const TargetLibraryInfo *TLI, 934 const TargetTransformInfo *TTI, AssumptionCache *AC, 935 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 936 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 937 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 938 GeneratedRTChecks &Checks) 939 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 940 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 941 Checks), 942 EPI(EPI) {} 943 944 // Override this function to handle the more complex control flow around the 945 // three loops. 946 BasicBlock *createVectorizedLoopSkeleton() final override { 947 return createEpilogueVectorizedLoopSkeleton(); 948 } 949 950 /// The interface for creating a vectorized skeleton using one of two 951 /// different strategies, each corresponding to one execution of the vplan 952 /// as described above. 953 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 954 955 /// Holds and updates state information required to vectorize the main loop 956 /// and its epilogue in two separate passes. This setup helps us avoid 957 /// regenerating and recomputing runtime safety checks. It also helps us to 958 /// shorten the iteration-count-check path length for the cases where the 959 /// iteration count of the loop is so small that the main vector loop is 960 /// completely skipped. 961 EpilogueLoopVectorizationInfo &EPI; 962 }; 963 964 /// A specialized derived class of inner loop vectorizer that performs 965 /// vectorization of *main* loops in the process of vectorizing loops and their 966 /// epilogues. 967 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 968 public: 969 EpilogueVectorizerMainLoop( 970 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 971 DominatorTree *DT, const TargetLibraryInfo *TLI, 972 const TargetTransformInfo *TTI, AssumptionCache *AC, 973 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 974 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 975 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 976 GeneratedRTChecks &Check) 977 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 978 EPI, LVL, CM, BFI, PSI, Check) {} 979 /// Implements the interface for creating a vectorized skeleton using the 980 /// *main loop* strategy (ie the first pass of vplan execution). 981 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 982 983 protected: 984 /// Emits an iteration count bypass check once for the main loop (when \p 985 /// ForEpilogue is false) and once for the epilogue loop (when \p 986 /// ForEpilogue is true). 987 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 988 bool ForEpilogue); 989 void printDebugTracesAtStart() override; 990 void printDebugTracesAtEnd() override; 991 }; 992 993 // A specialized derived class of inner loop vectorizer that performs 994 // vectorization of *epilogue* loops in the process of vectorizing loops and 995 // their epilogues. 996 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 997 public: 998 EpilogueVectorizerEpilogueLoop( 999 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1000 DominatorTree *DT, const TargetLibraryInfo *TLI, 1001 const TargetTransformInfo *TTI, AssumptionCache *AC, 1002 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1003 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1004 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1005 GeneratedRTChecks &Checks) 1006 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1007 EPI, LVL, CM, BFI, PSI, Checks) {} 1008 /// Implements the interface for creating a vectorized skeleton using the 1009 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1010 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1011 1012 protected: 1013 /// Emits an iteration count bypass check after the main vector loop has 1014 /// finished to see if there are any iterations left to execute by either 1015 /// the vector epilogue or the scalar epilogue. 1016 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1017 BasicBlock *Bypass, 1018 BasicBlock *Insert); 1019 void printDebugTracesAtStart() override; 1020 void printDebugTracesAtEnd() override; 1021 }; 1022 } // end namespace llvm 1023 1024 /// Look for a meaningful debug location on the instruction or it's 1025 /// operands. 1026 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1027 if (!I) 1028 return I; 1029 1030 DebugLoc Empty; 1031 if (I->getDebugLoc() != Empty) 1032 return I; 1033 1034 for (Use &Op : I->operands()) { 1035 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1036 if (OpInst->getDebugLoc() != Empty) 1037 return OpInst; 1038 } 1039 1040 return I; 1041 } 1042 1043 void InnerLoopVectorizer::setDebugLocFromInst( 1044 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1045 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1046 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1047 const DILocation *DIL = Inst->getDebugLoc(); 1048 1049 // When a FSDiscriminator is enabled, we don't need to add the multiply 1050 // factors to the discriminators. 1051 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1052 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1053 // FIXME: For scalable vectors, assume vscale=1. 1054 auto NewDIL = 1055 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1056 if (NewDIL) 1057 B->SetCurrentDebugLocation(NewDIL.getValue()); 1058 else 1059 LLVM_DEBUG(dbgs() 1060 << "Failed to create new discriminator: " 1061 << DIL->getFilename() << " Line: " << DIL->getLine()); 1062 } else 1063 B->SetCurrentDebugLocation(DIL); 1064 } else 1065 B->SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1069 /// is passed, the message relates to that particular instruction. 1070 #ifndef NDEBUG 1071 static void debugVectorizationMessage(const StringRef Prefix, 1072 const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: " << Prefix << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1104 } 1105 1106 /// Return a value for Step multiplied by VF. 1107 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1108 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get( 1110 Step->getType(), 1111 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1113 } 1114 1115 namespace llvm { 1116 1117 /// Return the runtime value for VF. 1118 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1119 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1120 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1121 } 1122 1123 void reportVectorizationFailure(const StringRef DebugMsg, 1124 const StringRef OREMsg, const StringRef ORETag, 1125 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1126 Instruction *I) { 1127 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1128 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1129 ORE->emit( 1130 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1131 << "loop not vectorized: " << OREMsg); 1132 } 1133 1134 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1135 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1136 Instruction *I) { 1137 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1138 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1139 ORE->emit( 1140 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1141 << Msg); 1142 } 1143 1144 } // end namespace llvm 1145 1146 #ifndef NDEBUG 1147 /// \return string containing a file name and a line # for the given loop. 1148 static std::string getDebugLocString(const Loop *L) { 1149 std::string Result; 1150 if (L) { 1151 raw_string_ostream OS(Result); 1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1153 LoopDbgLoc.print(OS); 1154 else 1155 // Just print the module name. 1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1157 OS.flush(); 1158 } 1159 return Result; 1160 } 1161 #endif 1162 1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1164 const Instruction *Orig) { 1165 // If the loop was versioned with memchecks, add the corresponding no-alias 1166 // metadata. 1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1168 LVer->annotateInstWithNoAlias(To, Orig); 1169 } 1170 1171 void InnerLoopVectorizer::addMetadata(Instruction *To, 1172 Instruction *From) { 1173 propagateMetadata(To, From); 1174 addNewMetadata(To, From); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1178 Instruction *From) { 1179 for (Value *V : To) { 1180 if (Instruction *I = dyn_cast<Instruction>(V)) 1181 addMetadata(I, From); 1182 } 1183 } 1184 1185 namespace llvm { 1186 1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1188 // lowered. 1189 enum ScalarEpilogueLowering { 1190 1191 // The default: allowing scalar epilogues. 1192 CM_ScalarEpilogueAllowed, 1193 1194 // Vectorization with OptForSize: don't allow epilogues. 1195 CM_ScalarEpilogueNotAllowedOptSize, 1196 1197 // A special case of vectorisation with OptForSize: loops with a very small 1198 // trip count are considered for vectorization under OptForSize, thereby 1199 // making sure the cost of their loop body is dominant, free of runtime 1200 // guards and scalar iteration overheads. 1201 CM_ScalarEpilogueNotAllowedLowTripLoop, 1202 1203 // Loop hint predicate indicating an epilogue is undesired. 1204 CM_ScalarEpilogueNotNeededUsePredicate, 1205 1206 // Directive indicating we must either tail fold or not vectorize 1207 CM_ScalarEpilogueNotAllowedUsePredicate 1208 }; 1209 1210 /// ElementCountComparator creates a total ordering for ElementCount 1211 /// for the purposes of using it in a set structure. 1212 struct ElementCountComparator { 1213 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1214 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1215 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1216 } 1217 }; 1218 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1219 1220 /// LoopVectorizationCostModel - estimates the expected speedups due to 1221 /// vectorization. 1222 /// In many cases vectorization is not profitable. This can happen because of 1223 /// a number of reasons. In this class we mainly attempt to predict the 1224 /// expected speedup/slowdowns due to the supported instruction set. We use the 1225 /// TargetTransformInfo to query the different backends for the cost of 1226 /// different operations. 1227 class LoopVectorizationCostModel { 1228 public: 1229 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1230 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1231 LoopVectorizationLegality *Legal, 1232 const TargetTransformInfo &TTI, 1233 const TargetLibraryInfo *TLI, DemandedBits *DB, 1234 AssumptionCache *AC, 1235 OptimizationRemarkEmitter *ORE, const Function *F, 1236 const LoopVectorizeHints *Hints, 1237 InterleavedAccessInfo &IAI) 1238 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1239 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1240 Hints(Hints), InterleaveInfo(IAI) {} 1241 1242 /// \return An upper bound for the vectorization factors (both fixed and 1243 /// scalable). If the factors are 0, vectorization and interleaving should be 1244 /// avoided up front. 1245 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1246 1247 /// \return True if runtime checks are required for vectorization, and false 1248 /// otherwise. 1249 bool runtimeChecksRequired(); 1250 1251 /// \return The most profitable vectorization factor and the cost of that VF. 1252 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1253 /// then this vectorization factor will be selected if vectorization is 1254 /// possible. 1255 VectorizationFactor 1256 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1257 1258 VectorizationFactor 1259 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1260 const LoopVectorizationPlanner &LVP); 1261 1262 /// Setup cost-based decisions for user vectorization factor. 1263 /// \return true if the UserVF is a feasible VF to be chosen. 1264 bool selectUserVectorizationFactor(ElementCount UserVF) { 1265 collectUniformsAndScalars(UserVF); 1266 collectInstsToScalarize(UserVF); 1267 return expectedCost(UserVF).first.isValid(); 1268 } 1269 1270 /// \return The size (in bits) of the smallest and widest types in the code 1271 /// that needs to be vectorized. We ignore values that remain scalar such as 1272 /// 64 bit loop indices. 1273 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1274 1275 /// \return The desired interleave count. 1276 /// If interleave count has been specified by metadata it will be returned. 1277 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1278 /// are the selected vectorization factor and the cost of the selected VF. 1279 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1280 1281 /// Memory access instruction may be vectorized in more than one way. 1282 /// Form of instruction after vectorization depends on cost. 1283 /// This function takes cost-based decisions for Load/Store instructions 1284 /// and collects them in a map. This decisions map is used for building 1285 /// the lists of loop-uniform and loop-scalar instructions. 1286 /// The calculated cost is saved with widening decision in order to 1287 /// avoid redundant calculations. 1288 void setCostBasedWideningDecision(ElementCount VF); 1289 1290 /// A struct that represents some properties of the register usage 1291 /// of a loop. 1292 struct RegisterUsage { 1293 /// Holds the number of loop invariant values that are used in the loop. 1294 /// The key is ClassID of target-provided register class. 1295 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1296 /// Holds the maximum number of concurrent live intervals in the loop. 1297 /// The key is ClassID of target-provided register class. 1298 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1299 }; 1300 1301 /// \return Returns information about the register usages of the loop for the 1302 /// given vectorization factors. 1303 SmallVector<RegisterUsage, 8> 1304 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1305 1306 /// Collect values we want to ignore in the cost model. 1307 void collectValuesToIgnore(); 1308 1309 /// Collect all element types in the loop for which widening is needed. 1310 void collectElementTypesForWidening(); 1311 1312 /// Split reductions into those that happen in the loop, and those that happen 1313 /// outside. In loop reductions are collected into InLoopReductionChains. 1314 void collectInLoopReductions(); 1315 1316 /// Returns true if we should use strict in-order reductions for the given 1317 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1318 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1319 /// of FP operations. 1320 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1321 return ForceOrderedReductions && !Hints->allowReordering() && 1322 RdxDesc.isOrdered(); 1323 } 1324 1325 /// \returns The smallest bitwidth each instruction can be represented with. 1326 /// The vector equivalents of these instructions should be truncated to this 1327 /// type. 1328 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1329 return MinBWs; 1330 } 1331 1332 /// \returns True if it is more profitable to scalarize instruction \p I for 1333 /// vectorization factor \p VF. 1334 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1335 assert(VF.isVector() && 1336 "Profitable to scalarize relevant only for VF > 1."); 1337 1338 // Cost model is not run in the VPlan-native path - return conservative 1339 // result until this changes. 1340 if (EnableVPlanNativePath) 1341 return false; 1342 1343 auto Scalars = InstsToScalarize.find(VF); 1344 assert(Scalars != InstsToScalarize.end() && 1345 "VF not yet analyzed for scalarization profitability"); 1346 return Scalars->second.find(I) != Scalars->second.end(); 1347 } 1348 1349 /// Returns true if \p I is known to be uniform after vectorization. 1350 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1351 if (VF.isScalar()) 1352 return true; 1353 1354 // Cost model is not run in the VPlan-native path - return conservative 1355 // result until this changes. 1356 if (EnableVPlanNativePath) 1357 return false; 1358 1359 auto UniformsPerVF = Uniforms.find(VF); 1360 assert(UniformsPerVF != Uniforms.end() && 1361 "VF not yet analyzed for uniformity"); 1362 return UniformsPerVF->second.count(I); 1363 } 1364 1365 /// Returns true if \p I is known to be scalar after vectorization. 1366 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1367 if (VF.isScalar()) 1368 return true; 1369 1370 // Cost model is not run in the VPlan-native path - return conservative 1371 // result until this changes. 1372 if (EnableVPlanNativePath) 1373 return false; 1374 1375 auto ScalarsPerVF = Scalars.find(VF); 1376 assert(ScalarsPerVF != Scalars.end() && 1377 "Scalar values are not calculated for VF"); 1378 return ScalarsPerVF->second.count(I); 1379 } 1380 1381 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1382 /// for vectorization factor \p VF. 1383 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1384 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1385 !isProfitableToScalarize(I, VF) && 1386 !isScalarAfterVectorization(I, VF); 1387 } 1388 1389 /// Decision that was taken during cost calculation for memory instruction. 1390 enum InstWidening { 1391 CM_Unknown, 1392 CM_Widen, // For consecutive accesses with stride +1. 1393 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1394 CM_Interleave, 1395 CM_GatherScatter, 1396 CM_Scalarize 1397 }; 1398 1399 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1400 /// instruction \p I and vector width \p VF. 1401 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1402 InstructionCost Cost) { 1403 assert(VF.isVector() && "Expected VF >=2"); 1404 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1405 } 1406 1407 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1408 /// interleaving group \p Grp and vector width \p VF. 1409 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1410 ElementCount VF, InstWidening W, 1411 InstructionCost Cost) { 1412 assert(VF.isVector() && "Expected VF >=2"); 1413 /// Broadcast this decicion to all instructions inside the group. 1414 /// But the cost will be assigned to one instruction only. 1415 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1416 if (auto *I = Grp->getMember(i)) { 1417 if (Grp->getInsertPos() == I) 1418 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1419 else 1420 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1421 } 1422 } 1423 } 1424 1425 /// Return the cost model decision for the given instruction \p I and vector 1426 /// width \p VF. Return CM_Unknown if this instruction did not pass 1427 /// through the cost modeling. 1428 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1429 assert(VF.isVector() && "Expected VF to be a vector VF"); 1430 // Cost model is not run in the VPlan-native path - return conservative 1431 // result until this changes. 1432 if (EnableVPlanNativePath) 1433 return CM_GatherScatter; 1434 1435 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1436 auto Itr = WideningDecisions.find(InstOnVF); 1437 if (Itr == WideningDecisions.end()) 1438 return CM_Unknown; 1439 return Itr->second.first; 1440 } 1441 1442 /// Return the vectorization cost for the given instruction \p I and vector 1443 /// width \p VF. 1444 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1445 assert(VF.isVector() && "Expected VF >=2"); 1446 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1447 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1448 "The cost is not calculated"); 1449 return WideningDecisions[InstOnVF].second; 1450 } 1451 1452 /// Return True if instruction \p I is an optimizable truncate whose operand 1453 /// is an induction variable. Such a truncate will be removed by adding a new 1454 /// induction variable with the destination type. 1455 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1456 // If the instruction is not a truncate, return false. 1457 auto *Trunc = dyn_cast<TruncInst>(I); 1458 if (!Trunc) 1459 return false; 1460 1461 // Get the source and destination types of the truncate. 1462 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1463 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1464 1465 // If the truncate is free for the given types, return false. Replacing a 1466 // free truncate with an induction variable would add an induction variable 1467 // update instruction to each iteration of the loop. We exclude from this 1468 // check the primary induction variable since it will need an update 1469 // instruction regardless. 1470 Value *Op = Trunc->getOperand(0); 1471 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1472 return false; 1473 1474 // If the truncated value is not an induction variable, return false. 1475 return Legal->isInductionPhi(Op); 1476 } 1477 1478 /// Collects the instructions to scalarize for each predicated instruction in 1479 /// the loop. 1480 void collectInstsToScalarize(ElementCount VF); 1481 1482 /// Collect Uniform and Scalar values for the given \p VF. 1483 /// The sets depend on CM decision for Load/Store instructions 1484 /// that may be vectorized as interleave, gather-scatter or scalarized. 1485 void collectUniformsAndScalars(ElementCount VF) { 1486 // Do the analysis once. 1487 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1488 return; 1489 setCostBasedWideningDecision(VF); 1490 collectLoopUniforms(VF); 1491 collectLoopScalars(VF); 1492 } 1493 1494 /// Returns true if the target machine supports masked store operation 1495 /// for the given \p DataType and kind of access to \p Ptr. 1496 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1497 return Legal->isConsecutivePtr(Ptr) && 1498 TTI.isLegalMaskedStore(DataType, Alignment); 1499 } 1500 1501 /// Returns true if the target machine supports masked load operation 1502 /// for the given \p DataType and kind of access to \p Ptr. 1503 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1504 return Legal->isConsecutivePtr(Ptr) && 1505 TTI.isLegalMaskedLoad(DataType, Alignment); 1506 } 1507 1508 /// Returns true if the target machine can represent \p V as a masked gather 1509 /// or scatter operation. 1510 bool isLegalGatherOrScatter(Value *V) { 1511 bool LI = isa<LoadInst>(V); 1512 bool SI = isa<StoreInst>(V); 1513 if (!LI && !SI) 1514 return false; 1515 auto *Ty = getLoadStoreType(V); 1516 Align Align = getLoadStoreAlignment(V); 1517 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1518 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1519 } 1520 1521 /// Returns true if the target machine supports all of the reduction 1522 /// variables found for the given VF. 1523 bool canVectorizeReductions(ElementCount VF) const { 1524 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1525 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1526 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1527 })); 1528 } 1529 1530 /// Returns true if \p I is an instruction that will be scalarized with 1531 /// predication. Such instructions include conditional stores and 1532 /// instructions that may divide by zero. 1533 /// If a non-zero VF has been calculated, we check if I will be scalarized 1534 /// predication for that VF. 1535 bool isScalarWithPredication(Instruction *I) const; 1536 1537 // Returns true if \p I is an instruction that will be predicated either 1538 // through scalar predication or masked load/store or masked gather/scatter. 1539 // Superset of instructions that return true for isScalarWithPredication. 1540 bool isPredicatedInst(Instruction *I) { 1541 if (!blockNeedsPredication(I->getParent())) 1542 return false; 1543 // Loads and stores that need some form of masked operation are predicated 1544 // instructions. 1545 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1546 return Legal->isMaskRequired(I); 1547 return isScalarWithPredication(I); 1548 } 1549 1550 /// Returns true if \p I is a memory instruction with consecutive memory 1551 /// access that can be widened. 1552 bool 1553 memoryInstructionCanBeWidened(Instruction *I, 1554 ElementCount VF = ElementCount::getFixed(1)); 1555 1556 /// Returns true if \p I is a memory instruction in an interleaved-group 1557 /// of memory accesses that can be vectorized with wide vector loads/stores 1558 /// and shuffles. 1559 bool 1560 interleavedAccessCanBeWidened(Instruction *I, 1561 ElementCount VF = ElementCount::getFixed(1)); 1562 1563 /// Check if \p Instr belongs to any interleaved access group. 1564 bool isAccessInterleaved(Instruction *Instr) { 1565 return InterleaveInfo.isInterleaved(Instr); 1566 } 1567 1568 /// Get the interleaved access group that \p Instr belongs to. 1569 const InterleaveGroup<Instruction> * 1570 getInterleavedAccessGroup(Instruction *Instr) { 1571 return InterleaveInfo.getInterleaveGroup(Instr); 1572 } 1573 1574 /// Returns true if we're required to use a scalar epilogue for at least 1575 /// the final iteration of the original loop. 1576 bool requiresScalarEpilogue(ElementCount VF) const { 1577 if (!isScalarEpilogueAllowed()) 1578 return false; 1579 // If we might exit from anywhere but the latch, must run the exiting 1580 // iteration in scalar form. 1581 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1582 return true; 1583 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1584 } 1585 1586 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1587 /// loop hint annotation. 1588 bool isScalarEpilogueAllowed() const { 1589 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1590 } 1591 1592 /// Returns true if all loop blocks should be masked to fold tail loop. 1593 bool foldTailByMasking() const { return FoldTailByMasking; } 1594 1595 bool blockNeedsPredication(BasicBlock *BB) const { 1596 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1597 } 1598 1599 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1600 /// nodes to the chain of instructions representing the reductions. Uses a 1601 /// MapVector to ensure deterministic iteration order. 1602 using ReductionChainMap = 1603 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1604 1605 /// Return the chain of instructions representing an inloop reduction. 1606 const ReductionChainMap &getInLoopReductionChains() const { 1607 return InLoopReductionChains; 1608 } 1609 1610 /// Returns true if the Phi is part of an inloop reduction. 1611 bool isInLoopReduction(PHINode *Phi) const { 1612 return InLoopReductionChains.count(Phi); 1613 } 1614 1615 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1616 /// with factor VF. Return the cost of the instruction, including 1617 /// scalarization overhead if it's needed. 1618 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1619 1620 /// Estimate cost of a call instruction CI if it were vectorized with factor 1621 /// VF. Return the cost of the instruction, including scalarization overhead 1622 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1623 /// scalarized - 1624 /// i.e. either vector version isn't available, or is too expensive. 1625 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1626 bool &NeedToScalarize) const; 1627 1628 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1629 /// that of B. 1630 bool isMoreProfitable(const VectorizationFactor &A, 1631 const VectorizationFactor &B) const; 1632 1633 /// Invalidates decisions already taken by the cost model. 1634 void invalidateCostModelingDecisions() { 1635 WideningDecisions.clear(); 1636 Uniforms.clear(); 1637 Scalars.clear(); 1638 } 1639 1640 private: 1641 unsigned NumPredStores = 0; 1642 1643 /// \return An upper bound for the vectorization factors for both 1644 /// fixed and scalable vectorization, where the minimum-known number of 1645 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1646 /// disabled or unsupported, then the scalable part will be equal to 1647 /// ElementCount::getScalable(0). 1648 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1649 ElementCount UserVF); 1650 1651 /// \return the maximized element count based on the targets vector 1652 /// registers and the loop trip-count, but limited to a maximum safe VF. 1653 /// This is a helper function of computeFeasibleMaxVF. 1654 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1655 /// issue that occurred on one of the buildbots which cannot be reproduced 1656 /// without having access to the properietary compiler (see comments on 1657 /// D98509). The issue is currently under investigation and this workaround 1658 /// will be removed as soon as possible. 1659 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1660 unsigned SmallestType, 1661 unsigned WidestType, 1662 const ElementCount &MaxSafeVF); 1663 1664 /// \return the maximum legal scalable VF, based on the safe max number 1665 /// of elements. 1666 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1667 1668 /// The vectorization cost is a combination of the cost itself and a boolean 1669 /// indicating whether any of the contributing operations will actually 1670 /// operate on vector values after type legalization in the backend. If this 1671 /// latter value is false, then all operations will be scalarized (i.e. no 1672 /// vectorization has actually taken place). 1673 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1674 1675 /// Returns the expected execution cost. The unit of the cost does 1676 /// not matter because we use the 'cost' units to compare different 1677 /// vector widths. The cost that is returned is *not* normalized by 1678 /// the factor width. If \p Invalid is not nullptr, this function 1679 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1680 /// each instruction that has an Invalid cost for the given VF. 1681 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1682 VectorizationCostTy 1683 expectedCost(ElementCount VF, 1684 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1685 1686 /// Returns the execution time cost of an instruction for a given vector 1687 /// width. Vector width of one means scalar. 1688 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1689 1690 /// The cost-computation logic from getInstructionCost which provides 1691 /// the vector type as an output parameter. 1692 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1693 Type *&VectorTy); 1694 1695 /// Return the cost of instructions in an inloop reduction pattern, if I is 1696 /// part of that pattern. 1697 Optional<InstructionCost> 1698 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1699 TTI::TargetCostKind CostKind); 1700 1701 /// Calculate vectorization cost of memory instruction \p I. 1702 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1703 1704 /// The cost computation for scalarized memory instruction. 1705 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1706 1707 /// The cost computation for interleaving group of memory instructions. 1708 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1709 1710 /// The cost computation for Gather/Scatter instruction. 1711 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1712 1713 /// The cost computation for widening instruction \p I with consecutive 1714 /// memory access. 1715 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1716 1717 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1718 /// Load: scalar load + broadcast. 1719 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1720 /// element) 1721 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1722 1723 /// Estimate the overhead of scalarizing an instruction. This is a 1724 /// convenience wrapper for the type-based getScalarizationOverhead API. 1725 InstructionCost getScalarizationOverhead(Instruction *I, 1726 ElementCount VF) const; 1727 1728 /// Returns whether the instruction is a load or store and will be a emitted 1729 /// as a vector operation. 1730 bool isConsecutiveLoadOrStore(Instruction *I); 1731 1732 /// Returns true if an artificially high cost for emulated masked memrefs 1733 /// should be used. 1734 bool useEmulatedMaskMemRefHack(Instruction *I); 1735 1736 /// Map of scalar integer values to the smallest bitwidth they can be legally 1737 /// represented as. The vector equivalents of these values should be truncated 1738 /// to this type. 1739 MapVector<Instruction *, uint64_t> MinBWs; 1740 1741 /// A type representing the costs for instructions if they were to be 1742 /// scalarized rather than vectorized. The entries are Instruction-Cost 1743 /// pairs. 1744 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1745 1746 /// A set containing all BasicBlocks that are known to present after 1747 /// vectorization as a predicated block. 1748 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1749 1750 /// Records whether it is allowed to have the original scalar loop execute at 1751 /// least once. This may be needed as a fallback loop in case runtime 1752 /// aliasing/dependence checks fail, or to handle the tail/remainder 1753 /// iterations when the trip count is unknown or doesn't divide by the VF, 1754 /// or as a peel-loop to handle gaps in interleave-groups. 1755 /// Under optsize and when the trip count is very small we don't allow any 1756 /// iterations to execute in the scalar loop. 1757 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1758 1759 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1760 bool FoldTailByMasking = false; 1761 1762 /// A map holding scalar costs for different vectorization factors. The 1763 /// presence of a cost for an instruction in the mapping indicates that the 1764 /// instruction will be scalarized when vectorizing with the associated 1765 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1766 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1767 1768 /// Holds the instructions known to be uniform after vectorization. 1769 /// The data is collected per VF. 1770 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1771 1772 /// Holds the instructions known to be scalar after vectorization. 1773 /// The data is collected per VF. 1774 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1775 1776 /// Holds the instructions (address computations) that are forced to be 1777 /// scalarized. 1778 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1779 1780 /// PHINodes of the reductions that should be expanded in-loop along with 1781 /// their associated chains of reduction operations, in program order from top 1782 /// (PHI) to bottom 1783 ReductionChainMap InLoopReductionChains; 1784 1785 /// A Map of inloop reduction operations and their immediate chain operand. 1786 /// FIXME: This can be removed once reductions can be costed correctly in 1787 /// vplan. This was added to allow quick lookup to the inloop operations, 1788 /// without having to loop through InLoopReductionChains. 1789 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1790 1791 /// Returns the expected difference in cost from scalarizing the expression 1792 /// feeding a predicated instruction \p PredInst. The instructions to 1793 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1794 /// non-negative return value implies the expression will be scalarized. 1795 /// Currently, only single-use chains are considered for scalarization. 1796 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1797 ElementCount VF); 1798 1799 /// Collect the instructions that are uniform after vectorization. An 1800 /// instruction is uniform if we represent it with a single scalar value in 1801 /// the vectorized loop corresponding to each vector iteration. Examples of 1802 /// uniform instructions include pointer operands of consecutive or 1803 /// interleaved memory accesses. Note that although uniformity implies an 1804 /// instruction will be scalar, the reverse is not true. In general, a 1805 /// scalarized instruction will be represented by VF scalar values in the 1806 /// vectorized loop, each corresponding to an iteration of the original 1807 /// scalar loop. 1808 void collectLoopUniforms(ElementCount VF); 1809 1810 /// Collect the instructions that are scalar after vectorization. An 1811 /// instruction is scalar if it is known to be uniform or will be scalarized 1812 /// during vectorization. Non-uniform scalarized instructions will be 1813 /// represented by VF values in the vectorized loop, each corresponding to an 1814 /// iteration of the original scalar loop. 1815 void collectLoopScalars(ElementCount VF); 1816 1817 /// Keeps cost model vectorization decision and cost for instructions. 1818 /// Right now it is used for memory instructions only. 1819 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1820 std::pair<InstWidening, InstructionCost>>; 1821 1822 DecisionList WideningDecisions; 1823 1824 /// Returns true if \p V is expected to be vectorized and it needs to be 1825 /// extracted. 1826 bool needsExtract(Value *V, ElementCount VF) const { 1827 Instruction *I = dyn_cast<Instruction>(V); 1828 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1829 TheLoop->isLoopInvariant(I)) 1830 return false; 1831 1832 // Assume we can vectorize V (and hence we need extraction) if the 1833 // scalars are not computed yet. This can happen, because it is called 1834 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1835 // the scalars are collected. That should be a safe assumption in most 1836 // cases, because we check if the operands have vectorizable types 1837 // beforehand in LoopVectorizationLegality. 1838 return Scalars.find(VF) == Scalars.end() || 1839 !isScalarAfterVectorization(I, VF); 1840 }; 1841 1842 /// Returns a range containing only operands needing to be extracted. 1843 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1844 ElementCount VF) const { 1845 return SmallVector<Value *, 4>(make_filter_range( 1846 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1847 } 1848 1849 /// Determines if we have the infrastructure to vectorize loop \p L and its 1850 /// epilogue, assuming the main loop is vectorized by \p VF. 1851 bool isCandidateForEpilogueVectorization(const Loop &L, 1852 const ElementCount VF) const; 1853 1854 /// Returns true if epilogue vectorization is considered profitable, and 1855 /// false otherwise. 1856 /// \p VF is the vectorization factor chosen for the original loop. 1857 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1858 1859 public: 1860 /// The loop that we evaluate. 1861 Loop *TheLoop; 1862 1863 /// Predicated scalar evolution analysis. 1864 PredicatedScalarEvolution &PSE; 1865 1866 /// Loop Info analysis. 1867 LoopInfo *LI; 1868 1869 /// Vectorization legality. 1870 LoopVectorizationLegality *Legal; 1871 1872 /// Vector target information. 1873 const TargetTransformInfo &TTI; 1874 1875 /// Target Library Info. 1876 const TargetLibraryInfo *TLI; 1877 1878 /// Demanded bits analysis. 1879 DemandedBits *DB; 1880 1881 /// Assumption cache. 1882 AssumptionCache *AC; 1883 1884 /// Interface to emit optimization remarks. 1885 OptimizationRemarkEmitter *ORE; 1886 1887 const Function *TheFunction; 1888 1889 /// Loop Vectorize Hint. 1890 const LoopVectorizeHints *Hints; 1891 1892 /// The interleave access information contains groups of interleaved accesses 1893 /// with the same stride and close to each other. 1894 InterleavedAccessInfo &InterleaveInfo; 1895 1896 /// Values to ignore in the cost model. 1897 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1898 1899 /// Values to ignore in the cost model when VF > 1. 1900 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1901 1902 /// All element types found in the loop. 1903 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1904 1905 /// Profitable vector factors. 1906 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1907 }; 1908 } // end namespace llvm 1909 1910 /// Helper struct to manage generating runtime checks for vectorization. 1911 /// 1912 /// The runtime checks are created up-front in temporary blocks to allow better 1913 /// estimating the cost and un-linked from the existing IR. After deciding to 1914 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1915 /// temporary blocks are completely removed. 1916 class GeneratedRTChecks { 1917 /// Basic block which contains the generated SCEV checks, if any. 1918 BasicBlock *SCEVCheckBlock = nullptr; 1919 1920 /// The value representing the result of the generated SCEV checks. If it is 1921 /// nullptr, either no SCEV checks have been generated or they have been used. 1922 Value *SCEVCheckCond = nullptr; 1923 1924 /// Basic block which contains the generated memory runtime checks, if any. 1925 BasicBlock *MemCheckBlock = nullptr; 1926 1927 /// The value representing the result of the generated memory runtime checks. 1928 /// If it is nullptr, either no memory runtime checks have been generated or 1929 /// they have been used. 1930 Instruction *MemRuntimeCheckCond = nullptr; 1931 1932 DominatorTree *DT; 1933 LoopInfo *LI; 1934 1935 SCEVExpander SCEVExp; 1936 SCEVExpander MemCheckExp; 1937 1938 public: 1939 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1940 const DataLayout &DL) 1941 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1942 MemCheckExp(SE, DL, "scev.check") {} 1943 1944 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1945 /// accurately estimate the cost of the runtime checks. The blocks are 1946 /// un-linked from the IR and is added back during vector code generation. If 1947 /// there is no vector code generation, the check blocks are removed 1948 /// completely. 1949 void Create(Loop *L, const LoopAccessInfo &LAI, 1950 const SCEVUnionPredicate &UnionPred) { 1951 1952 BasicBlock *LoopHeader = L->getHeader(); 1953 BasicBlock *Preheader = L->getLoopPreheader(); 1954 1955 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1956 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1957 // may be used by SCEVExpander. The blocks will be un-linked from their 1958 // predecessors and removed from LI & DT at the end of the function. 1959 if (!UnionPred.isAlwaysTrue()) { 1960 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1961 nullptr, "vector.scevcheck"); 1962 1963 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1964 &UnionPred, SCEVCheckBlock->getTerminator()); 1965 } 1966 1967 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1968 if (RtPtrChecking.Need) { 1969 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1970 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1971 "vector.memcheck"); 1972 1973 std::tie(std::ignore, MemRuntimeCheckCond) = 1974 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1975 RtPtrChecking.getChecks(), MemCheckExp); 1976 assert(MemRuntimeCheckCond && 1977 "no RT checks generated although RtPtrChecking " 1978 "claimed checks are required"); 1979 } 1980 1981 if (!MemCheckBlock && !SCEVCheckBlock) 1982 return; 1983 1984 // Unhook the temporary block with the checks, update various places 1985 // accordingly. 1986 if (SCEVCheckBlock) 1987 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1988 if (MemCheckBlock) 1989 MemCheckBlock->replaceAllUsesWith(Preheader); 1990 1991 if (SCEVCheckBlock) { 1992 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1993 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1994 Preheader->getTerminator()->eraseFromParent(); 1995 } 1996 if (MemCheckBlock) { 1997 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1998 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1999 Preheader->getTerminator()->eraseFromParent(); 2000 } 2001 2002 DT->changeImmediateDominator(LoopHeader, Preheader); 2003 if (MemCheckBlock) { 2004 DT->eraseNode(MemCheckBlock); 2005 LI->removeBlock(MemCheckBlock); 2006 } 2007 if (SCEVCheckBlock) { 2008 DT->eraseNode(SCEVCheckBlock); 2009 LI->removeBlock(SCEVCheckBlock); 2010 } 2011 } 2012 2013 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2014 /// unused. 2015 ~GeneratedRTChecks() { 2016 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2017 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2018 if (!SCEVCheckCond) 2019 SCEVCleaner.markResultUsed(); 2020 2021 if (!MemRuntimeCheckCond) 2022 MemCheckCleaner.markResultUsed(); 2023 2024 if (MemRuntimeCheckCond) { 2025 auto &SE = *MemCheckExp.getSE(); 2026 // Memory runtime check generation creates compares that use expanded 2027 // values. Remove them before running the SCEVExpanderCleaners. 2028 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2029 if (MemCheckExp.isInsertedInstruction(&I)) 2030 continue; 2031 SE.forgetValue(&I); 2032 SE.eraseValueFromMap(&I); 2033 I.eraseFromParent(); 2034 } 2035 } 2036 MemCheckCleaner.cleanup(); 2037 SCEVCleaner.cleanup(); 2038 2039 if (SCEVCheckCond) 2040 SCEVCheckBlock->eraseFromParent(); 2041 if (MemRuntimeCheckCond) 2042 MemCheckBlock->eraseFromParent(); 2043 } 2044 2045 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2046 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2047 /// depending on the generated condition. 2048 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2049 BasicBlock *LoopVectorPreHeader, 2050 BasicBlock *LoopExitBlock) { 2051 if (!SCEVCheckCond) 2052 return nullptr; 2053 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2054 if (C->isZero()) 2055 return nullptr; 2056 2057 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2058 2059 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2060 // Create new preheader for vector loop. 2061 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2062 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2063 2064 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2065 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2066 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2067 SCEVCheckBlock); 2068 2069 DT->addNewBlock(SCEVCheckBlock, Pred); 2070 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2071 2072 ReplaceInstWithInst( 2073 SCEVCheckBlock->getTerminator(), 2074 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2075 // Mark the check as used, to prevent it from being removed during cleanup. 2076 SCEVCheckCond = nullptr; 2077 return SCEVCheckBlock; 2078 } 2079 2080 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2081 /// the branches to branch to the vector preheader or \p Bypass, depending on 2082 /// the generated condition. 2083 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2084 BasicBlock *LoopVectorPreHeader) { 2085 // Check if we generated code that checks in runtime if arrays overlap. 2086 if (!MemRuntimeCheckCond) 2087 return nullptr; 2088 2089 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2090 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2091 MemCheckBlock); 2092 2093 DT->addNewBlock(MemCheckBlock, Pred); 2094 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2095 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2096 2097 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2098 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2099 2100 ReplaceInstWithInst( 2101 MemCheckBlock->getTerminator(), 2102 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2103 MemCheckBlock->getTerminator()->setDebugLoc( 2104 Pred->getTerminator()->getDebugLoc()); 2105 2106 // Mark the check as used, to prevent it from being removed during cleanup. 2107 MemRuntimeCheckCond = nullptr; 2108 return MemCheckBlock; 2109 } 2110 }; 2111 2112 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2113 // vectorization. The loop needs to be annotated with #pragma omp simd 2114 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2115 // vector length information is not provided, vectorization is not considered 2116 // explicit. Interleave hints are not allowed either. These limitations will be 2117 // relaxed in the future. 2118 // Please, note that we are currently forced to abuse the pragma 'clang 2119 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2120 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2121 // provides *explicit vectorization hints* (LV can bypass legal checks and 2122 // assume that vectorization is legal). However, both hints are implemented 2123 // using the same metadata (llvm.loop.vectorize, processed by 2124 // LoopVectorizeHints). This will be fixed in the future when the native IR 2125 // representation for pragma 'omp simd' is introduced. 2126 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2127 OptimizationRemarkEmitter *ORE) { 2128 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2129 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2130 2131 // Only outer loops with an explicit vectorization hint are supported. 2132 // Unannotated outer loops are ignored. 2133 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2134 return false; 2135 2136 Function *Fn = OuterLp->getHeader()->getParent(); 2137 if (!Hints.allowVectorization(Fn, OuterLp, 2138 true /*VectorizeOnlyWhenForced*/)) { 2139 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2140 return false; 2141 } 2142 2143 if (Hints.getInterleave() > 1) { 2144 // TODO: Interleave support is future work. 2145 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2146 "outer loops.\n"); 2147 Hints.emitRemarkWithHints(); 2148 return false; 2149 } 2150 2151 return true; 2152 } 2153 2154 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2155 OptimizationRemarkEmitter *ORE, 2156 SmallVectorImpl<Loop *> &V) { 2157 // Collect inner loops and outer loops without irreducible control flow. For 2158 // now, only collect outer loops that have explicit vectorization hints. If we 2159 // are stress testing the VPlan H-CFG construction, we collect the outermost 2160 // loop of every loop nest. 2161 if (L.isInnermost() || VPlanBuildStressTest || 2162 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2163 LoopBlocksRPO RPOT(&L); 2164 RPOT.perform(LI); 2165 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2166 V.push_back(&L); 2167 // TODO: Collect inner loops inside marked outer loops in case 2168 // vectorization fails for the outer loop. Do not invoke 2169 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2170 // already known to be reducible. We can use an inherited attribute for 2171 // that. 2172 return; 2173 } 2174 } 2175 for (Loop *InnerL : L) 2176 collectSupportedLoops(*InnerL, LI, ORE, V); 2177 } 2178 2179 namespace { 2180 2181 /// The LoopVectorize Pass. 2182 struct LoopVectorize : public FunctionPass { 2183 /// Pass identification, replacement for typeid 2184 static char ID; 2185 2186 LoopVectorizePass Impl; 2187 2188 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2189 bool VectorizeOnlyWhenForced = false) 2190 : FunctionPass(ID), 2191 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2192 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2193 } 2194 2195 bool runOnFunction(Function &F) override { 2196 if (skipFunction(F)) 2197 return false; 2198 2199 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2200 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2201 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2202 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2203 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2204 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2205 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2206 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2207 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2208 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2209 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2210 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2211 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2212 2213 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2214 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2215 2216 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2217 GetLAA, *ORE, PSI).MadeAnyChange; 2218 } 2219 2220 void getAnalysisUsage(AnalysisUsage &AU) const override { 2221 AU.addRequired<AssumptionCacheTracker>(); 2222 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2223 AU.addRequired<DominatorTreeWrapperPass>(); 2224 AU.addRequired<LoopInfoWrapperPass>(); 2225 AU.addRequired<ScalarEvolutionWrapperPass>(); 2226 AU.addRequired<TargetTransformInfoWrapperPass>(); 2227 AU.addRequired<AAResultsWrapperPass>(); 2228 AU.addRequired<LoopAccessLegacyAnalysis>(); 2229 AU.addRequired<DemandedBitsWrapperPass>(); 2230 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2231 AU.addRequired<InjectTLIMappingsLegacy>(); 2232 2233 // We currently do not preserve loopinfo/dominator analyses with outer loop 2234 // vectorization. Until this is addressed, mark these analyses as preserved 2235 // only for non-VPlan-native path. 2236 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2237 if (!EnableVPlanNativePath) { 2238 AU.addPreserved<LoopInfoWrapperPass>(); 2239 AU.addPreserved<DominatorTreeWrapperPass>(); 2240 } 2241 2242 AU.addPreserved<BasicAAWrapperPass>(); 2243 AU.addPreserved<GlobalsAAWrapperPass>(); 2244 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2245 } 2246 }; 2247 2248 } // end anonymous namespace 2249 2250 //===----------------------------------------------------------------------===// 2251 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2252 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2253 //===----------------------------------------------------------------------===// 2254 2255 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2256 // We need to place the broadcast of invariant variables outside the loop, 2257 // but only if it's proven safe to do so. Else, broadcast will be inside 2258 // vector loop body. 2259 Instruction *Instr = dyn_cast<Instruction>(V); 2260 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2261 (!Instr || 2262 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2263 // Place the code for broadcasting invariant variables in the new preheader. 2264 IRBuilder<>::InsertPointGuard Guard(Builder); 2265 if (SafeToHoist) 2266 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2267 2268 // Broadcast the scalar into all locations in the vector. 2269 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2270 2271 return Shuf; 2272 } 2273 2274 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2275 const InductionDescriptor &II, Value *Step, Value *Start, 2276 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2277 VPTransformState &State) { 2278 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2279 "Expected either an induction phi-node or a truncate of it!"); 2280 2281 // Construct the initial value of the vector IV in the vector loop preheader 2282 auto CurrIP = Builder.saveIP(); 2283 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2284 if (isa<TruncInst>(EntryVal)) { 2285 assert(Start->getType()->isIntegerTy() && 2286 "Truncation requires an integer type"); 2287 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2288 Step = Builder.CreateTrunc(Step, TruncType); 2289 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2290 } 2291 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2292 Value *SteppedStart = 2293 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2294 2295 // We create vector phi nodes for both integer and floating-point induction 2296 // variables. Here, we determine the kind of arithmetic we will perform. 2297 Instruction::BinaryOps AddOp; 2298 Instruction::BinaryOps MulOp; 2299 if (Step->getType()->isIntegerTy()) { 2300 AddOp = Instruction::Add; 2301 MulOp = Instruction::Mul; 2302 } else { 2303 AddOp = II.getInductionOpcode(); 2304 MulOp = Instruction::FMul; 2305 } 2306 2307 // Multiply the vectorization factor by the step using integer or 2308 // floating-point arithmetic as appropriate. 2309 Type *StepType = Step->getType(); 2310 if (Step->getType()->isFloatingPointTy()) 2311 StepType = IntegerType::get(StepType->getContext(), 2312 StepType->getScalarSizeInBits()); 2313 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2314 if (Step->getType()->isFloatingPointTy()) 2315 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2316 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2317 2318 // Create a vector splat to use in the induction update. 2319 // 2320 // FIXME: If the step is non-constant, we create the vector splat with 2321 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2322 // handle a constant vector splat. 2323 Value *SplatVF = isa<Constant>(Mul) 2324 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2325 : Builder.CreateVectorSplat(VF, Mul); 2326 Builder.restoreIP(CurrIP); 2327 2328 // We may need to add the step a number of times, depending on the unroll 2329 // factor. The last of those goes into the PHI. 2330 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2331 &*LoopVectorBody->getFirstInsertionPt()); 2332 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2333 Instruction *LastInduction = VecInd; 2334 for (unsigned Part = 0; Part < UF; ++Part) { 2335 State.set(Def, LastInduction, Part); 2336 2337 if (isa<TruncInst>(EntryVal)) 2338 addMetadata(LastInduction, EntryVal); 2339 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2340 State, Part); 2341 2342 LastInduction = cast<Instruction>( 2343 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2344 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2345 } 2346 2347 // Move the last step to the end of the latch block. This ensures consistent 2348 // placement of all induction updates. 2349 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2350 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2351 auto *ICmp = cast<Instruction>(Br->getCondition()); 2352 LastInduction->moveBefore(ICmp); 2353 LastInduction->setName("vec.ind.next"); 2354 2355 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2356 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2357 } 2358 2359 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2360 return Cost->isScalarAfterVectorization(I, VF) || 2361 Cost->isProfitableToScalarize(I, VF); 2362 } 2363 2364 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2365 if (shouldScalarizeInstruction(IV)) 2366 return true; 2367 auto isScalarInst = [&](User *U) -> bool { 2368 auto *I = cast<Instruction>(U); 2369 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2370 }; 2371 return llvm::any_of(IV->users(), isScalarInst); 2372 } 2373 2374 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2375 const InductionDescriptor &ID, const Instruction *EntryVal, 2376 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2377 unsigned Part, unsigned Lane) { 2378 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2379 "Expected either an induction phi-node or a truncate of it!"); 2380 2381 // This induction variable is not the phi from the original loop but the 2382 // newly-created IV based on the proof that casted Phi is equal to the 2383 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2384 // re-uses the same InductionDescriptor that original IV uses but we don't 2385 // have to do any recording in this case - that is done when original IV is 2386 // processed. 2387 if (isa<TruncInst>(EntryVal)) 2388 return; 2389 2390 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2391 if (Casts.empty()) 2392 return; 2393 // Only the first Cast instruction in the Casts vector is of interest. 2394 // The rest of the Casts (if exist) have no uses outside the 2395 // induction update chain itself. 2396 if (Lane < UINT_MAX) 2397 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2398 else 2399 State.set(CastDef, VectorLoopVal, Part); 2400 } 2401 2402 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2403 TruncInst *Trunc, VPValue *Def, 2404 VPValue *CastDef, 2405 VPTransformState &State) { 2406 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2407 "Primary induction variable must have an integer type"); 2408 2409 auto II = Legal->getInductionVars().find(IV); 2410 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2411 2412 auto ID = II->second; 2413 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2414 2415 // The value from the original loop to which we are mapping the new induction 2416 // variable. 2417 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2418 2419 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2420 2421 // Generate code for the induction step. Note that induction steps are 2422 // required to be loop-invariant 2423 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2424 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2425 "Induction step should be loop invariant"); 2426 if (PSE.getSE()->isSCEVable(IV->getType())) { 2427 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2428 return Exp.expandCodeFor(Step, Step->getType(), 2429 LoopVectorPreHeader->getTerminator()); 2430 } 2431 return cast<SCEVUnknown>(Step)->getValue(); 2432 }; 2433 2434 // The scalar value to broadcast. This is derived from the canonical 2435 // induction variable. If a truncation type is given, truncate the canonical 2436 // induction variable and step. Otherwise, derive these values from the 2437 // induction descriptor. 2438 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2439 Value *ScalarIV = Induction; 2440 if (IV != OldInduction) { 2441 ScalarIV = IV->getType()->isIntegerTy() 2442 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2443 : Builder.CreateCast(Instruction::SIToFP, Induction, 2444 IV->getType()); 2445 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2446 ScalarIV->setName("offset.idx"); 2447 } 2448 if (Trunc) { 2449 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2450 assert(Step->getType()->isIntegerTy() && 2451 "Truncation requires an integer step"); 2452 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2453 Step = Builder.CreateTrunc(Step, TruncType); 2454 } 2455 return ScalarIV; 2456 }; 2457 2458 // Create the vector values from the scalar IV, in the absence of creating a 2459 // vector IV. 2460 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2461 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2462 for (unsigned Part = 0; Part < UF; ++Part) { 2463 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2464 Value *EntryPart = 2465 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2466 ID.getInductionOpcode()); 2467 State.set(Def, EntryPart, Part); 2468 if (Trunc) 2469 addMetadata(EntryPart, Trunc); 2470 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2471 State, Part); 2472 } 2473 }; 2474 2475 // Fast-math-flags propagate from the original induction instruction. 2476 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2477 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2478 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2479 2480 // Now do the actual transformations, and start with creating the step value. 2481 Value *Step = CreateStepValue(ID.getStep()); 2482 if (VF.isZero() || VF.isScalar()) { 2483 Value *ScalarIV = CreateScalarIV(Step); 2484 CreateSplatIV(ScalarIV, Step); 2485 return; 2486 } 2487 2488 // Determine if we want a scalar version of the induction variable. This is 2489 // true if the induction variable itself is not widened, or if it has at 2490 // least one user in the loop that is not widened. 2491 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2492 if (!NeedsScalarIV) { 2493 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2494 State); 2495 return; 2496 } 2497 2498 // Try to create a new independent vector induction variable. If we can't 2499 // create the phi node, we will splat the scalar induction variable in each 2500 // loop iteration. 2501 if (!shouldScalarizeInstruction(EntryVal)) { 2502 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2503 State); 2504 Value *ScalarIV = CreateScalarIV(Step); 2505 // Create scalar steps that can be used by instructions we will later 2506 // scalarize. Note that the addition of the scalar steps will not increase 2507 // the number of instructions in the loop in the common case prior to 2508 // InstCombine. We will be trading one vector extract for each scalar step. 2509 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2510 return; 2511 } 2512 2513 // All IV users are scalar instructions, so only emit a scalar IV, not a 2514 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2515 // predicate used by the masked loads/stores. 2516 Value *ScalarIV = CreateScalarIV(Step); 2517 if (!Cost->isScalarEpilogueAllowed()) 2518 CreateSplatIV(ScalarIV, Step); 2519 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2520 } 2521 2522 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2523 Instruction::BinaryOps BinOp) { 2524 // Create and check the types. 2525 auto *ValVTy = cast<VectorType>(Val->getType()); 2526 ElementCount VLen = ValVTy->getElementCount(); 2527 2528 Type *STy = Val->getType()->getScalarType(); 2529 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2530 "Induction Step must be an integer or FP"); 2531 assert(Step->getType() == STy && "Step has wrong type"); 2532 2533 SmallVector<Constant *, 8> Indices; 2534 2535 // Create a vector of consecutive numbers from zero to VF. 2536 VectorType *InitVecValVTy = ValVTy; 2537 Type *InitVecValSTy = STy; 2538 if (STy->isFloatingPointTy()) { 2539 InitVecValSTy = 2540 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2541 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2542 } 2543 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2544 2545 // Add on StartIdx 2546 Value *StartIdxSplat = Builder.CreateVectorSplat( 2547 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2548 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2549 2550 if (STy->isIntegerTy()) { 2551 Step = Builder.CreateVectorSplat(VLen, Step); 2552 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2553 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2554 // which can be found from the original scalar operations. 2555 Step = Builder.CreateMul(InitVec, Step); 2556 return Builder.CreateAdd(Val, Step, "induction"); 2557 } 2558 2559 // Floating point induction. 2560 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2561 "Binary Opcode should be specified for FP induction"); 2562 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2563 Step = Builder.CreateVectorSplat(VLen, Step); 2564 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2565 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2566 } 2567 2568 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2569 Instruction *EntryVal, 2570 const InductionDescriptor &ID, 2571 VPValue *Def, VPValue *CastDef, 2572 VPTransformState &State) { 2573 // We shouldn't have to build scalar steps if we aren't vectorizing. 2574 assert(VF.isVector() && "VF should be greater than one"); 2575 // Get the value type and ensure it and the step have the same integer type. 2576 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2577 assert(ScalarIVTy == Step->getType() && 2578 "Val and Step should have the same type"); 2579 2580 // We build scalar steps for both integer and floating-point induction 2581 // variables. Here, we determine the kind of arithmetic we will perform. 2582 Instruction::BinaryOps AddOp; 2583 Instruction::BinaryOps MulOp; 2584 if (ScalarIVTy->isIntegerTy()) { 2585 AddOp = Instruction::Add; 2586 MulOp = Instruction::Mul; 2587 } else { 2588 AddOp = ID.getInductionOpcode(); 2589 MulOp = Instruction::FMul; 2590 } 2591 2592 // Determine the number of scalars we need to generate for each unroll 2593 // iteration. If EntryVal is uniform, we only need to generate the first 2594 // lane. Otherwise, we generate all VF values. 2595 bool IsUniform = 2596 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2597 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2598 // Compute the scalar steps and save the results in State. 2599 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2600 ScalarIVTy->getScalarSizeInBits()); 2601 Type *VecIVTy = nullptr; 2602 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2603 if (!IsUniform && VF.isScalable()) { 2604 VecIVTy = VectorType::get(ScalarIVTy, VF); 2605 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2606 SplatStep = Builder.CreateVectorSplat(VF, Step); 2607 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2608 } 2609 2610 for (unsigned Part = 0; Part < UF; ++Part) { 2611 Value *StartIdx0 = 2612 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2613 2614 if (!IsUniform && VF.isScalable()) { 2615 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2616 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2617 if (ScalarIVTy->isFloatingPointTy()) 2618 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2619 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2620 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2621 State.set(Def, Add, Part); 2622 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2623 Part); 2624 // It's useful to record the lane values too for the known minimum number 2625 // of elements so we do those below. This improves the code quality when 2626 // trying to extract the first element, for example. 2627 } 2628 2629 if (ScalarIVTy->isFloatingPointTy()) 2630 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2631 2632 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2633 Value *StartIdx = Builder.CreateBinOp( 2634 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2635 // The step returned by `createStepForVF` is a runtime-evaluated value 2636 // when VF is scalable. Otherwise, it should be folded into a Constant. 2637 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2638 "Expected StartIdx to be folded to a constant when VF is not " 2639 "scalable"); 2640 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2641 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2642 State.set(Def, Add, VPIteration(Part, Lane)); 2643 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2644 Part, Lane); 2645 } 2646 } 2647 } 2648 2649 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2650 const VPIteration &Instance, 2651 VPTransformState &State) { 2652 Value *ScalarInst = State.get(Def, Instance); 2653 Value *VectorValue = State.get(Def, Instance.Part); 2654 VectorValue = Builder.CreateInsertElement( 2655 VectorValue, ScalarInst, 2656 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2657 State.set(Def, VectorValue, Instance.Part); 2658 } 2659 2660 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2661 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2662 return Builder.CreateVectorReverse(Vec, "reverse"); 2663 } 2664 2665 // Return whether we allow using masked interleave-groups (for dealing with 2666 // strided loads/stores that reside in predicated blocks, or for dealing 2667 // with gaps). 2668 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2669 // If an override option has been passed in for interleaved accesses, use it. 2670 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2671 return EnableMaskedInterleavedMemAccesses; 2672 2673 return TTI.enableMaskedInterleavedAccessVectorization(); 2674 } 2675 2676 // Try to vectorize the interleave group that \p Instr belongs to. 2677 // 2678 // E.g. Translate following interleaved load group (factor = 3): 2679 // for (i = 0; i < N; i+=3) { 2680 // R = Pic[i]; // Member of index 0 2681 // G = Pic[i+1]; // Member of index 1 2682 // B = Pic[i+2]; // Member of index 2 2683 // ... // do something to R, G, B 2684 // } 2685 // To: 2686 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2687 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2688 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2689 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2690 // 2691 // Or translate following interleaved store group (factor = 3): 2692 // for (i = 0; i < N; i+=3) { 2693 // ... do something to R, G, B 2694 // Pic[i] = R; // Member of index 0 2695 // Pic[i+1] = G; // Member of index 1 2696 // Pic[i+2] = B; // Member of index 2 2697 // } 2698 // To: 2699 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2700 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2701 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2702 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2703 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2704 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2705 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2706 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2707 VPValue *BlockInMask) { 2708 Instruction *Instr = Group->getInsertPos(); 2709 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2710 2711 // Prepare for the vector type of the interleaved load/store. 2712 Type *ScalarTy = getLoadStoreType(Instr); 2713 unsigned InterleaveFactor = Group->getFactor(); 2714 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2715 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2716 2717 // Prepare for the new pointers. 2718 SmallVector<Value *, 2> AddrParts; 2719 unsigned Index = Group->getIndex(Instr); 2720 2721 // TODO: extend the masked interleaved-group support to reversed access. 2722 assert((!BlockInMask || !Group->isReverse()) && 2723 "Reversed masked interleave-group not supported."); 2724 2725 // If the group is reverse, adjust the index to refer to the last vector lane 2726 // instead of the first. We adjust the index from the first vector lane, 2727 // rather than directly getting the pointer for lane VF - 1, because the 2728 // pointer operand of the interleaved access is supposed to be uniform. For 2729 // uniform instructions, we're only required to generate a value for the 2730 // first vector lane in each unroll iteration. 2731 if (Group->isReverse()) 2732 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2733 2734 for (unsigned Part = 0; Part < UF; Part++) { 2735 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2736 setDebugLocFromInst(AddrPart); 2737 2738 // Notice current instruction could be any index. Need to adjust the address 2739 // to the member of index 0. 2740 // 2741 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2742 // b = A[i]; // Member of index 0 2743 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2744 // 2745 // E.g. A[i+1] = a; // Member of index 1 2746 // A[i] = b; // Member of index 0 2747 // A[i+2] = c; // Member of index 2 (Current instruction) 2748 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2749 2750 bool InBounds = false; 2751 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2752 InBounds = gep->isInBounds(); 2753 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2754 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2755 2756 // Cast to the vector pointer type. 2757 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2758 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2759 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2760 } 2761 2762 setDebugLocFromInst(Instr); 2763 Value *PoisonVec = PoisonValue::get(VecTy); 2764 2765 Value *MaskForGaps = nullptr; 2766 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2767 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2768 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2769 } 2770 2771 // Vectorize the interleaved load group. 2772 if (isa<LoadInst>(Instr)) { 2773 // For each unroll part, create a wide load for the group. 2774 SmallVector<Value *, 2> NewLoads; 2775 for (unsigned Part = 0; Part < UF; Part++) { 2776 Instruction *NewLoad; 2777 if (BlockInMask || MaskForGaps) { 2778 assert(useMaskedInterleavedAccesses(*TTI) && 2779 "masked interleaved groups are not allowed."); 2780 Value *GroupMask = MaskForGaps; 2781 if (BlockInMask) { 2782 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2783 Value *ShuffledMask = Builder.CreateShuffleVector( 2784 BlockInMaskPart, 2785 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2786 "interleaved.mask"); 2787 GroupMask = MaskForGaps 2788 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2789 MaskForGaps) 2790 : ShuffledMask; 2791 } 2792 NewLoad = 2793 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2794 GroupMask, PoisonVec, "wide.masked.vec"); 2795 } 2796 else 2797 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2798 Group->getAlign(), "wide.vec"); 2799 Group->addMetadata(NewLoad); 2800 NewLoads.push_back(NewLoad); 2801 } 2802 2803 // For each member in the group, shuffle out the appropriate data from the 2804 // wide loads. 2805 unsigned J = 0; 2806 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2807 Instruction *Member = Group->getMember(I); 2808 2809 // Skip the gaps in the group. 2810 if (!Member) 2811 continue; 2812 2813 auto StrideMask = 2814 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2815 for (unsigned Part = 0; Part < UF; Part++) { 2816 Value *StridedVec = Builder.CreateShuffleVector( 2817 NewLoads[Part], StrideMask, "strided.vec"); 2818 2819 // If this member has different type, cast the result type. 2820 if (Member->getType() != ScalarTy) { 2821 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2822 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2823 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2824 } 2825 2826 if (Group->isReverse()) 2827 StridedVec = reverseVector(StridedVec); 2828 2829 State.set(VPDefs[J], StridedVec, Part); 2830 } 2831 ++J; 2832 } 2833 return; 2834 } 2835 2836 // The sub vector type for current instruction. 2837 auto *SubVT = VectorType::get(ScalarTy, VF); 2838 2839 // Vectorize the interleaved store group. 2840 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2841 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2842 "masked interleaved groups are not allowed."); 2843 assert((!MaskForGaps || !VF.isScalable()) && 2844 "masking gaps for scalable vectors is not yet supported."); 2845 for (unsigned Part = 0; Part < UF; Part++) { 2846 // Collect the stored vector from each member. 2847 SmallVector<Value *, 4> StoredVecs; 2848 for (unsigned i = 0; i < InterleaveFactor; i++) { 2849 assert((Group->getMember(i) || MaskForGaps) && 2850 "Fail to get a member from an interleaved store group"); 2851 Instruction *Member = Group->getMember(i); 2852 2853 // Skip the gaps in the group. 2854 if (!Member) { 2855 Value *Undef = PoisonValue::get(SubVT); 2856 StoredVecs.push_back(Undef); 2857 continue; 2858 } 2859 2860 Value *StoredVec = State.get(StoredValues[i], Part); 2861 2862 if (Group->isReverse()) 2863 StoredVec = reverseVector(StoredVec); 2864 2865 // If this member has different type, cast it to a unified type. 2866 2867 if (StoredVec->getType() != SubVT) 2868 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2869 2870 StoredVecs.push_back(StoredVec); 2871 } 2872 2873 // Concatenate all vectors into a wide vector. 2874 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2875 2876 // Interleave the elements in the wide vector. 2877 Value *IVec = Builder.CreateShuffleVector( 2878 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2879 "interleaved.vec"); 2880 2881 Instruction *NewStoreInstr; 2882 if (BlockInMask || MaskForGaps) { 2883 Value *GroupMask = MaskForGaps; 2884 if (BlockInMask) { 2885 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2886 Value *ShuffledMask = Builder.CreateShuffleVector( 2887 BlockInMaskPart, 2888 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2889 "interleaved.mask"); 2890 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2891 ShuffledMask, MaskForGaps) 2892 : ShuffledMask; 2893 } 2894 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2895 Group->getAlign(), GroupMask); 2896 } else 2897 NewStoreInstr = 2898 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2899 2900 Group->addMetadata(NewStoreInstr); 2901 } 2902 } 2903 2904 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2905 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2906 VPValue *StoredValue, VPValue *BlockInMask) { 2907 // Attempt to issue a wide load. 2908 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2909 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2910 2911 assert((LI || SI) && "Invalid Load/Store instruction"); 2912 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2913 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2914 2915 LoopVectorizationCostModel::InstWidening Decision = 2916 Cost->getWideningDecision(Instr, VF); 2917 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2918 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2919 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2920 "CM decision is not to widen the memory instruction"); 2921 2922 Type *ScalarDataTy = getLoadStoreType(Instr); 2923 2924 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2925 const Align Alignment = getLoadStoreAlignment(Instr); 2926 2927 // Determine if the pointer operand of the access is either consecutive or 2928 // reverse consecutive. 2929 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2930 bool ConsecutiveStride = 2931 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2932 bool CreateGatherScatter = 2933 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2934 2935 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2936 // gather/scatter. Otherwise Decision should have been to Scalarize. 2937 assert((ConsecutiveStride || CreateGatherScatter) && 2938 "The instruction should be scalarized"); 2939 (void)ConsecutiveStride; 2940 2941 VectorParts BlockInMaskParts(UF); 2942 bool isMaskRequired = BlockInMask; 2943 if (isMaskRequired) 2944 for (unsigned Part = 0; Part < UF; ++Part) 2945 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2946 2947 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2948 // Calculate the pointer for the specific unroll-part. 2949 GetElementPtrInst *PartPtr = nullptr; 2950 2951 bool InBounds = false; 2952 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2953 InBounds = gep->isInBounds(); 2954 if (Reverse) { 2955 // If the address is consecutive but reversed, then the 2956 // wide store needs to start at the last vector element. 2957 // RunTimeVF = VScale * VF.getKnownMinValue() 2958 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2959 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2960 // NumElt = -Part * RunTimeVF 2961 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2962 // LastLane = 1 - RunTimeVF 2963 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2964 PartPtr = 2965 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2966 PartPtr->setIsInBounds(InBounds); 2967 PartPtr = cast<GetElementPtrInst>( 2968 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2969 PartPtr->setIsInBounds(InBounds); 2970 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2971 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2972 } else { 2973 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2974 PartPtr = cast<GetElementPtrInst>( 2975 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2976 PartPtr->setIsInBounds(InBounds); 2977 } 2978 2979 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2980 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2981 }; 2982 2983 // Handle Stores: 2984 if (SI) { 2985 setDebugLocFromInst(SI); 2986 2987 for (unsigned Part = 0; Part < UF; ++Part) { 2988 Instruction *NewSI = nullptr; 2989 Value *StoredVal = State.get(StoredValue, Part); 2990 if (CreateGatherScatter) { 2991 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2992 Value *VectorGep = State.get(Addr, Part); 2993 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2994 MaskPart); 2995 } else { 2996 if (Reverse) { 2997 // If we store to reverse consecutive memory locations, then we need 2998 // to reverse the order of elements in the stored value. 2999 StoredVal = reverseVector(StoredVal); 3000 // We don't want to update the value in the map as it might be used in 3001 // another expression. So don't call resetVectorValue(StoredVal). 3002 } 3003 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3004 if (isMaskRequired) 3005 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 3006 BlockInMaskParts[Part]); 3007 else 3008 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 3009 } 3010 addMetadata(NewSI, SI); 3011 } 3012 return; 3013 } 3014 3015 // Handle loads. 3016 assert(LI && "Must have a load instruction"); 3017 setDebugLocFromInst(LI); 3018 for (unsigned Part = 0; Part < UF; ++Part) { 3019 Value *NewLI; 3020 if (CreateGatherScatter) { 3021 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3022 Value *VectorGep = State.get(Addr, Part); 3023 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3024 nullptr, "wide.masked.gather"); 3025 addMetadata(NewLI, LI); 3026 } else { 3027 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3028 if (isMaskRequired) 3029 NewLI = Builder.CreateMaskedLoad( 3030 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3031 PoisonValue::get(DataTy), "wide.masked.load"); 3032 else 3033 NewLI = 3034 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3035 3036 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3037 addMetadata(NewLI, LI); 3038 if (Reverse) 3039 NewLI = reverseVector(NewLI); 3040 } 3041 3042 State.set(Def, NewLI, Part); 3043 } 3044 } 3045 3046 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3047 VPUser &User, 3048 const VPIteration &Instance, 3049 bool IfPredicateInstr, 3050 VPTransformState &State) { 3051 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3052 3053 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3054 // the first lane and part. 3055 if (isa<NoAliasScopeDeclInst>(Instr)) 3056 if (!Instance.isFirstIteration()) 3057 return; 3058 3059 setDebugLocFromInst(Instr); 3060 3061 // Does this instruction return a value ? 3062 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3063 3064 Instruction *Cloned = Instr->clone(); 3065 if (!IsVoidRetTy) 3066 Cloned->setName(Instr->getName() + ".cloned"); 3067 3068 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3069 Builder.GetInsertPoint()); 3070 // Replace the operands of the cloned instructions with their scalar 3071 // equivalents in the new loop. 3072 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3073 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3074 auto InputInstance = Instance; 3075 if (!Operand || !OrigLoop->contains(Operand) || 3076 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3077 InputInstance.Lane = VPLane::getFirstLane(); 3078 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3079 Cloned->setOperand(op, NewOp); 3080 } 3081 addNewMetadata(Cloned, Instr); 3082 3083 // Place the cloned scalar in the new loop. 3084 Builder.Insert(Cloned); 3085 3086 State.set(Def, Cloned, Instance); 3087 3088 // If we just cloned a new assumption, add it the assumption cache. 3089 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3090 AC->registerAssumption(II); 3091 3092 // End if-block. 3093 if (IfPredicateInstr) 3094 PredicatedInstructions.push_back(Cloned); 3095 } 3096 3097 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3098 Value *End, Value *Step, 3099 Instruction *DL) { 3100 BasicBlock *Header = L->getHeader(); 3101 BasicBlock *Latch = L->getLoopLatch(); 3102 // As we're just creating this loop, it's possible no latch exists 3103 // yet. If so, use the header as this will be a single block loop. 3104 if (!Latch) 3105 Latch = Header; 3106 3107 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3108 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3109 setDebugLocFromInst(OldInst, &B); 3110 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3111 3112 B.SetInsertPoint(Latch->getTerminator()); 3113 setDebugLocFromInst(OldInst, &B); 3114 3115 // Create i+1 and fill the PHINode. 3116 // 3117 // If the tail is not folded, we know that End - Start >= Step (either 3118 // statically or through the minimum iteration checks). We also know that both 3119 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3120 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3121 // overflows and we can mark the induction increment as NUW. 3122 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3123 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3124 Induction->addIncoming(Start, L->getLoopPreheader()); 3125 Induction->addIncoming(Next, Latch); 3126 // Create the compare. 3127 Value *ICmp = B.CreateICmpEQ(Next, End); 3128 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3129 3130 // Now we have two terminators. Remove the old one from the block. 3131 Latch->getTerminator()->eraseFromParent(); 3132 3133 return Induction; 3134 } 3135 3136 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3137 if (TripCount) 3138 return TripCount; 3139 3140 assert(L && "Create Trip Count for null loop."); 3141 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3142 // Find the loop boundaries. 3143 ScalarEvolution *SE = PSE.getSE(); 3144 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3145 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3146 "Invalid loop count"); 3147 3148 Type *IdxTy = Legal->getWidestInductionType(); 3149 assert(IdxTy && "No type for induction"); 3150 3151 // The exit count might have the type of i64 while the phi is i32. This can 3152 // happen if we have an induction variable that is sign extended before the 3153 // compare. The only way that we get a backedge taken count is that the 3154 // induction variable was signed and as such will not overflow. In such a case 3155 // truncation is legal. 3156 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3157 IdxTy->getPrimitiveSizeInBits()) 3158 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3159 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3160 3161 // Get the total trip count from the count by adding 1. 3162 const SCEV *ExitCount = SE->getAddExpr( 3163 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3164 3165 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3166 3167 // Expand the trip count and place the new instructions in the preheader. 3168 // Notice that the pre-header does not change, only the loop body. 3169 SCEVExpander Exp(*SE, DL, "induction"); 3170 3171 // Count holds the overall loop count (N). 3172 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3173 L->getLoopPreheader()->getTerminator()); 3174 3175 if (TripCount->getType()->isPointerTy()) 3176 TripCount = 3177 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3178 L->getLoopPreheader()->getTerminator()); 3179 3180 return TripCount; 3181 } 3182 3183 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3184 if (VectorTripCount) 3185 return VectorTripCount; 3186 3187 Value *TC = getOrCreateTripCount(L); 3188 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3189 3190 Type *Ty = TC->getType(); 3191 // This is where we can make the step a runtime constant. 3192 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3193 3194 // If the tail is to be folded by masking, round the number of iterations N 3195 // up to a multiple of Step instead of rounding down. This is done by first 3196 // adding Step-1 and then rounding down. Note that it's ok if this addition 3197 // overflows: the vector induction variable will eventually wrap to zero given 3198 // that it starts at zero and its Step is a power of two; the loop will then 3199 // exit, with the last early-exit vector comparison also producing all-true. 3200 if (Cost->foldTailByMasking()) { 3201 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3202 "VF*UF must be a power of 2 when folding tail by masking"); 3203 assert(!VF.isScalable() && 3204 "Tail folding not yet supported for scalable vectors"); 3205 TC = Builder.CreateAdd( 3206 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3207 } 3208 3209 // Now we need to generate the expression for the part of the loop that the 3210 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3211 // iterations are not required for correctness, or N - Step, otherwise. Step 3212 // is equal to the vectorization factor (number of SIMD elements) times the 3213 // unroll factor (number of SIMD instructions). 3214 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3215 3216 // There are cases where we *must* run at least one iteration in the remainder 3217 // loop. See the cost model for when this can happen. If the step evenly 3218 // divides the trip count, we set the remainder to be equal to the step. If 3219 // the step does not evenly divide the trip count, no adjustment is necessary 3220 // since there will already be scalar iterations. Note that the minimum 3221 // iterations check ensures that N >= Step. 3222 if (Cost->requiresScalarEpilogue(VF)) { 3223 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3224 R = Builder.CreateSelect(IsZero, Step, R); 3225 } 3226 3227 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3228 3229 return VectorTripCount; 3230 } 3231 3232 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3233 const DataLayout &DL) { 3234 // Verify that V is a vector type with same number of elements as DstVTy. 3235 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3236 unsigned VF = DstFVTy->getNumElements(); 3237 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3238 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3239 Type *SrcElemTy = SrcVecTy->getElementType(); 3240 Type *DstElemTy = DstFVTy->getElementType(); 3241 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3242 "Vector elements must have same size"); 3243 3244 // Do a direct cast if element types are castable. 3245 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3246 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3247 } 3248 // V cannot be directly casted to desired vector type. 3249 // May happen when V is a floating point vector but DstVTy is a vector of 3250 // pointers or vice-versa. Handle this using a two-step bitcast using an 3251 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3252 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3253 "Only one type should be a pointer type"); 3254 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3255 "Only one type should be a floating point type"); 3256 Type *IntTy = 3257 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3258 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3259 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3260 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3261 } 3262 3263 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3264 BasicBlock *Bypass) { 3265 Value *Count = getOrCreateTripCount(L); 3266 // Reuse existing vector loop preheader for TC checks. 3267 // Note that new preheader block is generated for vector loop. 3268 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3269 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3270 3271 // Generate code to check if the loop's trip count is less than VF * UF, or 3272 // equal to it in case a scalar epilogue is required; this implies that the 3273 // vector trip count is zero. This check also covers the case where adding one 3274 // to the backedge-taken count overflowed leading to an incorrect trip count 3275 // of zero. In this case we will also jump to the scalar loop. 3276 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3277 : ICmpInst::ICMP_ULT; 3278 3279 // If tail is to be folded, vector loop takes care of all iterations. 3280 Value *CheckMinIters = Builder.getFalse(); 3281 if (!Cost->foldTailByMasking()) { 3282 Value *Step = 3283 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3284 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3285 } 3286 // Create new preheader for vector loop. 3287 LoopVectorPreHeader = 3288 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3289 "vector.ph"); 3290 3291 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3292 DT->getNode(Bypass)->getIDom()) && 3293 "TC check is expected to dominate Bypass"); 3294 3295 // Update dominator for Bypass & LoopExit (if needed). 3296 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3297 if (!Cost->requiresScalarEpilogue(VF)) 3298 // If there is an epilogue which must run, there's no edge from the 3299 // middle block to exit blocks and thus no need to update the immediate 3300 // dominator of the exit blocks. 3301 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3302 3303 ReplaceInstWithInst( 3304 TCCheckBlock->getTerminator(), 3305 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3306 LoopBypassBlocks.push_back(TCCheckBlock); 3307 } 3308 3309 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3310 3311 BasicBlock *const SCEVCheckBlock = 3312 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3313 if (!SCEVCheckBlock) 3314 return nullptr; 3315 3316 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3317 (OptForSizeBasedOnProfile && 3318 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3319 "Cannot SCEV check stride or overflow when optimizing for size"); 3320 3321 3322 // Update dominator only if this is first RT check. 3323 if (LoopBypassBlocks.empty()) { 3324 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3325 if (!Cost->requiresScalarEpilogue(VF)) 3326 // If there is an epilogue which must run, there's no edge from the 3327 // middle block to exit blocks and thus no need to update the immediate 3328 // dominator of the exit blocks. 3329 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3330 } 3331 3332 LoopBypassBlocks.push_back(SCEVCheckBlock); 3333 AddedSafetyChecks = true; 3334 return SCEVCheckBlock; 3335 } 3336 3337 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3338 BasicBlock *Bypass) { 3339 // VPlan-native path does not do any analysis for runtime checks currently. 3340 if (EnableVPlanNativePath) 3341 return nullptr; 3342 3343 BasicBlock *const MemCheckBlock = 3344 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3345 3346 // Check if we generated code that checks in runtime if arrays overlap. We put 3347 // the checks into a separate block to make the more common case of few 3348 // elements faster. 3349 if (!MemCheckBlock) 3350 return nullptr; 3351 3352 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3353 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3354 "Cannot emit memory checks when optimizing for size, unless forced " 3355 "to vectorize."); 3356 ORE->emit([&]() { 3357 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3358 L->getStartLoc(), L->getHeader()) 3359 << "Code-size may be reduced by not forcing " 3360 "vectorization, or by source-code modifications " 3361 "eliminating the need for runtime checks " 3362 "(e.g., adding 'restrict')."; 3363 }); 3364 } 3365 3366 LoopBypassBlocks.push_back(MemCheckBlock); 3367 3368 AddedSafetyChecks = true; 3369 3370 // We currently don't use LoopVersioning for the actual loop cloning but we 3371 // still use it to add the noalias metadata. 3372 LVer = std::make_unique<LoopVersioning>( 3373 *Legal->getLAI(), 3374 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3375 DT, PSE.getSE()); 3376 LVer->prepareNoAliasMetadata(); 3377 return MemCheckBlock; 3378 } 3379 3380 Value *InnerLoopVectorizer::emitTransformedIndex( 3381 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3382 const InductionDescriptor &ID) const { 3383 3384 SCEVExpander Exp(*SE, DL, "induction"); 3385 auto Step = ID.getStep(); 3386 auto StartValue = ID.getStartValue(); 3387 assert(Index->getType()->getScalarType() == Step->getType() && 3388 "Index scalar type does not match StepValue type"); 3389 3390 // Note: the IR at this point is broken. We cannot use SE to create any new 3391 // SCEV and then expand it, hoping that SCEV's simplification will give us 3392 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3393 // lead to various SCEV crashes. So all we can do is to use builder and rely 3394 // on InstCombine for future simplifications. Here we handle some trivial 3395 // cases only. 3396 auto CreateAdd = [&B](Value *X, Value *Y) { 3397 assert(X->getType() == Y->getType() && "Types don't match!"); 3398 if (auto *CX = dyn_cast<ConstantInt>(X)) 3399 if (CX->isZero()) 3400 return Y; 3401 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3402 if (CY->isZero()) 3403 return X; 3404 return B.CreateAdd(X, Y); 3405 }; 3406 3407 // We allow X to be a vector type, in which case Y will potentially be 3408 // splatted into a vector with the same element count. 3409 auto CreateMul = [&B](Value *X, Value *Y) { 3410 assert(X->getType()->getScalarType() == Y->getType() && 3411 "Types don't match!"); 3412 if (auto *CX = dyn_cast<ConstantInt>(X)) 3413 if (CX->isOne()) 3414 return Y; 3415 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3416 if (CY->isOne()) 3417 return X; 3418 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3419 if (XVTy && !isa<VectorType>(Y->getType())) 3420 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3421 return B.CreateMul(X, Y); 3422 }; 3423 3424 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3425 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3426 // the DomTree is not kept up-to-date for additional blocks generated in the 3427 // vector loop. By using the header as insertion point, we guarantee that the 3428 // expanded instructions dominate all their uses. 3429 auto GetInsertPoint = [this, &B]() { 3430 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3431 if (InsertBB != LoopVectorBody && 3432 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3433 return LoopVectorBody->getTerminator(); 3434 return &*B.GetInsertPoint(); 3435 }; 3436 3437 switch (ID.getKind()) { 3438 case InductionDescriptor::IK_IntInduction: { 3439 assert(!isa<VectorType>(Index->getType()) && 3440 "Vector indices not supported for integer inductions yet"); 3441 assert(Index->getType() == StartValue->getType() && 3442 "Index type does not match StartValue type"); 3443 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3444 return B.CreateSub(StartValue, Index); 3445 auto *Offset = CreateMul( 3446 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3447 return CreateAdd(StartValue, Offset); 3448 } 3449 case InductionDescriptor::IK_PtrInduction: { 3450 assert(isa<SCEVConstant>(Step) && 3451 "Expected constant step for pointer induction"); 3452 return B.CreateGEP( 3453 StartValue->getType()->getPointerElementType(), StartValue, 3454 CreateMul(Index, 3455 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3456 GetInsertPoint()))); 3457 } 3458 case InductionDescriptor::IK_FpInduction: { 3459 assert(!isa<VectorType>(Index->getType()) && 3460 "Vector indices not supported for FP inductions yet"); 3461 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3462 auto InductionBinOp = ID.getInductionBinOp(); 3463 assert(InductionBinOp && 3464 (InductionBinOp->getOpcode() == Instruction::FAdd || 3465 InductionBinOp->getOpcode() == Instruction::FSub) && 3466 "Original bin op should be defined for FP induction"); 3467 3468 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3469 Value *MulExp = B.CreateFMul(StepValue, Index); 3470 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3471 "induction"); 3472 } 3473 case InductionDescriptor::IK_NoInduction: 3474 return nullptr; 3475 } 3476 llvm_unreachable("invalid enum"); 3477 } 3478 3479 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3480 LoopScalarBody = OrigLoop->getHeader(); 3481 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3482 assert(LoopVectorPreHeader && "Invalid loop structure"); 3483 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3484 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3485 "multiple exit loop without required epilogue?"); 3486 3487 LoopMiddleBlock = 3488 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3489 LI, nullptr, Twine(Prefix) + "middle.block"); 3490 LoopScalarPreHeader = 3491 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3492 nullptr, Twine(Prefix) + "scalar.ph"); 3493 3494 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3495 3496 // Set up the middle block terminator. Two cases: 3497 // 1) If we know that we must execute the scalar epilogue, emit an 3498 // unconditional branch. 3499 // 2) Otherwise, we must have a single unique exit block (due to how we 3500 // implement the multiple exit case). In this case, set up a conditonal 3501 // branch from the middle block to the loop scalar preheader, and the 3502 // exit block. completeLoopSkeleton will update the condition to use an 3503 // iteration check, if required to decide whether to execute the remainder. 3504 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3505 BranchInst::Create(LoopScalarPreHeader) : 3506 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3507 Builder.getTrue()); 3508 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3509 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3510 3511 // We intentionally don't let SplitBlock to update LoopInfo since 3512 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3513 // LoopVectorBody is explicitly added to the correct place few lines later. 3514 LoopVectorBody = 3515 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3516 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3517 3518 // Update dominator for loop exit. 3519 if (!Cost->requiresScalarEpilogue(VF)) 3520 // If there is an epilogue which must run, there's no edge from the 3521 // middle block to exit blocks and thus no need to update the immediate 3522 // dominator of the exit blocks. 3523 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3524 3525 // Create and register the new vector loop. 3526 Loop *Lp = LI->AllocateLoop(); 3527 Loop *ParentLoop = OrigLoop->getParentLoop(); 3528 3529 // Insert the new loop into the loop nest and register the new basic blocks 3530 // before calling any utilities such as SCEV that require valid LoopInfo. 3531 if (ParentLoop) { 3532 ParentLoop->addChildLoop(Lp); 3533 } else { 3534 LI->addTopLevelLoop(Lp); 3535 } 3536 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3537 return Lp; 3538 } 3539 3540 void InnerLoopVectorizer::createInductionResumeValues( 3541 Loop *L, Value *VectorTripCount, 3542 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3543 assert(VectorTripCount && L && "Expected valid arguments"); 3544 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3545 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3546 "Inconsistent information about additional bypass."); 3547 // We are going to resume the execution of the scalar loop. 3548 // Go over all of the induction variables that we found and fix the 3549 // PHIs that are left in the scalar version of the loop. 3550 // The starting values of PHI nodes depend on the counter of the last 3551 // iteration in the vectorized loop. 3552 // If we come from a bypass edge then we need to start from the original 3553 // start value. 3554 for (auto &InductionEntry : Legal->getInductionVars()) { 3555 PHINode *OrigPhi = InductionEntry.first; 3556 InductionDescriptor II = InductionEntry.second; 3557 3558 // Create phi nodes to merge from the backedge-taken check block. 3559 PHINode *BCResumeVal = 3560 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3561 LoopScalarPreHeader->getTerminator()); 3562 // Copy original phi DL over to the new one. 3563 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3564 Value *&EndValue = IVEndValues[OrigPhi]; 3565 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3566 if (OrigPhi == OldInduction) { 3567 // We know what the end value is. 3568 EndValue = VectorTripCount; 3569 } else { 3570 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3571 3572 // Fast-math-flags propagate from the original induction instruction. 3573 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3574 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3575 3576 Type *StepType = II.getStep()->getType(); 3577 Instruction::CastOps CastOp = 3578 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3579 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3580 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3581 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3582 EndValue->setName("ind.end"); 3583 3584 // Compute the end value for the additional bypass (if applicable). 3585 if (AdditionalBypass.first) { 3586 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3587 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3588 StepType, true); 3589 CRD = 3590 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3591 EndValueFromAdditionalBypass = 3592 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3593 EndValueFromAdditionalBypass->setName("ind.end"); 3594 } 3595 } 3596 // The new PHI merges the original incoming value, in case of a bypass, 3597 // or the value at the end of the vectorized loop. 3598 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3599 3600 // Fix the scalar body counter (PHI node). 3601 // The old induction's phi node in the scalar body needs the truncated 3602 // value. 3603 for (BasicBlock *BB : LoopBypassBlocks) 3604 BCResumeVal->addIncoming(II.getStartValue(), BB); 3605 3606 if (AdditionalBypass.first) 3607 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3608 EndValueFromAdditionalBypass); 3609 3610 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3611 } 3612 } 3613 3614 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3615 MDNode *OrigLoopID) { 3616 assert(L && "Expected valid loop."); 3617 3618 // The trip counts should be cached by now. 3619 Value *Count = getOrCreateTripCount(L); 3620 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3621 3622 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3623 3624 // Add a check in the middle block to see if we have completed 3625 // all of the iterations in the first vector loop. Three cases: 3626 // 1) If we require a scalar epilogue, there is no conditional branch as 3627 // we unconditionally branch to the scalar preheader. Do nothing. 3628 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3629 // Thus if tail is to be folded, we know we don't need to run the 3630 // remainder and we can use the previous value for the condition (true). 3631 // 3) Otherwise, construct a runtime check. 3632 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3633 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3634 Count, VectorTripCount, "cmp.n", 3635 LoopMiddleBlock->getTerminator()); 3636 3637 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3638 // of the corresponding compare because they may have ended up with 3639 // different line numbers and we want to avoid awkward line stepping while 3640 // debugging. Eg. if the compare has got a line number inside the loop. 3641 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3642 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3643 } 3644 3645 // Get ready to start creating new instructions into the vectorized body. 3646 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3647 "Inconsistent vector loop preheader"); 3648 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3649 3650 Optional<MDNode *> VectorizedLoopID = 3651 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3652 LLVMLoopVectorizeFollowupVectorized}); 3653 if (VectorizedLoopID.hasValue()) { 3654 L->setLoopID(VectorizedLoopID.getValue()); 3655 3656 // Do not setAlreadyVectorized if loop attributes have been defined 3657 // explicitly. 3658 return LoopVectorPreHeader; 3659 } 3660 3661 // Keep all loop hints from the original loop on the vector loop (we'll 3662 // replace the vectorizer-specific hints below). 3663 if (MDNode *LID = OrigLoop->getLoopID()) 3664 L->setLoopID(LID); 3665 3666 LoopVectorizeHints Hints(L, true, *ORE); 3667 Hints.setAlreadyVectorized(); 3668 3669 #ifdef EXPENSIVE_CHECKS 3670 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3671 LI->verify(*DT); 3672 #endif 3673 3674 return LoopVectorPreHeader; 3675 } 3676 3677 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3678 /* 3679 In this function we generate a new loop. The new loop will contain 3680 the vectorized instructions while the old loop will continue to run the 3681 scalar remainder. 3682 3683 [ ] <-- loop iteration number check. 3684 / | 3685 / v 3686 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3687 | / | 3688 | / v 3689 || [ ] <-- vector pre header. 3690 |/ | 3691 | v 3692 | [ ] \ 3693 | [ ]_| <-- vector loop. 3694 | | 3695 | v 3696 \ -[ ] <--- middle-block. 3697 \/ | 3698 /\ v 3699 | ->[ ] <--- new preheader. 3700 | | 3701 (opt) v <-- edge from middle to exit iff epilogue is not required. 3702 | [ ] \ 3703 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3704 \ | 3705 \ v 3706 >[ ] <-- exit block(s). 3707 ... 3708 */ 3709 3710 // Get the metadata of the original loop before it gets modified. 3711 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3712 3713 // Workaround! Compute the trip count of the original loop and cache it 3714 // before we start modifying the CFG. This code has a systemic problem 3715 // wherein it tries to run analysis over partially constructed IR; this is 3716 // wrong, and not simply for SCEV. The trip count of the original loop 3717 // simply happens to be prone to hitting this in practice. In theory, we 3718 // can hit the same issue for any SCEV, or ValueTracking query done during 3719 // mutation. See PR49900. 3720 getOrCreateTripCount(OrigLoop); 3721 3722 // Create an empty vector loop, and prepare basic blocks for the runtime 3723 // checks. 3724 Loop *Lp = createVectorLoopSkeleton(""); 3725 3726 // Now, compare the new count to zero. If it is zero skip the vector loop and 3727 // jump to the scalar loop. This check also covers the case where the 3728 // backedge-taken count is uint##_max: adding one to it will overflow leading 3729 // to an incorrect trip count of zero. In this (rare) case we will also jump 3730 // to the scalar loop. 3731 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3732 3733 // Generate the code to check any assumptions that we've made for SCEV 3734 // expressions. 3735 emitSCEVChecks(Lp, LoopScalarPreHeader); 3736 3737 // Generate the code that checks in runtime if arrays overlap. We put the 3738 // checks into a separate block to make the more common case of few elements 3739 // faster. 3740 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3741 3742 // Some loops have a single integer induction variable, while other loops 3743 // don't. One example is c++ iterators that often have multiple pointer 3744 // induction variables. In the code below we also support a case where we 3745 // don't have a single induction variable. 3746 // 3747 // We try to obtain an induction variable from the original loop as hard 3748 // as possible. However if we don't find one that: 3749 // - is an integer 3750 // - counts from zero, stepping by one 3751 // - is the size of the widest induction variable type 3752 // then we create a new one. 3753 OldInduction = Legal->getPrimaryInduction(); 3754 Type *IdxTy = Legal->getWidestInductionType(); 3755 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3756 // The loop step is equal to the vectorization factor (num of SIMD elements) 3757 // times the unroll factor (num of SIMD instructions). 3758 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3759 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3760 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3761 Induction = 3762 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3763 getDebugLocFromInstOrOperands(OldInduction)); 3764 3765 // Emit phis for the new starting index of the scalar loop. 3766 createInductionResumeValues(Lp, CountRoundDown); 3767 3768 return completeLoopSkeleton(Lp, OrigLoopID); 3769 } 3770 3771 // Fix up external users of the induction variable. At this point, we are 3772 // in LCSSA form, with all external PHIs that use the IV having one input value, 3773 // coming from the remainder loop. We need those PHIs to also have a correct 3774 // value for the IV when arriving directly from the middle block. 3775 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3776 const InductionDescriptor &II, 3777 Value *CountRoundDown, Value *EndValue, 3778 BasicBlock *MiddleBlock) { 3779 // There are two kinds of external IV usages - those that use the value 3780 // computed in the last iteration (the PHI) and those that use the penultimate 3781 // value (the value that feeds into the phi from the loop latch). 3782 // We allow both, but they, obviously, have different values. 3783 3784 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3785 3786 DenseMap<Value *, Value *> MissingVals; 3787 3788 // An external user of the last iteration's value should see the value that 3789 // the remainder loop uses to initialize its own IV. 3790 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3791 for (User *U : PostInc->users()) { 3792 Instruction *UI = cast<Instruction>(U); 3793 if (!OrigLoop->contains(UI)) { 3794 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3795 MissingVals[UI] = EndValue; 3796 } 3797 } 3798 3799 // An external user of the penultimate value need to see EndValue - Step. 3800 // The simplest way to get this is to recompute it from the constituent SCEVs, 3801 // that is Start + (Step * (CRD - 1)). 3802 for (User *U : OrigPhi->users()) { 3803 auto *UI = cast<Instruction>(U); 3804 if (!OrigLoop->contains(UI)) { 3805 const DataLayout &DL = 3806 OrigLoop->getHeader()->getModule()->getDataLayout(); 3807 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3808 3809 IRBuilder<> B(MiddleBlock->getTerminator()); 3810 3811 // Fast-math-flags propagate from the original induction instruction. 3812 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3813 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3814 3815 Value *CountMinusOne = B.CreateSub( 3816 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3817 Value *CMO = 3818 !II.getStep()->getType()->isIntegerTy() 3819 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3820 II.getStep()->getType()) 3821 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3822 CMO->setName("cast.cmo"); 3823 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3824 Escape->setName("ind.escape"); 3825 MissingVals[UI] = Escape; 3826 } 3827 } 3828 3829 for (auto &I : MissingVals) { 3830 PHINode *PHI = cast<PHINode>(I.first); 3831 // One corner case we have to handle is two IVs "chasing" each-other, 3832 // that is %IV2 = phi [...], [ %IV1, %latch ] 3833 // In this case, if IV1 has an external use, we need to avoid adding both 3834 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3835 // don't already have an incoming value for the middle block. 3836 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3837 PHI->addIncoming(I.second, MiddleBlock); 3838 } 3839 } 3840 3841 namespace { 3842 3843 struct CSEDenseMapInfo { 3844 static bool canHandle(const Instruction *I) { 3845 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3846 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3847 } 3848 3849 static inline Instruction *getEmptyKey() { 3850 return DenseMapInfo<Instruction *>::getEmptyKey(); 3851 } 3852 3853 static inline Instruction *getTombstoneKey() { 3854 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3855 } 3856 3857 static unsigned getHashValue(const Instruction *I) { 3858 assert(canHandle(I) && "Unknown instruction!"); 3859 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3860 I->value_op_end())); 3861 } 3862 3863 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3864 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3865 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3866 return LHS == RHS; 3867 return LHS->isIdenticalTo(RHS); 3868 } 3869 }; 3870 3871 } // end anonymous namespace 3872 3873 ///Perform cse of induction variable instructions. 3874 static void cse(BasicBlock *BB) { 3875 // Perform simple cse. 3876 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3877 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3878 Instruction *In = &*I++; 3879 3880 if (!CSEDenseMapInfo::canHandle(In)) 3881 continue; 3882 3883 // Check if we can replace this instruction with any of the 3884 // visited instructions. 3885 if (Instruction *V = CSEMap.lookup(In)) { 3886 In->replaceAllUsesWith(V); 3887 In->eraseFromParent(); 3888 continue; 3889 } 3890 3891 CSEMap[In] = In; 3892 } 3893 } 3894 3895 InstructionCost 3896 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3897 bool &NeedToScalarize) const { 3898 Function *F = CI->getCalledFunction(); 3899 Type *ScalarRetTy = CI->getType(); 3900 SmallVector<Type *, 4> Tys, ScalarTys; 3901 for (auto &ArgOp : CI->arg_operands()) 3902 ScalarTys.push_back(ArgOp->getType()); 3903 3904 // Estimate cost of scalarized vector call. The source operands are assumed 3905 // to be vectors, so we need to extract individual elements from there, 3906 // execute VF scalar calls, and then gather the result into the vector return 3907 // value. 3908 InstructionCost ScalarCallCost = 3909 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3910 if (VF.isScalar()) 3911 return ScalarCallCost; 3912 3913 // Compute corresponding vector type for return value and arguments. 3914 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3915 for (Type *ScalarTy : ScalarTys) 3916 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3917 3918 // Compute costs of unpacking argument values for the scalar calls and 3919 // packing the return values to a vector. 3920 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3921 3922 InstructionCost Cost = 3923 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3924 3925 // If we can't emit a vector call for this function, then the currently found 3926 // cost is the cost we need to return. 3927 NeedToScalarize = true; 3928 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3929 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3930 3931 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3932 return Cost; 3933 3934 // If the corresponding vector cost is cheaper, return its cost. 3935 InstructionCost VectorCallCost = 3936 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3937 if (VectorCallCost < Cost) { 3938 NeedToScalarize = false; 3939 Cost = VectorCallCost; 3940 } 3941 return Cost; 3942 } 3943 3944 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3945 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3946 return Elt; 3947 return VectorType::get(Elt, VF); 3948 } 3949 3950 InstructionCost 3951 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3952 ElementCount VF) const { 3953 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3954 assert(ID && "Expected intrinsic call!"); 3955 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3956 FastMathFlags FMF; 3957 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3958 FMF = FPMO->getFastMathFlags(); 3959 3960 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3961 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3962 SmallVector<Type *> ParamTys; 3963 std::transform(FTy->param_begin(), FTy->param_end(), 3964 std::back_inserter(ParamTys), 3965 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3966 3967 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3968 dyn_cast<IntrinsicInst>(CI)); 3969 return TTI.getIntrinsicInstrCost(CostAttrs, 3970 TargetTransformInfo::TCK_RecipThroughput); 3971 } 3972 3973 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3974 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3975 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3976 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3977 } 3978 3979 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3980 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3981 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3982 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3983 } 3984 3985 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3986 // For every instruction `I` in MinBWs, truncate the operands, create a 3987 // truncated version of `I` and reextend its result. InstCombine runs 3988 // later and will remove any ext/trunc pairs. 3989 SmallPtrSet<Value *, 4> Erased; 3990 for (const auto &KV : Cost->getMinimalBitwidths()) { 3991 // If the value wasn't vectorized, we must maintain the original scalar 3992 // type. The absence of the value from State indicates that it 3993 // wasn't vectorized. 3994 VPValue *Def = State.Plan->getVPValue(KV.first); 3995 if (!State.hasAnyVectorValue(Def)) 3996 continue; 3997 for (unsigned Part = 0; Part < UF; ++Part) { 3998 Value *I = State.get(Def, Part); 3999 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 4000 continue; 4001 Type *OriginalTy = I->getType(); 4002 Type *ScalarTruncatedTy = 4003 IntegerType::get(OriginalTy->getContext(), KV.second); 4004 auto *TruncatedTy = VectorType::get( 4005 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 4006 if (TruncatedTy == OriginalTy) 4007 continue; 4008 4009 IRBuilder<> B(cast<Instruction>(I)); 4010 auto ShrinkOperand = [&](Value *V) -> Value * { 4011 if (auto *ZI = dyn_cast<ZExtInst>(V)) 4012 if (ZI->getSrcTy() == TruncatedTy) 4013 return ZI->getOperand(0); 4014 return B.CreateZExtOrTrunc(V, TruncatedTy); 4015 }; 4016 4017 // The actual instruction modification depends on the instruction type, 4018 // unfortunately. 4019 Value *NewI = nullptr; 4020 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4021 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4022 ShrinkOperand(BO->getOperand(1))); 4023 4024 // Any wrapping introduced by shrinking this operation shouldn't be 4025 // considered undefined behavior. So, we can't unconditionally copy 4026 // arithmetic wrapping flags to NewI. 4027 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4028 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4029 NewI = 4030 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4031 ShrinkOperand(CI->getOperand(1))); 4032 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4033 NewI = B.CreateSelect(SI->getCondition(), 4034 ShrinkOperand(SI->getTrueValue()), 4035 ShrinkOperand(SI->getFalseValue())); 4036 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4037 switch (CI->getOpcode()) { 4038 default: 4039 llvm_unreachable("Unhandled cast!"); 4040 case Instruction::Trunc: 4041 NewI = ShrinkOperand(CI->getOperand(0)); 4042 break; 4043 case Instruction::SExt: 4044 NewI = B.CreateSExtOrTrunc( 4045 CI->getOperand(0), 4046 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4047 break; 4048 case Instruction::ZExt: 4049 NewI = B.CreateZExtOrTrunc( 4050 CI->getOperand(0), 4051 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4052 break; 4053 } 4054 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4055 auto Elements0 = 4056 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4057 auto *O0 = B.CreateZExtOrTrunc( 4058 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4059 auto Elements1 = 4060 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4061 auto *O1 = B.CreateZExtOrTrunc( 4062 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4063 4064 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4065 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4066 // Don't do anything with the operands, just extend the result. 4067 continue; 4068 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4069 auto Elements = 4070 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4071 auto *O0 = B.CreateZExtOrTrunc( 4072 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4073 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4074 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4075 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4076 auto Elements = 4077 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4078 auto *O0 = B.CreateZExtOrTrunc( 4079 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4080 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4081 } else { 4082 // If we don't know what to do, be conservative and don't do anything. 4083 continue; 4084 } 4085 4086 // Lastly, extend the result. 4087 NewI->takeName(cast<Instruction>(I)); 4088 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4089 I->replaceAllUsesWith(Res); 4090 cast<Instruction>(I)->eraseFromParent(); 4091 Erased.insert(I); 4092 State.reset(Def, Res, Part); 4093 } 4094 } 4095 4096 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4097 for (const auto &KV : Cost->getMinimalBitwidths()) { 4098 // If the value wasn't vectorized, we must maintain the original scalar 4099 // type. The absence of the value from State indicates that it 4100 // wasn't vectorized. 4101 VPValue *Def = State.Plan->getVPValue(KV.first); 4102 if (!State.hasAnyVectorValue(Def)) 4103 continue; 4104 for (unsigned Part = 0; Part < UF; ++Part) { 4105 Value *I = State.get(Def, Part); 4106 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4107 if (Inst && Inst->use_empty()) { 4108 Value *NewI = Inst->getOperand(0); 4109 Inst->eraseFromParent(); 4110 State.reset(Def, NewI, Part); 4111 } 4112 } 4113 } 4114 } 4115 4116 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4117 // Insert truncates and extends for any truncated instructions as hints to 4118 // InstCombine. 4119 if (VF.isVector()) 4120 truncateToMinimalBitwidths(State); 4121 4122 // Fix widened non-induction PHIs by setting up the PHI operands. 4123 if (OrigPHIsToFix.size()) { 4124 assert(EnableVPlanNativePath && 4125 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4126 fixNonInductionPHIs(State); 4127 } 4128 4129 // At this point every instruction in the original loop is widened to a 4130 // vector form. Now we need to fix the recurrences in the loop. These PHI 4131 // nodes are currently empty because we did not want to introduce cycles. 4132 // This is the second stage of vectorizing recurrences. 4133 fixCrossIterationPHIs(State); 4134 4135 // Forget the original basic block. 4136 PSE.getSE()->forgetLoop(OrigLoop); 4137 4138 // If we inserted an edge from the middle block to the unique exit block, 4139 // update uses outside the loop (phis) to account for the newly inserted 4140 // edge. 4141 if (!Cost->requiresScalarEpilogue(VF)) { 4142 // Fix-up external users of the induction variables. 4143 for (auto &Entry : Legal->getInductionVars()) 4144 fixupIVUsers(Entry.first, Entry.second, 4145 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4146 IVEndValues[Entry.first], LoopMiddleBlock); 4147 4148 fixLCSSAPHIs(State); 4149 } 4150 4151 for (Instruction *PI : PredicatedInstructions) 4152 sinkScalarOperands(&*PI); 4153 4154 // Remove redundant induction instructions. 4155 cse(LoopVectorBody); 4156 4157 // Set/update profile weights for the vector and remainder loops as original 4158 // loop iterations are now distributed among them. Note that original loop 4159 // represented by LoopScalarBody becomes remainder loop after vectorization. 4160 // 4161 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4162 // end up getting slightly roughened result but that should be OK since 4163 // profile is not inherently precise anyway. Note also possible bypass of 4164 // vector code caused by legality checks is ignored, assigning all the weight 4165 // to the vector loop, optimistically. 4166 // 4167 // For scalable vectorization we can't know at compile time how many iterations 4168 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4169 // vscale of '1'. 4170 setProfileInfoAfterUnrolling( 4171 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4172 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4173 } 4174 4175 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4176 // In order to support recurrences we need to be able to vectorize Phi nodes. 4177 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4178 // stage #2: We now need to fix the recurrences by adding incoming edges to 4179 // the currently empty PHI nodes. At this point every instruction in the 4180 // original loop is widened to a vector form so we can use them to construct 4181 // the incoming edges. 4182 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4183 for (VPRecipeBase &R : Header->phis()) { 4184 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4185 fixReduction(ReductionPhi, State); 4186 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4187 fixFirstOrderRecurrence(FOR, State); 4188 } 4189 } 4190 4191 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4192 VPTransformState &State) { 4193 // This is the second phase of vectorizing first-order recurrences. An 4194 // overview of the transformation is described below. Suppose we have the 4195 // following loop. 4196 // 4197 // for (int i = 0; i < n; ++i) 4198 // b[i] = a[i] - a[i - 1]; 4199 // 4200 // There is a first-order recurrence on "a". For this loop, the shorthand 4201 // scalar IR looks like: 4202 // 4203 // scalar.ph: 4204 // s_init = a[-1] 4205 // br scalar.body 4206 // 4207 // scalar.body: 4208 // i = phi [0, scalar.ph], [i+1, scalar.body] 4209 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4210 // s2 = a[i] 4211 // b[i] = s2 - s1 4212 // br cond, scalar.body, ... 4213 // 4214 // In this example, s1 is a recurrence because it's value depends on the 4215 // previous iteration. In the first phase of vectorization, we created a 4216 // vector phi v1 for s1. We now complete the vectorization and produce the 4217 // shorthand vector IR shown below (for VF = 4, UF = 1). 4218 // 4219 // vector.ph: 4220 // v_init = vector(..., ..., ..., a[-1]) 4221 // br vector.body 4222 // 4223 // vector.body 4224 // i = phi [0, vector.ph], [i+4, vector.body] 4225 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4226 // v2 = a[i, i+1, i+2, i+3]; 4227 // v3 = vector(v1(3), v2(0, 1, 2)) 4228 // b[i, i+1, i+2, i+3] = v2 - v3 4229 // br cond, vector.body, middle.block 4230 // 4231 // middle.block: 4232 // x = v2(3) 4233 // br scalar.ph 4234 // 4235 // scalar.ph: 4236 // s_init = phi [x, middle.block], [a[-1], otherwise] 4237 // br scalar.body 4238 // 4239 // After execution completes the vector loop, we extract the next value of 4240 // the recurrence (x) to use as the initial value in the scalar loop. 4241 4242 // Extract the last vector element in the middle block. This will be the 4243 // initial value for the recurrence when jumping to the scalar loop. 4244 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4245 Value *Incoming = State.get(PreviousDef, UF - 1); 4246 auto *ExtractForScalar = Incoming; 4247 auto *IdxTy = Builder.getInt32Ty(); 4248 if (VF.isVector()) { 4249 auto *One = ConstantInt::get(IdxTy, 1); 4250 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4251 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4252 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4253 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4254 "vector.recur.extract"); 4255 } 4256 // Extract the second last element in the middle block if the 4257 // Phi is used outside the loop. We need to extract the phi itself 4258 // and not the last element (the phi update in the current iteration). This 4259 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4260 // when the scalar loop is not run at all. 4261 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4262 if (VF.isVector()) { 4263 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4264 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4265 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4266 Incoming, Idx, "vector.recur.extract.for.phi"); 4267 } else if (UF > 1) 4268 // When loop is unrolled without vectorizing, initialize 4269 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4270 // of `Incoming`. This is analogous to the vectorized case above: extracting 4271 // the second last element when VF > 1. 4272 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4273 4274 // Fix the initial value of the original recurrence in the scalar loop. 4275 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4276 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4277 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4278 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4279 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4280 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4281 Start->addIncoming(Incoming, BB); 4282 } 4283 4284 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4285 Phi->setName("scalar.recur"); 4286 4287 // Finally, fix users of the recurrence outside the loop. The users will need 4288 // either the last value of the scalar recurrence or the last value of the 4289 // vector recurrence we extracted in the middle block. Since the loop is in 4290 // LCSSA form, we just need to find all the phi nodes for the original scalar 4291 // recurrence in the exit block, and then add an edge for the middle block. 4292 // Note that LCSSA does not imply single entry when the original scalar loop 4293 // had multiple exiting edges (as we always run the last iteration in the 4294 // scalar epilogue); in that case, there is no edge from middle to exit and 4295 // and thus no phis which needed updated. 4296 if (!Cost->requiresScalarEpilogue(VF)) 4297 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4298 if (any_of(LCSSAPhi.incoming_values(), 4299 [Phi](Value *V) { return V == Phi; })) 4300 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4301 } 4302 4303 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4304 VPTransformState &State) { 4305 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4306 // Get it's reduction variable descriptor. 4307 assert(Legal->isReductionVariable(OrigPhi) && 4308 "Unable to find the reduction variable"); 4309 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4310 4311 RecurKind RK = RdxDesc.getRecurrenceKind(); 4312 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4313 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4314 setDebugLocFromInst(ReductionStartValue); 4315 4316 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4317 // This is the vector-clone of the value that leaves the loop. 4318 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4319 4320 // Wrap flags are in general invalid after vectorization, clear them. 4321 clearReductionWrapFlags(RdxDesc, State); 4322 4323 // Before each round, move the insertion point right between 4324 // the PHIs and the values we are going to write. 4325 // This allows us to write both PHINodes and the extractelement 4326 // instructions. 4327 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4328 4329 setDebugLocFromInst(LoopExitInst); 4330 4331 Type *PhiTy = OrigPhi->getType(); 4332 // If tail is folded by masking, the vector value to leave the loop should be 4333 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4334 // instead of the former. For an inloop reduction the reduction will already 4335 // be predicated, and does not need to be handled here. 4336 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4337 for (unsigned Part = 0; Part < UF; ++Part) { 4338 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4339 Value *Sel = nullptr; 4340 for (User *U : VecLoopExitInst->users()) { 4341 if (isa<SelectInst>(U)) { 4342 assert(!Sel && "Reduction exit feeding two selects"); 4343 Sel = U; 4344 } else 4345 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4346 } 4347 assert(Sel && "Reduction exit feeds no select"); 4348 State.reset(LoopExitInstDef, Sel, Part); 4349 4350 // If the target can create a predicated operator for the reduction at no 4351 // extra cost in the loop (for example a predicated vadd), it can be 4352 // cheaper for the select to remain in the loop than be sunk out of it, 4353 // and so use the select value for the phi instead of the old 4354 // LoopExitValue. 4355 if (PreferPredicatedReductionSelect || 4356 TTI->preferPredicatedReductionSelect( 4357 RdxDesc.getOpcode(), PhiTy, 4358 TargetTransformInfo::ReductionFlags())) { 4359 auto *VecRdxPhi = 4360 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4361 VecRdxPhi->setIncomingValueForBlock( 4362 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4363 } 4364 } 4365 } 4366 4367 // If the vector reduction can be performed in a smaller type, we truncate 4368 // then extend the loop exit value to enable InstCombine to evaluate the 4369 // entire expression in the smaller type. 4370 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4371 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4372 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4373 Builder.SetInsertPoint( 4374 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4375 VectorParts RdxParts(UF); 4376 for (unsigned Part = 0; Part < UF; ++Part) { 4377 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4378 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4379 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4380 : Builder.CreateZExt(Trunc, VecTy); 4381 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4382 UI != RdxParts[Part]->user_end();) 4383 if (*UI != Trunc) { 4384 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4385 RdxParts[Part] = Extnd; 4386 } else { 4387 ++UI; 4388 } 4389 } 4390 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4391 for (unsigned Part = 0; Part < UF; ++Part) { 4392 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4393 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4394 } 4395 } 4396 4397 // Reduce all of the unrolled parts into a single vector. 4398 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4399 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4400 4401 // The middle block terminator has already been assigned a DebugLoc here (the 4402 // OrigLoop's single latch terminator). We want the whole middle block to 4403 // appear to execute on this line because: (a) it is all compiler generated, 4404 // (b) these instructions are always executed after evaluating the latch 4405 // conditional branch, and (c) other passes may add new predecessors which 4406 // terminate on this line. This is the easiest way to ensure we don't 4407 // accidentally cause an extra step back into the loop while debugging. 4408 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4409 if (PhiR->isOrdered()) 4410 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4411 else { 4412 // Floating-point operations should have some FMF to enable the reduction. 4413 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4414 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4415 for (unsigned Part = 1; Part < UF; ++Part) { 4416 Value *RdxPart = State.get(LoopExitInstDef, Part); 4417 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4418 ReducedPartRdx = Builder.CreateBinOp( 4419 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4420 } else { 4421 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4422 } 4423 } 4424 } 4425 4426 // Create the reduction after the loop. Note that inloop reductions create the 4427 // target reduction in the loop using a Reduction recipe. 4428 if (VF.isVector() && !PhiR->isInLoop()) { 4429 ReducedPartRdx = 4430 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4431 // If the reduction can be performed in a smaller type, we need to extend 4432 // the reduction to the wider type before we branch to the original loop. 4433 if (PhiTy != RdxDesc.getRecurrenceType()) 4434 ReducedPartRdx = RdxDesc.isSigned() 4435 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4436 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4437 } 4438 4439 // Create a phi node that merges control-flow from the backedge-taken check 4440 // block and the middle block. 4441 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4442 LoopScalarPreHeader->getTerminator()); 4443 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4444 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4445 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4446 4447 // Now, we need to fix the users of the reduction variable 4448 // inside and outside of the scalar remainder loop. 4449 4450 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4451 // in the exit blocks. See comment on analogous loop in 4452 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4453 if (!Cost->requiresScalarEpilogue(VF)) 4454 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4455 if (any_of(LCSSAPhi.incoming_values(), 4456 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4457 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4458 4459 // Fix the scalar loop reduction variable with the incoming reduction sum 4460 // from the vector body and from the backedge value. 4461 int IncomingEdgeBlockIdx = 4462 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4463 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4464 // Pick the other block. 4465 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4466 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4467 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4468 } 4469 4470 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4471 VPTransformState &State) { 4472 RecurKind RK = RdxDesc.getRecurrenceKind(); 4473 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4474 return; 4475 4476 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4477 assert(LoopExitInstr && "null loop exit instruction"); 4478 SmallVector<Instruction *, 8> Worklist; 4479 SmallPtrSet<Instruction *, 8> Visited; 4480 Worklist.push_back(LoopExitInstr); 4481 Visited.insert(LoopExitInstr); 4482 4483 while (!Worklist.empty()) { 4484 Instruction *Cur = Worklist.pop_back_val(); 4485 if (isa<OverflowingBinaryOperator>(Cur)) 4486 for (unsigned Part = 0; Part < UF; ++Part) { 4487 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4488 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4489 } 4490 4491 for (User *U : Cur->users()) { 4492 Instruction *UI = cast<Instruction>(U); 4493 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4494 Visited.insert(UI).second) 4495 Worklist.push_back(UI); 4496 } 4497 } 4498 } 4499 4500 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4501 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4502 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4503 // Some phis were already hand updated by the reduction and recurrence 4504 // code above, leave them alone. 4505 continue; 4506 4507 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4508 // Non-instruction incoming values will have only one value. 4509 4510 VPLane Lane = VPLane::getFirstLane(); 4511 if (isa<Instruction>(IncomingValue) && 4512 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4513 VF)) 4514 Lane = VPLane::getLastLaneForVF(VF); 4515 4516 // Can be a loop invariant incoming value or the last scalar value to be 4517 // extracted from the vectorized loop. 4518 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4519 Value *lastIncomingValue = 4520 OrigLoop->isLoopInvariant(IncomingValue) 4521 ? IncomingValue 4522 : State.get(State.Plan->getVPValue(IncomingValue), 4523 VPIteration(UF - 1, Lane)); 4524 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4525 } 4526 } 4527 4528 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4529 // The basic block and loop containing the predicated instruction. 4530 auto *PredBB = PredInst->getParent(); 4531 auto *VectorLoop = LI->getLoopFor(PredBB); 4532 4533 // Initialize a worklist with the operands of the predicated instruction. 4534 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4535 4536 // Holds instructions that we need to analyze again. An instruction may be 4537 // reanalyzed if we don't yet know if we can sink it or not. 4538 SmallVector<Instruction *, 8> InstsToReanalyze; 4539 4540 // Returns true if a given use occurs in the predicated block. Phi nodes use 4541 // their operands in their corresponding predecessor blocks. 4542 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4543 auto *I = cast<Instruction>(U.getUser()); 4544 BasicBlock *BB = I->getParent(); 4545 if (auto *Phi = dyn_cast<PHINode>(I)) 4546 BB = Phi->getIncomingBlock( 4547 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4548 return BB == PredBB; 4549 }; 4550 4551 // Iteratively sink the scalarized operands of the predicated instruction 4552 // into the block we created for it. When an instruction is sunk, it's 4553 // operands are then added to the worklist. The algorithm ends after one pass 4554 // through the worklist doesn't sink a single instruction. 4555 bool Changed; 4556 do { 4557 // Add the instructions that need to be reanalyzed to the worklist, and 4558 // reset the changed indicator. 4559 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4560 InstsToReanalyze.clear(); 4561 Changed = false; 4562 4563 while (!Worklist.empty()) { 4564 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4565 4566 // We can't sink an instruction if it is a phi node, is not in the loop, 4567 // or may have side effects. 4568 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4569 I->mayHaveSideEffects()) 4570 continue; 4571 4572 // If the instruction is already in PredBB, check if we can sink its 4573 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4574 // sinking the scalar instruction I, hence it appears in PredBB; but it 4575 // may have failed to sink I's operands (recursively), which we try 4576 // (again) here. 4577 if (I->getParent() == PredBB) { 4578 Worklist.insert(I->op_begin(), I->op_end()); 4579 continue; 4580 } 4581 4582 // It's legal to sink the instruction if all its uses occur in the 4583 // predicated block. Otherwise, there's nothing to do yet, and we may 4584 // need to reanalyze the instruction. 4585 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4586 InstsToReanalyze.push_back(I); 4587 continue; 4588 } 4589 4590 // Move the instruction to the beginning of the predicated block, and add 4591 // it's operands to the worklist. 4592 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4593 Worklist.insert(I->op_begin(), I->op_end()); 4594 4595 // The sinking may have enabled other instructions to be sunk, so we will 4596 // need to iterate. 4597 Changed = true; 4598 } 4599 } while (Changed); 4600 } 4601 4602 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4603 for (PHINode *OrigPhi : OrigPHIsToFix) { 4604 VPWidenPHIRecipe *VPPhi = 4605 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4606 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4607 // Make sure the builder has a valid insert point. 4608 Builder.SetInsertPoint(NewPhi); 4609 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4610 VPValue *Inc = VPPhi->getIncomingValue(i); 4611 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4612 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4613 } 4614 } 4615 } 4616 4617 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4618 return Cost->useOrderedReductions(RdxDesc); 4619 } 4620 4621 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4622 VPUser &Operands, unsigned UF, 4623 ElementCount VF, bool IsPtrLoopInvariant, 4624 SmallBitVector &IsIndexLoopInvariant, 4625 VPTransformState &State) { 4626 // Construct a vector GEP by widening the operands of the scalar GEP as 4627 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4628 // results in a vector of pointers when at least one operand of the GEP 4629 // is vector-typed. Thus, to keep the representation compact, we only use 4630 // vector-typed operands for loop-varying values. 4631 4632 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4633 // If we are vectorizing, but the GEP has only loop-invariant operands, 4634 // the GEP we build (by only using vector-typed operands for 4635 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4636 // produce a vector of pointers, we need to either arbitrarily pick an 4637 // operand to broadcast, or broadcast a clone of the original GEP. 4638 // Here, we broadcast a clone of the original. 4639 // 4640 // TODO: If at some point we decide to scalarize instructions having 4641 // loop-invariant operands, this special case will no longer be 4642 // required. We would add the scalarization decision to 4643 // collectLoopScalars() and teach getVectorValue() to broadcast 4644 // the lane-zero scalar value. 4645 auto *Clone = Builder.Insert(GEP->clone()); 4646 for (unsigned Part = 0; Part < UF; ++Part) { 4647 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4648 State.set(VPDef, EntryPart, Part); 4649 addMetadata(EntryPart, GEP); 4650 } 4651 } else { 4652 // If the GEP has at least one loop-varying operand, we are sure to 4653 // produce a vector of pointers. But if we are only unrolling, we want 4654 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4655 // produce with the code below will be scalar (if VF == 1) or vector 4656 // (otherwise). Note that for the unroll-only case, we still maintain 4657 // values in the vector mapping with initVector, as we do for other 4658 // instructions. 4659 for (unsigned Part = 0; Part < UF; ++Part) { 4660 // The pointer operand of the new GEP. If it's loop-invariant, we 4661 // won't broadcast it. 4662 auto *Ptr = IsPtrLoopInvariant 4663 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4664 : State.get(Operands.getOperand(0), Part); 4665 4666 // Collect all the indices for the new GEP. If any index is 4667 // loop-invariant, we won't broadcast it. 4668 SmallVector<Value *, 4> Indices; 4669 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4670 VPValue *Operand = Operands.getOperand(I); 4671 if (IsIndexLoopInvariant[I - 1]) 4672 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4673 else 4674 Indices.push_back(State.get(Operand, Part)); 4675 } 4676 4677 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4678 // but it should be a vector, otherwise. 4679 auto *NewGEP = 4680 GEP->isInBounds() 4681 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4682 Indices) 4683 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4684 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4685 "NewGEP is not a pointer vector"); 4686 State.set(VPDef, NewGEP, Part); 4687 addMetadata(NewGEP, GEP); 4688 } 4689 } 4690 } 4691 4692 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4693 VPWidenPHIRecipe *PhiR, 4694 VPTransformState &State) { 4695 PHINode *P = cast<PHINode>(PN); 4696 if (EnableVPlanNativePath) { 4697 // Currently we enter here in the VPlan-native path for non-induction 4698 // PHIs where all control flow is uniform. We simply widen these PHIs. 4699 // Create a vector phi with no operands - the vector phi operands will be 4700 // set at the end of vector code generation. 4701 Type *VecTy = (State.VF.isScalar()) 4702 ? PN->getType() 4703 : VectorType::get(PN->getType(), State.VF); 4704 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4705 State.set(PhiR, VecPhi, 0); 4706 OrigPHIsToFix.push_back(P); 4707 4708 return; 4709 } 4710 4711 assert(PN->getParent() == OrigLoop->getHeader() && 4712 "Non-header phis should have been handled elsewhere"); 4713 4714 // In order to support recurrences we need to be able to vectorize Phi nodes. 4715 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4716 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4717 // this value when we vectorize all of the instructions that use the PHI. 4718 4719 assert(!Legal->isReductionVariable(P) && 4720 "reductions should be handled elsewhere"); 4721 4722 setDebugLocFromInst(P); 4723 4724 // This PHINode must be an induction variable. 4725 // Make sure that we know about it. 4726 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4727 4728 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4729 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4730 4731 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4732 // which can be found from the original scalar operations. 4733 switch (II.getKind()) { 4734 case InductionDescriptor::IK_NoInduction: 4735 llvm_unreachable("Unknown induction"); 4736 case InductionDescriptor::IK_IntInduction: 4737 case InductionDescriptor::IK_FpInduction: 4738 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4739 case InductionDescriptor::IK_PtrInduction: { 4740 // Handle the pointer induction variable case. 4741 assert(P->getType()->isPointerTy() && "Unexpected type."); 4742 4743 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4744 // This is the normalized GEP that starts counting at zero. 4745 Value *PtrInd = 4746 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4747 // Determine the number of scalars we need to generate for each unroll 4748 // iteration. If the instruction is uniform, we only need to generate the 4749 // first lane. Otherwise, we generate all VF values. 4750 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4751 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4752 4753 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4754 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4755 if (NeedsVectorIndex) { 4756 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4757 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4758 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4759 } 4760 4761 for (unsigned Part = 0; Part < UF; ++Part) { 4762 Value *PartStart = createStepForVF( 4763 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4764 4765 if (NeedsVectorIndex) { 4766 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4767 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4768 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4769 Value *SclrGep = 4770 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4771 SclrGep->setName("next.gep"); 4772 State.set(PhiR, SclrGep, Part); 4773 // We've cached the whole vector, which means we can support the 4774 // extraction of any lane. 4775 continue; 4776 } 4777 4778 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4779 Value *Idx = Builder.CreateAdd( 4780 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4781 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4782 Value *SclrGep = 4783 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4784 SclrGep->setName("next.gep"); 4785 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4786 } 4787 } 4788 return; 4789 } 4790 assert(isa<SCEVConstant>(II.getStep()) && 4791 "Induction step not a SCEV constant!"); 4792 Type *PhiType = II.getStep()->getType(); 4793 4794 // Build a pointer phi 4795 Value *ScalarStartValue = II.getStartValue(); 4796 Type *ScStValueType = ScalarStartValue->getType(); 4797 PHINode *NewPointerPhi = 4798 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4799 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4800 4801 // A pointer induction, performed by using a gep 4802 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4803 Instruction *InductionLoc = LoopLatch->getTerminator(); 4804 const SCEV *ScalarStep = II.getStep(); 4805 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4806 Value *ScalarStepValue = 4807 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4808 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4809 Value *NumUnrolledElems = 4810 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4811 Value *InductionGEP = GetElementPtrInst::Create( 4812 ScStValueType->getPointerElementType(), NewPointerPhi, 4813 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4814 InductionLoc); 4815 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4816 4817 // Create UF many actual address geps that use the pointer 4818 // phi as base and a vectorized version of the step value 4819 // (<step*0, ..., step*N>) as offset. 4820 for (unsigned Part = 0; Part < State.UF; ++Part) { 4821 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4822 Value *StartOffsetScalar = 4823 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4824 Value *StartOffset = 4825 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4826 // Create a vector of consecutive numbers from zero to VF. 4827 StartOffset = 4828 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4829 4830 Value *GEP = Builder.CreateGEP( 4831 ScStValueType->getPointerElementType(), NewPointerPhi, 4832 Builder.CreateMul( 4833 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4834 "vector.gep")); 4835 State.set(PhiR, GEP, Part); 4836 } 4837 } 4838 } 4839 } 4840 4841 /// A helper function for checking whether an integer division-related 4842 /// instruction may divide by zero (in which case it must be predicated if 4843 /// executed conditionally in the scalar code). 4844 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4845 /// Non-zero divisors that are non compile-time constants will not be 4846 /// converted into multiplication, so we will still end up scalarizing 4847 /// the division, but can do so w/o predication. 4848 static bool mayDivideByZero(Instruction &I) { 4849 assert((I.getOpcode() == Instruction::UDiv || 4850 I.getOpcode() == Instruction::SDiv || 4851 I.getOpcode() == Instruction::URem || 4852 I.getOpcode() == Instruction::SRem) && 4853 "Unexpected instruction"); 4854 Value *Divisor = I.getOperand(1); 4855 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4856 return !CInt || CInt->isZero(); 4857 } 4858 4859 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4860 VPUser &User, 4861 VPTransformState &State) { 4862 switch (I.getOpcode()) { 4863 case Instruction::Call: 4864 case Instruction::Br: 4865 case Instruction::PHI: 4866 case Instruction::GetElementPtr: 4867 case Instruction::Select: 4868 llvm_unreachable("This instruction is handled by a different recipe."); 4869 case Instruction::UDiv: 4870 case Instruction::SDiv: 4871 case Instruction::SRem: 4872 case Instruction::URem: 4873 case Instruction::Add: 4874 case Instruction::FAdd: 4875 case Instruction::Sub: 4876 case Instruction::FSub: 4877 case Instruction::FNeg: 4878 case Instruction::Mul: 4879 case Instruction::FMul: 4880 case Instruction::FDiv: 4881 case Instruction::FRem: 4882 case Instruction::Shl: 4883 case Instruction::LShr: 4884 case Instruction::AShr: 4885 case Instruction::And: 4886 case Instruction::Or: 4887 case Instruction::Xor: { 4888 // Just widen unops and binops. 4889 setDebugLocFromInst(&I); 4890 4891 for (unsigned Part = 0; Part < UF; ++Part) { 4892 SmallVector<Value *, 2> Ops; 4893 for (VPValue *VPOp : User.operands()) 4894 Ops.push_back(State.get(VPOp, Part)); 4895 4896 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4897 4898 if (auto *VecOp = dyn_cast<Instruction>(V)) 4899 VecOp->copyIRFlags(&I); 4900 4901 // Use this vector value for all users of the original instruction. 4902 State.set(Def, V, Part); 4903 addMetadata(V, &I); 4904 } 4905 4906 break; 4907 } 4908 case Instruction::ICmp: 4909 case Instruction::FCmp: { 4910 // Widen compares. Generate vector compares. 4911 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4912 auto *Cmp = cast<CmpInst>(&I); 4913 setDebugLocFromInst(Cmp); 4914 for (unsigned Part = 0; Part < UF; ++Part) { 4915 Value *A = State.get(User.getOperand(0), Part); 4916 Value *B = State.get(User.getOperand(1), Part); 4917 Value *C = nullptr; 4918 if (FCmp) { 4919 // Propagate fast math flags. 4920 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4921 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4922 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4923 } else { 4924 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4925 } 4926 State.set(Def, C, Part); 4927 addMetadata(C, &I); 4928 } 4929 4930 break; 4931 } 4932 4933 case Instruction::ZExt: 4934 case Instruction::SExt: 4935 case Instruction::FPToUI: 4936 case Instruction::FPToSI: 4937 case Instruction::FPExt: 4938 case Instruction::PtrToInt: 4939 case Instruction::IntToPtr: 4940 case Instruction::SIToFP: 4941 case Instruction::UIToFP: 4942 case Instruction::Trunc: 4943 case Instruction::FPTrunc: 4944 case Instruction::BitCast: { 4945 auto *CI = cast<CastInst>(&I); 4946 setDebugLocFromInst(CI); 4947 4948 /// Vectorize casts. 4949 Type *DestTy = 4950 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4951 4952 for (unsigned Part = 0; Part < UF; ++Part) { 4953 Value *A = State.get(User.getOperand(0), Part); 4954 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4955 State.set(Def, Cast, Part); 4956 addMetadata(Cast, &I); 4957 } 4958 break; 4959 } 4960 default: 4961 // This instruction is not vectorized by simple widening. 4962 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4963 llvm_unreachable("Unhandled instruction!"); 4964 } // end of switch. 4965 } 4966 4967 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4968 VPUser &ArgOperands, 4969 VPTransformState &State) { 4970 assert(!isa<DbgInfoIntrinsic>(I) && 4971 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4972 setDebugLocFromInst(&I); 4973 4974 Module *M = I.getParent()->getParent()->getParent(); 4975 auto *CI = cast<CallInst>(&I); 4976 4977 SmallVector<Type *, 4> Tys; 4978 for (Value *ArgOperand : CI->arg_operands()) 4979 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4980 4981 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4982 4983 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4984 // version of the instruction. 4985 // Is it beneficial to perform intrinsic call compared to lib call? 4986 bool NeedToScalarize = false; 4987 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4988 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4989 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4990 assert((UseVectorIntrinsic || !NeedToScalarize) && 4991 "Instruction should be scalarized elsewhere."); 4992 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4993 "Either the intrinsic cost or vector call cost must be valid"); 4994 4995 for (unsigned Part = 0; Part < UF; ++Part) { 4996 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4997 SmallVector<Value *, 4> Args; 4998 for (auto &I : enumerate(ArgOperands.operands())) { 4999 // Some intrinsics have a scalar argument - don't replace it with a 5000 // vector. 5001 Value *Arg; 5002 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5003 Arg = State.get(I.value(), Part); 5004 else { 5005 Arg = State.get(I.value(), VPIteration(0, 0)); 5006 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 5007 TysForDecl.push_back(Arg->getType()); 5008 } 5009 Args.push_back(Arg); 5010 } 5011 5012 Function *VectorF; 5013 if (UseVectorIntrinsic) { 5014 // Use vector version of the intrinsic. 5015 if (VF.isVector()) 5016 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5017 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5018 assert(VectorF && "Can't retrieve vector intrinsic."); 5019 } else { 5020 // Use vector version of the function call. 5021 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5022 #ifndef NDEBUG 5023 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5024 "Can't create vector function."); 5025 #endif 5026 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5027 } 5028 SmallVector<OperandBundleDef, 1> OpBundles; 5029 CI->getOperandBundlesAsDefs(OpBundles); 5030 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5031 5032 if (isa<FPMathOperator>(V)) 5033 V->copyFastMathFlags(CI); 5034 5035 State.set(Def, V, Part); 5036 addMetadata(V, &I); 5037 } 5038 } 5039 5040 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5041 VPUser &Operands, 5042 bool InvariantCond, 5043 VPTransformState &State) { 5044 setDebugLocFromInst(&I); 5045 5046 // The condition can be loop invariant but still defined inside the 5047 // loop. This means that we can't just use the original 'cond' value. 5048 // We have to take the 'vectorized' value and pick the first lane. 5049 // Instcombine will make this a no-op. 5050 auto *InvarCond = InvariantCond 5051 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5052 : nullptr; 5053 5054 for (unsigned Part = 0; Part < UF; ++Part) { 5055 Value *Cond = 5056 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5057 Value *Op0 = State.get(Operands.getOperand(1), Part); 5058 Value *Op1 = State.get(Operands.getOperand(2), Part); 5059 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5060 State.set(VPDef, Sel, Part); 5061 addMetadata(Sel, &I); 5062 } 5063 } 5064 5065 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5066 // We should not collect Scalars more than once per VF. Right now, this 5067 // function is called from collectUniformsAndScalars(), which already does 5068 // this check. Collecting Scalars for VF=1 does not make any sense. 5069 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5070 "This function should not be visited twice for the same VF"); 5071 5072 SmallSetVector<Instruction *, 8> Worklist; 5073 5074 // These sets are used to seed the analysis with pointers used by memory 5075 // accesses that will remain scalar. 5076 SmallSetVector<Instruction *, 8> ScalarPtrs; 5077 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5078 auto *Latch = TheLoop->getLoopLatch(); 5079 5080 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5081 // The pointer operands of loads and stores will be scalar as long as the 5082 // memory access is not a gather or scatter operation. The value operand of a 5083 // store will remain scalar if the store is scalarized. 5084 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5085 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5086 assert(WideningDecision != CM_Unknown && 5087 "Widening decision should be ready at this moment"); 5088 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5089 if (Ptr == Store->getValueOperand()) 5090 return WideningDecision == CM_Scalarize; 5091 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5092 "Ptr is neither a value or pointer operand"); 5093 return WideningDecision != CM_GatherScatter; 5094 }; 5095 5096 // A helper that returns true if the given value is a bitcast or 5097 // getelementptr instruction contained in the loop. 5098 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5099 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5100 isa<GetElementPtrInst>(V)) && 5101 !TheLoop->isLoopInvariant(V); 5102 }; 5103 5104 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5105 if (!isa<PHINode>(Ptr) || 5106 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5107 return false; 5108 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5109 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5110 return false; 5111 return isScalarUse(MemAccess, Ptr); 5112 }; 5113 5114 // A helper that evaluates a memory access's use of a pointer. If the 5115 // pointer is actually the pointer induction of a loop, it is being 5116 // inserted into Worklist. If the use will be a scalar use, and the 5117 // pointer is only used by memory accesses, we place the pointer in 5118 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5119 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5120 if (isScalarPtrInduction(MemAccess, Ptr)) { 5121 Worklist.insert(cast<Instruction>(Ptr)); 5122 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5123 << "\n"); 5124 5125 Instruction *Update = cast<Instruction>( 5126 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5127 ScalarPtrs.insert(Update); 5128 return; 5129 } 5130 // We only care about bitcast and getelementptr instructions contained in 5131 // the loop. 5132 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5133 return; 5134 5135 // If the pointer has already been identified as scalar (e.g., if it was 5136 // also identified as uniform), there's nothing to do. 5137 auto *I = cast<Instruction>(Ptr); 5138 if (Worklist.count(I)) 5139 return; 5140 5141 // If the use of the pointer will be a scalar use, and all users of the 5142 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5143 // place the pointer in PossibleNonScalarPtrs. 5144 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5145 return isa<LoadInst>(U) || isa<StoreInst>(U); 5146 })) 5147 ScalarPtrs.insert(I); 5148 else 5149 PossibleNonScalarPtrs.insert(I); 5150 }; 5151 5152 // We seed the scalars analysis with three classes of instructions: (1) 5153 // instructions marked uniform-after-vectorization and (2) bitcast, 5154 // getelementptr and (pointer) phi instructions used by memory accesses 5155 // requiring a scalar use. 5156 // 5157 // (1) Add to the worklist all instructions that have been identified as 5158 // uniform-after-vectorization. 5159 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5160 5161 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5162 // memory accesses requiring a scalar use. The pointer operands of loads and 5163 // stores will be scalar as long as the memory accesses is not a gather or 5164 // scatter operation. The value operand of a store will remain scalar if the 5165 // store is scalarized. 5166 for (auto *BB : TheLoop->blocks()) 5167 for (auto &I : *BB) { 5168 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5169 evaluatePtrUse(Load, Load->getPointerOperand()); 5170 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5171 evaluatePtrUse(Store, Store->getPointerOperand()); 5172 evaluatePtrUse(Store, Store->getValueOperand()); 5173 } 5174 } 5175 for (auto *I : ScalarPtrs) 5176 if (!PossibleNonScalarPtrs.count(I)) { 5177 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5178 Worklist.insert(I); 5179 } 5180 5181 // Insert the forced scalars. 5182 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5183 // induction variable when the PHI user is scalarized. 5184 auto ForcedScalar = ForcedScalars.find(VF); 5185 if (ForcedScalar != ForcedScalars.end()) 5186 for (auto *I : ForcedScalar->second) 5187 Worklist.insert(I); 5188 5189 // Expand the worklist by looking through any bitcasts and getelementptr 5190 // instructions we've already identified as scalar. This is similar to the 5191 // expansion step in collectLoopUniforms(); however, here we're only 5192 // expanding to include additional bitcasts and getelementptr instructions. 5193 unsigned Idx = 0; 5194 while (Idx != Worklist.size()) { 5195 Instruction *Dst = Worklist[Idx++]; 5196 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5197 continue; 5198 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5199 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5200 auto *J = cast<Instruction>(U); 5201 return !TheLoop->contains(J) || Worklist.count(J) || 5202 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5203 isScalarUse(J, Src)); 5204 })) { 5205 Worklist.insert(Src); 5206 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5207 } 5208 } 5209 5210 // An induction variable will remain scalar if all users of the induction 5211 // variable and induction variable update remain scalar. 5212 for (auto &Induction : Legal->getInductionVars()) { 5213 auto *Ind = Induction.first; 5214 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5215 5216 // If tail-folding is applied, the primary induction variable will be used 5217 // to feed a vector compare. 5218 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5219 continue; 5220 5221 // Determine if all users of the induction variable are scalar after 5222 // vectorization. 5223 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5224 auto *I = cast<Instruction>(U); 5225 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5226 }); 5227 if (!ScalarInd) 5228 continue; 5229 5230 // Determine if all users of the induction variable update instruction are 5231 // scalar after vectorization. 5232 auto ScalarIndUpdate = 5233 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5234 auto *I = cast<Instruction>(U); 5235 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5236 }); 5237 if (!ScalarIndUpdate) 5238 continue; 5239 5240 // The induction variable and its update instruction will remain scalar. 5241 Worklist.insert(Ind); 5242 Worklist.insert(IndUpdate); 5243 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5244 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5245 << "\n"); 5246 } 5247 5248 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5249 } 5250 5251 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5252 if (!blockNeedsPredication(I->getParent())) 5253 return false; 5254 switch(I->getOpcode()) { 5255 default: 5256 break; 5257 case Instruction::Load: 5258 case Instruction::Store: { 5259 if (!Legal->isMaskRequired(I)) 5260 return false; 5261 auto *Ptr = getLoadStorePointerOperand(I); 5262 auto *Ty = getLoadStoreType(I); 5263 const Align Alignment = getLoadStoreAlignment(I); 5264 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5265 TTI.isLegalMaskedGather(Ty, Alignment)) 5266 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5267 TTI.isLegalMaskedScatter(Ty, Alignment)); 5268 } 5269 case Instruction::UDiv: 5270 case Instruction::SDiv: 5271 case Instruction::SRem: 5272 case Instruction::URem: 5273 return mayDivideByZero(*I); 5274 } 5275 return false; 5276 } 5277 5278 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5279 Instruction *I, ElementCount VF) { 5280 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5281 assert(getWideningDecision(I, VF) == CM_Unknown && 5282 "Decision should not be set yet."); 5283 auto *Group = getInterleavedAccessGroup(I); 5284 assert(Group && "Must have a group."); 5285 5286 // If the instruction's allocated size doesn't equal it's type size, it 5287 // requires padding and will be scalarized. 5288 auto &DL = I->getModule()->getDataLayout(); 5289 auto *ScalarTy = getLoadStoreType(I); 5290 if (hasIrregularType(ScalarTy, DL)) 5291 return false; 5292 5293 // Check if masking is required. 5294 // A Group may need masking for one of two reasons: it resides in a block that 5295 // needs predication, or it was decided to use masking to deal with gaps 5296 // (either a gap at the end of a load-access that may result in a speculative 5297 // load, or any gaps in a store-access). 5298 bool PredicatedAccessRequiresMasking = 5299 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5300 bool LoadAccessWithGapsRequiresEpilogMasking = 5301 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 5302 !isScalarEpilogueAllowed(); 5303 bool StoreAccessWithGapsRequiresMasking = 5304 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 5305 if (!PredicatedAccessRequiresMasking && 5306 !LoadAccessWithGapsRequiresEpilogMasking && 5307 !StoreAccessWithGapsRequiresMasking) 5308 return true; 5309 5310 // If masked interleaving is required, we expect that the user/target had 5311 // enabled it, because otherwise it either wouldn't have been created or 5312 // it should have been invalidated by the CostModel. 5313 assert(useMaskedInterleavedAccesses(TTI) && 5314 "Masked interleave-groups for predicated accesses are not enabled."); 5315 5316 auto *Ty = getLoadStoreType(I); 5317 const Align Alignment = getLoadStoreAlignment(I); 5318 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5319 : TTI.isLegalMaskedStore(Ty, Alignment); 5320 } 5321 5322 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5323 Instruction *I, ElementCount VF) { 5324 // Get and ensure we have a valid memory instruction. 5325 LoadInst *LI = dyn_cast<LoadInst>(I); 5326 StoreInst *SI = dyn_cast<StoreInst>(I); 5327 assert((LI || SI) && "Invalid memory instruction"); 5328 5329 auto *Ptr = getLoadStorePointerOperand(I); 5330 5331 // In order to be widened, the pointer should be consecutive, first of all. 5332 if (!Legal->isConsecutivePtr(Ptr)) 5333 return false; 5334 5335 // If the instruction is a store located in a predicated block, it will be 5336 // scalarized. 5337 if (isScalarWithPredication(I)) 5338 return false; 5339 5340 // If the instruction's allocated size doesn't equal it's type size, it 5341 // requires padding and will be scalarized. 5342 auto &DL = I->getModule()->getDataLayout(); 5343 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5344 if (hasIrregularType(ScalarTy, DL)) 5345 return false; 5346 5347 return true; 5348 } 5349 5350 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5351 // We should not collect Uniforms more than once per VF. Right now, 5352 // this function is called from collectUniformsAndScalars(), which 5353 // already does this check. Collecting Uniforms for VF=1 does not make any 5354 // sense. 5355 5356 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5357 "This function should not be visited twice for the same VF"); 5358 5359 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5360 // not analyze again. Uniforms.count(VF) will return 1. 5361 Uniforms[VF].clear(); 5362 5363 // We now know that the loop is vectorizable! 5364 // Collect instructions inside the loop that will remain uniform after 5365 // vectorization. 5366 5367 // Global values, params and instructions outside of current loop are out of 5368 // scope. 5369 auto isOutOfScope = [&](Value *V) -> bool { 5370 Instruction *I = dyn_cast<Instruction>(V); 5371 return (!I || !TheLoop->contains(I)); 5372 }; 5373 5374 SetVector<Instruction *> Worklist; 5375 BasicBlock *Latch = TheLoop->getLoopLatch(); 5376 5377 // Instructions that are scalar with predication must not be considered 5378 // uniform after vectorization, because that would create an erroneous 5379 // replicating region where only a single instance out of VF should be formed. 5380 // TODO: optimize such seldom cases if found important, see PR40816. 5381 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5382 if (isOutOfScope(I)) { 5383 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5384 << *I << "\n"); 5385 return; 5386 } 5387 if (isScalarWithPredication(I)) { 5388 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5389 << *I << "\n"); 5390 return; 5391 } 5392 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5393 Worklist.insert(I); 5394 }; 5395 5396 // Start with the conditional branch. If the branch condition is an 5397 // instruction contained in the loop that is only used by the branch, it is 5398 // uniform. 5399 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5400 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5401 addToWorklistIfAllowed(Cmp); 5402 5403 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5404 InstWidening WideningDecision = getWideningDecision(I, VF); 5405 assert(WideningDecision != CM_Unknown && 5406 "Widening decision should be ready at this moment"); 5407 5408 // A uniform memory op is itself uniform. We exclude uniform stores 5409 // here as they demand the last lane, not the first one. 5410 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5411 assert(WideningDecision == CM_Scalarize); 5412 return true; 5413 } 5414 5415 return (WideningDecision == CM_Widen || 5416 WideningDecision == CM_Widen_Reverse || 5417 WideningDecision == CM_Interleave); 5418 }; 5419 5420 5421 // Returns true if Ptr is the pointer operand of a memory access instruction 5422 // I, and I is known to not require scalarization. 5423 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5424 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5425 }; 5426 5427 // Holds a list of values which are known to have at least one uniform use. 5428 // Note that there may be other uses which aren't uniform. A "uniform use" 5429 // here is something which only demands lane 0 of the unrolled iterations; 5430 // it does not imply that all lanes produce the same value (e.g. this is not 5431 // the usual meaning of uniform) 5432 SetVector<Value *> HasUniformUse; 5433 5434 // Scan the loop for instructions which are either a) known to have only 5435 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5436 for (auto *BB : TheLoop->blocks()) 5437 for (auto &I : *BB) { 5438 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5439 switch (II->getIntrinsicID()) { 5440 case Intrinsic::sideeffect: 5441 case Intrinsic::experimental_noalias_scope_decl: 5442 case Intrinsic::assume: 5443 case Intrinsic::lifetime_start: 5444 case Intrinsic::lifetime_end: 5445 if (TheLoop->hasLoopInvariantOperands(&I)) 5446 addToWorklistIfAllowed(&I); 5447 break; 5448 default: 5449 break; 5450 } 5451 } 5452 5453 // ExtractValue instructions must be uniform, because the operands are 5454 // known to be loop-invariant. 5455 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5456 assert(isOutOfScope(EVI->getAggregateOperand()) && 5457 "Expected aggregate value to be loop invariant"); 5458 addToWorklistIfAllowed(EVI); 5459 continue; 5460 } 5461 5462 // If there's no pointer operand, there's nothing to do. 5463 auto *Ptr = getLoadStorePointerOperand(&I); 5464 if (!Ptr) 5465 continue; 5466 5467 // A uniform memory op is itself uniform. We exclude uniform stores 5468 // here as they demand the last lane, not the first one. 5469 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5470 addToWorklistIfAllowed(&I); 5471 5472 if (isUniformDecision(&I, VF)) { 5473 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5474 HasUniformUse.insert(Ptr); 5475 } 5476 } 5477 5478 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5479 // demanding) users. Since loops are assumed to be in LCSSA form, this 5480 // disallows uses outside the loop as well. 5481 for (auto *V : HasUniformUse) { 5482 if (isOutOfScope(V)) 5483 continue; 5484 auto *I = cast<Instruction>(V); 5485 auto UsersAreMemAccesses = 5486 llvm::all_of(I->users(), [&](User *U) -> bool { 5487 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5488 }); 5489 if (UsersAreMemAccesses) 5490 addToWorklistIfAllowed(I); 5491 } 5492 5493 // Expand Worklist in topological order: whenever a new instruction 5494 // is added , its users should be already inside Worklist. It ensures 5495 // a uniform instruction will only be used by uniform instructions. 5496 unsigned idx = 0; 5497 while (idx != Worklist.size()) { 5498 Instruction *I = Worklist[idx++]; 5499 5500 for (auto OV : I->operand_values()) { 5501 // isOutOfScope operands cannot be uniform instructions. 5502 if (isOutOfScope(OV)) 5503 continue; 5504 // First order recurrence Phi's should typically be considered 5505 // non-uniform. 5506 auto *OP = dyn_cast<PHINode>(OV); 5507 if (OP && Legal->isFirstOrderRecurrence(OP)) 5508 continue; 5509 // If all the users of the operand are uniform, then add the 5510 // operand into the uniform worklist. 5511 auto *OI = cast<Instruction>(OV); 5512 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5513 auto *J = cast<Instruction>(U); 5514 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5515 })) 5516 addToWorklistIfAllowed(OI); 5517 } 5518 } 5519 5520 // For an instruction to be added into Worklist above, all its users inside 5521 // the loop should also be in Worklist. However, this condition cannot be 5522 // true for phi nodes that form a cyclic dependence. We must process phi 5523 // nodes separately. An induction variable will remain uniform if all users 5524 // of the induction variable and induction variable update remain uniform. 5525 // The code below handles both pointer and non-pointer induction variables. 5526 for (auto &Induction : Legal->getInductionVars()) { 5527 auto *Ind = Induction.first; 5528 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5529 5530 // Determine if all users of the induction variable are uniform after 5531 // vectorization. 5532 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5533 auto *I = cast<Instruction>(U); 5534 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5535 isVectorizedMemAccessUse(I, Ind); 5536 }); 5537 if (!UniformInd) 5538 continue; 5539 5540 // Determine if all users of the induction variable update instruction are 5541 // uniform after vectorization. 5542 auto UniformIndUpdate = 5543 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5544 auto *I = cast<Instruction>(U); 5545 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5546 isVectorizedMemAccessUse(I, IndUpdate); 5547 }); 5548 if (!UniformIndUpdate) 5549 continue; 5550 5551 // The induction variable and its update instruction will remain uniform. 5552 addToWorklistIfAllowed(Ind); 5553 addToWorklistIfAllowed(IndUpdate); 5554 } 5555 5556 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5557 } 5558 5559 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5560 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5561 5562 if (Legal->getRuntimePointerChecking()->Need) { 5563 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5564 "runtime pointer checks needed. Enable vectorization of this " 5565 "loop with '#pragma clang loop vectorize(enable)' when " 5566 "compiling with -Os/-Oz", 5567 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5568 return true; 5569 } 5570 5571 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5572 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5573 "runtime SCEV checks needed. Enable vectorization of this " 5574 "loop with '#pragma clang loop vectorize(enable)' when " 5575 "compiling with -Os/-Oz", 5576 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5577 return true; 5578 } 5579 5580 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5581 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5582 reportVectorizationFailure("Runtime stride check for small trip count", 5583 "runtime stride == 1 checks needed. Enable vectorization of " 5584 "this loop without such check by compiling with -Os/-Oz", 5585 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5586 return true; 5587 } 5588 5589 return false; 5590 } 5591 5592 ElementCount 5593 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5594 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5595 reportVectorizationInfo( 5596 "Disabling scalable vectorization, because target does not " 5597 "support scalable vectors.", 5598 "ScalableVectorsUnsupported", ORE, TheLoop); 5599 return ElementCount::getScalable(0); 5600 } 5601 5602 if (Hints->isScalableVectorizationDisabled()) { 5603 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5604 "ScalableVectorizationDisabled", ORE, TheLoop); 5605 return ElementCount::getScalable(0); 5606 } 5607 5608 auto MaxScalableVF = ElementCount::getScalable( 5609 std::numeric_limits<ElementCount::ScalarTy>::max()); 5610 5611 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5612 // FIXME: While for scalable vectors this is currently sufficient, this should 5613 // be replaced by a more detailed mechanism that filters out specific VFs, 5614 // instead of invalidating vectorization for a whole set of VFs based on the 5615 // MaxVF. 5616 5617 // Disable scalable vectorization if the loop contains unsupported reductions. 5618 if (!canVectorizeReductions(MaxScalableVF)) { 5619 reportVectorizationInfo( 5620 "Scalable vectorization not supported for the reduction " 5621 "operations found in this loop.", 5622 "ScalableVFUnfeasible", ORE, TheLoop); 5623 return ElementCount::getScalable(0); 5624 } 5625 5626 // Disable scalable vectorization if the loop contains any instructions 5627 // with element types not supported for scalable vectors. 5628 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5629 return !Ty->isVoidTy() && 5630 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5631 })) { 5632 reportVectorizationInfo("Scalable vectorization is not supported " 5633 "for all element types found in this loop.", 5634 "ScalableVFUnfeasible", ORE, TheLoop); 5635 return ElementCount::getScalable(0); 5636 } 5637 5638 if (Legal->isSafeForAnyVectorWidth()) 5639 return MaxScalableVF; 5640 5641 // Limit MaxScalableVF by the maximum safe dependence distance. 5642 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5643 MaxScalableVF = ElementCount::getScalable( 5644 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5645 if (!MaxScalableVF) 5646 reportVectorizationInfo( 5647 "Max legal vector width too small, scalable vectorization " 5648 "unfeasible.", 5649 "ScalableVFUnfeasible", ORE, TheLoop); 5650 5651 return MaxScalableVF; 5652 } 5653 5654 FixedScalableVFPair 5655 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5656 ElementCount UserVF) { 5657 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5658 unsigned SmallestType, WidestType; 5659 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5660 5661 // Get the maximum safe dependence distance in bits computed by LAA. 5662 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5663 // the memory accesses that is most restrictive (involved in the smallest 5664 // dependence distance). 5665 unsigned MaxSafeElements = 5666 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5667 5668 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5669 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5670 5671 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5672 << ".\n"); 5673 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5674 << ".\n"); 5675 5676 // First analyze the UserVF, fall back if the UserVF should be ignored. 5677 if (UserVF) { 5678 auto MaxSafeUserVF = 5679 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5680 5681 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5682 // If `VF=vscale x N` is safe, then so is `VF=N` 5683 if (UserVF.isScalable()) 5684 return FixedScalableVFPair( 5685 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5686 else 5687 return UserVF; 5688 } 5689 5690 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5691 5692 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5693 // is better to ignore the hint and let the compiler choose a suitable VF. 5694 if (!UserVF.isScalable()) { 5695 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5696 << " is unsafe, clamping to max safe VF=" 5697 << MaxSafeFixedVF << ".\n"); 5698 ORE->emit([&]() { 5699 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5700 TheLoop->getStartLoc(), 5701 TheLoop->getHeader()) 5702 << "User-specified vectorization factor " 5703 << ore::NV("UserVectorizationFactor", UserVF) 5704 << " is unsafe, clamping to maximum safe vectorization factor " 5705 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5706 }); 5707 return MaxSafeFixedVF; 5708 } 5709 5710 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5711 << " is unsafe. Ignoring scalable UserVF.\n"); 5712 ORE->emit([&]() { 5713 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5714 TheLoop->getStartLoc(), 5715 TheLoop->getHeader()) 5716 << "User-specified vectorization factor " 5717 << ore::NV("UserVectorizationFactor", UserVF) 5718 << " is unsafe. Ignoring the hint to let the compiler pick a " 5719 "suitable VF."; 5720 }); 5721 } 5722 5723 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5724 << " / " << WidestType << " bits.\n"); 5725 5726 FixedScalableVFPair Result(ElementCount::getFixed(1), 5727 ElementCount::getScalable(0)); 5728 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5729 WidestType, MaxSafeFixedVF)) 5730 Result.FixedVF = MaxVF; 5731 5732 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5733 WidestType, MaxSafeScalableVF)) 5734 if (MaxVF.isScalable()) { 5735 Result.ScalableVF = MaxVF; 5736 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5737 << "\n"); 5738 } 5739 5740 return Result; 5741 } 5742 5743 FixedScalableVFPair 5744 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5745 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5746 // TODO: It may by useful to do since it's still likely to be dynamically 5747 // uniform if the target can skip. 5748 reportVectorizationFailure( 5749 "Not inserting runtime ptr check for divergent target", 5750 "runtime pointer checks needed. Not enabled for divergent target", 5751 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5752 return FixedScalableVFPair::getNone(); 5753 } 5754 5755 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5756 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5757 if (TC == 1) { 5758 reportVectorizationFailure("Single iteration (non) loop", 5759 "loop trip count is one, irrelevant for vectorization", 5760 "SingleIterationLoop", ORE, TheLoop); 5761 return FixedScalableVFPair::getNone(); 5762 } 5763 5764 switch (ScalarEpilogueStatus) { 5765 case CM_ScalarEpilogueAllowed: 5766 return computeFeasibleMaxVF(TC, UserVF); 5767 case CM_ScalarEpilogueNotAllowedUsePredicate: 5768 LLVM_FALLTHROUGH; 5769 case CM_ScalarEpilogueNotNeededUsePredicate: 5770 LLVM_DEBUG( 5771 dbgs() << "LV: vector predicate hint/switch found.\n" 5772 << "LV: Not allowing scalar epilogue, creating predicated " 5773 << "vector loop.\n"); 5774 break; 5775 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5776 // fallthrough as a special case of OptForSize 5777 case CM_ScalarEpilogueNotAllowedOptSize: 5778 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5779 LLVM_DEBUG( 5780 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5781 else 5782 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5783 << "count.\n"); 5784 5785 // Bail if runtime checks are required, which are not good when optimising 5786 // for size. 5787 if (runtimeChecksRequired()) 5788 return FixedScalableVFPair::getNone(); 5789 5790 break; 5791 } 5792 5793 // The only loops we can vectorize without a scalar epilogue, are loops with 5794 // a bottom-test and a single exiting block. We'd have to handle the fact 5795 // that not every instruction executes on the last iteration. This will 5796 // require a lane mask which varies through the vector loop body. (TODO) 5797 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5798 // If there was a tail-folding hint/switch, but we can't fold the tail by 5799 // masking, fallback to a vectorization with a scalar epilogue. 5800 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5801 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5802 "scalar epilogue instead.\n"); 5803 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5804 return computeFeasibleMaxVF(TC, UserVF); 5805 } 5806 return FixedScalableVFPair::getNone(); 5807 } 5808 5809 // Now try the tail folding 5810 5811 // Invalidate interleave groups that require an epilogue if we can't mask 5812 // the interleave-group. 5813 if (!useMaskedInterleavedAccesses(TTI)) { 5814 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5815 "No decisions should have been taken at this point"); 5816 // Note: There is no need to invalidate any cost modeling decisions here, as 5817 // non where taken so far. 5818 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5819 } 5820 5821 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5822 // Avoid tail folding if the trip count is known to be a multiple of any VF 5823 // we chose. 5824 // FIXME: The condition below pessimises the case for fixed-width vectors, 5825 // when scalable VFs are also candidates for vectorization. 5826 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5827 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5828 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5829 "MaxFixedVF must be a power of 2"); 5830 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5831 : MaxFixedVF.getFixedValue(); 5832 ScalarEvolution *SE = PSE.getSE(); 5833 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5834 const SCEV *ExitCount = SE->getAddExpr( 5835 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5836 const SCEV *Rem = SE->getURemExpr( 5837 SE->applyLoopGuards(ExitCount, TheLoop), 5838 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5839 if (Rem->isZero()) { 5840 // Accept MaxFixedVF if we do not have a tail. 5841 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5842 return MaxFactors; 5843 } 5844 } 5845 5846 // For scalable vectors, don't use tail folding as this is currently not yet 5847 // supported. The code is likely to have ended up here if the tripcount is 5848 // low, in which case it makes sense not to use scalable vectors. 5849 if (MaxFactors.ScalableVF.isVector()) 5850 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5851 5852 // If we don't know the precise trip count, or if the trip count that we 5853 // found modulo the vectorization factor is not zero, try to fold the tail 5854 // by masking. 5855 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5856 if (Legal->prepareToFoldTailByMasking()) { 5857 FoldTailByMasking = true; 5858 return MaxFactors; 5859 } 5860 5861 // If there was a tail-folding hint/switch, but we can't fold the tail by 5862 // masking, fallback to a vectorization with a scalar epilogue. 5863 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5864 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5865 "scalar epilogue instead.\n"); 5866 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5867 return MaxFactors; 5868 } 5869 5870 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5871 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5872 return FixedScalableVFPair::getNone(); 5873 } 5874 5875 if (TC == 0) { 5876 reportVectorizationFailure( 5877 "Unable to calculate the loop count due to complex control flow", 5878 "unable to calculate the loop count due to complex control flow", 5879 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5880 return FixedScalableVFPair::getNone(); 5881 } 5882 5883 reportVectorizationFailure( 5884 "Cannot optimize for size and vectorize at the same time.", 5885 "cannot optimize for size and vectorize at the same time. " 5886 "Enable vectorization of this loop with '#pragma clang loop " 5887 "vectorize(enable)' when compiling with -Os/-Oz", 5888 "NoTailLoopWithOptForSize", ORE, TheLoop); 5889 return FixedScalableVFPair::getNone(); 5890 } 5891 5892 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5893 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5894 const ElementCount &MaxSafeVF) { 5895 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5896 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5897 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5898 : TargetTransformInfo::RGK_FixedWidthVector); 5899 5900 // Convenience function to return the minimum of two ElementCounts. 5901 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5902 assert((LHS.isScalable() == RHS.isScalable()) && 5903 "Scalable flags must match"); 5904 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5905 }; 5906 5907 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5908 // Note that both WidestRegister and WidestType may not be a powers of 2. 5909 auto MaxVectorElementCount = ElementCount::get( 5910 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5911 ComputeScalableMaxVF); 5912 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5913 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5914 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5915 5916 if (!MaxVectorElementCount) { 5917 LLVM_DEBUG(dbgs() << "LV: The target has no " 5918 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5919 << " vector registers.\n"); 5920 return ElementCount::getFixed(1); 5921 } 5922 5923 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5924 if (ConstTripCount && 5925 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5926 isPowerOf2_32(ConstTripCount)) { 5927 // We need to clamp the VF to be the ConstTripCount. There is no point in 5928 // choosing a higher viable VF as done in the loop below. If 5929 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5930 // the TC is less than or equal to the known number of lanes. 5931 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5932 << ConstTripCount << "\n"); 5933 return TripCountEC; 5934 } 5935 5936 ElementCount MaxVF = MaxVectorElementCount; 5937 if (TTI.shouldMaximizeVectorBandwidth() || 5938 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5939 auto MaxVectorElementCountMaxBW = ElementCount::get( 5940 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5941 ComputeScalableMaxVF); 5942 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5943 5944 // Collect all viable vectorization factors larger than the default MaxVF 5945 // (i.e. MaxVectorElementCount). 5946 SmallVector<ElementCount, 8> VFs; 5947 for (ElementCount VS = MaxVectorElementCount * 2; 5948 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5949 VFs.push_back(VS); 5950 5951 // For each VF calculate its register usage. 5952 auto RUs = calculateRegisterUsage(VFs); 5953 5954 // Select the largest VF which doesn't require more registers than existing 5955 // ones. 5956 for (int i = RUs.size() - 1; i >= 0; --i) { 5957 bool Selected = true; 5958 for (auto &pair : RUs[i].MaxLocalUsers) { 5959 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5960 if (pair.second > TargetNumRegisters) 5961 Selected = false; 5962 } 5963 if (Selected) { 5964 MaxVF = VFs[i]; 5965 break; 5966 } 5967 } 5968 if (ElementCount MinVF = 5969 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5970 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5971 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5972 << ") with target's minimum: " << MinVF << '\n'); 5973 MaxVF = MinVF; 5974 } 5975 } 5976 } 5977 return MaxVF; 5978 } 5979 5980 bool LoopVectorizationCostModel::isMoreProfitable( 5981 const VectorizationFactor &A, const VectorizationFactor &B) const { 5982 InstructionCost CostA = A.Cost; 5983 InstructionCost CostB = B.Cost; 5984 5985 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5986 5987 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5988 MaxTripCount) { 5989 // If we are folding the tail and the trip count is a known (possibly small) 5990 // constant, the trip count will be rounded up to an integer number of 5991 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5992 // which we compare directly. When not folding the tail, the total cost will 5993 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5994 // approximated with the per-lane cost below instead of using the tripcount 5995 // as here. 5996 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5997 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5998 return RTCostA < RTCostB; 5999 } 6000 6001 // When set to preferred, for now assume vscale may be larger than 1, so 6002 // that scalable vectorization is slightly favorable over fixed-width 6003 // vectorization. 6004 if (Hints->isScalableVectorizationPreferred()) 6005 if (A.Width.isScalable() && !B.Width.isScalable()) 6006 return (CostA * B.Width.getKnownMinValue()) <= 6007 (CostB * A.Width.getKnownMinValue()); 6008 6009 // To avoid the need for FP division: 6010 // (CostA / A.Width) < (CostB / B.Width) 6011 // <=> (CostA * B.Width) < (CostB * A.Width) 6012 return (CostA * B.Width.getKnownMinValue()) < 6013 (CostB * A.Width.getKnownMinValue()); 6014 } 6015 6016 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 6017 const ElementCountSet &VFCandidates) { 6018 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6019 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6020 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6021 assert(VFCandidates.count(ElementCount::getFixed(1)) && 6022 "Expected Scalar VF to be a candidate"); 6023 6024 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6025 VectorizationFactor ChosenFactor = ScalarCost; 6026 6027 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6028 if (ForceVectorization && VFCandidates.size() > 1) { 6029 // Ignore scalar width, because the user explicitly wants vectorization. 6030 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6031 // evaluation. 6032 ChosenFactor.Cost = InstructionCost::getMax(); 6033 } 6034 6035 SmallVector<InstructionVFPair> InvalidCosts; 6036 for (const auto &i : VFCandidates) { 6037 // The cost for scalar VF=1 is already calculated, so ignore it. 6038 if (i.isScalar()) 6039 continue; 6040 6041 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 6042 VectorizationFactor Candidate(i, C.first); 6043 LLVM_DEBUG( 6044 dbgs() << "LV: Vector loop of width " << i << " costs: " 6045 << (Candidate.Cost / Candidate.Width.getKnownMinValue()) 6046 << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") 6047 << ".\n"); 6048 6049 if (!C.second && !ForceVectorization) { 6050 LLVM_DEBUG( 6051 dbgs() << "LV: Not considering vector loop of width " << i 6052 << " because it will not generate any vector instructions.\n"); 6053 continue; 6054 } 6055 6056 // If profitable add it to ProfitableVF list. 6057 if (isMoreProfitable(Candidate, ScalarCost)) 6058 ProfitableVFs.push_back(Candidate); 6059 6060 if (isMoreProfitable(Candidate, ChosenFactor)) 6061 ChosenFactor = Candidate; 6062 } 6063 6064 // Emit a report of VFs with invalid costs in the loop. 6065 if (!InvalidCosts.empty()) { 6066 // Group the remarks per instruction, keeping the instruction order from 6067 // InvalidCosts. 6068 std::map<Instruction *, unsigned> Numbering; 6069 unsigned I = 0; 6070 for (auto &Pair : InvalidCosts) 6071 if (!Numbering.count(Pair.first)) 6072 Numbering[Pair.first] = I++; 6073 6074 // Sort the list, first on instruction(number) then on VF. 6075 llvm::sort(InvalidCosts, 6076 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6077 if (Numbering[A.first] != Numbering[B.first]) 6078 return Numbering[A.first] < Numbering[B.first]; 6079 ElementCountComparator ECC; 6080 return ECC(A.second, B.second); 6081 }); 6082 6083 // For a list of ordered instruction-vf pairs: 6084 // [(load, vf1), (load, vf2), (store, vf1)] 6085 // Group the instructions together to emit separate remarks for: 6086 // load (vf1, vf2) 6087 // store (vf1) 6088 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6089 auto Subset = ArrayRef<InstructionVFPair>(); 6090 do { 6091 if (Subset.empty()) 6092 Subset = Tail.take_front(1); 6093 6094 Instruction *I = Subset.front().first; 6095 6096 // If the next instruction is different, or if there are no other pairs, 6097 // emit a remark for the collated subset. e.g. 6098 // [(load, vf1), (load, vf2))] 6099 // to emit: 6100 // remark: invalid costs for 'load' at VF=(vf, vf2) 6101 if (Subset == Tail || Tail[Subset.size()].first != I) { 6102 std::string OutString; 6103 raw_string_ostream OS(OutString); 6104 assert(!Subset.empty() && "Unexpected empty range"); 6105 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6106 for (auto &Pair : Subset) 6107 OS << (Pair.second == Subset.front().second ? "" : ", ") 6108 << Pair.second; 6109 OS << "):"; 6110 if (auto *CI = dyn_cast<CallInst>(I)) 6111 OS << " call to " << CI->getCalledFunction()->getName(); 6112 else 6113 OS << " " << I->getOpcodeName(); 6114 OS.flush(); 6115 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6116 Tail = Tail.drop_front(Subset.size()); 6117 Subset = {}; 6118 } else 6119 // Grow the subset by one element 6120 Subset = Tail.take_front(Subset.size() + 1); 6121 } while (!Tail.empty()); 6122 } 6123 6124 if (!EnableCondStoresVectorization && NumPredStores) { 6125 reportVectorizationFailure("There are conditional stores.", 6126 "store that is conditionally executed prevents vectorization", 6127 "ConditionalStore", ORE, TheLoop); 6128 ChosenFactor = ScalarCost; 6129 } 6130 6131 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6132 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6133 << "LV: Vectorization seems to be not beneficial, " 6134 << "but was forced by a user.\n"); 6135 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6136 return ChosenFactor; 6137 } 6138 6139 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6140 const Loop &L, ElementCount VF) const { 6141 // Cross iteration phis such as reductions need special handling and are 6142 // currently unsupported. 6143 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6144 return Legal->isFirstOrderRecurrence(&Phi) || 6145 Legal->isReductionVariable(&Phi); 6146 })) 6147 return false; 6148 6149 // Phis with uses outside of the loop require special handling and are 6150 // currently unsupported. 6151 for (auto &Entry : Legal->getInductionVars()) { 6152 // Look for uses of the value of the induction at the last iteration. 6153 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6154 for (User *U : PostInc->users()) 6155 if (!L.contains(cast<Instruction>(U))) 6156 return false; 6157 // Look for uses of penultimate value of the induction. 6158 for (User *U : Entry.first->users()) 6159 if (!L.contains(cast<Instruction>(U))) 6160 return false; 6161 } 6162 6163 // Induction variables that are widened require special handling that is 6164 // currently not supported. 6165 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6166 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6167 this->isProfitableToScalarize(Entry.first, VF)); 6168 })) 6169 return false; 6170 6171 // Epilogue vectorization code has not been auditted to ensure it handles 6172 // non-latch exits properly. It may be fine, but it needs auditted and 6173 // tested. 6174 if (L.getExitingBlock() != L.getLoopLatch()) 6175 return false; 6176 6177 return true; 6178 } 6179 6180 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6181 const ElementCount VF) const { 6182 // FIXME: We need a much better cost-model to take different parameters such 6183 // as register pressure, code size increase and cost of extra branches into 6184 // account. For now we apply a very crude heuristic and only consider loops 6185 // with vectorization factors larger than a certain value. 6186 // We also consider epilogue vectorization unprofitable for targets that don't 6187 // consider interleaving beneficial (eg. MVE). 6188 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6189 return false; 6190 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6191 return true; 6192 return false; 6193 } 6194 6195 VectorizationFactor 6196 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6197 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6198 VectorizationFactor Result = VectorizationFactor::Disabled(); 6199 if (!EnableEpilogueVectorization) { 6200 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6201 return Result; 6202 } 6203 6204 if (!isScalarEpilogueAllowed()) { 6205 LLVM_DEBUG( 6206 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6207 "allowed.\n";); 6208 return Result; 6209 } 6210 6211 // FIXME: This can be fixed for scalable vectors later, because at this stage 6212 // the LoopVectorizer will only consider vectorizing a loop with scalable 6213 // vectors when the loop has a hint to enable vectorization for a given VF. 6214 if (MainLoopVF.isScalable()) { 6215 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6216 "yet supported.\n"); 6217 return Result; 6218 } 6219 6220 // Not really a cost consideration, but check for unsupported cases here to 6221 // simplify the logic. 6222 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6223 LLVM_DEBUG( 6224 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6225 "not a supported candidate.\n";); 6226 return Result; 6227 } 6228 6229 if (EpilogueVectorizationForceVF > 1) { 6230 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6231 if (LVP.hasPlanWithVFs( 6232 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6233 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6234 else { 6235 LLVM_DEBUG( 6236 dbgs() 6237 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6238 return Result; 6239 } 6240 } 6241 6242 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6243 TheLoop->getHeader()->getParent()->hasMinSize()) { 6244 LLVM_DEBUG( 6245 dbgs() 6246 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6247 return Result; 6248 } 6249 6250 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6251 return Result; 6252 6253 for (auto &NextVF : ProfitableVFs) 6254 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6255 (Result.Width.getFixedValue() == 1 || 6256 isMoreProfitable(NextVF, Result)) && 6257 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6258 Result = NextVF; 6259 6260 if (Result != VectorizationFactor::Disabled()) 6261 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6262 << Result.Width.getFixedValue() << "\n";); 6263 return Result; 6264 } 6265 6266 std::pair<unsigned, unsigned> 6267 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6268 unsigned MinWidth = -1U; 6269 unsigned MaxWidth = 8; 6270 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6271 for (Type *T : ElementTypesInLoop) { 6272 MinWidth = std::min<unsigned>( 6273 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6274 MaxWidth = std::max<unsigned>( 6275 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6276 } 6277 return {MinWidth, MaxWidth}; 6278 } 6279 6280 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6281 ElementTypesInLoop.clear(); 6282 // For each block. 6283 for (BasicBlock *BB : TheLoop->blocks()) { 6284 // For each instruction in the loop. 6285 for (Instruction &I : BB->instructionsWithoutDebug()) { 6286 Type *T = I.getType(); 6287 6288 // Skip ignored values. 6289 if (ValuesToIgnore.count(&I)) 6290 continue; 6291 6292 // Only examine Loads, Stores and PHINodes. 6293 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6294 continue; 6295 6296 // Examine PHI nodes that are reduction variables. Update the type to 6297 // account for the recurrence type. 6298 if (auto *PN = dyn_cast<PHINode>(&I)) { 6299 if (!Legal->isReductionVariable(PN)) 6300 continue; 6301 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6302 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6303 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6304 RdxDesc.getRecurrenceType(), 6305 TargetTransformInfo::ReductionFlags())) 6306 continue; 6307 T = RdxDesc.getRecurrenceType(); 6308 } 6309 6310 // Examine the stored values. 6311 if (auto *ST = dyn_cast<StoreInst>(&I)) 6312 T = ST->getValueOperand()->getType(); 6313 6314 // Ignore loaded pointer types and stored pointer types that are not 6315 // vectorizable. 6316 // 6317 // FIXME: The check here attempts to predict whether a load or store will 6318 // be vectorized. We only know this for certain after a VF has 6319 // been selected. Here, we assume that if an access can be 6320 // vectorized, it will be. We should also look at extending this 6321 // optimization to non-pointer types. 6322 // 6323 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6324 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6325 continue; 6326 6327 ElementTypesInLoop.insert(T); 6328 } 6329 } 6330 } 6331 6332 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6333 unsigned LoopCost) { 6334 // -- The interleave heuristics -- 6335 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6336 // There are many micro-architectural considerations that we can't predict 6337 // at this level. For example, frontend pressure (on decode or fetch) due to 6338 // code size, or the number and capabilities of the execution ports. 6339 // 6340 // We use the following heuristics to select the interleave count: 6341 // 1. If the code has reductions, then we interleave to break the cross 6342 // iteration dependency. 6343 // 2. If the loop is really small, then we interleave to reduce the loop 6344 // overhead. 6345 // 3. We don't interleave if we think that we will spill registers to memory 6346 // due to the increased register pressure. 6347 6348 if (!isScalarEpilogueAllowed()) 6349 return 1; 6350 6351 // We used the distance for the interleave count. 6352 if (Legal->getMaxSafeDepDistBytes() != -1U) 6353 return 1; 6354 6355 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6356 const bool HasReductions = !Legal->getReductionVars().empty(); 6357 // Do not interleave loops with a relatively small known or estimated trip 6358 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6359 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6360 // because with the above conditions interleaving can expose ILP and break 6361 // cross iteration dependences for reductions. 6362 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6363 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6364 return 1; 6365 6366 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6367 // We divide by these constants so assume that we have at least one 6368 // instruction that uses at least one register. 6369 for (auto& pair : R.MaxLocalUsers) { 6370 pair.second = std::max(pair.second, 1U); 6371 } 6372 6373 // We calculate the interleave count using the following formula. 6374 // Subtract the number of loop invariants from the number of available 6375 // registers. These registers are used by all of the interleaved instances. 6376 // Next, divide the remaining registers by the number of registers that is 6377 // required by the loop, in order to estimate how many parallel instances 6378 // fit without causing spills. All of this is rounded down if necessary to be 6379 // a power of two. We want power of two interleave count to simplify any 6380 // addressing operations or alignment considerations. 6381 // We also want power of two interleave counts to ensure that the induction 6382 // variable of the vector loop wraps to zero, when tail is folded by masking; 6383 // this currently happens when OptForSize, in which case IC is set to 1 above. 6384 unsigned IC = UINT_MAX; 6385 6386 for (auto& pair : R.MaxLocalUsers) { 6387 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6388 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6389 << " registers of " 6390 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6391 if (VF.isScalar()) { 6392 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6393 TargetNumRegisters = ForceTargetNumScalarRegs; 6394 } else { 6395 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6396 TargetNumRegisters = ForceTargetNumVectorRegs; 6397 } 6398 unsigned MaxLocalUsers = pair.second; 6399 unsigned LoopInvariantRegs = 0; 6400 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6401 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6402 6403 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6404 // Don't count the induction variable as interleaved. 6405 if (EnableIndVarRegisterHeur) { 6406 TmpIC = 6407 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6408 std::max(1U, (MaxLocalUsers - 1))); 6409 } 6410 6411 IC = std::min(IC, TmpIC); 6412 } 6413 6414 // Clamp the interleave ranges to reasonable counts. 6415 unsigned MaxInterleaveCount = 6416 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6417 6418 // Check if the user has overridden the max. 6419 if (VF.isScalar()) { 6420 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6421 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6422 } else { 6423 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6424 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6425 } 6426 6427 // If trip count is known or estimated compile time constant, limit the 6428 // interleave count to be less than the trip count divided by VF, provided it 6429 // is at least 1. 6430 // 6431 // For scalable vectors we can't know if interleaving is beneficial. It may 6432 // not be beneficial for small loops if none of the lanes in the second vector 6433 // iterations is enabled. However, for larger loops, there is likely to be a 6434 // similar benefit as for fixed-width vectors. For now, we choose to leave 6435 // the InterleaveCount as if vscale is '1', although if some information about 6436 // the vector is known (e.g. min vector size), we can make a better decision. 6437 if (BestKnownTC) { 6438 MaxInterleaveCount = 6439 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6440 // Make sure MaxInterleaveCount is greater than 0. 6441 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6442 } 6443 6444 assert(MaxInterleaveCount > 0 && 6445 "Maximum interleave count must be greater than 0"); 6446 6447 // Clamp the calculated IC to be between the 1 and the max interleave count 6448 // that the target and trip count allows. 6449 if (IC > MaxInterleaveCount) 6450 IC = MaxInterleaveCount; 6451 else 6452 // Make sure IC is greater than 0. 6453 IC = std::max(1u, IC); 6454 6455 assert(IC > 0 && "Interleave count must be greater than 0."); 6456 6457 // If we did not calculate the cost for VF (because the user selected the VF) 6458 // then we calculate the cost of VF here. 6459 if (LoopCost == 0) { 6460 InstructionCost C = expectedCost(VF).first; 6461 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6462 LoopCost = *C.getValue(); 6463 } 6464 6465 assert(LoopCost && "Non-zero loop cost expected"); 6466 6467 // Interleave if we vectorized this loop and there is a reduction that could 6468 // benefit from interleaving. 6469 if (VF.isVector() && HasReductions) { 6470 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6471 return IC; 6472 } 6473 6474 // Note that if we've already vectorized the loop we will have done the 6475 // runtime check and so interleaving won't require further checks. 6476 bool InterleavingRequiresRuntimePointerCheck = 6477 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6478 6479 // We want to interleave small loops in order to reduce the loop overhead and 6480 // potentially expose ILP opportunities. 6481 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6482 << "LV: IC is " << IC << '\n' 6483 << "LV: VF is " << VF << '\n'); 6484 const bool AggressivelyInterleaveReductions = 6485 TTI.enableAggressiveInterleaving(HasReductions); 6486 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6487 // We assume that the cost overhead is 1 and we use the cost model 6488 // to estimate the cost of the loop and interleave until the cost of the 6489 // loop overhead is about 5% of the cost of the loop. 6490 unsigned SmallIC = 6491 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6492 6493 // Interleave until store/load ports (estimated by max interleave count) are 6494 // saturated. 6495 unsigned NumStores = Legal->getNumStores(); 6496 unsigned NumLoads = Legal->getNumLoads(); 6497 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6498 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6499 6500 // If we have a scalar reduction (vector reductions are already dealt with 6501 // by this point), we can increase the critical path length if the loop 6502 // we're interleaving is inside another loop. For tree-wise reductions 6503 // set the limit to 2, and for ordered reductions it's best to disable 6504 // interleaving entirely. 6505 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6506 bool HasOrderedReductions = 6507 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6508 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6509 return RdxDesc.isOrdered(); 6510 }); 6511 if (HasOrderedReductions) { 6512 LLVM_DEBUG( 6513 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6514 return 1; 6515 } 6516 6517 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6518 SmallIC = std::min(SmallIC, F); 6519 StoresIC = std::min(StoresIC, F); 6520 LoadsIC = std::min(LoadsIC, F); 6521 } 6522 6523 if (EnableLoadStoreRuntimeInterleave && 6524 std::max(StoresIC, LoadsIC) > SmallIC) { 6525 LLVM_DEBUG( 6526 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6527 return std::max(StoresIC, LoadsIC); 6528 } 6529 6530 // If there are scalar reductions and TTI has enabled aggressive 6531 // interleaving for reductions, we will interleave to expose ILP. 6532 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6533 AggressivelyInterleaveReductions) { 6534 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6535 // Interleave no less than SmallIC but not as aggressive as the normal IC 6536 // to satisfy the rare situation when resources are too limited. 6537 return std::max(IC / 2, SmallIC); 6538 } else { 6539 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6540 return SmallIC; 6541 } 6542 } 6543 6544 // Interleave if this is a large loop (small loops are already dealt with by 6545 // this point) that could benefit from interleaving. 6546 if (AggressivelyInterleaveReductions) { 6547 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6548 return IC; 6549 } 6550 6551 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6552 return 1; 6553 } 6554 6555 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6556 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6557 // This function calculates the register usage by measuring the highest number 6558 // of values that are alive at a single location. Obviously, this is a very 6559 // rough estimation. We scan the loop in a topological order in order and 6560 // assign a number to each instruction. We use RPO to ensure that defs are 6561 // met before their users. We assume that each instruction that has in-loop 6562 // users starts an interval. We record every time that an in-loop value is 6563 // used, so we have a list of the first and last occurrences of each 6564 // instruction. Next, we transpose this data structure into a multi map that 6565 // holds the list of intervals that *end* at a specific location. This multi 6566 // map allows us to perform a linear search. We scan the instructions linearly 6567 // and record each time that a new interval starts, by placing it in a set. 6568 // If we find this value in the multi-map then we remove it from the set. 6569 // The max register usage is the maximum size of the set. 6570 // We also search for instructions that are defined outside the loop, but are 6571 // used inside the loop. We need this number separately from the max-interval 6572 // usage number because when we unroll, loop-invariant values do not take 6573 // more register. 6574 LoopBlocksDFS DFS(TheLoop); 6575 DFS.perform(LI); 6576 6577 RegisterUsage RU; 6578 6579 // Each 'key' in the map opens a new interval. The values 6580 // of the map are the index of the 'last seen' usage of the 6581 // instruction that is the key. 6582 using IntervalMap = DenseMap<Instruction *, unsigned>; 6583 6584 // Maps instruction to its index. 6585 SmallVector<Instruction *, 64> IdxToInstr; 6586 // Marks the end of each interval. 6587 IntervalMap EndPoint; 6588 // Saves the list of instruction indices that are used in the loop. 6589 SmallPtrSet<Instruction *, 8> Ends; 6590 // Saves the list of values that are used in the loop but are 6591 // defined outside the loop, such as arguments and constants. 6592 SmallPtrSet<Value *, 8> LoopInvariants; 6593 6594 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6595 for (Instruction &I : BB->instructionsWithoutDebug()) { 6596 IdxToInstr.push_back(&I); 6597 6598 // Save the end location of each USE. 6599 for (Value *U : I.operands()) { 6600 auto *Instr = dyn_cast<Instruction>(U); 6601 6602 // Ignore non-instruction values such as arguments, constants, etc. 6603 if (!Instr) 6604 continue; 6605 6606 // If this instruction is outside the loop then record it and continue. 6607 if (!TheLoop->contains(Instr)) { 6608 LoopInvariants.insert(Instr); 6609 continue; 6610 } 6611 6612 // Overwrite previous end points. 6613 EndPoint[Instr] = IdxToInstr.size(); 6614 Ends.insert(Instr); 6615 } 6616 } 6617 } 6618 6619 // Saves the list of intervals that end with the index in 'key'. 6620 using InstrList = SmallVector<Instruction *, 2>; 6621 DenseMap<unsigned, InstrList> TransposeEnds; 6622 6623 // Transpose the EndPoints to a list of values that end at each index. 6624 for (auto &Interval : EndPoint) 6625 TransposeEnds[Interval.second].push_back(Interval.first); 6626 6627 SmallPtrSet<Instruction *, 8> OpenIntervals; 6628 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6629 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6630 6631 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6632 6633 // A lambda that gets the register usage for the given type and VF. 6634 const auto &TTICapture = TTI; 6635 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6636 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6637 return 0; 6638 InstructionCost::CostType RegUsage = 6639 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6640 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6641 "Nonsensical values for register usage."); 6642 return RegUsage; 6643 }; 6644 6645 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6646 Instruction *I = IdxToInstr[i]; 6647 6648 // Remove all of the instructions that end at this location. 6649 InstrList &List = TransposeEnds[i]; 6650 for (Instruction *ToRemove : List) 6651 OpenIntervals.erase(ToRemove); 6652 6653 // Ignore instructions that are never used within the loop. 6654 if (!Ends.count(I)) 6655 continue; 6656 6657 // Skip ignored values. 6658 if (ValuesToIgnore.count(I)) 6659 continue; 6660 6661 // For each VF find the maximum usage of registers. 6662 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6663 // Count the number of live intervals. 6664 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6665 6666 if (VFs[j].isScalar()) { 6667 for (auto Inst : OpenIntervals) { 6668 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6669 if (RegUsage.find(ClassID) == RegUsage.end()) 6670 RegUsage[ClassID] = 1; 6671 else 6672 RegUsage[ClassID] += 1; 6673 } 6674 } else { 6675 collectUniformsAndScalars(VFs[j]); 6676 for (auto Inst : OpenIntervals) { 6677 // Skip ignored values for VF > 1. 6678 if (VecValuesToIgnore.count(Inst)) 6679 continue; 6680 if (isScalarAfterVectorization(Inst, VFs[j])) { 6681 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6682 if (RegUsage.find(ClassID) == RegUsage.end()) 6683 RegUsage[ClassID] = 1; 6684 else 6685 RegUsage[ClassID] += 1; 6686 } else { 6687 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6688 if (RegUsage.find(ClassID) == RegUsage.end()) 6689 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6690 else 6691 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6692 } 6693 } 6694 } 6695 6696 for (auto& pair : RegUsage) { 6697 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6698 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6699 else 6700 MaxUsages[j][pair.first] = pair.second; 6701 } 6702 } 6703 6704 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6705 << OpenIntervals.size() << '\n'); 6706 6707 // Add the current instruction to the list of open intervals. 6708 OpenIntervals.insert(I); 6709 } 6710 6711 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6712 SmallMapVector<unsigned, unsigned, 4> Invariant; 6713 6714 for (auto Inst : LoopInvariants) { 6715 unsigned Usage = 6716 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6717 unsigned ClassID = 6718 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6719 if (Invariant.find(ClassID) == Invariant.end()) 6720 Invariant[ClassID] = Usage; 6721 else 6722 Invariant[ClassID] += Usage; 6723 } 6724 6725 LLVM_DEBUG({ 6726 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6727 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6728 << " item\n"; 6729 for (const auto &pair : MaxUsages[i]) { 6730 dbgs() << "LV(REG): RegisterClass: " 6731 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6732 << " registers\n"; 6733 } 6734 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6735 << " item\n"; 6736 for (const auto &pair : Invariant) { 6737 dbgs() << "LV(REG): RegisterClass: " 6738 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6739 << " registers\n"; 6740 } 6741 }); 6742 6743 RU.LoopInvariantRegs = Invariant; 6744 RU.MaxLocalUsers = MaxUsages[i]; 6745 RUs[i] = RU; 6746 } 6747 6748 return RUs; 6749 } 6750 6751 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6752 // TODO: Cost model for emulated masked load/store is completely 6753 // broken. This hack guides the cost model to use an artificially 6754 // high enough value to practically disable vectorization with such 6755 // operations, except where previously deployed legality hack allowed 6756 // using very low cost values. This is to avoid regressions coming simply 6757 // from moving "masked load/store" check from legality to cost model. 6758 // Masked Load/Gather emulation was previously never allowed. 6759 // Limited number of Masked Store/Scatter emulation was allowed. 6760 assert(isPredicatedInst(I) && 6761 "Expecting a scalar emulated instruction"); 6762 return isa<LoadInst>(I) || 6763 (isa<StoreInst>(I) && 6764 NumPredStores > NumberOfStoresToPredicate); 6765 } 6766 6767 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6768 // If we aren't vectorizing the loop, or if we've already collected the 6769 // instructions to scalarize, there's nothing to do. Collection may already 6770 // have occurred if we have a user-selected VF and are now computing the 6771 // expected cost for interleaving. 6772 if (VF.isScalar() || VF.isZero() || 6773 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6774 return; 6775 6776 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6777 // not profitable to scalarize any instructions, the presence of VF in the 6778 // map will indicate that we've analyzed it already. 6779 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6780 6781 // Find all the instructions that are scalar with predication in the loop and 6782 // determine if it would be better to not if-convert the blocks they are in. 6783 // If so, we also record the instructions to scalarize. 6784 for (BasicBlock *BB : TheLoop->blocks()) { 6785 if (!blockNeedsPredication(BB)) 6786 continue; 6787 for (Instruction &I : *BB) 6788 if (isScalarWithPredication(&I)) { 6789 ScalarCostsTy ScalarCosts; 6790 // Do not apply discount if scalable, because that would lead to 6791 // invalid scalarization costs. 6792 // Do not apply discount logic if hacked cost is needed 6793 // for emulated masked memrefs. 6794 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6795 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6796 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6797 // Remember that BB will remain after vectorization. 6798 PredicatedBBsAfterVectorization.insert(BB); 6799 } 6800 } 6801 } 6802 6803 int LoopVectorizationCostModel::computePredInstDiscount( 6804 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6805 assert(!isUniformAfterVectorization(PredInst, VF) && 6806 "Instruction marked uniform-after-vectorization will be predicated"); 6807 6808 // Initialize the discount to zero, meaning that the scalar version and the 6809 // vector version cost the same. 6810 InstructionCost Discount = 0; 6811 6812 // Holds instructions to analyze. The instructions we visit are mapped in 6813 // ScalarCosts. Those instructions are the ones that would be scalarized if 6814 // we find that the scalar version costs less. 6815 SmallVector<Instruction *, 8> Worklist; 6816 6817 // Returns true if the given instruction can be scalarized. 6818 auto canBeScalarized = [&](Instruction *I) -> bool { 6819 // We only attempt to scalarize instructions forming a single-use chain 6820 // from the original predicated block that would otherwise be vectorized. 6821 // Although not strictly necessary, we give up on instructions we know will 6822 // already be scalar to avoid traversing chains that are unlikely to be 6823 // beneficial. 6824 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6825 isScalarAfterVectorization(I, VF)) 6826 return false; 6827 6828 // If the instruction is scalar with predication, it will be analyzed 6829 // separately. We ignore it within the context of PredInst. 6830 if (isScalarWithPredication(I)) 6831 return false; 6832 6833 // If any of the instruction's operands are uniform after vectorization, 6834 // the instruction cannot be scalarized. This prevents, for example, a 6835 // masked load from being scalarized. 6836 // 6837 // We assume we will only emit a value for lane zero of an instruction 6838 // marked uniform after vectorization, rather than VF identical values. 6839 // Thus, if we scalarize an instruction that uses a uniform, we would 6840 // create uses of values corresponding to the lanes we aren't emitting code 6841 // for. This behavior can be changed by allowing getScalarValue to clone 6842 // the lane zero values for uniforms rather than asserting. 6843 for (Use &U : I->operands()) 6844 if (auto *J = dyn_cast<Instruction>(U.get())) 6845 if (isUniformAfterVectorization(J, VF)) 6846 return false; 6847 6848 // Otherwise, we can scalarize the instruction. 6849 return true; 6850 }; 6851 6852 // Compute the expected cost discount from scalarizing the entire expression 6853 // feeding the predicated instruction. We currently only consider expressions 6854 // that are single-use instruction chains. 6855 Worklist.push_back(PredInst); 6856 while (!Worklist.empty()) { 6857 Instruction *I = Worklist.pop_back_val(); 6858 6859 // If we've already analyzed the instruction, there's nothing to do. 6860 if (ScalarCosts.find(I) != ScalarCosts.end()) 6861 continue; 6862 6863 // Compute the cost of the vector instruction. Note that this cost already 6864 // includes the scalarization overhead of the predicated instruction. 6865 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6866 6867 // Compute the cost of the scalarized instruction. This cost is the cost of 6868 // the instruction as if it wasn't if-converted and instead remained in the 6869 // predicated block. We will scale this cost by block probability after 6870 // computing the scalarization overhead. 6871 InstructionCost ScalarCost = 6872 VF.getFixedValue() * 6873 getInstructionCost(I, ElementCount::getFixed(1)).first; 6874 6875 // Compute the scalarization overhead of needed insertelement instructions 6876 // and phi nodes. 6877 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6878 ScalarCost += TTI.getScalarizationOverhead( 6879 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6880 APInt::getAllOnesValue(VF.getFixedValue()), true, false); 6881 ScalarCost += 6882 VF.getFixedValue() * 6883 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6884 } 6885 6886 // Compute the scalarization overhead of needed extractelement 6887 // instructions. For each of the instruction's operands, if the operand can 6888 // be scalarized, add it to the worklist; otherwise, account for the 6889 // overhead. 6890 for (Use &U : I->operands()) 6891 if (auto *J = dyn_cast<Instruction>(U.get())) { 6892 assert(VectorType::isValidElementType(J->getType()) && 6893 "Instruction has non-scalar type"); 6894 if (canBeScalarized(J)) 6895 Worklist.push_back(J); 6896 else if (needsExtract(J, VF)) { 6897 ScalarCost += TTI.getScalarizationOverhead( 6898 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6899 APInt::getAllOnesValue(VF.getFixedValue()), false, true); 6900 } 6901 } 6902 6903 // Scale the total scalar cost by block probability. 6904 ScalarCost /= getReciprocalPredBlockProb(); 6905 6906 // Compute the discount. A non-negative discount means the vector version 6907 // of the instruction costs more, and scalarizing would be beneficial. 6908 Discount += VectorCost - ScalarCost; 6909 ScalarCosts[I] = ScalarCost; 6910 } 6911 6912 return *Discount.getValue(); 6913 } 6914 6915 LoopVectorizationCostModel::VectorizationCostTy 6916 LoopVectorizationCostModel::expectedCost( 6917 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6918 VectorizationCostTy Cost; 6919 6920 // For each block. 6921 for (BasicBlock *BB : TheLoop->blocks()) { 6922 VectorizationCostTy BlockCost; 6923 6924 // For each instruction in the old loop. 6925 for (Instruction &I : BB->instructionsWithoutDebug()) { 6926 // Skip ignored values. 6927 if (ValuesToIgnore.count(&I) || 6928 (VF.isVector() && VecValuesToIgnore.count(&I))) 6929 continue; 6930 6931 VectorizationCostTy C = getInstructionCost(&I, VF); 6932 6933 // Check if we should override the cost. 6934 if (C.first.isValid() && 6935 ForceTargetInstructionCost.getNumOccurrences() > 0) 6936 C.first = InstructionCost(ForceTargetInstructionCost); 6937 6938 // Keep a list of instructions with invalid costs. 6939 if (Invalid && !C.first.isValid()) 6940 Invalid->emplace_back(&I, VF); 6941 6942 BlockCost.first += C.first; 6943 BlockCost.second |= C.second; 6944 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6945 << " for VF " << VF << " For instruction: " << I 6946 << '\n'); 6947 } 6948 6949 // If we are vectorizing a predicated block, it will have been 6950 // if-converted. This means that the block's instructions (aside from 6951 // stores and instructions that may divide by zero) will now be 6952 // unconditionally executed. For the scalar case, we may not always execute 6953 // the predicated block, if it is an if-else block. Thus, scale the block's 6954 // cost by the probability of executing it. blockNeedsPredication from 6955 // Legal is used so as to not include all blocks in tail folded loops. 6956 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6957 BlockCost.first /= getReciprocalPredBlockProb(); 6958 6959 Cost.first += BlockCost.first; 6960 Cost.second |= BlockCost.second; 6961 } 6962 6963 return Cost; 6964 } 6965 6966 /// Gets Address Access SCEV after verifying that the access pattern 6967 /// is loop invariant except the induction variable dependence. 6968 /// 6969 /// This SCEV can be sent to the Target in order to estimate the address 6970 /// calculation cost. 6971 static const SCEV *getAddressAccessSCEV( 6972 Value *Ptr, 6973 LoopVectorizationLegality *Legal, 6974 PredicatedScalarEvolution &PSE, 6975 const Loop *TheLoop) { 6976 6977 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6978 if (!Gep) 6979 return nullptr; 6980 6981 // We are looking for a gep with all loop invariant indices except for one 6982 // which should be an induction variable. 6983 auto SE = PSE.getSE(); 6984 unsigned NumOperands = Gep->getNumOperands(); 6985 for (unsigned i = 1; i < NumOperands; ++i) { 6986 Value *Opd = Gep->getOperand(i); 6987 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6988 !Legal->isInductionVariable(Opd)) 6989 return nullptr; 6990 } 6991 6992 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6993 return PSE.getSCEV(Ptr); 6994 } 6995 6996 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6997 return Legal->hasStride(I->getOperand(0)) || 6998 Legal->hasStride(I->getOperand(1)); 6999 } 7000 7001 InstructionCost 7002 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 7003 ElementCount VF) { 7004 assert(VF.isVector() && 7005 "Scalarization cost of instruction implies vectorization."); 7006 if (VF.isScalable()) 7007 return InstructionCost::getInvalid(); 7008 7009 Type *ValTy = getLoadStoreType(I); 7010 auto SE = PSE.getSE(); 7011 7012 unsigned AS = getLoadStoreAddressSpace(I); 7013 Value *Ptr = getLoadStorePointerOperand(I); 7014 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 7015 7016 // Figure out whether the access is strided and get the stride value 7017 // if it's known in compile time 7018 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 7019 7020 // Get the cost of the scalar memory instruction and address computation. 7021 InstructionCost Cost = 7022 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 7023 7024 // Don't pass *I here, since it is scalar but will actually be part of a 7025 // vectorized loop where the user of it is a vectorized instruction. 7026 const Align Alignment = getLoadStoreAlignment(I); 7027 Cost += VF.getKnownMinValue() * 7028 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 7029 AS, TTI::TCK_RecipThroughput); 7030 7031 // Get the overhead of the extractelement and insertelement instructions 7032 // we might create due to scalarization. 7033 Cost += getScalarizationOverhead(I, VF); 7034 7035 // If we have a predicated load/store, it will need extra i1 extracts and 7036 // conditional branches, but may not be executed for each vector lane. Scale 7037 // the cost by the probability of executing the predicated block. 7038 if (isPredicatedInst(I)) { 7039 Cost /= getReciprocalPredBlockProb(); 7040 7041 // Add the cost of an i1 extract and a branch 7042 auto *Vec_i1Ty = 7043 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 7044 Cost += TTI.getScalarizationOverhead( 7045 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7046 /*Insert=*/false, /*Extract=*/true); 7047 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 7048 7049 if (useEmulatedMaskMemRefHack(I)) 7050 // Artificially setting to a high enough value to practically disable 7051 // vectorization with such operations. 7052 Cost = 3000000; 7053 } 7054 7055 return Cost; 7056 } 7057 7058 InstructionCost 7059 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7060 ElementCount VF) { 7061 Type *ValTy = getLoadStoreType(I); 7062 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7063 Value *Ptr = getLoadStorePointerOperand(I); 7064 unsigned AS = getLoadStoreAddressSpace(I); 7065 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 7066 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7067 7068 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7069 "Stride should be 1 or -1 for consecutive memory access"); 7070 const Align Alignment = getLoadStoreAlignment(I); 7071 InstructionCost Cost = 0; 7072 if (Legal->isMaskRequired(I)) 7073 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7074 CostKind); 7075 else 7076 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7077 CostKind, I); 7078 7079 bool Reverse = ConsecutiveStride < 0; 7080 if (Reverse) 7081 Cost += 7082 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7083 return Cost; 7084 } 7085 7086 InstructionCost 7087 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7088 ElementCount VF) { 7089 assert(Legal->isUniformMemOp(*I)); 7090 7091 Type *ValTy = getLoadStoreType(I); 7092 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7093 const Align Alignment = getLoadStoreAlignment(I); 7094 unsigned AS = getLoadStoreAddressSpace(I); 7095 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7096 if (isa<LoadInst>(I)) { 7097 return TTI.getAddressComputationCost(ValTy) + 7098 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7099 CostKind) + 7100 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7101 } 7102 StoreInst *SI = cast<StoreInst>(I); 7103 7104 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7105 return TTI.getAddressComputationCost(ValTy) + 7106 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7107 CostKind) + 7108 (isLoopInvariantStoreValue 7109 ? 0 7110 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7111 VF.getKnownMinValue() - 1)); 7112 } 7113 7114 InstructionCost 7115 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7116 ElementCount VF) { 7117 Type *ValTy = getLoadStoreType(I); 7118 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7119 const Align Alignment = getLoadStoreAlignment(I); 7120 const Value *Ptr = getLoadStorePointerOperand(I); 7121 7122 return TTI.getAddressComputationCost(VectorTy) + 7123 TTI.getGatherScatterOpCost( 7124 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7125 TargetTransformInfo::TCK_RecipThroughput, I); 7126 } 7127 7128 InstructionCost 7129 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7130 ElementCount VF) { 7131 // TODO: Once we have support for interleaving with scalable vectors 7132 // we can calculate the cost properly here. 7133 if (VF.isScalable()) 7134 return InstructionCost::getInvalid(); 7135 7136 Type *ValTy = getLoadStoreType(I); 7137 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7138 unsigned AS = getLoadStoreAddressSpace(I); 7139 7140 auto Group = getInterleavedAccessGroup(I); 7141 assert(Group && "Fail to get an interleaved access group."); 7142 7143 unsigned InterleaveFactor = Group->getFactor(); 7144 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7145 7146 // Holds the indices of existing members in the interleaved group. 7147 SmallVector<unsigned, 4> Indices; 7148 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 7149 if (Group->getMember(IF)) 7150 Indices.push_back(IF); 7151 7152 // Calculate the cost of the whole interleaved group. 7153 bool UseMaskForGaps = 7154 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 7155 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 7156 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7157 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7158 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7159 7160 if (Group->isReverse()) { 7161 // TODO: Add support for reversed masked interleaved access. 7162 assert(!Legal->isMaskRequired(I) && 7163 "Reverse masked interleaved access not supported."); 7164 Cost += 7165 Group->getNumMembers() * 7166 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7167 } 7168 return Cost; 7169 } 7170 7171 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7172 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7173 using namespace llvm::PatternMatch; 7174 // Early exit for no inloop reductions 7175 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7176 return None; 7177 auto *VectorTy = cast<VectorType>(Ty); 7178 7179 // We are looking for a pattern of, and finding the minimal acceptable cost: 7180 // reduce(mul(ext(A), ext(B))) or 7181 // reduce(mul(A, B)) or 7182 // reduce(ext(A)) or 7183 // reduce(A). 7184 // The basic idea is that we walk down the tree to do that, finding the root 7185 // reduction instruction in InLoopReductionImmediateChains. From there we find 7186 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7187 // of the components. If the reduction cost is lower then we return it for the 7188 // reduction instruction and 0 for the other instructions in the pattern. If 7189 // it is not we return an invalid cost specifying the orignal cost method 7190 // should be used. 7191 Instruction *RetI = I; 7192 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7193 if (!RetI->hasOneUser()) 7194 return None; 7195 RetI = RetI->user_back(); 7196 } 7197 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7198 RetI->user_back()->getOpcode() == Instruction::Add) { 7199 if (!RetI->hasOneUser()) 7200 return None; 7201 RetI = RetI->user_back(); 7202 } 7203 7204 // Test if the found instruction is a reduction, and if not return an invalid 7205 // cost specifying the parent to use the original cost modelling. 7206 if (!InLoopReductionImmediateChains.count(RetI)) 7207 return None; 7208 7209 // Find the reduction this chain is a part of and calculate the basic cost of 7210 // the reduction on its own. 7211 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7212 Instruction *ReductionPhi = LastChain; 7213 while (!isa<PHINode>(ReductionPhi)) 7214 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7215 7216 const RecurrenceDescriptor &RdxDesc = 7217 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7218 7219 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7220 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7221 7222 // If we're using ordered reductions then we can just return the base cost 7223 // here, since getArithmeticReductionCost calculates the full ordered 7224 // reduction cost when FP reassociation is not allowed. 7225 if (useOrderedReductions(RdxDesc)) 7226 return BaseCost; 7227 7228 // Get the operand that was not the reduction chain and match it to one of the 7229 // patterns, returning the better cost if it is found. 7230 Instruction *RedOp = RetI->getOperand(1) == LastChain 7231 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7232 : dyn_cast<Instruction>(RetI->getOperand(1)); 7233 7234 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7235 7236 Instruction *Op0, *Op1; 7237 if (RedOp && 7238 match(RedOp, 7239 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7240 match(Op0, m_ZExtOrSExt(m_Value())) && 7241 Op0->getOpcode() == Op1->getOpcode() && 7242 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7243 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7244 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7245 7246 // Matched reduce(ext(mul(ext(A), ext(B))) 7247 // Note that the extend opcodes need to all match, or if A==B they will have 7248 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7249 // which is equally fine. 7250 bool IsUnsigned = isa<ZExtInst>(Op0); 7251 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7252 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7253 7254 InstructionCost ExtCost = 7255 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7256 TTI::CastContextHint::None, CostKind, Op0); 7257 InstructionCost MulCost = 7258 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7259 InstructionCost Ext2Cost = 7260 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7261 TTI::CastContextHint::None, CostKind, RedOp); 7262 7263 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7264 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7265 CostKind); 7266 7267 if (RedCost.isValid() && 7268 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7269 return I == RetI ? RedCost : 0; 7270 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7271 !TheLoop->isLoopInvariant(RedOp)) { 7272 // Matched reduce(ext(A)) 7273 bool IsUnsigned = isa<ZExtInst>(RedOp); 7274 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7275 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7276 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7277 CostKind); 7278 7279 InstructionCost ExtCost = 7280 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7281 TTI::CastContextHint::None, CostKind, RedOp); 7282 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7283 return I == RetI ? RedCost : 0; 7284 } else if (RedOp && 7285 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7286 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7287 Op0->getOpcode() == Op1->getOpcode() && 7288 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7289 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7290 bool IsUnsigned = isa<ZExtInst>(Op0); 7291 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7292 // Matched reduce(mul(ext, ext)) 7293 InstructionCost ExtCost = 7294 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7295 TTI::CastContextHint::None, CostKind, Op0); 7296 InstructionCost MulCost = 7297 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7298 7299 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7300 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7301 CostKind); 7302 7303 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7304 return I == RetI ? RedCost : 0; 7305 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7306 // Matched reduce(mul()) 7307 InstructionCost MulCost = 7308 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7309 7310 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7311 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7312 CostKind); 7313 7314 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7315 return I == RetI ? RedCost : 0; 7316 } 7317 } 7318 7319 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7320 } 7321 7322 InstructionCost 7323 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7324 ElementCount VF) { 7325 // Calculate scalar cost only. Vectorization cost should be ready at this 7326 // moment. 7327 if (VF.isScalar()) { 7328 Type *ValTy = getLoadStoreType(I); 7329 const Align Alignment = getLoadStoreAlignment(I); 7330 unsigned AS = getLoadStoreAddressSpace(I); 7331 7332 return TTI.getAddressComputationCost(ValTy) + 7333 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7334 TTI::TCK_RecipThroughput, I); 7335 } 7336 return getWideningCost(I, VF); 7337 } 7338 7339 LoopVectorizationCostModel::VectorizationCostTy 7340 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7341 ElementCount VF) { 7342 // If we know that this instruction will remain uniform, check the cost of 7343 // the scalar version. 7344 if (isUniformAfterVectorization(I, VF)) 7345 VF = ElementCount::getFixed(1); 7346 7347 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7348 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7349 7350 // Forced scalars do not have any scalarization overhead. 7351 auto ForcedScalar = ForcedScalars.find(VF); 7352 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7353 auto InstSet = ForcedScalar->second; 7354 if (InstSet.count(I)) 7355 return VectorizationCostTy( 7356 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7357 VF.getKnownMinValue()), 7358 false); 7359 } 7360 7361 Type *VectorTy; 7362 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7363 7364 bool TypeNotScalarized = 7365 VF.isVector() && VectorTy->isVectorTy() && 7366 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7367 return VectorizationCostTy(C, TypeNotScalarized); 7368 } 7369 7370 InstructionCost 7371 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7372 ElementCount VF) const { 7373 7374 // There is no mechanism yet to create a scalable scalarization loop, 7375 // so this is currently Invalid. 7376 if (VF.isScalable()) 7377 return InstructionCost::getInvalid(); 7378 7379 if (VF.isScalar()) 7380 return 0; 7381 7382 InstructionCost Cost = 0; 7383 Type *RetTy = ToVectorTy(I->getType(), VF); 7384 if (!RetTy->isVoidTy() && 7385 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7386 Cost += TTI.getScalarizationOverhead( 7387 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7388 true, false); 7389 7390 // Some targets keep addresses scalar. 7391 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7392 return Cost; 7393 7394 // Some targets support efficient element stores. 7395 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7396 return Cost; 7397 7398 // Collect operands to consider. 7399 CallInst *CI = dyn_cast<CallInst>(I); 7400 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7401 7402 // Skip operands that do not require extraction/scalarization and do not incur 7403 // any overhead. 7404 SmallVector<Type *> Tys; 7405 for (auto *V : filterExtractingOperands(Ops, VF)) 7406 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7407 return Cost + TTI.getOperandsScalarizationOverhead( 7408 filterExtractingOperands(Ops, VF), Tys); 7409 } 7410 7411 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7412 if (VF.isScalar()) 7413 return; 7414 NumPredStores = 0; 7415 for (BasicBlock *BB : TheLoop->blocks()) { 7416 // For each instruction in the old loop. 7417 for (Instruction &I : *BB) { 7418 Value *Ptr = getLoadStorePointerOperand(&I); 7419 if (!Ptr) 7420 continue; 7421 7422 // TODO: We should generate better code and update the cost model for 7423 // predicated uniform stores. Today they are treated as any other 7424 // predicated store (see added test cases in 7425 // invariant-store-vectorization.ll). 7426 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7427 NumPredStores++; 7428 7429 if (Legal->isUniformMemOp(I)) { 7430 // TODO: Avoid replicating loads and stores instead of 7431 // relying on instcombine to remove them. 7432 // Load: Scalar load + broadcast 7433 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7434 InstructionCost Cost; 7435 if (isa<StoreInst>(&I) && VF.isScalable() && 7436 isLegalGatherOrScatter(&I)) { 7437 Cost = getGatherScatterCost(&I, VF); 7438 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7439 } else { 7440 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7441 "Cannot yet scalarize uniform stores"); 7442 Cost = getUniformMemOpCost(&I, VF); 7443 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7444 } 7445 continue; 7446 } 7447 7448 // We assume that widening is the best solution when possible. 7449 if (memoryInstructionCanBeWidened(&I, VF)) { 7450 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7451 int ConsecutiveStride = 7452 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7453 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7454 "Expected consecutive stride."); 7455 InstWidening Decision = 7456 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7457 setWideningDecision(&I, VF, Decision, Cost); 7458 continue; 7459 } 7460 7461 // Choose between Interleaving, Gather/Scatter or Scalarization. 7462 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7463 unsigned NumAccesses = 1; 7464 if (isAccessInterleaved(&I)) { 7465 auto Group = getInterleavedAccessGroup(&I); 7466 assert(Group && "Fail to get an interleaved access group."); 7467 7468 // Make one decision for the whole group. 7469 if (getWideningDecision(&I, VF) != CM_Unknown) 7470 continue; 7471 7472 NumAccesses = Group->getNumMembers(); 7473 if (interleavedAccessCanBeWidened(&I, VF)) 7474 InterleaveCost = getInterleaveGroupCost(&I, VF); 7475 } 7476 7477 InstructionCost GatherScatterCost = 7478 isLegalGatherOrScatter(&I) 7479 ? getGatherScatterCost(&I, VF) * NumAccesses 7480 : InstructionCost::getInvalid(); 7481 7482 InstructionCost ScalarizationCost = 7483 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7484 7485 // Choose better solution for the current VF, 7486 // write down this decision and use it during vectorization. 7487 InstructionCost Cost; 7488 InstWidening Decision; 7489 if (InterleaveCost <= GatherScatterCost && 7490 InterleaveCost < ScalarizationCost) { 7491 Decision = CM_Interleave; 7492 Cost = InterleaveCost; 7493 } else if (GatherScatterCost < ScalarizationCost) { 7494 Decision = CM_GatherScatter; 7495 Cost = GatherScatterCost; 7496 } else { 7497 Decision = CM_Scalarize; 7498 Cost = ScalarizationCost; 7499 } 7500 // If the instructions belongs to an interleave group, the whole group 7501 // receives the same decision. The whole group receives the cost, but 7502 // the cost will actually be assigned to one instruction. 7503 if (auto Group = getInterleavedAccessGroup(&I)) 7504 setWideningDecision(Group, VF, Decision, Cost); 7505 else 7506 setWideningDecision(&I, VF, Decision, Cost); 7507 } 7508 } 7509 7510 // Make sure that any load of address and any other address computation 7511 // remains scalar unless there is gather/scatter support. This avoids 7512 // inevitable extracts into address registers, and also has the benefit of 7513 // activating LSR more, since that pass can't optimize vectorized 7514 // addresses. 7515 if (TTI.prefersVectorizedAddressing()) 7516 return; 7517 7518 // Start with all scalar pointer uses. 7519 SmallPtrSet<Instruction *, 8> AddrDefs; 7520 for (BasicBlock *BB : TheLoop->blocks()) 7521 for (Instruction &I : *BB) { 7522 Instruction *PtrDef = 7523 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7524 if (PtrDef && TheLoop->contains(PtrDef) && 7525 getWideningDecision(&I, VF) != CM_GatherScatter) 7526 AddrDefs.insert(PtrDef); 7527 } 7528 7529 // Add all instructions used to generate the addresses. 7530 SmallVector<Instruction *, 4> Worklist; 7531 append_range(Worklist, AddrDefs); 7532 while (!Worklist.empty()) { 7533 Instruction *I = Worklist.pop_back_val(); 7534 for (auto &Op : I->operands()) 7535 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7536 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7537 AddrDefs.insert(InstOp).second) 7538 Worklist.push_back(InstOp); 7539 } 7540 7541 for (auto *I : AddrDefs) { 7542 if (isa<LoadInst>(I)) { 7543 // Setting the desired widening decision should ideally be handled in 7544 // by cost functions, but since this involves the task of finding out 7545 // if the loaded register is involved in an address computation, it is 7546 // instead changed here when we know this is the case. 7547 InstWidening Decision = getWideningDecision(I, VF); 7548 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7549 // Scalarize a widened load of address. 7550 setWideningDecision( 7551 I, VF, CM_Scalarize, 7552 (VF.getKnownMinValue() * 7553 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7554 else if (auto Group = getInterleavedAccessGroup(I)) { 7555 // Scalarize an interleave group of address loads. 7556 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7557 if (Instruction *Member = Group->getMember(I)) 7558 setWideningDecision( 7559 Member, VF, CM_Scalarize, 7560 (VF.getKnownMinValue() * 7561 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7562 } 7563 } 7564 } else 7565 // Make sure I gets scalarized and a cost estimate without 7566 // scalarization overhead. 7567 ForcedScalars[VF].insert(I); 7568 } 7569 } 7570 7571 InstructionCost 7572 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7573 Type *&VectorTy) { 7574 Type *RetTy = I->getType(); 7575 if (canTruncateToMinimalBitwidth(I, VF)) 7576 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7577 auto SE = PSE.getSE(); 7578 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7579 7580 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7581 ElementCount VF) -> bool { 7582 if (VF.isScalar()) 7583 return true; 7584 7585 auto Scalarized = InstsToScalarize.find(VF); 7586 assert(Scalarized != InstsToScalarize.end() && 7587 "VF not yet analyzed for scalarization profitability"); 7588 return !Scalarized->second.count(I) && 7589 llvm::all_of(I->users(), [&](User *U) { 7590 auto *UI = cast<Instruction>(U); 7591 return !Scalarized->second.count(UI); 7592 }); 7593 }; 7594 (void) hasSingleCopyAfterVectorization; 7595 7596 if (isScalarAfterVectorization(I, VF)) { 7597 // With the exception of GEPs and PHIs, after scalarization there should 7598 // only be one copy of the instruction generated in the loop. This is 7599 // because the VF is either 1, or any instructions that need scalarizing 7600 // have already been dealt with by the the time we get here. As a result, 7601 // it means we don't have to multiply the instruction cost by VF. 7602 assert(I->getOpcode() == Instruction::GetElementPtr || 7603 I->getOpcode() == Instruction::PHI || 7604 (I->getOpcode() == Instruction::BitCast && 7605 I->getType()->isPointerTy()) || 7606 hasSingleCopyAfterVectorization(I, VF)); 7607 VectorTy = RetTy; 7608 } else 7609 VectorTy = ToVectorTy(RetTy, VF); 7610 7611 // TODO: We need to estimate the cost of intrinsic calls. 7612 switch (I->getOpcode()) { 7613 case Instruction::GetElementPtr: 7614 // We mark this instruction as zero-cost because the cost of GEPs in 7615 // vectorized code depends on whether the corresponding memory instruction 7616 // is scalarized or not. Therefore, we handle GEPs with the memory 7617 // instruction cost. 7618 return 0; 7619 case Instruction::Br: { 7620 // In cases of scalarized and predicated instructions, there will be VF 7621 // predicated blocks in the vectorized loop. Each branch around these 7622 // blocks requires also an extract of its vector compare i1 element. 7623 bool ScalarPredicatedBB = false; 7624 BranchInst *BI = cast<BranchInst>(I); 7625 if (VF.isVector() && BI->isConditional() && 7626 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7627 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7628 ScalarPredicatedBB = true; 7629 7630 if (ScalarPredicatedBB) { 7631 // Not possible to scalarize scalable vector with predicated instructions. 7632 if (VF.isScalable()) 7633 return InstructionCost::getInvalid(); 7634 // Return cost for branches around scalarized and predicated blocks. 7635 auto *Vec_i1Ty = 7636 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7637 return ( 7638 TTI.getScalarizationOverhead( 7639 Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false, 7640 true) + 7641 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7642 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7643 // The back-edge branch will remain, as will all scalar branches. 7644 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7645 else 7646 // This branch will be eliminated by if-conversion. 7647 return 0; 7648 // Note: We currently assume zero cost for an unconditional branch inside 7649 // a predicated block since it will become a fall-through, although we 7650 // may decide in the future to call TTI for all branches. 7651 } 7652 case Instruction::PHI: { 7653 auto *Phi = cast<PHINode>(I); 7654 7655 // First-order recurrences are replaced by vector shuffles inside the loop. 7656 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7657 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7658 return TTI.getShuffleCost( 7659 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7660 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7661 7662 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7663 // converted into select instructions. We require N - 1 selects per phi 7664 // node, where N is the number of incoming values. 7665 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7666 return (Phi->getNumIncomingValues() - 1) * 7667 TTI.getCmpSelInstrCost( 7668 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7669 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7670 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7671 7672 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7673 } 7674 case Instruction::UDiv: 7675 case Instruction::SDiv: 7676 case Instruction::URem: 7677 case Instruction::SRem: 7678 // If we have a predicated instruction, it may not be executed for each 7679 // vector lane. Get the scalarization cost and scale this amount by the 7680 // probability of executing the predicated block. If the instruction is not 7681 // predicated, we fall through to the next case. 7682 if (VF.isVector() && isScalarWithPredication(I)) { 7683 InstructionCost Cost = 0; 7684 7685 // These instructions have a non-void type, so account for the phi nodes 7686 // that we will create. This cost is likely to be zero. The phi node 7687 // cost, if any, should be scaled by the block probability because it 7688 // models a copy at the end of each predicated block. 7689 Cost += VF.getKnownMinValue() * 7690 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7691 7692 // The cost of the non-predicated instruction. 7693 Cost += VF.getKnownMinValue() * 7694 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7695 7696 // The cost of insertelement and extractelement instructions needed for 7697 // scalarization. 7698 Cost += getScalarizationOverhead(I, VF); 7699 7700 // Scale the cost by the probability of executing the predicated blocks. 7701 // This assumes the predicated block for each vector lane is equally 7702 // likely. 7703 return Cost / getReciprocalPredBlockProb(); 7704 } 7705 LLVM_FALLTHROUGH; 7706 case Instruction::Add: 7707 case Instruction::FAdd: 7708 case Instruction::Sub: 7709 case Instruction::FSub: 7710 case Instruction::Mul: 7711 case Instruction::FMul: 7712 case Instruction::FDiv: 7713 case Instruction::FRem: 7714 case Instruction::Shl: 7715 case Instruction::LShr: 7716 case Instruction::AShr: 7717 case Instruction::And: 7718 case Instruction::Or: 7719 case Instruction::Xor: { 7720 // Since we will replace the stride by 1 the multiplication should go away. 7721 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7722 return 0; 7723 7724 // Detect reduction patterns 7725 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7726 return *RedCost; 7727 7728 // Certain instructions can be cheaper to vectorize if they have a constant 7729 // second vector operand. One example of this are shifts on x86. 7730 Value *Op2 = I->getOperand(1); 7731 TargetTransformInfo::OperandValueProperties Op2VP; 7732 TargetTransformInfo::OperandValueKind Op2VK = 7733 TTI.getOperandInfo(Op2, Op2VP); 7734 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7735 Op2VK = TargetTransformInfo::OK_UniformValue; 7736 7737 SmallVector<const Value *, 4> Operands(I->operand_values()); 7738 return TTI.getArithmeticInstrCost( 7739 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7740 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7741 } 7742 case Instruction::FNeg: { 7743 return TTI.getArithmeticInstrCost( 7744 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7745 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7746 TargetTransformInfo::OP_None, I->getOperand(0), I); 7747 } 7748 case Instruction::Select: { 7749 SelectInst *SI = cast<SelectInst>(I); 7750 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7751 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7752 7753 const Value *Op0, *Op1; 7754 using namespace llvm::PatternMatch; 7755 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7756 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7757 // select x, y, false --> x & y 7758 // select x, true, y --> x | y 7759 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7760 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7761 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7762 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7763 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7764 Op1->getType()->getScalarSizeInBits() == 1); 7765 7766 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7767 return TTI.getArithmeticInstrCost( 7768 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7769 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7770 } 7771 7772 Type *CondTy = SI->getCondition()->getType(); 7773 if (!ScalarCond) 7774 CondTy = VectorType::get(CondTy, VF); 7775 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7776 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7777 } 7778 case Instruction::ICmp: 7779 case Instruction::FCmp: { 7780 Type *ValTy = I->getOperand(0)->getType(); 7781 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7782 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7783 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7784 VectorTy = ToVectorTy(ValTy, VF); 7785 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7786 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7787 } 7788 case Instruction::Store: 7789 case Instruction::Load: { 7790 ElementCount Width = VF; 7791 if (Width.isVector()) { 7792 InstWidening Decision = getWideningDecision(I, Width); 7793 assert(Decision != CM_Unknown && 7794 "CM decision should be taken at this point"); 7795 if (Decision == CM_Scalarize) 7796 Width = ElementCount::getFixed(1); 7797 } 7798 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7799 return getMemoryInstructionCost(I, VF); 7800 } 7801 case Instruction::BitCast: 7802 if (I->getType()->isPointerTy()) 7803 return 0; 7804 LLVM_FALLTHROUGH; 7805 case Instruction::ZExt: 7806 case Instruction::SExt: 7807 case Instruction::FPToUI: 7808 case Instruction::FPToSI: 7809 case Instruction::FPExt: 7810 case Instruction::PtrToInt: 7811 case Instruction::IntToPtr: 7812 case Instruction::SIToFP: 7813 case Instruction::UIToFP: 7814 case Instruction::Trunc: 7815 case Instruction::FPTrunc: { 7816 // Computes the CastContextHint from a Load/Store instruction. 7817 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7818 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7819 "Expected a load or a store!"); 7820 7821 if (VF.isScalar() || !TheLoop->contains(I)) 7822 return TTI::CastContextHint::Normal; 7823 7824 switch (getWideningDecision(I, VF)) { 7825 case LoopVectorizationCostModel::CM_GatherScatter: 7826 return TTI::CastContextHint::GatherScatter; 7827 case LoopVectorizationCostModel::CM_Interleave: 7828 return TTI::CastContextHint::Interleave; 7829 case LoopVectorizationCostModel::CM_Scalarize: 7830 case LoopVectorizationCostModel::CM_Widen: 7831 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7832 : TTI::CastContextHint::Normal; 7833 case LoopVectorizationCostModel::CM_Widen_Reverse: 7834 return TTI::CastContextHint::Reversed; 7835 case LoopVectorizationCostModel::CM_Unknown: 7836 llvm_unreachable("Instr did not go through cost modelling?"); 7837 } 7838 7839 llvm_unreachable("Unhandled case!"); 7840 }; 7841 7842 unsigned Opcode = I->getOpcode(); 7843 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7844 // For Trunc, the context is the only user, which must be a StoreInst. 7845 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7846 if (I->hasOneUse()) 7847 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7848 CCH = ComputeCCH(Store); 7849 } 7850 // For Z/Sext, the context is the operand, which must be a LoadInst. 7851 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7852 Opcode == Instruction::FPExt) { 7853 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7854 CCH = ComputeCCH(Load); 7855 } 7856 7857 // We optimize the truncation of induction variables having constant 7858 // integer steps. The cost of these truncations is the same as the scalar 7859 // operation. 7860 if (isOptimizableIVTruncate(I, VF)) { 7861 auto *Trunc = cast<TruncInst>(I); 7862 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7863 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7864 } 7865 7866 // Detect reduction patterns 7867 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7868 return *RedCost; 7869 7870 Type *SrcScalarTy = I->getOperand(0)->getType(); 7871 Type *SrcVecTy = 7872 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7873 if (canTruncateToMinimalBitwidth(I, VF)) { 7874 // This cast is going to be shrunk. This may remove the cast or it might 7875 // turn it into slightly different cast. For example, if MinBW == 16, 7876 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7877 // 7878 // Calculate the modified src and dest types. 7879 Type *MinVecTy = VectorTy; 7880 if (Opcode == Instruction::Trunc) { 7881 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7882 VectorTy = 7883 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7884 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7885 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7886 VectorTy = 7887 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7888 } 7889 } 7890 7891 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7892 } 7893 case Instruction::Call: { 7894 bool NeedToScalarize; 7895 CallInst *CI = cast<CallInst>(I); 7896 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7897 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7898 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7899 return std::min(CallCost, IntrinsicCost); 7900 } 7901 return CallCost; 7902 } 7903 case Instruction::ExtractValue: 7904 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7905 case Instruction::Alloca: 7906 // We cannot easily widen alloca to a scalable alloca, as 7907 // the result would need to be a vector of pointers. 7908 if (VF.isScalable()) 7909 return InstructionCost::getInvalid(); 7910 LLVM_FALLTHROUGH; 7911 default: 7912 // This opcode is unknown. Assume that it is the same as 'mul'. 7913 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7914 } // end of switch. 7915 } 7916 7917 char LoopVectorize::ID = 0; 7918 7919 static const char lv_name[] = "Loop Vectorization"; 7920 7921 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7922 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7923 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7924 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7925 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7926 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7927 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7928 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7929 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7930 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7931 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7932 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7933 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7934 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7935 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7936 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7937 7938 namespace llvm { 7939 7940 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7941 7942 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7943 bool VectorizeOnlyWhenForced) { 7944 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7945 } 7946 7947 } // end namespace llvm 7948 7949 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7950 // Check if the pointer operand of a load or store instruction is 7951 // consecutive. 7952 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7953 return Legal->isConsecutivePtr(Ptr); 7954 return false; 7955 } 7956 7957 void LoopVectorizationCostModel::collectValuesToIgnore() { 7958 // Ignore ephemeral values. 7959 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7960 7961 // Ignore type-promoting instructions we identified during reduction 7962 // detection. 7963 for (auto &Reduction : Legal->getReductionVars()) { 7964 RecurrenceDescriptor &RedDes = Reduction.second; 7965 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7966 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7967 } 7968 // Ignore type-casting instructions we identified during induction 7969 // detection. 7970 for (auto &Induction : Legal->getInductionVars()) { 7971 InductionDescriptor &IndDes = Induction.second; 7972 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7973 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7974 } 7975 } 7976 7977 void LoopVectorizationCostModel::collectInLoopReductions() { 7978 for (auto &Reduction : Legal->getReductionVars()) { 7979 PHINode *Phi = Reduction.first; 7980 RecurrenceDescriptor &RdxDesc = Reduction.second; 7981 7982 // We don't collect reductions that are type promoted (yet). 7983 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7984 continue; 7985 7986 // If the target would prefer this reduction to happen "in-loop", then we 7987 // want to record it as such. 7988 unsigned Opcode = RdxDesc.getOpcode(); 7989 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7990 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7991 TargetTransformInfo::ReductionFlags())) 7992 continue; 7993 7994 // Check that we can correctly put the reductions into the loop, by 7995 // finding the chain of operations that leads from the phi to the loop 7996 // exit value. 7997 SmallVector<Instruction *, 4> ReductionOperations = 7998 RdxDesc.getReductionOpChain(Phi, TheLoop); 7999 bool InLoop = !ReductionOperations.empty(); 8000 if (InLoop) { 8001 InLoopReductionChains[Phi] = ReductionOperations; 8002 // Add the elements to InLoopReductionImmediateChains for cost modelling. 8003 Instruction *LastChain = Phi; 8004 for (auto *I : ReductionOperations) { 8005 InLoopReductionImmediateChains[I] = LastChain; 8006 LastChain = I; 8007 } 8008 } 8009 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 8010 << " reduction for phi: " << *Phi << "\n"); 8011 } 8012 } 8013 8014 // TODO: we could return a pair of values that specify the max VF and 8015 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 8016 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 8017 // doesn't have a cost model that can choose which plan to execute if 8018 // more than one is generated. 8019 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 8020 LoopVectorizationCostModel &CM) { 8021 unsigned WidestType; 8022 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 8023 return WidestVectorRegBits / WidestType; 8024 } 8025 8026 VectorizationFactor 8027 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 8028 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 8029 ElementCount VF = UserVF; 8030 // Outer loop handling: They may require CFG and instruction level 8031 // transformations before even evaluating whether vectorization is profitable. 8032 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8033 // the vectorization pipeline. 8034 if (!OrigLoop->isInnermost()) { 8035 // If the user doesn't provide a vectorization factor, determine a 8036 // reasonable one. 8037 if (UserVF.isZero()) { 8038 VF = ElementCount::getFixed(determineVPlanVF( 8039 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8040 .getFixedSize(), 8041 CM)); 8042 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8043 8044 // Make sure we have a VF > 1 for stress testing. 8045 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8046 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8047 << "overriding computed VF.\n"); 8048 VF = ElementCount::getFixed(4); 8049 } 8050 } 8051 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8052 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8053 "VF needs to be a power of two"); 8054 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8055 << "VF " << VF << " to build VPlans.\n"); 8056 buildVPlans(VF, VF); 8057 8058 // For VPlan build stress testing, we bail out after VPlan construction. 8059 if (VPlanBuildStressTest) 8060 return VectorizationFactor::Disabled(); 8061 8062 return {VF, 0 /*Cost*/}; 8063 } 8064 8065 LLVM_DEBUG( 8066 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8067 "VPlan-native path.\n"); 8068 return VectorizationFactor::Disabled(); 8069 } 8070 8071 Optional<VectorizationFactor> 8072 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8073 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8074 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8075 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8076 return None; 8077 8078 // Invalidate interleave groups if all blocks of loop will be predicated. 8079 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 8080 !useMaskedInterleavedAccesses(*TTI)) { 8081 LLVM_DEBUG( 8082 dbgs() 8083 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8084 "which requires masked-interleaved support.\n"); 8085 if (CM.InterleaveInfo.invalidateGroups()) 8086 // Invalidating interleave groups also requires invalidating all decisions 8087 // based on them, which includes widening decisions and uniform and scalar 8088 // values. 8089 CM.invalidateCostModelingDecisions(); 8090 } 8091 8092 ElementCount MaxUserVF = 8093 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8094 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8095 if (!UserVF.isZero() && UserVFIsLegal) { 8096 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8097 "VF needs to be a power of two"); 8098 // Collect the instructions (and their associated costs) that will be more 8099 // profitable to scalarize. 8100 if (CM.selectUserVectorizationFactor(UserVF)) { 8101 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8102 CM.collectInLoopReductions(); 8103 buildVPlansWithVPRecipes(UserVF, UserVF); 8104 LLVM_DEBUG(printPlans(dbgs())); 8105 return {{UserVF, 0}}; 8106 } else 8107 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8108 "InvalidCost", ORE, OrigLoop); 8109 } 8110 8111 // Populate the set of Vectorization Factor Candidates. 8112 ElementCountSet VFCandidates; 8113 for (auto VF = ElementCount::getFixed(1); 8114 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8115 VFCandidates.insert(VF); 8116 for (auto VF = ElementCount::getScalable(1); 8117 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8118 VFCandidates.insert(VF); 8119 8120 for (const auto &VF : VFCandidates) { 8121 // Collect Uniform and Scalar instructions after vectorization with VF. 8122 CM.collectUniformsAndScalars(VF); 8123 8124 // Collect the instructions (and their associated costs) that will be more 8125 // profitable to scalarize. 8126 if (VF.isVector()) 8127 CM.collectInstsToScalarize(VF); 8128 } 8129 8130 CM.collectInLoopReductions(); 8131 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8132 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8133 8134 LLVM_DEBUG(printPlans(dbgs())); 8135 if (!MaxFactors.hasVector()) 8136 return VectorizationFactor::Disabled(); 8137 8138 // Select the optimal vectorization factor. 8139 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8140 8141 // Check if it is profitable to vectorize with runtime checks. 8142 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8143 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8144 bool PragmaThresholdReached = 8145 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8146 bool ThresholdReached = 8147 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8148 if ((ThresholdReached && !Hints.allowReordering()) || 8149 PragmaThresholdReached) { 8150 ORE->emit([&]() { 8151 return OptimizationRemarkAnalysisAliasing( 8152 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8153 OrigLoop->getHeader()) 8154 << "loop not vectorized: cannot prove it is safe to reorder " 8155 "memory operations"; 8156 }); 8157 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8158 Hints.emitRemarkWithHints(); 8159 return VectorizationFactor::Disabled(); 8160 } 8161 } 8162 return SelectedVF; 8163 } 8164 8165 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8166 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8167 << '\n'); 8168 BestVF = VF; 8169 BestUF = UF; 8170 8171 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8172 return !Plan->hasVF(VF); 8173 }); 8174 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8175 } 8176 8177 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8178 DominatorTree *DT) { 8179 // Perform the actual loop transformation. 8180 8181 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8182 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8183 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8184 8185 VPTransformState State{ 8186 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8187 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8188 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8189 State.CanonicalIV = ILV.Induction; 8190 8191 ILV.printDebugTracesAtStart(); 8192 8193 //===------------------------------------------------===// 8194 // 8195 // Notice: any optimization or new instruction that go 8196 // into the code below should also be implemented in 8197 // the cost-model. 8198 // 8199 //===------------------------------------------------===// 8200 8201 // 2. Copy and widen instructions from the old loop into the new loop. 8202 VPlans.front()->execute(&State); 8203 8204 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8205 // predication, updating analyses. 8206 ILV.fixVectorizedLoop(State); 8207 8208 ILV.printDebugTracesAtEnd(); 8209 } 8210 8211 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8212 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8213 for (const auto &Plan : VPlans) 8214 if (PrintVPlansInDotFormat) 8215 Plan->printDOT(O); 8216 else 8217 Plan->print(O); 8218 } 8219 #endif 8220 8221 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8222 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8223 8224 // We create new control-flow for the vectorized loop, so the original exit 8225 // conditions will be dead after vectorization if it's only used by the 8226 // terminator 8227 SmallVector<BasicBlock*> ExitingBlocks; 8228 OrigLoop->getExitingBlocks(ExitingBlocks); 8229 for (auto *BB : ExitingBlocks) { 8230 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8231 if (!Cmp || !Cmp->hasOneUse()) 8232 continue; 8233 8234 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8235 if (!DeadInstructions.insert(Cmp).second) 8236 continue; 8237 8238 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8239 // TODO: can recurse through operands in general 8240 for (Value *Op : Cmp->operands()) { 8241 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8242 DeadInstructions.insert(cast<Instruction>(Op)); 8243 } 8244 } 8245 8246 // We create new "steps" for induction variable updates to which the original 8247 // induction variables map. An original update instruction will be dead if 8248 // all its users except the induction variable are dead. 8249 auto *Latch = OrigLoop->getLoopLatch(); 8250 for (auto &Induction : Legal->getInductionVars()) { 8251 PHINode *Ind = Induction.first; 8252 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8253 8254 // If the tail is to be folded by masking, the primary induction variable, 8255 // if exists, isn't dead: it will be used for masking. Don't kill it. 8256 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8257 continue; 8258 8259 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8260 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8261 })) 8262 DeadInstructions.insert(IndUpdate); 8263 8264 // We record as "Dead" also the type-casting instructions we had identified 8265 // during induction analysis. We don't need any handling for them in the 8266 // vectorized loop because we have proven that, under a proper runtime 8267 // test guarding the vectorized loop, the value of the phi, and the casted 8268 // value of the phi, are the same. The last instruction in this casting chain 8269 // will get its scalar/vector/widened def from the scalar/vector/widened def 8270 // of the respective phi node. Any other casts in the induction def-use chain 8271 // have no other uses outside the phi update chain, and will be ignored. 8272 InductionDescriptor &IndDes = Induction.second; 8273 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8274 DeadInstructions.insert(Casts.begin(), Casts.end()); 8275 } 8276 } 8277 8278 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8279 8280 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8281 8282 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8283 Instruction::BinaryOps BinOp) { 8284 // When unrolling and the VF is 1, we only need to add a simple scalar. 8285 Type *Ty = Val->getType(); 8286 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8287 8288 if (Ty->isFloatingPointTy()) { 8289 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8290 8291 // Floating-point operations inherit FMF via the builder's flags. 8292 Value *MulOp = Builder.CreateFMul(C, Step); 8293 return Builder.CreateBinOp(BinOp, Val, MulOp); 8294 } 8295 Constant *C = ConstantInt::get(Ty, StartIdx); 8296 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8297 } 8298 8299 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8300 SmallVector<Metadata *, 4> MDs; 8301 // Reserve first location for self reference to the LoopID metadata node. 8302 MDs.push_back(nullptr); 8303 bool IsUnrollMetadata = false; 8304 MDNode *LoopID = L->getLoopID(); 8305 if (LoopID) { 8306 // First find existing loop unrolling disable metadata. 8307 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8308 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8309 if (MD) { 8310 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8311 IsUnrollMetadata = 8312 S && S->getString().startswith("llvm.loop.unroll.disable"); 8313 } 8314 MDs.push_back(LoopID->getOperand(i)); 8315 } 8316 } 8317 8318 if (!IsUnrollMetadata) { 8319 // Add runtime unroll disable metadata. 8320 LLVMContext &Context = L->getHeader()->getContext(); 8321 SmallVector<Metadata *, 1> DisableOperands; 8322 DisableOperands.push_back( 8323 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8324 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8325 MDs.push_back(DisableNode); 8326 MDNode *NewLoopID = MDNode::get(Context, MDs); 8327 // Set operand 0 to refer to the loop id itself. 8328 NewLoopID->replaceOperandWith(0, NewLoopID); 8329 L->setLoopID(NewLoopID); 8330 } 8331 } 8332 8333 //===--------------------------------------------------------------------===// 8334 // EpilogueVectorizerMainLoop 8335 //===--------------------------------------------------------------------===// 8336 8337 /// This function is partially responsible for generating the control flow 8338 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8339 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8340 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8341 Loop *Lp = createVectorLoopSkeleton(""); 8342 8343 // Generate the code to check the minimum iteration count of the vector 8344 // epilogue (see below). 8345 EPI.EpilogueIterationCountCheck = 8346 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8347 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8348 8349 // Generate the code to check any assumptions that we've made for SCEV 8350 // expressions. 8351 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8352 8353 // Generate the code that checks at runtime if arrays overlap. We put the 8354 // checks into a separate block to make the more common case of few elements 8355 // faster. 8356 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8357 8358 // Generate the iteration count check for the main loop, *after* the check 8359 // for the epilogue loop, so that the path-length is shorter for the case 8360 // that goes directly through the vector epilogue. The longer-path length for 8361 // the main loop is compensated for, by the gain from vectorizing the larger 8362 // trip count. Note: the branch will get updated later on when we vectorize 8363 // the epilogue. 8364 EPI.MainLoopIterationCountCheck = 8365 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8366 8367 // Generate the induction variable. 8368 OldInduction = Legal->getPrimaryInduction(); 8369 Type *IdxTy = Legal->getWidestInductionType(); 8370 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8371 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8372 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8373 EPI.VectorTripCount = CountRoundDown; 8374 Induction = 8375 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8376 getDebugLocFromInstOrOperands(OldInduction)); 8377 8378 // Skip induction resume value creation here because they will be created in 8379 // the second pass. If we created them here, they wouldn't be used anyway, 8380 // because the vplan in the second pass still contains the inductions from the 8381 // original loop. 8382 8383 return completeLoopSkeleton(Lp, OrigLoopID); 8384 } 8385 8386 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8387 LLVM_DEBUG({ 8388 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8389 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8390 << ", Main Loop UF:" << EPI.MainLoopUF 8391 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8392 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8393 }); 8394 } 8395 8396 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8397 DEBUG_WITH_TYPE(VerboseDebug, { 8398 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8399 }); 8400 } 8401 8402 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8403 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8404 assert(L && "Expected valid Loop."); 8405 assert(Bypass && "Expected valid bypass basic block."); 8406 unsigned VFactor = 8407 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8408 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8409 Value *Count = getOrCreateTripCount(L); 8410 // Reuse existing vector loop preheader for TC checks. 8411 // Note that new preheader block is generated for vector loop. 8412 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8413 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8414 8415 // Generate code to check if the loop's trip count is less than VF * UF of the 8416 // main vector loop. 8417 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8418 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8419 8420 Value *CheckMinIters = Builder.CreateICmp( 8421 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8422 "min.iters.check"); 8423 8424 if (!ForEpilogue) 8425 TCCheckBlock->setName("vector.main.loop.iter.check"); 8426 8427 // Create new preheader for vector loop. 8428 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8429 DT, LI, nullptr, "vector.ph"); 8430 8431 if (ForEpilogue) { 8432 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8433 DT->getNode(Bypass)->getIDom()) && 8434 "TC check is expected to dominate Bypass"); 8435 8436 // Update dominator for Bypass & LoopExit. 8437 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8438 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8439 // For loops with multiple exits, there's no edge from the middle block 8440 // to exit blocks (as the epilogue must run) and thus no need to update 8441 // the immediate dominator of the exit blocks. 8442 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8443 8444 LoopBypassBlocks.push_back(TCCheckBlock); 8445 8446 // Save the trip count so we don't have to regenerate it in the 8447 // vec.epilog.iter.check. This is safe to do because the trip count 8448 // generated here dominates the vector epilog iter check. 8449 EPI.TripCount = Count; 8450 } 8451 8452 ReplaceInstWithInst( 8453 TCCheckBlock->getTerminator(), 8454 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8455 8456 return TCCheckBlock; 8457 } 8458 8459 //===--------------------------------------------------------------------===// 8460 // EpilogueVectorizerEpilogueLoop 8461 //===--------------------------------------------------------------------===// 8462 8463 /// This function is partially responsible for generating the control flow 8464 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8465 BasicBlock * 8466 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8467 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8468 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8469 8470 // Now, compare the remaining count and if there aren't enough iterations to 8471 // execute the vectorized epilogue skip to the scalar part. 8472 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8473 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8474 LoopVectorPreHeader = 8475 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8476 LI, nullptr, "vec.epilog.ph"); 8477 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8478 VecEpilogueIterationCountCheck); 8479 8480 // Adjust the control flow taking the state info from the main loop 8481 // vectorization into account. 8482 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8483 "expected this to be saved from the previous pass."); 8484 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8485 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8486 8487 DT->changeImmediateDominator(LoopVectorPreHeader, 8488 EPI.MainLoopIterationCountCheck); 8489 8490 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8491 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8492 8493 if (EPI.SCEVSafetyCheck) 8494 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8495 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8496 if (EPI.MemSafetyCheck) 8497 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8498 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8499 8500 DT->changeImmediateDominator( 8501 VecEpilogueIterationCountCheck, 8502 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8503 8504 DT->changeImmediateDominator(LoopScalarPreHeader, 8505 EPI.EpilogueIterationCountCheck); 8506 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8507 // If there is an epilogue which must run, there's no edge from the 8508 // middle block to exit blocks and thus no need to update the immediate 8509 // dominator of the exit blocks. 8510 DT->changeImmediateDominator(LoopExitBlock, 8511 EPI.EpilogueIterationCountCheck); 8512 8513 // Keep track of bypass blocks, as they feed start values to the induction 8514 // phis in the scalar loop preheader. 8515 if (EPI.SCEVSafetyCheck) 8516 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8517 if (EPI.MemSafetyCheck) 8518 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8519 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8520 8521 // Generate a resume induction for the vector epilogue and put it in the 8522 // vector epilogue preheader 8523 Type *IdxTy = Legal->getWidestInductionType(); 8524 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8525 LoopVectorPreHeader->getFirstNonPHI()); 8526 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8527 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8528 EPI.MainLoopIterationCountCheck); 8529 8530 // Generate the induction variable. 8531 OldInduction = Legal->getPrimaryInduction(); 8532 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8533 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8534 Value *StartIdx = EPResumeVal; 8535 Induction = 8536 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8537 getDebugLocFromInstOrOperands(OldInduction)); 8538 8539 // Generate induction resume values. These variables save the new starting 8540 // indexes for the scalar loop. They are used to test if there are any tail 8541 // iterations left once the vector loop has completed. 8542 // Note that when the vectorized epilogue is skipped due to iteration count 8543 // check, then the resume value for the induction variable comes from 8544 // the trip count of the main vector loop, hence passing the AdditionalBypass 8545 // argument. 8546 createInductionResumeValues(Lp, CountRoundDown, 8547 {VecEpilogueIterationCountCheck, 8548 EPI.VectorTripCount} /* AdditionalBypass */); 8549 8550 AddRuntimeUnrollDisableMetaData(Lp); 8551 return completeLoopSkeleton(Lp, OrigLoopID); 8552 } 8553 8554 BasicBlock * 8555 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8556 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8557 8558 assert(EPI.TripCount && 8559 "Expected trip count to have been safed in the first pass."); 8560 assert( 8561 (!isa<Instruction>(EPI.TripCount) || 8562 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8563 "saved trip count does not dominate insertion point."); 8564 Value *TC = EPI.TripCount; 8565 IRBuilder<> Builder(Insert->getTerminator()); 8566 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8567 8568 // Generate code to check if the loop's trip count is less than VF * UF of the 8569 // vector epilogue loop. 8570 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8571 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8572 8573 Value *CheckMinIters = Builder.CreateICmp( 8574 P, Count, 8575 ConstantInt::get(Count->getType(), 8576 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8577 "min.epilog.iters.check"); 8578 8579 ReplaceInstWithInst( 8580 Insert->getTerminator(), 8581 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8582 8583 LoopBypassBlocks.push_back(Insert); 8584 return Insert; 8585 } 8586 8587 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8588 LLVM_DEBUG({ 8589 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8590 << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8591 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8592 }); 8593 } 8594 8595 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8596 DEBUG_WITH_TYPE(VerboseDebug, { 8597 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8598 }); 8599 } 8600 8601 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8602 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8603 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8604 bool PredicateAtRangeStart = Predicate(Range.Start); 8605 8606 for (ElementCount TmpVF = Range.Start * 2; 8607 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8608 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8609 Range.End = TmpVF; 8610 break; 8611 } 8612 8613 return PredicateAtRangeStart; 8614 } 8615 8616 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8617 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8618 /// of VF's starting at a given VF and extending it as much as possible. Each 8619 /// vectorization decision can potentially shorten this sub-range during 8620 /// buildVPlan(). 8621 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8622 ElementCount MaxVF) { 8623 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8624 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8625 VFRange SubRange = {VF, MaxVFPlusOne}; 8626 VPlans.push_back(buildVPlan(SubRange)); 8627 VF = SubRange.End; 8628 } 8629 } 8630 8631 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8632 VPlanPtr &Plan) { 8633 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8634 8635 // Look for cached value. 8636 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8637 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8638 if (ECEntryIt != EdgeMaskCache.end()) 8639 return ECEntryIt->second; 8640 8641 VPValue *SrcMask = createBlockInMask(Src, Plan); 8642 8643 // The terminator has to be a branch inst! 8644 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8645 assert(BI && "Unexpected terminator found"); 8646 8647 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8648 return EdgeMaskCache[Edge] = SrcMask; 8649 8650 // If source is an exiting block, we know the exit edge is dynamically dead 8651 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8652 // adding uses of an otherwise potentially dead instruction. 8653 if (OrigLoop->isLoopExiting(Src)) 8654 return EdgeMaskCache[Edge] = SrcMask; 8655 8656 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8657 assert(EdgeMask && "No Edge Mask found for condition"); 8658 8659 if (BI->getSuccessor(0) != Dst) 8660 EdgeMask = Builder.createNot(EdgeMask); 8661 8662 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8663 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8664 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8665 // The select version does not introduce new UB if SrcMask is false and 8666 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8667 VPValue *False = Plan->getOrAddVPValue( 8668 ConstantInt::getFalse(BI->getCondition()->getType())); 8669 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8670 } 8671 8672 return EdgeMaskCache[Edge] = EdgeMask; 8673 } 8674 8675 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8676 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8677 8678 // Look for cached value. 8679 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8680 if (BCEntryIt != BlockMaskCache.end()) 8681 return BCEntryIt->second; 8682 8683 // All-one mask is modelled as no-mask following the convention for masked 8684 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8685 VPValue *BlockMask = nullptr; 8686 8687 if (OrigLoop->getHeader() == BB) { 8688 if (!CM.blockNeedsPredication(BB)) 8689 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8690 8691 // Create the block in mask as the first non-phi instruction in the block. 8692 VPBuilder::InsertPointGuard Guard(Builder); 8693 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8694 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8695 8696 // Introduce the early-exit compare IV <= BTC to form header block mask. 8697 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8698 // Start by constructing the desired canonical IV. 8699 VPValue *IV = nullptr; 8700 if (Legal->getPrimaryInduction()) 8701 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8702 else { 8703 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8704 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8705 IV = IVRecipe->getVPSingleValue(); 8706 } 8707 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8708 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8709 8710 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8711 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8712 // as a second argument, we only pass the IV here and extract the 8713 // tripcount from the transform state where codegen of the VP instructions 8714 // happen. 8715 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8716 } else { 8717 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8718 } 8719 return BlockMaskCache[BB] = BlockMask; 8720 } 8721 8722 // This is the block mask. We OR all incoming edges. 8723 for (auto *Predecessor : predecessors(BB)) { 8724 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8725 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8726 return BlockMaskCache[BB] = EdgeMask; 8727 8728 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8729 BlockMask = EdgeMask; 8730 continue; 8731 } 8732 8733 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8734 } 8735 8736 return BlockMaskCache[BB] = BlockMask; 8737 } 8738 8739 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8740 ArrayRef<VPValue *> Operands, 8741 VFRange &Range, 8742 VPlanPtr &Plan) { 8743 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8744 "Must be called with either a load or store"); 8745 8746 auto willWiden = [&](ElementCount VF) -> bool { 8747 if (VF.isScalar()) 8748 return false; 8749 LoopVectorizationCostModel::InstWidening Decision = 8750 CM.getWideningDecision(I, VF); 8751 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8752 "CM decision should be taken at this point."); 8753 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8754 return true; 8755 if (CM.isScalarAfterVectorization(I, VF) || 8756 CM.isProfitableToScalarize(I, VF)) 8757 return false; 8758 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8759 }; 8760 8761 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8762 return nullptr; 8763 8764 VPValue *Mask = nullptr; 8765 if (Legal->isMaskRequired(I)) 8766 Mask = createBlockInMask(I->getParent(), Plan); 8767 8768 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8769 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8770 8771 StoreInst *Store = cast<StoreInst>(I); 8772 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8773 Mask); 8774 } 8775 8776 VPWidenIntOrFpInductionRecipe * 8777 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8778 ArrayRef<VPValue *> Operands) const { 8779 // Check if this is an integer or fp induction. If so, build the recipe that 8780 // produces its scalar and vector values. 8781 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8782 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8783 II.getKind() == InductionDescriptor::IK_FpInduction) { 8784 assert(II.getStartValue() == 8785 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8786 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8787 return new VPWidenIntOrFpInductionRecipe( 8788 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8789 } 8790 8791 return nullptr; 8792 } 8793 8794 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8795 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8796 VPlan &Plan) const { 8797 // Optimize the special case where the source is a constant integer 8798 // induction variable. Notice that we can only optimize the 'trunc' case 8799 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8800 // (c) other casts depend on pointer size. 8801 8802 // Determine whether \p K is a truncation based on an induction variable that 8803 // can be optimized. 8804 auto isOptimizableIVTruncate = 8805 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8806 return [=](ElementCount VF) -> bool { 8807 return CM.isOptimizableIVTruncate(K, VF); 8808 }; 8809 }; 8810 8811 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8812 isOptimizableIVTruncate(I), Range)) { 8813 8814 InductionDescriptor II = 8815 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8816 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8817 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8818 Start, nullptr, I); 8819 } 8820 return nullptr; 8821 } 8822 8823 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8824 ArrayRef<VPValue *> Operands, 8825 VPlanPtr &Plan) { 8826 // If all incoming values are equal, the incoming VPValue can be used directly 8827 // instead of creating a new VPBlendRecipe. 8828 VPValue *FirstIncoming = Operands[0]; 8829 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8830 return FirstIncoming == Inc; 8831 })) { 8832 return Operands[0]; 8833 } 8834 8835 // We know that all PHIs in non-header blocks are converted into selects, so 8836 // we don't have to worry about the insertion order and we can just use the 8837 // builder. At this point we generate the predication tree. There may be 8838 // duplications since this is a simple recursive scan, but future 8839 // optimizations will clean it up. 8840 SmallVector<VPValue *, 2> OperandsWithMask; 8841 unsigned NumIncoming = Phi->getNumIncomingValues(); 8842 8843 for (unsigned In = 0; In < NumIncoming; In++) { 8844 VPValue *EdgeMask = 8845 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8846 assert((EdgeMask || NumIncoming == 1) && 8847 "Multiple predecessors with one having a full mask"); 8848 OperandsWithMask.push_back(Operands[In]); 8849 if (EdgeMask) 8850 OperandsWithMask.push_back(EdgeMask); 8851 } 8852 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8853 } 8854 8855 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8856 ArrayRef<VPValue *> Operands, 8857 VFRange &Range) const { 8858 8859 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8860 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8861 Range); 8862 8863 if (IsPredicated) 8864 return nullptr; 8865 8866 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8867 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8868 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8869 ID == Intrinsic::pseudoprobe || 8870 ID == Intrinsic::experimental_noalias_scope_decl)) 8871 return nullptr; 8872 8873 auto willWiden = [&](ElementCount VF) -> bool { 8874 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8875 // The following case may be scalarized depending on the VF. 8876 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8877 // version of the instruction. 8878 // Is it beneficial to perform intrinsic call compared to lib call? 8879 bool NeedToScalarize = false; 8880 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8881 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8882 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8883 return UseVectorIntrinsic || !NeedToScalarize; 8884 }; 8885 8886 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8887 return nullptr; 8888 8889 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8890 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8891 } 8892 8893 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8894 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8895 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8896 // Instruction should be widened, unless it is scalar after vectorization, 8897 // scalarization is profitable or it is predicated. 8898 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8899 return CM.isScalarAfterVectorization(I, VF) || 8900 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8901 }; 8902 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8903 Range); 8904 } 8905 8906 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8907 ArrayRef<VPValue *> Operands) const { 8908 auto IsVectorizableOpcode = [](unsigned Opcode) { 8909 switch (Opcode) { 8910 case Instruction::Add: 8911 case Instruction::And: 8912 case Instruction::AShr: 8913 case Instruction::BitCast: 8914 case Instruction::FAdd: 8915 case Instruction::FCmp: 8916 case Instruction::FDiv: 8917 case Instruction::FMul: 8918 case Instruction::FNeg: 8919 case Instruction::FPExt: 8920 case Instruction::FPToSI: 8921 case Instruction::FPToUI: 8922 case Instruction::FPTrunc: 8923 case Instruction::FRem: 8924 case Instruction::FSub: 8925 case Instruction::ICmp: 8926 case Instruction::IntToPtr: 8927 case Instruction::LShr: 8928 case Instruction::Mul: 8929 case Instruction::Or: 8930 case Instruction::PtrToInt: 8931 case Instruction::SDiv: 8932 case Instruction::Select: 8933 case Instruction::SExt: 8934 case Instruction::Shl: 8935 case Instruction::SIToFP: 8936 case Instruction::SRem: 8937 case Instruction::Sub: 8938 case Instruction::Trunc: 8939 case Instruction::UDiv: 8940 case Instruction::UIToFP: 8941 case Instruction::URem: 8942 case Instruction::Xor: 8943 case Instruction::ZExt: 8944 return true; 8945 } 8946 return false; 8947 }; 8948 8949 if (!IsVectorizableOpcode(I->getOpcode())) 8950 return nullptr; 8951 8952 // Success: widen this instruction. 8953 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8954 } 8955 8956 void VPRecipeBuilder::fixHeaderPhis() { 8957 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8958 for (VPWidenPHIRecipe *R : PhisToFix) { 8959 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8960 VPRecipeBase *IncR = 8961 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8962 R->addOperand(IncR->getVPSingleValue()); 8963 } 8964 } 8965 8966 VPBasicBlock *VPRecipeBuilder::handleReplication( 8967 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8968 VPlanPtr &Plan) { 8969 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8970 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8971 Range); 8972 8973 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8974 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8975 8976 // Even if the instruction is not marked as uniform, there are certain 8977 // intrinsic calls that can be effectively treated as such, so we check for 8978 // them here. Conservatively, we only do this for scalable vectors, since 8979 // for fixed-width VFs we can always fall back on full scalarization. 8980 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8981 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8982 case Intrinsic::assume: 8983 case Intrinsic::lifetime_start: 8984 case Intrinsic::lifetime_end: 8985 // For scalable vectors if one of the operands is variant then we still 8986 // want to mark as uniform, which will generate one instruction for just 8987 // the first lane of the vector. We can't scalarize the call in the same 8988 // way as for fixed-width vectors because we don't know how many lanes 8989 // there are. 8990 // 8991 // The reasons for doing it this way for scalable vectors are: 8992 // 1. For the assume intrinsic generating the instruction for the first 8993 // lane is still be better than not generating any at all. For 8994 // example, the input may be a splat across all lanes. 8995 // 2. For the lifetime start/end intrinsics the pointer operand only 8996 // does anything useful when the input comes from a stack object, 8997 // which suggests it should always be uniform. For non-stack objects 8998 // the effect is to poison the object, which still allows us to 8999 // remove the call. 9000 IsUniform = true; 9001 break; 9002 default: 9003 break; 9004 } 9005 } 9006 9007 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 9008 IsUniform, IsPredicated); 9009 setRecipe(I, Recipe); 9010 Plan->addVPValue(I, Recipe); 9011 9012 // Find if I uses a predicated instruction. If so, it will use its scalar 9013 // value. Avoid hoisting the insert-element which packs the scalar value into 9014 // a vector value, as that happens iff all users use the vector value. 9015 for (VPValue *Op : Recipe->operands()) { 9016 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 9017 if (!PredR) 9018 continue; 9019 auto *RepR = 9020 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 9021 assert(RepR->isPredicated() && 9022 "expected Replicate recipe to be predicated"); 9023 RepR->setAlsoPack(false); 9024 } 9025 9026 // Finalize the recipe for Instr, first if it is not predicated. 9027 if (!IsPredicated) { 9028 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9029 VPBB->appendRecipe(Recipe); 9030 return VPBB; 9031 } 9032 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9033 assert(VPBB->getSuccessors().empty() && 9034 "VPBB has successors when handling predicated replication."); 9035 // Record predicated instructions for above packing optimizations. 9036 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9037 VPBlockUtils::insertBlockAfter(Region, VPBB); 9038 auto *RegSucc = new VPBasicBlock(); 9039 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9040 return RegSucc; 9041 } 9042 9043 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9044 VPRecipeBase *PredRecipe, 9045 VPlanPtr &Plan) { 9046 // Instructions marked for predication are replicated and placed under an 9047 // if-then construct to prevent side-effects. 9048 9049 // Generate recipes to compute the block mask for this region. 9050 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9051 9052 // Build the triangular if-then region. 9053 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9054 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9055 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9056 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9057 auto *PHIRecipe = Instr->getType()->isVoidTy() 9058 ? nullptr 9059 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9060 if (PHIRecipe) { 9061 Plan->removeVPValueFor(Instr); 9062 Plan->addVPValue(Instr, PHIRecipe); 9063 } 9064 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9065 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9066 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9067 9068 // Note: first set Entry as region entry and then connect successors starting 9069 // from it in order, to propagate the "parent" of each VPBasicBlock. 9070 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9071 VPBlockUtils::connectBlocks(Pred, Exit); 9072 9073 return Region; 9074 } 9075 9076 VPRecipeOrVPValueTy 9077 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9078 ArrayRef<VPValue *> Operands, 9079 VFRange &Range, VPlanPtr &Plan) { 9080 // First, check for specific widening recipes that deal with calls, memory 9081 // operations, inductions and Phi nodes. 9082 if (auto *CI = dyn_cast<CallInst>(Instr)) 9083 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9084 9085 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9086 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9087 9088 VPRecipeBase *Recipe; 9089 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9090 if (Phi->getParent() != OrigLoop->getHeader()) 9091 return tryToBlend(Phi, Operands, Plan); 9092 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9093 return toVPRecipeResult(Recipe); 9094 9095 VPWidenPHIRecipe *PhiRecipe = nullptr; 9096 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9097 VPValue *StartV = Operands[0]; 9098 if (Legal->isReductionVariable(Phi)) { 9099 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9100 assert(RdxDesc.getRecurrenceStartValue() == 9101 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9102 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9103 CM.isInLoopReduction(Phi), 9104 CM.useOrderedReductions(RdxDesc)); 9105 } else { 9106 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9107 } 9108 9109 // Record the incoming value from the backedge, so we can add the incoming 9110 // value from the backedge after all recipes have been created. 9111 recordRecipeOf(cast<Instruction>( 9112 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9113 PhisToFix.push_back(PhiRecipe); 9114 } else { 9115 // TODO: record start and backedge value for remaining pointer induction 9116 // phis. 9117 assert(Phi->getType()->isPointerTy() && 9118 "only pointer phis should be handled here"); 9119 PhiRecipe = new VPWidenPHIRecipe(Phi); 9120 } 9121 9122 return toVPRecipeResult(PhiRecipe); 9123 } 9124 9125 if (isa<TruncInst>(Instr) && 9126 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9127 Range, *Plan))) 9128 return toVPRecipeResult(Recipe); 9129 9130 if (!shouldWiden(Instr, Range)) 9131 return nullptr; 9132 9133 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9134 return toVPRecipeResult(new VPWidenGEPRecipe( 9135 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9136 9137 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9138 bool InvariantCond = 9139 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9140 return toVPRecipeResult(new VPWidenSelectRecipe( 9141 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9142 } 9143 9144 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9145 } 9146 9147 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9148 ElementCount MaxVF) { 9149 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9150 9151 // Collect instructions from the original loop that will become trivially dead 9152 // in the vectorized loop. We don't need to vectorize these instructions. For 9153 // example, original induction update instructions can become dead because we 9154 // separately emit induction "steps" when generating code for the new loop. 9155 // Similarly, we create a new latch condition when setting up the structure 9156 // of the new loop, so the old one can become dead. 9157 SmallPtrSet<Instruction *, 4> DeadInstructions; 9158 collectTriviallyDeadInstructions(DeadInstructions); 9159 9160 // Add assume instructions we need to drop to DeadInstructions, to prevent 9161 // them from being added to the VPlan. 9162 // TODO: We only need to drop assumes in blocks that get flattend. If the 9163 // control flow is preserved, we should keep them. 9164 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9165 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9166 9167 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9168 // Dead instructions do not need sinking. Remove them from SinkAfter. 9169 for (Instruction *I : DeadInstructions) 9170 SinkAfter.erase(I); 9171 9172 // Cannot sink instructions after dead instructions (there won't be any 9173 // recipes for them). Instead, find the first non-dead previous instruction. 9174 for (auto &P : Legal->getSinkAfter()) { 9175 Instruction *SinkTarget = P.second; 9176 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9177 (void)FirstInst; 9178 while (DeadInstructions.contains(SinkTarget)) { 9179 assert( 9180 SinkTarget != FirstInst && 9181 "Must find a live instruction (at least the one feeding the " 9182 "first-order recurrence PHI) before reaching beginning of the block"); 9183 SinkTarget = SinkTarget->getPrevNode(); 9184 assert(SinkTarget != P.first && 9185 "sink source equals target, no sinking required"); 9186 } 9187 P.second = SinkTarget; 9188 } 9189 9190 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9191 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9192 VFRange SubRange = {VF, MaxVFPlusOne}; 9193 VPlans.push_back( 9194 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9195 VF = SubRange.End; 9196 } 9197 } 9198 9199 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9200 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9201 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9202 9203 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9204 9205 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9206 9207 // --------------------------------------------------------------------------- 9208 // Pre-construction: record ingredients whose recipes we'll need to further 9209 // process after constructing the initial VPlan. 9210 // --------------------------------------------------------------------------- 9211 9212 // Mark instructions we'll need to sink later and their targets as 9213 // ingredients whose recipe we'll need to record. 9214 for (auto &Entry : SinkAfter) { 9215 RecipeBuilder.recordRecipeOf(Entry.first); 9216 RecipeBuilder.recordRecipeOf(Entry.second); 9217 } 9218 for (auto &Reduction : CM.getInLoopReductionChains()) { 9219 PHINode *Phi = Reduction.first; 9220 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9221 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9222 9223 RecipeBuilder.recordRecipeOf(Phi); 9224 for (auto &R : ReductionOperations) { 9225 RecipeBuilder.recordRecipeOf(R); 9226 // For min/max reducitons, where we have a pair of icmp/select, we also 9227 // need to record the ICmp recipe, so it can be removed later. 9228 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9229 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9230 } 9231 } 9232 9233 // For each interleave group which is relevant for this (possibly trimmed) 9234 // Range, add it to the set of groups to be later applied to the VPlan and add 9235 // placeholders for its members' Recipes which we'll be replacing with a 9236 // single VPInterleaveRecipe. 9237 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9238 auto applyIG = [IG, this](ElementCount VF) -> bool { 9239 return (VF.isVector() && // Query is illegal for VF == 1 9240 CM.getWideningDecision(IG->getInsertPos(), VF) == 9241 LoopVectorizationCostModel::CM_Interleave); 9242 }; 9243 if (!getDecisionAndClampRange(applyIG, Range)) 9244 continue; 9245 InterleaveGroups.insert(IG); 9246 for (unsigned i = 0; i < IG->getFactor(); i++) 9247 if (Instruction *Member = IG->getMember(i)) 9248 RecipeBuilder.recordRecipeOf(Member); 9249 }; 9250 9251 // --------------------------------------------------------------------------- 9252 // Build initial VPlan: Scan the body of the loop in a topological order to 9253 // visit each basic block after having visited its predecessor basic blocks. 9254 // --------------------------------------------------------------------------- 9255 9256 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9257 auto Plan = std::make_unique<VPlan>(); 9258 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9259 Plan->setEntry(VPBB); 9260 9261 // Scan the body of the loop in a topological order to visit each basic block 9262 // after having visited its predecessor basic blocks. 9263 LoopBlocksDFS DFS(OrigLoop); 9264 DFS.perform(LI); 9265 9266 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9267 // Relevant instructions from basic block BB will be grouped into VPRecipe 9268 // ingredients and fill a new VPBasicBlock. 9269 unsigned VPBBsForBB = 0; 9270 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9271 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9272 VPBB = FirstVPBBForBB; 9273 Builder.setInsertPoint(VPBB); 9274 9275 // Introduce each ingredient into VPlan. 9276 // TODO: Model and preserve debug instrinsics in VPlan. 9277 for (Instruction &I : BB->instructionsWithoutDebug()) { 9278 Instruction *Instr = &I; 9279 9280 // First filter out irrelevant instructions, to ensure no recipes are 9281 // built for them. 9282 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9283 continue; 9284 9285 SmallVector<VPValue *, 4> Operands; 9286 auto *Phi = dyn_cast<PHINode>(Instr); 9287 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9288 Operands.push_back(Plan->getOrAddVPValue( 9289 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9290 } else { 9291 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9292 Operands = {OpRange.begin(), OpRange.end()}; 9293 } 9294 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9295 Instr, Operands, Range, Plan)) { 9296 // If Instr can be simplified to an existing VPValue, use it. 9297 if (RecipeOrValue.is<VPValue *>()) { 9298 auto *VPV = RecipeOrValue.get<VPValue *>(); 9299 Plan->addVPValue(Instr, VPV); 9300 // If the re-used value is a recipe, register the recipe for the 9301 // instruction, in case the recipe for Instr needs to be recorded. 9302 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9303 RecipeBuilder.setRecipe(Instr, R); 9304 continue; 9305 } 9306 // Otherwise, add the new recipe. 9307 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9308 for (auto *Def : Recipe->definedValues()) { 9309 auto *UV = Def->getUnderlyingValue(); 9310 Plan->addVPValue(UV, Def); 9311 } 9312 9313 RecipeBuilder.setRecipe(Instr, Recipe); 9314 VPBB->appendRecipe(Recipe); 9315 continue; 9316 } 9317 9318 // Otherwise, if all widening options failed, Instruction is to be 9319 // replicated. This may create a successor for VPBB. 9320 VPBasicBlock *NextVPBB = 9321 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9322 if (NextVPBB != VPBB) { 9323 VPBB = NextVPBB; 9324 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9325 : ""); 9326 } 9327 } 9328 } 9329 9330 RecipeBuilder.fixHeaderPhis(); 9331 9332 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9333 // may also be empty, such as the last one VPBB, reflecting original 9334 // basic-blocks with no recipes. 9335 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9336 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9337 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9338 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9339 delete PreEntry; 9340 9341 // --------------------------------------------------------------------------- 9342 // Transform initial VPlan: Apply previously taken decisions, in order, to 9343 // bring the VPlan to its final state. 9344 // --------------------------------------------------------------------------- 9345 9346 // Apply Sink-After legal constraints. 9347 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9348 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9349 if (Region && Region->isReplicator()) { 9350 assert(Region->getNumSuccessors() == 1 && 9351 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9352 assert(R->getParent()->size() == 1 && 9353 "A recipe in an original replicator region must be the only " 9354 "recipe in its block"); 9355 return Region; 9356 } 9357 return nullptr; 9358 }; 9359 for (auto &Entry : SinkAfter) { 9360 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9361 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9362 9363 auto *TargetRegion = GetReplicateRegion(Target); 9364 auto *SinkRegion = GetReplicateRegion(Sink); 9365 if (!SinkRegion) { 9366 // If the sink source is not a replicate region, sink the recipe directly. 9367 if (TargetRegion) { 9368 // The target is in a replication region, make sure to move Sink to 9369 // the block after it, not into the replication region itself. 9370 VPBasicBlock *NextBlock = 9371 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9372 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9373 } else 9374 Sink->moveAfter(Target); 9375 continue; 9376 } 9377 9378 // The sink source is in a replicate region. Unhook the region from the CFG. 9379 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9380 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9381 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9382 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9383 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9384 9385 if (TargetRegion) { 9386 // The target recipe is also in a replicate region, move the sink region 9387 // after the target region. 9388 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9389 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9390 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9391 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9392 } else { 9393 // The sink source is in a replicate region, we need to move the whole 9394 // replicate region, which should only contain a single recipe in the 9395 // main block. 9396 auto *SplitBlock = 9397 Target->getParent()->splitAt(std::next(Target->getIterator())); 9398 9399 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9400 9401 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9402 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9403 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9404 if (VPBB == SplitPred) 9405 VPBB = SplitBlock; 9406 } 9407 } 9408 9409 // Introduce a recipe to combine the incoming and previous values of a 9410 // first-order recurrence. 9411 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9412 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9413 if (!RecurPhi) 9414 continue; 9415 9416 auto *RecurSplice = cast<VPInstruction>( 9417 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9418 {RecurPhi, RecurPhi->getBackedgeValue()})); 9419 9420 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9421 if (auto *Region = GetReplicateRegion(PrevRecipe)) { 9422 VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9423 RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi()); 9424 } else 9425 RecurSplice->moveAfter(PrevRecipe); 9426 RecurPhi->replaceAllUsesWith(RecurSplice); 9427 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9428 // all users. 9429 RecurSplice->setOperand(0, RecurPhi); 9430 } 9431 9432 // Interleave memory: for each Interleave Group we marked earlier as relevant 9433 // for this VPlan, replace the Recipes widening its memory instructions with a 9434 // single VPInterleaveRecipe at its insertion point. 9435 for (auto IG : InterleaveGroups) { 9436 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9437 RecipeBuilder.getRecipe(IG->getInsertPos())); 9438 SmallVector<VPValue *, 4> StoredValues; 9439 for (unsigned i = 0; i < IG->getFactor(); ++i) 9440 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9441 auto *StoreR = 9442 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9443 StoredValues.push_back(StoreR->getStoredValue()); 9444 } 9445 9446 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9447 Recipe->getMask()); 9448 VPIG->insertBefore(Recipe); 9449 unsigned J = 0; 9450 for (unsigned i = 0; i < IG->getFactor(); ++i) 9451 if (Instruction *Member = IG->getMember(i)) { 9452 if (!Member->getType()->isVoidTy()) { 9453 VPValue *OriginalV = Plan->getVPValue(Member); 9454 Plan->removeVPValueFor(Member); 9455 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9456 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9457 J++; 9458 } 9459 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9460 } 9461 } 9462 9463 // Adjust the recipes for any inloop reductions. 9464 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9465 9466 VPlanTransforms::sinkScalarOperands(*Plan); 9467 VPlanTransforms::mergeReplicateRegions(*Plan); 9468 9469 std::string PlanName; 9470 raw_string_ostream RSO(PlanName); 9471 ElementCount VF = Range.Start; 9472 Plan->addVF(VF); 9473 RSO << "Initial VPlan for VF={" << VF; 9474 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9475 Plan->addVF(VF); 9476 RSO << "," << VF; 9477 } 9478 RSO << "},UF>=1"; 9479 RSO.flush(); 9480 Plan->setName(PlanName); 9481 9482 return Plan; 9483 } 9484 9485 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9486 // Outer loop handling: They may require CFG and instruction level 9487 // transformations before even evaluating whether vectorization is profitable. 9488 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9489 // the vectorization pipeline. 9490 assert(!OrigLoop->isInnermost()); 9491 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9492 9493 // Create new empty VPlan 9494 auto Plan = std::make_unique<VPlan>(); 9495 9496 // Build hierarchical CFG 9497 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9498 HCFGBuilder.buildHierarchicalCFG(); 9499 9500 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9501 VF *= 2) 9502 Plan->addVF(VF); 9503 9504 if (EnableVPlanPredication) { 9505 VPlanPredicator VPP(*Plan); 9506 VPP.predicate(); 9507 9508 // Avoid running transformation to recipes until masked code generation in 9509 // VPlan-native path is in place. 9510 return Plan; 9511 } 9512 9513 SmallPtrSet<Instruction *, 1> DeadInstructions; 9514 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9515 Legal->getInductionVars(), 9516 DeadInstructions, *PSE.getSE()); 9517 return Plan; 9518 } 9519 9520 // Adjust the recipes for reductions. For in-loop reductions the chain of 9521 // instructions leading from the loop exit instr to the phi need to be converted 9522 // to reductions, with one operand being vector and the other being the scalar 9523 // reduction chain. For other reductions, a select is introduced between the phi 9524 // and live-out recipes when folding the tail. 9525 void LoopVectorizationPlanner::adjustRecipesForReductions( 9526 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9527 ElementCount MinVF) { 9528 for (auto &Reduction : CM.getInLoopReductionChains()) { 9529 PHINode *Phi = Reduction.first; 9530 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9531 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9532 9533 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9534 continue; 9535 9536 // ReductionOperations are orders top-down from the phi's use to the 9537 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9538 // which of the two operands will remain scalar and which will be reduced. 9539 // For minmax the chain will be the select instructions. 9540 Instruction *Chain = Phi; 9541 for (Instruction *R : ReductionOperations) { 9542 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9543 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9544 9545 VPValue *ChainOp = Plan->getVPValue(Chain); 9546 unsigned FirstOpId; 9547 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9548 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9549 "Expected to replace a VPWidenSelectSC"); 9550 FirstOpId = 1; 9551 } else { 9552 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && 9553 "Expected to replace a VPWidenSC"); 9554 FirstOpId = 0; 9555 } 9556 unsigned VecOpId = 9557 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9558 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9559 9560 auto *CondOp = CM.foldTailByMasking() 9561 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9562 : nullptr; 9563 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9564 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9565 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9566 Plan->removeVPValueFor(R); 9567 Plan->addVPValue(R, RedRecipe); 9568 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9569 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9570 WidenRecipe->eraseFromParent(); 9571 9572 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9573 VPRecipeBase *CompareRecipe = 9574 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9575 assert(isa<VPWidenRecipe>(CompareRecipe) && 9576 "Expected to replace a VPWidenSC"); 9577 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9578 "Expected no remaining users"); 9579 CompareRecipe->eraseFromParent(); 9580 } 9581 Chain = R; 9582 } 9583 } 9584 9585 // If tail is folded by masking, introduce selects between the phi 9586 // and the live-out instruction of each reduction, at the end of the latch. 9587 if (CM.foldTailByMasking()) { 9588 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9589 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9590 if (!PhiR || PhiR->isInLoop()) 9591 continue; 9592 Builder.setInsertPoint(LatchVPBB); 9593 VPValue *Cond = 9594 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9595 VPValue *Red = PhiR->getBackedgeValue(); 9596 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9597 } 9598 } 9599 } 9600 9601 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9602 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9603 VPSlotTracker &SlotTracker) const { 9604 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9605 IG->getInsertPos()->printAsOperand(O, false); 9606 O << ", "; 9607 getAddr()->printAsOperand(O, SlotTracker); 9608 VPValue *Mask = getMask(); 9609 if (Mask) { 9610 O << ", "; 9611 Mask->printAsOperand(O, SlotTracker); 9612 } 9613 9614 unsigned OpIdx = 0; 9615 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9616 if (!IG->getMember(i)) 9617 continue; 9618 if (getNumStoreOperands() > 0) { 9619 O << "\n" << Indent << " store "; 9620 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9621 O << " to index " << i; 9622 } else { 9623 O << "\n" << Indent << " "; 9624 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9625 O << " = load from index " << i; 9626 } 9627 ++OpIdx; 9628 } 9629 } 9630 #endif 9631 9632 void VPWidenCallRecipe::execute(VPTransformState &State) { 9633 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9634 *this, State); 9635 } 9636 9637 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9638 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9639 this, *this, InvariantCond, State); 9640 } 9641 9642 void VPWidenRecipe::execute(VPTransformState &State) { 9643 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9644 } 9645 9646 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9647 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9648 *this, State.UF, State.VF, IsPtrLoopInvariant, 9649 IsIndexLoopInvariant, State); 9650 } 9651 9652 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9653 assert(!State.Instance && "Int or FP induction being replicated."); 9654 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9655 getTruncInst(), getVPValue(0), 9656 getCastValue(), State); 9657 } 9658 9659 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9660 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9661 State); 9662 } 9663 9664 void VPBlendRecipe::execute(VPTransformState &State) { 9665 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9666 // We know that all PHIs in non-header blocks are converted into 9667 // selects, so we don't have to worry about the insertion order and we 9668 // can just use the builder. 9669 // At this point we generate the predication tree. There may be 9670 // duplications since this is a simple recursive scan, but future 9671 // optimizations will clean it up. 9672 9673 unsigned NumIncoming = getNumIncomingValues(); 9674 9675 // Generate a sequence of selects of the form: 9676 // SELECT(Mask3, In3, 9677 // SELECT(Mask2, In2, 9678 // SELECT(Mask1, In1, 9679 // In0))) 9680 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9681 // are essentially undef are taken from In0. 9682 InnerLoopVectorizer::VectorParts Entry(State.UF); 9683 for (unsigned In = 0; In < NumIncoming; ++In) { 9684 for (unsigned Part = 0; Part < State.UF; ++Part) { 9685 // We might have single edge PHIs (blocks) - use an identity 9686 // 'select' for the first PHI operand. 9687 Value *In0 = State.get(getIncomingValue(In), Part); 9688 if (In == 0) 9689 Entry[Part] = In0; // Initialize with the first incoming value. 9690 else { 9691 // Select between the current value and the previous incoming edge 9692 // based on the incoming mask. 9693 Value *Cond = State.get(getMask(In), Part); 9694 Entry[Part] = 9695 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9696 } 9697 } 9698 } 9699 for (unsigned Part = 0; Part < State.UF; ++Part) 9700 State.set(this, Entry[Part], Part); 9701 } 9702 9703 void VPInterleaveRecipe::execute(VPTransformState &State) { 9704 assert(!State.Instance && "Interleave group being replicated."); 9705 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9706 getStoredValues(), getMask()); 9707 } 9708 9709 void VPReductionRecipe::execute(VPTransformState &State) { 9710 assert(!State.Instance && "Reduction being replicated."); 9711 Value *PrevInChain = State.get(getChainOp(), 0); 9712 for (unsigned Part = 0; Part < State.UF; ++Part) { 9713 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9714 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9715 Value *NewVecOp = State.get(getVecOp(), Part); 9716 if (VPValue *Cond = getCondOp()) { 9717 Value *NewCond = State.get(Cond, Part); 9718 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9719 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9720 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9721 Constant *IdenVec = 9722 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9723 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9724 NewVecOp = Select; 9725 } 9726 Value *NewRed; 9727 Value *NextInChain; 9728 if (IsOrdered) { 9729 if (State.VF.isVector()) 9730 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9731 PrevInChain); 9732 else 9733 NewRed = State.Builder.CreateBinOp( 9734 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), 9735 PrevInChain, NewVecOp); 9736 PrevInChain = NewRed; 9737 } else { 9738 PrevInChain = State.get(getChainOp(), Part); 9739 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9740 } 9741 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9742 NextInChain = 9743 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9744 NewRed, PrevInChain); 9745 } else if (IsOrdered) 9746 NextInChain = NewRed; 9747 else { 9748 NextInChain = State.Builder.CreateBinOp( 9749 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9750 PrevInChain); 9751 } 9752 State.set(this, NextInChain, Part); 9753 } 9754 } 9755 9756 void VPReplicateRecipe::execute(VPTransformState &State) { 9757 if (State.Instance) { // Generate a single instance. 9758 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9759 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9760 *State.Instance, IsPredicated, State); 9761 // Insert scalar instance packing it into a vector. 9762 if (AlsoPack && State.VF.isVector()) { 9763 // If we're constructing lane 0, initialize to start from poison. 9764 if (State.Instance->Lane.isFirstLane()) { 9765 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9766 Value *Poison = PoisonValue::get( 9767 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9768 State.set(this, Poison, State.Instance->Part); 9769 } 9770 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9771 } 9772 return; 9773 } 9774 9775 // Generate scalar instances for all VF lanes of all UF parts, unless the 9776 // instruction is uniform inwhich case generate only the first lane for each 9777 // of the UF parts. 9778 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9779 assert((!State.VF.isScalable() || IsUniform) && 9780 "Can't scalarize a scalable vector"); 9781 for (unsigned Part = 0; Part < State.UF; ++Part) 9782 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9783 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9784 VPIteration(Part, Lane), IsPredicated, 9785 State); 9786 } 9787 9788 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9789 assert(State.Instance && "Branch on Mask works only on single instance."); 9790 9791 unsigned Part = State.Instance->Part; 9792 unsigned Lane = State.Instance->Lane.getKnownLane(); 9793 9794 Value *ConditionBit = nullptr; 9795 VPValue *BlockInMask = getMask(); 9796 if (BlockInMask) { 9797 ConditionBit = State.get(BlockInMask, Part); 9798 if (ConditionBit->getType()->isVectorTy()) 9799 ConditionBit = State.Builder.CreateExtractElement( 9800 ConditionBit, State.Builder.getInt32(Lane)); 9801 } else // Block in mask is all-one. 9802 ConditionBit = State.Builder.getTrue(); 9803 9804 // Replace the temporary unreachable terminator with a new conditional branch, 9805 // whose two destinations will be set later when they are created. 9806 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9807 assert(isa<UnreachableInst>(CurrentTerminator) && 9808 "Expected to replace unreachable terminator with conditional branch."); 9809 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9810 CondBr->setSuccessor(0, nullptr); 9811 ReplaceInstWithInst(CurrentTerminator, CondBr); 9812 } 9813 9814 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9815 assert(State.Instance && "Predicated instruction PHI works per instance."); 9816 Instruction *ScalarPredInst = 9817 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9818 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9819 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9820 assert(PredicatingBB && "Predicated block has no single predecessor."); 9821 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9822 "operand must be VPReplicateRecipe"); 9823 9824 // By current pack/unpack logic we need to generate only a single phi node: if 9825 // a vector value for the predicated instruction exists at this point it means 9826 // the instruction has vector users only, and a phi for the vector value is 9827 // needed. In this case the recipe of the predicated instruction is marked to 9828 // also do that packing, thereby "hoisting" the insert-element sequence. 9829 // Otherwise, a phi node for the scalar value is needed. 9830 unsigned Part = State.Instance->Part; 9831 if (State.hasVectorValue(getOperand(0), Part)) { 9832 Value *VectorValue = State.get(getOperand(0), Part); 9833 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9834 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9835 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9836 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9837 if (State.hasVectorValue(this, Part)) 9838 State.reset(this, VPhi, Part); 9839 else 9840 State.set(this, VPhi, Part); 9841 // NOTE: Currently we need to update the value of the operand, so the next 9842 // predicated iteration inserts its generated value in the correct vector. 9843 State.reset(getOperand(0), VPhi, Part); 9844 } else { 9845 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9846 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9847 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9848 PredicatingBB); 9849 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9850 if (State.hasScalarValue(this, *State.Instance)) 9851 State.reset(this, Phi, *State.Instance); 9852 else 9853 State.set(this, Phi, *State.Instance); 9854 // NOTE: Currently we need to update the value of the operand, so the next 9855 // predicated iteration inserts its generated value in the correct vector. 9856 State.reset(getOperand(0), Phi, *State.Instance); 9857 } 9858 } 9859 9860 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9861 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9862 State.ILV->vectorizeMemoryInstruction( 9863 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9864 StoredValue, getMask()); 9865 } 9866 9867 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9868 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9869 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9870 // for predication. 9871 static ScalarEpilogueLowering getScalarEpilogueLowering( 9872 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9873 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9874 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9875 LoopVectorizationLegality &LVL) { 9876 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9877 // don't look at hints or options, and don't request a scalar epilogue. 9878 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9879 // LoopAccessInfo (due to code dependency and not being able to reliably get 9880 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9881 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9882 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9883 // back to the old way and vectorize with versioning when forced. See D81345.) 9884 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9885 PGSOQueryType::IRPass) && 9886 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9887 return CM_ScalarEpilogueNotAllowedOptSize; 9888 9889 // 2) If set, obey the directives 9890 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9891 switch (PreferPredicateOverEpilogue) { 9892 case PreferPredicateTy::ScalarEpilogue: 9893 return CM_ScalarEpilogueAllowed; 9894 case PreferPredicateTy::PredicateElseScalarEpilogue: 9895 return CM_ScalarEpilogueNotNeededUsePredicate; 9896 case PreferPredicateTy::PredicateOrDontVectorize: 9897 return CM_ScalarEpilogueNotAllowedUsePredicate; 9898 }; 9899 } 9900 9901 // 3) If set, obey the hints 9902 switch (Hints.getPredicate()) { 9903 case LoopVectorizeHints::FK_Enabled: 9904 return CM_ScalarEpilogueNotNeededUsePredicate; 9905 case LoopVectorizeHints::FK_Disabled: 9906 return CM_ScalarEpilogueAllowed; 9907 }; 9908 9909 // 4) if the TTI hook indicates this is profitable, request predication. 9910 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9911 LVL.getLAI())) 9912 return CM_ScalarEpilogueNotNeededUsePredicate; 9913 9914 return CM_ScalarEpilogueAllowed; 9915 } 9916 9917 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9918 // If Values have been set for this Def return the one relevant for \p Part. 9919 if (hasVectorValue(Def, Part)) 9920 return Data.PerPartOutput[Def][Part]; 9921 9922 if (!hasScalarValue(Def, {Part, 0})) { 9923 Value *IRV = Def->getLiveInIRValue(); 9924 Value *B = ILV->getBroadcastInstrs(IRV); 9925 set(Def, B, Part); 9926 return B; 9927 } 9928 9929 Value *ScalarValue = get(Def, {Part, 0}); 9930 // If we aren't vectorizing, we can just copy the scalar map values over 9931 // to the vector map. 9932 if (VF.isScalar()) { 9933 set(Def, ScalarValue, Part); 9934 return ScalarValue; 9935 } 9936 9937 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9938 bool IsUniform = RepR && RepR->isUniform(); 9939 9940 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9941 // Check if there is a scalar value for the selected lane. 9942 if (!hasScalarValue(Def, {Part, LastLane})) { 9943 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9944 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9945 "unexpected recipe found to be invariant"); 9946 IsUniform = true; 9947 LastLane = 0; 9948 } 9949 9950 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9951 // Set the insert point after the last scalarized instruction or after the 9952 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9953 // will directly follow the scalar definitions. 9954 auto OldIP = Builder.saveIP(); 9955 auto NewIP = 9956 isa<PHINode>(LastInst) 9957 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9958 : std::next(BasicBlock::iterator(LastInst)); 9959 Builder.SetInsertPoint(&*NewIP); 9960 9961 // However, if we are vectorizing, we need to construct the vector values. 9962 // If the value is known to be uniform after vectorization, we can just 9963 // broadcast the scalar value corresponding to lane zero for each unroll 9964 // iteration. Otherwise, we construct the vector values using 9965 // insertelement instructions. Since the resulting vectors are stored in 9966 // State, we will only generate the insertelements once. 9967 Value *VectorValue = nullptr; 9968 if (IsUniform) { 9969 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9970 set(Def, VectorValue, Part); 9971 } else { 9972 // Initialize packing with insertelements to start from undef. 9973 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9974 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9975 set(Def, Undef, Part); 9976 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9977 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9978 VectorValue = get(Def, Part); 9979 } 9980 Builder.restoreIP(OldIP); 9981 return VectorValue; 9982 } 9983 9984 // Process the loop in the VPlan-native vectorization path. This path builds 9985 // VPlan upfront in the vectorization pipeline, which allows to apply 9986 // VPlan-to-VPlan transformations from the very beginning without modifying the 9987 // input LLVM IR. 9988 static bool processLoopInVPlanNativePath( 9989 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9990 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9991 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9992 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9993 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9994 LoopVectorizationRequirements &Requirements) { 9995 9996 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9997 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9998 return false; 9999 } 10000 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10001 Function *F = L->getHeader()->getParent(); 10002 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10003 10004 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10005 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10006 10007 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10008 &Hints, IAI); 10009 // Use the planner for outer loop vectorization. 10010 // TODO: CM is not used at this point inside the planner. Turn CM into an 10011 // optional argument if we don't need it in the future. 10012 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10013 Requirements, ORE); 10014 10015 // Get user vectorization factor. 10016 ElementCount UserVF = Hints.getWidth(); 10017 10018 CM.collectElementTypesForWidening(); 10019 10020 // Plan how to best vectorize, return the best VF and its cost. 10021 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10022 10023 // If we are stress testing VPlan builds, do not attempt to generate vector 10024 // code. Masked vector code generation support will follow soon. 10025 // Also, do not attempt to vectorize if no vector code will be produced. 10026 if (VPlanBuildStressTest || EnableVPlanPredication || 10027 VectorizationFactor::Disabled() == VF) 10028 return false; 10029 10030 LVP.setBestPlan(VF.Width, 1); 10031 10032 { 10033 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10034 F->getParent()->getDataLayout()); 10035 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10036 &CM, BFI, PSI, Checks); 10037 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10038 << L->getHeader()->getParent()->getName() << "\"\n"); 10039 LVP.executePlan(LB, DT); 10040 } 10041 10042 // Mark the loop as already vectorized to avoid vectorizing again. 10043 Hints.setAlreadyVectorized(); 10044 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10045 return true; 10046 } 10047 10048 // Emit a remark if there are stores to floats that required a floating point 10049 // extension. If the vectorized loop was generated with floating point there 10050 // will be a performance penalty from the conversion overhead and the change in 10051 // the vector width. 10052 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10053 SmallVector<Instruction *, 4> Worklist; 10054 for (BasicBlock *BB : L->getBlocks()) { 10055 for (Instruction &Inst : *BB) { 10056 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10057 if (S->getValueOperand()->getType()->isFloatTy()) 10058 Worklist.push_back(S); 10059 } 10060 } 10061 } 10062 10063 // Traverse the floating point stores upwards searching, for floating point 10064 // conversions. 10065 SmallPtrSet<const Instruction *, 4> Visited; 10066 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10067 while (!Worklist.empty()) { 10068 auto *I = Worklist.pop_back_val(); 10069 if (!L->contains(I)) 10070 continue; 10071 if (!Visited.insert(I).second) 10072 continue; 10073 10074 // Emit a remark if the floating point store required a floating 10075 // point conversion. 10076 // TODO: More work could be done to identify the root cause such as a 10077 // constant or a function return type and point the user to it. 10078 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10079 ORE->emit([&]() { 10080 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10081 I->getDebugLoc(), L->getHeader()) 10082 << "floating point conversion changes vector width. " 10083 << "Mixed floating point precision requires an up/down " 10084 << "cast that will negatively impact performance."; 10085 }); 10086 10087 for (Use &Op : I->operands()) 10088 if (auto *OpI = dyn_cast<Instruction>(Op)) 10089 Worklist.push_back(OpI); 10090 } 10091 } 10092 10093 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10094 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10095 !EnableLoopInterleaving), 10096 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10097 !EnableLoopVectorization) {} 10098 10099 bool LoopVectorizePass::processLoop(Loop *L) { 10100 assert((EnableVPlanNativePath || L->isInnermost()) && 10101 "VPlan-native path is not enabled. Only process inner loops."); 10102 10103 #ifndef NDEBUG 10104 const std::string DebugLocStr = getDebugLocString(L); 10105 #endif /* NDEBUG */ 10106 10107 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10108 << L->getHeader()->getParent()->getName() << "\" from " 10109 << DebugLocStr << "\n"); 10110 10111 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10112 10113 LLVM_DEBUG( 10114 dbgs() << "LV: Loop hints:" 10115 << " force=" 10116 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10117 ? "disabled" 10118 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10119 ? "enabled" 10120 : "?")) 10121 << " width=" << Hints.getWidth() 10122 << " interleave=" << Hints.getInterleave() << "\n"); 10123 10124 // Function containing loop 10125 Function *F = L->getHeader()->getParent(); 10126 10127 // Looking at the diagnostic output is the only way to determine if a loop 10128 // was vectorized (other than looking at the IR or machine code), so it 10129 // is important to generate an optimization remark for each loop. Most of 10130 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10131 // generated as OptimizationRemark and OptimizationRemarkMissed are 10132 // less verbose reporting vectorized loops and unvectorized loops that may 10133 // benefit from vectorization, respectively. 10134 10135 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10136 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10137 return false; 10138 } 10139 10140 PredicatedScalarEvolution PSE(*SE, *L); 10141 10142 // Check if it is legal to vectorize the loop. 10143 LoopVectorizationRequirements Requirements; 10144 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10145 &Requirements, &Hints, DB, AC, BFI, PSI); 10146 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10147 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10148 Hints.emitRemarkWithHints(); 10149 return false; 10150 } 10151 10152 // Check the function attributes and profiles to find out if this function 10153 // should be optimized for size. 10154 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10155 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10156 10157 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10158 // here. They may require CFG and instruction level transformations before 10159 // even evaluating whether vectorization is profitable. Since we cannot modify 10160 // the incoming IR, we need to build VPlan upfront in the vectorization 10161 // pipeline. 10162 if (!L->isInnermost()) 10163 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10164 ORE, BFI, PSI, Hints, Requirements); 10165 10166 assert(L->isInnermost() && "Inner loop expected."); 10167 10168 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10169 // count by optimizing for size, to minimize overheads. 10170 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10171 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10172 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10173 << "This loop is worth vectorizing only if no scalar " 10174 << "iteration overheads are incurred."); 10175 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10176 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10177 else { 10178 LLVM_DEBUG(dbgs() << "\n"); 10179 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10180 } 10181 } 10182 10183 // Check the function attributes to see if implicit floats are allowed. 10184 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10185 // an integer loop and the vector instructions selected are purely integer 10186 // vector instructions? 10187 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10188 reportVectorizationFailure( 10189 "Can't vectorize when the NoImplicitFloat attribute is used", 10190 "loop not vectorized due to NoImplicitFloat attribute", 10191 "NoImplicitFloat", ORE, L); 10192 Hints.emitRemarkWithHints(); 10193 return false; 10194 } 10195 10196 // Check if the target supports potentially unsafe FP vectorization. 10197 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10198 // for the target we're vectorizing for, to make sure none of the 10199 // additional fp-math flags can help. 10200 if (Hints.isPotentiallyUnsafe() && 10201 TTI->isFPVectorizationPotentiallyUnsafe()) { 10202 reportVectorizationFailure( 10203 "Potentially unsafe FP op prevents vectorization", 10204 "loop not vectorized due to unsafe FP support.", 10205 "UnsafeFP", ORE, L); 10206 Hints.emitRemarkWithHints(); 10207 return false; 10208 } 10209 10210 if (!LVL.canVectorizeFPMath(ForceOrderedReductions)) { 10211 ORE->emit([&]() { 10212 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10213 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10214 ExactFPMathInst->getDebugLoc(), 10215 ExactFPMathInst->getParent()) 10216 << "loop not vectorized: cannot prove it is safe to reorder " 10217 "floating-point operations"; 10218 }); 10219 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10220 "reorder floating-point operations\n"); 10221 Hints.emitRemarkWithHints(); 10222 return false; 10223 } 10224 10225 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10226 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10227 10228 // If an override option has been passed in for interleaved accesses, use it. 10229 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10230 UseInterleaved = EnableInterleavedMemAccesses; 10231 10232 // Analyze interleaved memory accesses. 10233 if (UseInterleaved) { 10234 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10235 } 10236 10237 // Use the cost model. 10238 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10239 F, &Hints, IAI); 10240 CM.collectValuesToIgnore(); 10241 CM.collectElementTypesForWidening(); 10242 10243 // Use the planner for vectorization. 10244 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10245 Requirements, ORE); 10246 10247 // Get user vectorization factor and interleave count. 10248 ElementCount UserVF = Hints.getWidth(); 10249 unsigned UserIC = Hints.getInterleave(); 10250 10251 // Plan how to best vectorize, return the best VF and its cost. 10252 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10253 10254 VectorizationFactor VF = VectorizationFactor::Disabled(); 10255 unsigned IC = 1; 10256 10257 if (MaybeVF) { 10258 VF = *MaybeVF; 10259 // Select the interleave count. 10260 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10261 } 10262 10263 // Identify the diagnostic messages that should be produced. 10264 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10265 bool VectorizeLoop = true, InterleaveLoop = true; 10266 if (VF.Width.isScalar()) { 10267 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10268 VecDiagMsg = std::make_pair( 10269 "VectorizationNotBeneficial", 10270 "the cost-model indicates that vectorization is not beneficial"); 10271 VectorizeLoop = false; 10272 } 10273 10274 if (!MaybeVF && UserIC > 1) { 10275 // Tell the user interleaving was avoided up-front, despite being explicitly 10276 // requested. 10277 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10278 "interleaving should be avoided up front\n"); 10279 IntDiagMsg = std::make_pair( 10280 "InterleavingAvoided", 10281 "Ignoring UserIC, because interleaving was avoided up front"); 10282 InterleaveLoop = false; 10283 } else if (IC == 1 && UserIC <= 1) { 10284 // Tell the user interleaving is not beneficial. 10285 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10286 IntDiagMsg = std::make_pair( 10287 "InterleavingNotBeneficial", 10288 "the cost-model indicates that interleaving is not beneficial"); 10289 InterleaveLoop = false; 10290 if (UserIC == 1) { 10291 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10292 IntDiagMsg.second += 10293 " and is explicitly disabled or interleave count is set to 1"; 10294 } 10295 } else if (IC > 1 && UserIC == 1) { 10296 // Tell the user interleaving is beneficial, but it explicitly disabled. 10297 LLVM_DEBUG( 10298 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10299 IntDiagMsg = std::make_pair( 10300 "InterleavingBeneficialButDisabled", 10301 "the cost-model indicates that interleaving is beneficial " 10302 "but is explicitly disabled or interleave count is set to 1"); 10303 InterleaveLoop = false; 10304 } 10305 10306 // Override IC if user provided an interleave count. 10307 IC = UserIC > 0 ? UserIC : IC; 10308 10309 // Emit diagnostic messages, if any. 10310 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10311 if (!VectorizeLoop && !InterleaveLoop) { 10312 // Do not vectorize or interleaving the loop. 10313 ORE->emit([&]() { 10314 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10315 L->getStartLoc(), L->getHeader()) 10316 << VecDiagMsg.second; 10317 }); 10318 ORE->emit([&]() { 10319 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10320 L->getStartLoc(), L->getHeader()) 10321 << IntDiagMsg.second; 10322 }); 10323 return false; 10324 } else if (!VectorizeLoop && InterleaveLoop) { 10325 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10326 ORE->emit([&]() { 10327 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10328 L->getStartLoc(), L->getHeader()) 10329 << VecDiagMsg.second; 10330 }); 10331 } else if (VectorizeLoop && !InterleaveLoop) { 10332 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10333 << ") in " << DebugLocStr << '\n'); 10334 ORE->emit([&]() { 10335 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10336 L->getStartLoc(), L->getHeader()) 10337 << IntDiagMsg.second; 10338 }); 10339 } else if (VectorizeLoop && InterleaveLoop) { 10340 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10341 << ") in " << DebugLocStr << '\n'); 10342 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10343 } 10344 10345 bool DisableRuntimeUnroll = false; 10346 MDNode *OrigLoopID = L->getLoopID(); 10347 { 10348 // Optimistically generate runtime checks. Drop them if they turn out to not 10349 // be profitable. Limit the scope of Checks, so the cleanup happens 10350 // immediately after vector codegeneration is done. 10351 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10352 F->getParent()->getDataLayout()); 10353 if (!VF.Width.isScalar() || IC > 1) 10354 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10355 LVP.setBestPlan(VF.Width, IC); 10356 10357 using namespace ore; 10358 if (!VectorizeLoop) { 10359 assert(IC > 1 && "interleave count should not be 1 or 0"); 10360 // If we decided that it is not legal to vectorize the loop, then 10361 // interleave it. 10362 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10363 &CM, BFI, PSI, Checks); 10364 LVP.executePlan(Unroller, DT); 10365 10366 ORE->emit([&]() { 10367 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10368 L->getHeader()) 10369 << "interleaved loop (interleaved count: " 10370 << NV("InterleaveCount", IC) << ")"; 10371 }); 10372 } else { 10373 // If we decided that it is *legal* to vectorize the loop, then do it. 10374 10375 // Consider vectorizing the epilogue too if it's profitable. 10376 VectorizationFactor EpilogueVF = 10377 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10378 if (EpilogueVF.Width.isVector()) { 10379 10380 // The first pass vectorizes the main loop and creates a scalar epilogue 10381 // to be vectorized by executing the plan (potentially with a different 10382 // factor) again shortly afterwards. 10383 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10384 EpilogueVF.Width.getKnownMinValue(), 10385 1); 10386 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10387 EPI, &LVL, &CM, BFI, PSI, Checks); 10388 10389 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10390 LVP.executePlan(MainILV, DT); 10391 ++LoopsVectorized; 10392 10393 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10394 formLCSSARecursively(*L, *DT, LI, SE); 10395 10396 // Second pass vectorizes the epilogue and adjusts the control flow 10397 // edges from the first pass. 10398 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10399 EPI.MainLoopVF = EPI.EpilogueVF; 10400 EPI.MainLoopUF = EPI.EpilogueUF; 10401 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10402 ORE, EPI, &LVL, &CM, BFI, PSI, 10403 Checks); 10404 LVP.executePlan(EpilogILV, DT); 10405 ++LoopsEpilogueVectorized; 10406 10407 if (!MainILV.areSafetyChecksAdded()) 10408 DisableRuntimeUnroll = true; 10409 } else { 10410 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10411 &LVL, &CM, BFI, PSI, Checks); 10412 LVP.executePlan(LB, DT); 10413 ++LoopsVectorized; 10414 10415 // Add metadata to disable runtime unrolling a scalar loop when there 10416 // are no runtime checks about strides and memory. A scalar loop that is 10417 // rarely used is not worth unrolling. 10418 if (!LB.areSafetyChecksAdded()) 10419 DisableRuntimeUnroll = true; 10420 } 10421 // Report the vectorization decision. 10422 ORE->emit([&]() { 10423 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10424 L->getHeader()) 10425 << "vectorized loop (vectorization width: " 10426 << NV("VectorizationFactor", VF.Width) 10427 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10428 }); 10429 } 10430 10431 if (ORE->allowExtraAnalysis(LV_NAME)) 10432 checkMixedPrecision(L, ORE); 10433 } 10434 10435 Optional<MDNode *> RemainderLoopID = 10436 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10437 LLVMLoopVectorizeFollowupEpilogue}); 10438 if (RemainderLoopID.hasValue()) { 10439 L->setLoopID(RemainderLoopID.getValue()); 10440 } else { 10441 if (DisableRuntimeUnroll) 10442 AddRuntimeUnrollDisableMetaData(L); 10443 10444 // Mark the loop as already vectorized to avoid vectorizing again. 10445 Hints.setAlreadyVectorized(); 10446 } 10447 10448 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10449 return true; 10450 } 10451 10452 LoopVectorizeResult LoopVectorizePass::runImpl( 10453 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10454 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10455 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10456 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10457 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10458 SE = &SE_; 10459 LI = &LI_; 10460 TTI = &TTI_; 10461 DT = &DT_; 10462 BFI = &BFI_; 10463 TLI = TLI_; 10464 AA = &AA_; 10465 AC = &AC_; 10466 GetLAA = &GetLAA_; 10467 DB = &DB_; 10468 ORE = &ORE_; 10469 PSI = PSI_; 10470 10471 // Don't attempt if 10472 // 1. the target claims to have no vector registers, and 10473 // 2. interleaving won't help ILP. 10474 // 10475 // The second condition is necessary because, even if the target has no 10476 // vector registers, loop vectorization may still enable scalar 10477 // interleaving. 10478 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10479 TTI->getMaxInterleaveFactor(1) < 2) 10480 return LoopVectorizeResult(false, false); 10481 10482 bool Changed = false, CFGChanged = false; 10483 10484 // The vectorizer requires loops to be in simplified form. 10485 // Since simplification may add new inner loops, it has to run before the 10486 // legality and profitability checks. This means running the loop vectorizer 10487 // will simplify all loops, regardless of whether anything end up being 10488 // vectorized. 10489 for (auto &L : *LI) 10490 Changed |= CFGChanged |= 10491 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10492 10493 // Build up a worklist of inner-loops to vectorize. This is necessary as 10494 // the act of vectorizing or partially unrolling a loop creates new loops 10495 // and can invalidate iterators across the loops. 10496 SmallVector<Loop *, 8> Worklist; 10497 10498 for (Loop *L : *LI) 10499 collectSupportedLoops(*L, LI, ORE, Worklist); 10500 10501 LoopsAnalyzed += Worklist.size(); 10502 10503 // Now walk the identified inner loops. 10504 while (!Worklist.empty()) { 10505 Loop *L = Worklist.pop_back_val(); 10506 10507 // For the inner loops we actually process, form LCSSA to simplify the 10508 // transform. 10509 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10510 10511 Changed |= CFGChanged |= processLoop(L); 10512 } 10513 10514 // Process each loop nest in the function. 10515 return LoopVectorizeResult(Changed, CFGChanged); 10516 } 10517 10518 PreservedAnalyses LoopVectorizePass::run(Function &F, 10519 FunctionAnalysisManager &AM) { 10520 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10521 auto &LI = AM.getResult<LoopAnalysis>(F); 10522 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10523 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10524 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10525 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10526 auto &AA = AM.getResult<AAManager>(F); 10527 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10528 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10529 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10530 MemorySSA *MSSA = EnableMSSALoopDependency 10531 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10532 : nullptr; 10533 10534 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10535 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10536 [&](Loop &L) -> const LoopAccessInfo & { 10537 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10538 TLI, TTI, nullptr, MSSA}; 10539 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10540 }; 10541 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10542 ProfileSummaryInfo *PSI = 10543 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10544 LoopVectorizeResult Result = 10545 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10546 if (!Result.MadeAnyChange) 10547 return PreservedAnalyses::all(); 10548 PreservedAnalyses PA; 10549 10550 // We currently do not preserve loopinfo/dominator analyses with outer loop 10551 // vectorization. Until this is addressed, mark these analyses as preserved 10552 // only for non-VPlan-native path. 10553 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10554 if (!EnableVPlanNativePath) { 10555 PA.preserve<LoopAnalysis>(); 10556 PA.preserve<DominatorTreeAnalysis>(); 10557 } 10558 if (!Result.MadeCFGChange) 10559 PA.preserveSet<CFGAnalyses>(); 10560 return PA; 10561 } 10562