1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop. 472 /// In the case of epilogue vectorization, this function is overriden to 473 /// handle the more complex control flow around the loops. 474 virtual BasicBlock *createVectorizedLoopSkeleton(); 475 476 /// Widen a single instruction within the innermost loop. 477 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 478 VPTransformState &State); 479 480 /// Widen a single call instruction within the innermost loop. 481 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 482 VPTransformState &State); 483 484 /// Widen a single select instruction within the innermost loop. 485 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 486 bool InvariantCond, VPTransformState &State); 487 488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 489 void fixVectorizedLoop(VPTransformState &State); 490 491 // Return true if any runtime check is added. 492 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 493 494 /// A type for vectorized values in the new loop. Each value from the 495 /// original loop, when vectorized, is represented by UF vector values in the 496 /// new unrolled loop, where UF is the unroll factor. 497 using VectorParts = SmallVector<Value *, 2>; 498 499 /// Vectorize a single GetElementPtrInst based on information gathered and 500 /// decisions taken during planning. 501 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 502 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 503 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 504 505 /// Vectorize a single first-order recurrence or pointer induction PHINode in 506 /// a block. This method handles the induction variable canonicalization. It 507 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 508 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 509 VPTransformState &State); 510 511 /// A helper function to scalarize a single Instruction in the innermost loop. 512 /// Generates a sequence of scalar instances for each lane between \p MinLane 513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 514 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 515 /// Instr's operands. 516 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 517 const VPIteration &Instance, bool IfPredicateInstr, 518 VPTransformState &State); 519 520 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 521 /// is provided, the integer induction variable will first be truncated to 522 /// the corresponding type. 523 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 524 VPValue *Def, VPValue *CastDef, 525 VPTransformState &State); 526 527 /// Construct the vector value of a scalarized value \p V one lane at a time. 528 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 529 VPTransformState &State); 530 531 /// Try to vectorize interleaved access group \p Group with the base address 532 /// given in \p Addr, optionally masking the vector operations if \p 533 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 534 /// values in the vectorized loop. 535 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 536 ArrayRef<VPValue *> VPDefs, 537 VPTransformState &State, VPValue *Addr, 538 ArrayRef<VPValue *> StoredValues, 539 VPValue *BlockInMask = nullptr); 540 541 /// Vectorize Load and Store instructions with the base address given in \p 542 /// Addr, optionally masking the vector operations if \p BlockInMask is 543 /// non-null. Use \p State to translate given VPValues to IR values in the 544 /// vectorized loop. 545 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 546 VPValue *Def, VPValue *Addr, 547 VPValue *StoredValue, VPValue *BlockInMask); 548 549 /// Set the debug location in the builder \p Ptr using the debug location in 550 /// \p V. If \p Ptr is None then it uses the class member's Builder. 551 void setDebugLocFromInst(const Value *V, 552 Optional<IRBuilder<> *> CustomBuilder = None); 553 554 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 555 void fixNonInductionPHIs(VPTransformState &State); 556 557 /// Returns true if the reordering of FP operations is not allowed, but we are 558 /// able to vectorize with strict in-order reductions for the given RdxDesc. 559 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 560 561 /// Create a broadcast instruction. This method generates a broadcast 562 /// instruction (shuffle) for loop invariant values and for the induction 563 /// value. If this is the induction variable then we extend it to N, N+1, ... 564 /// this is needed because each iteration in the loop corresponds to a SIMD 565 /// element. 566 virtual Value *getBroadcastInstrs(Value *V); 567 568 protected: 569 friend class LoopVectorizationPlanner; 570 571 /// A small list of PHINodes. 572 using PhiVector = SmallVector<PHINode *, 4>; 573 574 /// A type for scalarized values in the new loop. Each value from the 575 /// original loop, when scalarized, is represented by UF x VF scalar values 576 /// in the new unrolled loop, where UF is the unroll factor and VF is the 577 /// vectorization factor. 578 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 579 580 /// Set up the values of the IVs correctly when exiting the vector loop. 581 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 582 Value *CountRoundDown, Value *EndValue, 583 BasicBlock *MiddleBlock); 584 585 /// Create a new induction variable inside L. 586 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 587 Value *Step, Instruction *DL); 588 589 /// Handle all cross-iteration phis in the header. 590 void fixCrossIterationPHIs(VPTransformState &State); 591 592 /// Create the exit value of first order recurrences in the middle block and 593 /// update their users. 594 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 595 596 /// Create code for the loop exit value of the reduction. 597 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 598 599 /// Clear NSW/NUW flags from reduction instructions if necessary. 600 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 601 VPTransformState &State); 602 603 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 604 /// means we need to add the appropriate incoming value from the middle 605 /// block as exiting edges from the scalar epilogue loop (if present) are 606 /// already in place, and we exit the vector loop exclusively to the middle 607 /// block. 608 void fixLCSSAPHIs(VPTransformState &State); 609 610 /// Iteratively sink the scalarized operands of a predicated instruction into 611 /// the block that was created for it. 612 void sinkScalarOperands(Instruction *PredInst); 613 614 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 615 /// represented as. 616 void truncateToMinimalBitwidths(VPTransformState &State); 617 618 /// This function adds 619 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 620 /// to each vector element of Val. The sequence starts at StartIndex. 621 /// \p Opcode is relevant for FP induction variable. 622 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 623 Instruction::BinaryOps Opcode = 624 Instruction::BinaryOpsEnd); 625 626 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 627 /// variable on which to base the steps, \p Step is the size of the step, and 628 /// \p EntryVal is the value from the original loop that maps to the steps. 629 /// Note that \p EntryVal doesn't have to be an induction variable - it 630 /// can also be a truncate instruction. 631 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 632 const InductionDescriptor &ID, VPValue *Def, 633 VPValue *CastDef, VPTransformState &State); 634 635 /// Create a vector induction phi node based on an existing scalar one. \p 636 /// EntryVal is the value from the original loop that maps to the vector phi 637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 638 /// truncate instruction, instead of widening the original IV, we widen a 639 /// version of the IV truncated to \p EntryVal's type. 640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 641 Value *Step, Value *Start, 642 Instruction *EntryVal, VPValue *Def, 643 VPValue *CastDef, 644 VPTransformState &State); 645 646 /// Returns true if an instruction \p I should be scalarized instead of 647 /// vectorized for the chosen vectorization factor. 648 bool shouldScalarizeInstruction(Instruction *I) const; 649 650 /// Returns true if we should generate a scalar version of \p IV. 651 bool needsScalarInduction(Instruction *IV) const; 652 653 /// If there is a cast involved in the induction variable \p ID, which should 654 /// be ignored in the vectorized loop body, this function records the 655 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 656 /// cast. We had already proved that the casted Phi is equal to the uncasted 657 /// Phi in the vectorized loop (under a runtime guard), and therefore 658 /// there is no need to vectorize the cast - the same value can be used in the 659 /// vector loop for both the Phi and the cast. 660 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 661 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 662 /// 663 /// \p EntryVal is the value from the original loop that maps to the vector 664 /// phi node and is used to distinguish what is the IV currently being 665 /// processed - original one (if \p EntryVal is a phi corresponding to the 666 /// original IV) or the "newly-created" one based on the proof mentioned above 667 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 668 /// latter case \p EntryVal is a TruncInst and we must not record anything for 669 /// that IV, but it's error-prone to expect callers of this routine to care 670 /// about that, hence this explicit parameter. 671 void recordVectorLoopValueForInductionCast( 672 const InductionDescriptor &ID, const Instruction *EntryVal, 673 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 674 unsigned Part, unsigned Lane = UINT_MAX); 675 676 /// Generate a shuffle sequence that will reverse the vector Vec. 677 virtual Value *reverseVector(Value *Vec); 678 679 /// Returns (and creates if needed) the original loop trip count. 680 Value *getOrCreateTripCount(Loop *NewLoop); 681 682 /// Returns (and creates if needed) the trip count of the widened loop. 683 Value *getOrCreateVectorTripCount(Loop *NewLoop); 684 685 /// Returns a bitcasted value to the requested vector type. 686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 687 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 688 const DataLayout &DL); 689 690 /// Emit a bypass check to see if the vector trip count is zero, including if 691 /// it overflows. 692 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 693 694 /// Emit a bypass check to see if all of the SCEV assumptions we've 695 /// had to make are correct. Returns the block containing the checks or 696 /// nullptr if no checks have been added. 697 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Emit bypass checks to check any memory assumptions we may have made. 700 /// Returns the block containing the checks or nullptr if no checks have been 701 /// added. 702 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Compute the transformed value of Index at offset StartValue using step 705 /// StepValue. 706 /// For integer induction, returns StartValue + Index * StepValue. 707 /// For pointer induction, returns StartValue[Index * StepValue]. 708 /// FIXME: The newly created binary instructions should contain nsw/nuw 709 /// flags, which can be found from the original scalar operations. 710 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 711 const DataLayout &DL, 712 const InductionDescriptor &ID) const; 713 714 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 715 /// vector loop preheader, middle block and scalar preheader. Also 716 /// allocate a loop object for the new vector loop and return it. 717 Loop *createVectorLoopSkeleton(StringRef Prefix); 718 719 /// Create new phi nodes for the induction variables to resume iteration count 720 /// in the scalar epilogue, from where the vectorized loop left off (given by 721 /// \p VectorTripCount). 722 /// In cases where the loop skeleton is more complicated (eg. epilogue 723 /// vectorization) and the resume values can come from an additional bypass 724 /// block, the \p AdditionalBypass pair provides information about the bypass 725 /// block and the end value on the edge from bypass to this loop. 726 void createInductionResumeValues( 727 Loop *L, Value *VectorTripCount, 728 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 729 730 /// Complete the loop skeleton by adding debug MDs, creating appropriate 731 /// conditional branches in the middle block, preparing the builder and 732 /// running the verifier. Take in the vector loop \p L as argument, and return 733 /// the preheader of the completed vector loop. 734 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 735 736 /// Add additional metadata to \p To that was not present on \p Orig. 737 /// 738 /// Currently this is used to add the noalias annotations based on the 739 /// inserted memchecks. Use this for instructions that are *cloned* into the 740 /// vector loop. 741 void addNewMetadata(Instruction *To, const Instruction *Orig); 742 743 /// Add metadata from one instruction to another. 744 /// 745 /// This includes both the original MDs from \p From and additional ones (\see 746 /// addNewMetadata). Use this for *newly created* instructions in the vector 747 /// loop. 748 void addMetadata(Instruction *To, Instruction *From); 749 750 /// Similar to the previous function but it adds the metadata to a 751 /// vector of instructions. 752 void addMetadata(ArrayRef<Value *> To, Instruction *From); 753 754 /// Allow subclasses to override and print debug traces before/after vplan 755 /// execution, when trace information is requested. 756 virtual void printDebugTracesAtStart(){}; 757 virtual void printDebugTracesAtEnd(){}; 758 759 /// The original loop. 760 Loop *OrigLoop; 761 762 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 763 /// dynamic knowledge to simplify SCEV expressions and converts them to a 764 /// more usable form. 765 PredicatedScalarEvolution &PSE; 766 767 /// Loop Info. 768 LoopInfo *LI; 769 770 /// Dominator Tree. 771 DominatorTree *DT; 772 773 /// Alias Analysis. 774 AAResults *AA; 775 776 /// Target Library Info. 777 const TargetLibraryInfo *TLI; 778 779 /// Target Transform Info. 780 const TargetTransformInfo *TTI; 781 782 /// Assumption Cache. 783 AssumptionCache *AC; 784 785 /// Interface to emit optimization remarks. 786 OptimizationRemarkEmitter *ORE; 787 788 /// LoopVersioning. It's only set up (non-null) if memchecks were 789 /// used. 790 /// 791 /// This is currently only used to add no-alias metadata based on the 792 /// memchecks. The actually versioning is performed manually. 793 std::unique_ptr<LoopVersioning> LVer; 794 795 /// The vectorization SIMD factor to use. Each vector will have this many 796 /// vector elements. 797 ElementCount VF; 798 799 /// The vectorization unroll factor to use. Each scalar is vectorized to this 800 /// many different vector instructions. 801 unsigned UF; 802 803 /// The builder that we use 804 IRBuilder<> Builder; 805 806 // --- Vectorization state --- 807 808 /// The vector-loop preheader. 809 BasicBlock *LoopVectorPreHeader; 810 811 /// The scalar-loop preheader. 812 BasicBlock *LoopScalarPreHeader; 813 814 /// Middle Block between the vector and the scalar. 815 BasicBlock *LoopMiddleBlock; 816 817 /// The unique ExitBlock of the scalar loop if one exists. Note that 818 /// there can be multiple exiting edges reaching this block. 819 BasicBlock *LoopExitBlock; 820 821 /// The vector loop body. 822 BasicBlock *LoopVectorBody; 823 824 /// The scalar loop body. 825 BasicBlock *LoopScalarBody; 826 827 /// A list of all bypass blocks. The first block is the entry of the loop. 828 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 829 830 /// The new Induction variable which was added to the new block. 831 PHINode *Induction = nullptr; 832 833 /// The induction variable of the old basic block. 834 PHINode *OldInduction = nullptr; 835 836 /// Store instructions that were predicated. 837 SmallVector<Instruction *, 4> PredicatedInstructions; 838 839 /// Trip count of the original loop. 840 Value *TripCount = nullptr; 841 842 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 843 Value *VectorTripCount = nullptr; 844 845 /// The legality analysis. 846 LoopVectorizationLegality *Legal; 847 848 /// The profitablity analysis. 849 LoopVectorizationCostModel *Cost; 850 851 // Record whether runtime checks are added. 852 bool AddedSafetyChecks = false; 853 854 // Holds the end values for each induction variable. We save the end values 855 // so we can later fix-up the external users of the induction variables. 856 DenseMap<PHINode *, Value *> IVEndValues; 857 858 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 859 // fixed up at the end of vector code generation. 860 SmallVector<PHINode *, 8> OrigPHIsToFix; 861 862 /// BFI and PSI are used to check for profile guided size optimizations. 863 BlockFrequencyInfo *BFI; 864 ProfileSummaryInfo *PSI; 865 866 // Whether this loop should be optimized for size based on profile guided size 867 // optimizatios. 868 bool OptForSizeBasedOnProfile; 869 870 /// Structure to hold information about generated runtime checks, responsible 871 /// for cleaning the checks, if vectorization turns out unprofitable. 872 GeneratedRTChecks &RTChecks; 873 }; 874 875 class InnerLoopUnroller : public InnerLoopVectorizer { 876 public: 877 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 878 LoopInfo *LI, DominatorTree *DT, 879 const TargetLibraryInfo *TLI, 880 const TargetTransformInfo *TTI, AssumptionCache *AC, 881 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 882 LoopVectorizationLegality *LVL, 883 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 884 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 885 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 886 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 887 BFI, PSI, Check) {} 888 889 private: 890 Value *getBroadcastInstrs(Value *V) override; 891 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 892 Instruction::BinaryOps Opcode = 893 Instruction::BinaryOpsEnd) override; 894 Value *reverseVector(Value *Vec) override; 895 }; 896 897 /// Encapsulate information regarding vectorization of a loop and its epilogue. 898 /// This information is meant to be updated and used across two stages of 899 /// epilogue vectorization. 900 struct EpilogueLoopVectorizationInfo { 901 ElementCount MainLoopVF = ElementCount::getFixed(0); 902 unsigned MainLoopUF = 0; 903 ElementCount EpilogueVF = ElementCount::getFixed(0); 904 unsigned EpilogueUF = 0; 905 BasicBlock *MainLoopIterationCountCheck = nullptr; 906 BasicBlock *EpilogueIterationCountCheck = nullptr; 907 BasicBlock *SCEVSafetyCheck = nullptr; 908 BasicBlock *MemSafetyCheck = nullptr; 909 Value *TripCount = nullptr; 910 Value *VectorTripCount = nullptr; 911 912 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 913 unsigned EUF) 914 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 915 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 916 assert(EUF == 1 && 917 "A high UF for the epilogue loop is likely not beneficial."); 918 } 919 }; 920 921 /// An extension of the inner loop vectorizer that creates a skeleton for a 922 /// vectorized loop that has its epilogue (residual) also vectorized. 923 /// The idea is to run the vplan on a given loop twice, firstly to setup the 924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 925 /// from the first step and vectorize the epilogue. This is achieved by 926 /// deriving two concrete strategy classes from this base class and invoking 927 /// them in succession from the loop vectorizer planner. 928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 929 public: 930 InnerLoopAndEpilogueVectorizer( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 940 Checks), 941 EPI(EPI) {} 942 943 // Override this function to handle the more complex control flow around the 944 // three loops. 945 BasicBlock *createVectorizedLoopSkeleton() final override { 946 return createEpilogueVectorizedLoopSkeleton(); 947 } 948 949 /// The interface for creating a vectorized skeleton using one of two 950 /// different strategies, each corresponding to one execution of the vplan 951 /// as described above. 952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 953 954 /// Holds and updates state information required to vectorize the main loop 955 /// and its epilogue in two separate passes. This setup helps us avoid 956 /// regenerating and recomputing runtime safety checks. It also helps us to 957 /// shorten the iteration-count-check path length for the cases where the 958 /// iteration count of the loop is so small that the main vector loop is 959 /// completely skipped. 960 EpilogueLoopVectorizationInfo &EPI; 961 }; 962 963 /// A specialized derived class of inner loop vectorizer that performs 964 /// vectorization of *main* loops in the process of vectorizing loops and their 965 /// epilogues. 966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 967 public: 968 EpilogueVectorizerMainLoop( 969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 970 DominatorTree *DT, const TargetLibraryInfo *TLI, 971 const TargetTransformInfo *TTI, AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 975 GeneratedRTChecks &Check) 976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 977 EPI, LVL, CM, BFI, PSI, Check) {} 978 /// Implements the interface for creating a vectorized skeleton using the 979 /// *main loop* strategy (ie the first pass of vplan execution). 980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 981 982 protected: 983 /// Emits an iteration count bypass check once for the main loop (when \p 984 /// ForEpilogue is false) and once for the epilogue loop (when \p 985 /// ForEpilogue is true). 986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 987 bool ForEpilogue); 988 void printDebugTracesAtStart() override; 989 void printDebugTracesAtEnd() override; 990 }; 991 992 // A specialized derived class of inner loop vectorizer that performs 993 // vectorization of *epilogue* loops in the process of vectorizing loops and 994 // their epilogues. 995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 996 public: 997 EpilogueVectorizerEpilogueLoop( 998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 999 DominatorTree *DT, const TargetLibraryInfo *TLI, 1000 const TargetTransformInfo *TTI, AssumptionCache *AC, 1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1004 GeneratedRTChecks &Checks) 1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1006 EPI, LVL, CM, BFI, PSI, Checks) {} 1007 /// Implements the interface for creating a vectorized skeleton using the 1008 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1010 1011 protected: 1012 /// Emits an iteration count bypass check after the main vector loop has 1013 /// finished to see if there are any iterations left to execute by either 1014 /// the vector epilogue or the scalar epilogue. 1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1016 BasicBlock *Bypass, 1017 BasicBlock *Insert); 1018 void printDebugTracesAtStart() override; 1019 void printDebugTracesAtEnd() override; 1020 }; 1021 } // end namespace llvm 1022 1023 /// Look for a meaningful debug location on the instruction or it's 1024 /// operands. 1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1026 if (!I) 1027 return I; 1028 1029 DebugLoc Empty; 1030 if (I->getDebugLoc() != Empty) 1031 return I; 1032 1033 for (Use &Op : I->operands()) { 1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1035 if (OpInst->getDebugLoc() != Empty) 1036 return OpInst; 1037 } 1038 1039 return I; 1040 } 1041 1042 void InnerLoopVectorizer::setDebugLocFromInst( 1043 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1044 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1045 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1046 const DILocation *DIL = Inst->getDebugLoc(); 1047 1048 // When a FSDiscriminator is enabled, we don't need to add the multiply 1049 // factors to the discriminators. 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1052 // FIXME: For scalable vectors, assume vscale=1. 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B->SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } else 1062 B->SetCurrentDebugLocation(DIL); 1063 } else 1064 B->SetCurrentDebugLocation(DebugLoc()); 1065 } 1066 1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1068 /// is passed, the message relates to that particular instruction. 1069 #ifndef NDEBUG 1070 static void debugVectorizationMessage(const StringRef Prefix, 1071 const StringRef DebugMsg, 1072 Instruction *I) { 1073 dbgs() << "LV: " << Prefix << DebugMsg; 1074 if (I != nullptr) 1075 dbgs() << " " << *I; 1076 else 1077 dbgs() << '.'; 1078 dbgs() << '\n'; 1079 } 1080 #endif 1081 1082 /// Create an analysis remark that explains why vectorization failed 1083 /// 1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1085 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1086 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1087 /// the location of the remark. \return the remark object that can be 1088 /// streamed to. 1089 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1090 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1091 Value *CodeRegion = TheLoop->getHeader(); 1092 DebugLoc DL = TheLoop->getStartLoc(); 1093 1094 if (I) { 1095 CodeRegion = I->getParent(); 1096 // If there is no debug location attached to the instruction, revert back to 1097 // using the loop's. 1098 if (I->getDebugLoc()) 1099 DL = I->getDebugLoc(); 1100 } 1101 1102 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1103 } 1104 1105 /// Return a value for Step multiplied by VF. 1106 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1107 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1108 Constant *StepVal = ConstantInt::get( 1109 Step->getType(), 1110 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1111 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1112 } 1113 1114 namespace llvm { 1115 1116 /// Return the runtime value for VF. 1117 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1118 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1119 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1120 } 1121 1122 void reportVectorizationFailure(const StringRef DebugMsg, 1123 const StringRef OREMsg, const StringRef ORETag, 1124 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1125 Instruction *I) { 1126 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1127 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1128 ORE->emit( 1129 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1130 << "loop not vectorized: " << OREMsg); 1131 } 1132 1133 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1134 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1135 Instruction *I) { 1136 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1137 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1138 ORE->emit( 1139 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1140 << Msg); 1141 } 1142 1143 } // end namespace llvm 1144 1145 #ifndef NDEBUG 1146 /// \return string containing a file name and a line # for the given loop. 1147 static std::string getDebugLocString(const Loop *L) { 1148 std::string Result; 1149 if (L) { 1150 raw_string_ostream OS(Result); 1151 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1152 LoopDbgLoc.print(OS); 1153 else 1154 // Just print the module name. 1155 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1156 OS.flush(); 1157 } 1158 return Result; 1159 } 1160 #endif 1161 1162 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1163 const Instruction *Orig) { 1164 // If the loop was versioned with memchecks, add the corresponding no-alias 1165 // metadata. 1166 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1167 LVer->annotateInstWithNoAlias(To, Orig); 1168 } 1169 1170 void InnerLoopVectorizer::addMetadata(Instruction *To, 1171 Instruction *From) { 1172 propagateMetadata(To, From); 1173 addNewMetadata(To, From); 1174 } 1175 1176 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1177 Instruction *From) { 1178 for (Value *V : To) { 1179 if (Instruction *I = dyn_cast<Instruction>(V)) 1180 addMetadata(I, From); 1181 } 1182 } 1183 1184 namespace llvm { 1185 1186 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1187 // lowered. 1188 enum ScalarEpilogueLowering { 1189 1190 // The default: allowing scalar epilogues. 1191 CM_ScalarEpilogueAllowed, 1192 1193 // Vectorization with OptForSize: don't allow epilogues. 1194 CM_ScalarEpilogueNotAllowedOptSize, 1195 1196 // A special case of vectorisation with OptForSize: loops with a very small 1197 // trip count are considered for vectorization under OptForSize, thereby 1198 // making sure the cost of their loop body is dominant, free of runtime 1199 // guards and scalar iteration overheads. 1200 CM_ScalarEpilogueNotAllowedLowTripLoop, 1201 1202 // Loop hint predicate indicating an epilogue is undesired. 1203 CM_ScalarEpilogueNotNeededUsePredicate, 1204 1205 // Directive indicating we must either tail fold or not vectorize 1206 CM_ScalarEpilogueNotAllowedUsePredicate 1207 }; 1208 1209 /// ElementCountComparator creates a total ordering for ElementCount 1210 /// for the purposes of using it in a set structure. 1211 struct ElementCountComparator { 1212 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1213 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1214 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1215 } 1216 }; 1217 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1218 1219 /// LoopVectorizationCostModel - estimates the expected speedups due to 1220 /// vectorization. 1221 /// In many cases vectorization is not profitable. This can happen because of 1222 /// a number of reasons. In this class we mainly attempt to predict the 1223 /// expected speedup/slowdowns due to the supported instruction set. We use the 1224 /// TargetTransformInfo to query the different backends for the cost of 1225 /// different operations. 1226 class LoopVectorizationCostModel { 1227 public: 1228 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1229 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1230 LoopVectorizationLegality *Legal, 1231 const TargetTransformInfo &TTI, 1232 const TargetLibraryInfo *TLI, DemandedBits *DB, 1233 AssumptionCache *AC, 1234 OptimizationRemarkEmitter *ORE, const Function *F, 1235 const LoopVectorizeHints *Hints, 1236 InterleavedAccessInfo &IAI) 1237 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1238 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1239 Hints(Hints), InterleaveInfo(IAI) {} 1240 1241 /// \return An upper bound for the vectorization factors (both fixed and 1242 /// scalable). If the factors are 0, vectorization and interleaving should be 1243 /// avoided up front. 1244 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1245 1246 /// \return True if runtime checks are required for vectorization, and false 1247 /// otherwise. 1248 bool runtimeChecksRequired(); 1249 1250 /// \return The most profitable vectorization factor and the cost of that VF. 1251 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1252 /// then this vectorization factor will be selected if vectorization is 1253 /// possible. 1254 VectorizationFactor 1255 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1256 1257 VectorizationFactor 1258 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1259 const LoopVectorizationPlanner &LVP); 1260 1261 /// Setup cost-based decisions for user vectorization factor. 1262 /// \return true if the UserVF is a feasible VF to be chosen. 1263 bool selectUserVectorizationFactor(ElementCount UserVF) { 1264 collectUniformsAndScalars(UserVF); 1265 collectInstsToScalarize(UserVF); 1266 return expectedCost(UserVF).first.isValid(); 1267 } 1268 1269 /// \return The size (in bits) of the smallest and widest types in the code 1270 /// that needs to be vectorized. We ignore values that remain scalar such as 1271 /// 64 bit loop indices. 1272 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1273 1274 /// \return The desired interleave count. 1275 /// If interleave count has been specified by metadata it will be returned. 1276 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1277 /// are the selected vectorization factor and the cost of the selected VF. 1278 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1279 1280 /// Memory access instruction may be vectorized in more than one way. 1281 /// Form of instruction after vectorization depends on cost. 1282 /// This function takes cost-based decisions for Load/Store instructions 1283 /// and collects them in a map. This decisions map is used for building 1284 /// the lists of loop-uniform and loop-scalar instructions. 1285 /// The calculated cost is saved with widening decision in order to 1286 /// avoid redundant calculations. 1287 void setCostBasedWideningDecision(ElementCount VF); 1288 1289 /// A struct that represents some properties of the register usage 1290 /// of a loop. 1291 struct RegisterUsage { 1292 /// Holds the number of loop invariant values that are used in the loop. 1293 /// The key is ClassID of target-provided register class. 1294 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1295 /// Holds the maximum number of concurrent live intervals in the loop. 1296 /// The key is ClassID of target-provided register class. 1297 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1298 }; 1299 1300 /// \return Returns information about the register usages of the loop for the 1301 /// given vectorization factors. 1302 SmallVector<RegisterUsage, 8> 1303 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1304 1305 /// Collect values we want to ignore in the cost model. 1306 void collectValuesToIgnore(); 1307 1308 /// Collect all element types in the loop for which widening is needed. 1309 void collectElementTypesForWidening(); 1310 1311 /// Split reductions into those that happen in the loop, and those that happen 1312 /// outside. In loop reductions are collected into InLoopReductionChains. 1313 void collectInLoopReductions(); 1314 1315 /// Returns true if we should use strict in-order reductions for the given 1316 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1317 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1318 /// of FP operations. 1319 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1320 return ForceOrderedReductions && !Hints->allowReordering() && 1321 RdxDesc.isOrdered(); 1322 } 1323 1324 /// \returns The smallest bitwidth each instruction can be represented with. 1325 /// The vector equivalents of these instructions should be truncated to this 1326 /// type. 1327 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1328 return MinBWs; 1329 } 1330 1331 /// \returns True if it is more profitable to scalarize instruction \p I for 1332 /// vectorization factor \p VF. 1333 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1334 assert(VF.isVector() && 1335 "Profitable to scalarize relevant only for VF > 1."); 1336 1337 // Cost model is not run in the VPlan-native path - return conservative 1338 // result until this changes. 1339 if (EnableVPlanNativePath) 1340 return false; 1341 1342 auto Scalars = InstsToScalarize.find(VF); 1343 assert(Scalars != InstsToScalarize.end() && 1344 "VF not yet analyzed for scalarization profitability"); 1345 return Scalars->second.find(I) != Scalars->second.end(); 1346 } 1347 1348 /// Returns true if \p I is known to be uniform after vectorization. 1349 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1350 if (VF.isScalar()) 1351 return true; 1352 1353 // Cost model is not run in the VPlan-native path - return conservative 1354 // result until this changes. 1355 if (EnableVPlanNativePath) 1356 return false; 1357 1358 auto UniformsPerVF = Uniforms.find(VF); 1359 assert(UniformsPerVF != Uniforms.end() && 1360 "VF not yet analyzed for uniformity"); 1361 return UniformsPerVF->second.count(I); 1362 } 1363 1364 /// Returns true if \p I is known to be scalar after vectorization. 1365 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1366 if (VF.isScalar()) 1367 return true; 1368 1369 // Cost model is not run in the VPlan-native path - return conservative 1370 // result until this changes. 1371 if (EnableVPlanNativePath) 1372 return false; 1373 1374 auto ScalarsPerVF = Scalars.find(VF); 1375 assert(ScalarsPerVF != Scalars.end() && 1376 "Scalar values are not calculated for VF"); 1377 return ScalarsPerVF->second.count(I); 1378 } 1379 1380 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1381 /// for vectorization factor \p VF. 1382 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1383 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1384 !isProfitableToScalarize(I, VF) && 1385 !isScalarAfterVectorization(I, VF); 1386 } 1387 1388 /// Decision that was taken during cost calculation for memory instruction. 1389 enum InstWidening { 1390 CM_Unknown, 1391 CM_Widen, // For consecutive accesses with stride +1. 1392 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1393 CM_Interleave, 1394 CM_GatherScatter, 1395 CM_Scalarize 1396 }; 1397 1398 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1399 /// instruction \p I and vector width \p VF. 1400 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1401 InstructionCost Cost) { 1402 assert(VF.isVector() && "Expected VF >=2"); 1403 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1404 } 1405 1406 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1407 /// interleaving group \p Grp and vector width \p VF. 1408 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1409 ElementCount VF, InstWidening W, 1410 InstructionCost Cost) { 1411 assert(VF.isVector() && "Expected VF >=2"); 1412 /// Broadcast this decicion to all instructions inside the group. 1413 /// But the cost will be assigned to one instruction only. 1414 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1415 if (auto *I = Grp->getMember(i)) { 1416 if (Grp->getInsertPos() == I) 1417 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1418 else 1419 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1420 } 1421 } 1422 } 1423 1424 /// Return the cost model decision for the given instruction \p I and vector 1425 /// width \p VF. Return CM_Unknown if this instruction did not pass 1426 /// through the cost modeling. 1427 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1428 assert(VF.isVector() && "Expected VF to be a vector VF"); 1429 // Cost model is not run in the VPlan-native path - return conservative 1430 // result until this changes. 1431 if (EnableVPlanNativePath) 1432 return CM_GatherScatter; 1433 1434 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1435 auto Itr = WideningDecisions.find(InstOnVF); 1436 if (Itr == WideningDecisions.end()) 1437 return CM_Unknown; 1438 return Itr->second.first; 1439 } 1440 1441 /// Return the vectorization cost for the given instruction \p I and vector 1442 /// width \p VF. 1443 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1444 assert(VF.isVector() && "Expected VF >=2"); 1445 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1446 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1447 "The cost is not calculated"); 1448 return WideningDecisions[InstOnVF].second; 1449 } 1450 1451 /// Return True if instruction \p I is an optimizable truncate whose operand 1452 /// is an induction variable. Such a truncate will be removed by adding a new 1453 /// induction variable with the destination type. 1454 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1455 // If the instruction is not a truncate, return false. 1456 auto *Trunc = dyn_cast<TruncInst>(I); 1457 if (!Trunc) 1458 return false; 1459 1460 // Get the source and destination types of the truncate. 1461 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1462 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1463 1464 // If the truncate is free for the given types, return false. Replacing a 1465 // free truncate with an induction variable would add an induction variable 1466 // update instruction to each iteration of the loop. We exclude from this 1467 // check the primary induction variable since it will need an update 1468 // instruction regardless. 1469 Value *Op = Trunc->getOperand(0); 1470 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1471 return false; 1472 1473 // If the truncated value is not an induction variable, return false. 1474 return Legal->isInductionPhi(Op); 1475 } 1476 1477 /// Collects the instructions to scalarize for each predicated instruction in 1478 /// the loop. 1479 void collectInstsToScalarize(ElementCount VF); 1480 1481 /// Collect Uniform and Scalar values for the given \p VF. 1482 /// The sets depend on CM decision for Load/Store instructions 1483 /// that may be vectorized as interleave, gather-scatter or scalarized. 1484 void collectUniformsAndScalars(ElementCount VF) { 1485 // Do the analysis once. 1486 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1487 return; 1488 setCostBasedWideningDecision(VF); 1489 collectLoopUniforms(VF); 1490 collectLoopScalars(VF); 1491 } 1492 1493 /// Returns true if the target machine supports masked store operation 1494 /// for the given \p DataType and kind of access to \p Ptr. 1495 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1496 return Legal->isConsecutivePtr(Ptr) && 1497 TTI.isLegalMaskedStore(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine supports masked load operation 1501 /// for the given \p DataType and kind of access to \p Ptr. 1502 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1503 return Legal->isConsecutivePtr(Ptr) && 1504 TTI.isLegalMaskedLoad(DataType, Alignment); 1505 } 1506 1507 /// Returns true if the target machine can represent \p V as a masked gather 1508 /// or scatter operation. 1509 bool isLegalGatherOrScatter(Value *V) { 1510 bool LI = isa<LoadInst>(V); 1511 bool SI = isa<StoreInst>(V); 1512 if (!LI && !SI) 1513 return false; 1514 auto *Ty = getLoadStoreType(V); 1515 Align Align = getLoadStoreAlignment(V); 1516 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1517 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1518 } 1519 1520 /// Returns true if the target machine supports all of the reduction 1521 /// variables found for the given VF. 1522 bool canVectorizeReductions(ElementCount VF) const { 1523 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1524 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1525 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1526 })); 1527 } 1528 1529 /// Returns true if \p I is an instruction that will be scalarized with 1530 /// predication. Such instructions include conditional stores and 1531 /// instructions that may divide by zero. 1532 /// If a non-zero VF has been calculated, we check if I will be scalarized 1533 /// predication for that VF. 1534 bool isScalarWithPredication(Instruction *I) const; 1535 1536 // Returns true if \p I is an instruction that will be predicated either 1537 // through scalar predication or masked load/store or masked gather/scatter. 1538 // Superset of instructions that return true for isScalarWithPredication. 1539 bool isPredicatedInst(Instruction *I) { 1540 if (!blockNeedsPredication(I->getParent())) 1541 return false; 1542 // Loads and stores that need some form of masked operation are predicated 1543 // instructions. 1544 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1545 return Legal->isMaskRequired(I); 1546 return isScalarWithPredication(I); 1547 } 1548 1549 /// Returns true if \p I is a memory instruction with consecutive memory 1550 /// access that can be widened. 1551 bool 1552 memoryInstructionCanBeWidened(Instruction *I, 1553 ElementCount VF = ElementCount::getFixed(1)); 1554 1555 /// Returns true if \p I is a memory instruction in an interleaved-group 1556 /// of memory accesses that can be vectorized with wide vector loads/stores 1557 /// and shuffles. 1558 bool 1559 interleavedAccessCanBeWidened(Instruction *I, 1560 ElementCount VF = ElementCount::getFixed(1)); 1561 1562 /// Check if \p Instr belongs to any interleaved access group. 1563 bool isAccessInterleaved(Instruction *Instr) { 1564 return InterleaveInfo.isInterleaved(Instr); 1565 } 1566 1567 /// Get the interleaved access group that \p Instr belongs to. 1568 const InterleaveGroup<Instruction> * 1569 getInterleavedAccessGroup(Instruction *Instr) { 1570 return InterleaveInfo.getInterleaveGroup(Instr); 1571 } 1572 1573 /// Returns true if we're required to use a scalar epilogue for at least 1574 /// the final iteration of the original loop. 1575 bool requiresScalarEpilogue(ElementCount VF) const { 1576 if (!isScalarEpilogueAllowed()) 1577 return false; 1578 // If we might exit from anywhere but the latch, must run the exiting 1579 // iteration in scalar form. 1580 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1581 return true; 1582 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1583 } 1584 1585 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1586 /// loop hint annotation. 1587 bool isScalarEpilogueAllowed() const { 1588 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1589 } 1590 1591 /// Returns true if all loop blocks should be masked to fold tail loop. 1592 bool foldTailByMasking() const { return FoldTailByMasking; } 1593 1594 bool blockNeedsPredication(BasicBlock *BB) const { 1595 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1596 } 1597 1598 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1599 /// nodes to the chain of instructions representing the reductions. Uses a 1600 /// MapVector to ensure deterministic iteration order. 1601 using ReductionChainMap = 1602 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1603 1604 /// Return the chain of instructions representing an inloop reduction. 1605 const ReductionChainMap &getInLoopReductionChains() const { 1606 return InLoopReductionChains; 1607 } 1608 1609 /// Returns true if the Phi is part of an inloop reduction. 1610 bool isInLoopReduction(PHINode *Phi) const { 1611 return InLoopReductionChains.count(Phi); 1612 } 1613 1614 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1615 /// with factor VF. Return the cost of the instruction, including 1616 /// scalarization overhead if it's needed. 1617 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1618 1619 /// Estimate cost of a call instruction CI if it were vectorized with factor 1620 /// VF. Return the cost of the instruction, including scalarization overhead 1621 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1622 /// scalarized - 1623 /// i.e. either vector version isn't available, or is too expensive. 1624 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1625 bool &NeedToScalarize) const; 1626 1627 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1628 /// that of B. 1629 bool isMoreProfitable(const VectorizationFactor &A, 1630 const VectorizationFactor &B) const; 1631 1632 /// Invalidates decisions already taken by the cost model. 1633 void invalidateCostModelingDecisions() { 1634 WideningDecisions.clear(); 1635 Uniforms.clear(); 1636 Scalars.clear(); 1637 } 1638 1639 private: 1640 unsigned NumPredStores = 0; 1641 1642 /// \return An upper bound for the vectorization factors for both 1643 /// fixed and scalable vectorization, where the minimum-known number of 1644 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1645 /// disabled or unsupported, then the scalable part will be equal to 1646 /// ElementCount::getScalable(0). 1647 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1648 ElementCount UserVF); 1649 1650 /// \return the maximized element count based on the targets vector 1651 /// registers and the loop trip-count, but limited to a maximum safe VF. 1652 /// This is a helper function of computeFeasibleMaxVF. 1653 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1654 /// issue that occurred on one of the buildbots which cannot be reproduced 1655 /// without having access to the properietary compiler (see comments on 1656 /// D98509). The issue is currently under investigation and this workaround 1657 /// will be removed as soon as possible. 1658 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1659 unsigned SmallestType, 1660 unsigned WidestType, 1661 const ElementCount &MaxSafeVF); 1662 1663 /// \return the maximum legal scalable VF, based on the safe max number 1664 /// of elements. 1665 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1666 1667 /// The vectorization cost is a combination of the cost itself and a boolean 1668 /// indicating whether any of the contributing operations will actually 1669 /// operate on vector values after type legalization in the backend. If this 1670 /// latter value is false, then all operations will be scalarized (i.e. no 1671 /// vectorization has actually taken place). 1672 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1673 1674 /// Returns the expected execution cost. The unit of the cost does 1675 /// not matter because we use the 'cost' units to compare different 1676 /// vector widths. The cost that is returned is *not* normalized by 1677 /// the factor width. If \p Invalid is not nullptr, this function 1678 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1679 /// each instruction that has an Invalid cost for the given VF. 1680 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1681 VectorizationCostTy 1682 expectedCost(ElementCount VF, 1683 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1684 1685 /// Returns the execution time cost of an instruction for a given vector 1686 /// width. Vector width of one means scalar. 1687 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1688 1689 /// The cost-computation logic from getInstructionCost which provides 1690 /// the vector type as an output parameter. 1691 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1692 Type *&VectorTy); 1693 1694 /// Return the cost of instructions in an inloop reduction pattern, if I is 1695 /// part of that pattern. 1696 Optional<InstructionCost> 1697 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1698 TTI::TargetCostKind CostKind); 1699 1700 /// Calculate vectorization cost of memory instruction \p I. 1701 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1702 1703 /// The cost computation for scalarized memory instruction. 1704 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1705 1706 /// The cost computation for interleaving group of memory instructions. 1707 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1708 1709 /// The cost computation for Gather/Scatter instruction. 1710 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1711 1712 /// The cost computation for widening instruction \p I with consecutive 1713 /// memory access. 1714 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1715 1716 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1717 /// Load: scalar load + broadcast. 1718 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1719 /// element) 1720 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1721 1722 /// Estimate the overhead of scalarizing an instruction. This is a 1723 /// convenience wrapper for the type-based getScalarizationOverhead API. 1724 InstructionCost getScalarizationOverhead(Instruction *I, 1725 ElementCount VF) const; 1726 1727 /// Returns whether the instruction is a load or store and will be a emitted 1728 /// as a vector operation. 1729 bool isConsecutiveLoadOrStore(Instruction *I); 1730 1731 /// Returns true if an artificially high cost for emulated masked memrefs 1732 /// should be used. 1733 bool useEmulatedMaskMemRefHack(Instruction *I); 1734 1735 /// Map of scalar integer values to the smallest bitwidth they can be legally 1736 /// represented as. The vector equivalents of these values should be truncated 1737 /// to this type. 1738 MapVector<Instruction *, uint64_t> MinBWs; 1739 1740 /// A type representing the costs for instructions if they were to be 1741 /// scalarized rather than vectorized. The entries are Instruction-Cost 1742 /// pairs. 1743 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1744 1745 /// A set containing all BasicBlocks that are known to present after 1746 /// vectorization as a predicated block. 1747 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1748 1749 /// Records whether it is allowed to have the original scalar loop execute at 1750 /// least once. This may be needed as a fallback loop in case runtime 1751 /// aliasing/dependence checks fail, or to handle the tail/remainder 1752 /// iterations when the trip count is unknown or doesn't divide by the VF, 1753 /// or as a peel-loop to handle gaps in interleave-groups. 1754 /// Under optsize and when the trip count is very small we don't allow any 1755 /// iterations to execute in the scalar loop. 1756 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1757 1758 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1759 bool FoldTailByMasking = false; 1760 1761 /// A map holding scalar costs for different vectorization factors. The 1762 /// presence of a cost for an instruction in the mapping indicates that the 1763 /// instruction will be scalarized when vectorizing with the associated 1764 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1765 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1766 1767 /// Holds the instructions known to be uniform after vectorization. 1768 /// The data is collected per VF. 1769 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1770 1771 /// Holds the instructions known to be scalar after vectorization. 1772 /// The data is collected per VF. 1773 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1774 1775 /// Holds the instructions (address computations) that are forced to be 1776 /// scalarized. 1777 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1778 1779 /// PHINodes of the reductions that should be expanded in-loop along with 1780 /// their associated chains of reduction operations, in program order from top 1781 /// (PHI) to bottom 1782 ReductionChainMap InLoopReductionChains; 1783 1784 /// A Map of inloop reduction operations and their immediate chain operand. 1785 /// FIXME: This can be removed once reductions can be costed correctly in 1786 /// vplan. This was added to allow quick lookup to the inloop operations, 1787 /// without having to loop through InLoopReductionChains. 1788 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1789 1790 /// Returns the expected difference in cost from scalarizing the expression 1791 /// feeding a predicated instruction \p PredInst. The instructions to 1792 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1793 /// non-negative return value implies the expression will be scalarized. 1794 /// Currently, only single-use chains are considered for scalarization. 1795 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1796 ElementCount VF); 1797 1798 /// Collect the instructions that are uniform after vectorization. An 1799 /// instruction is uniform if we represent it with a single scalar value in 1800 /// the vectorized loop corresponding to each vector iteration. Examples of 1801 /// uniform instructions include pointer operands of consecutive or 1802 /// interleaved memory accesses. Note that although uniformity implies an 1803 /// instruction will be scalar, the reverse is not true. In general, a 1804 /// scalarized instruction will be represented by VF scalar values in the 1805 /// vectorized loop, each corresponding to an iteration of the original 1806 /// scalar loop. 1807 void collectLoopUniforms(ElementCount VF); 1808 1809 /// Collect the instructions that are scalar after vectorization. An 1810 /// instruction is scalar if it is known to be uniform or will be scalarized 1811 /// during vectorization. Non-uniform scalarized instructions will be 1812 /// represented by VF values in the vectorized loop, each corresponding to an 1813 /// iteration of the original scalar loop. 1814 void collectLoopScalars(ElementCount VF); 1815 1816 /// Keeps cost model vectorization decision and cost for instructions. 1817 /// Right now it is used for memory instructions only. 1818 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1819 std::pair<InstWidening, InstructionCost>>; 1820 1821 DecisionList WideningDecisions; 1822 1823 /// Returns true if \p V is expected to be vectorized and it needs to be 1824 /// extracted. 1825 bool needsExtract(Value *V, ElementCount VF) const { 1826 Instruction *I = dyn_cast<Instruction>(V); 1827 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1828 TheLoop->isLoopInvariant(I)) 1829 return false; 1830 1831 // Assume we can vectorize V (and hence we need extraction) if the 1832 // scalars are not computed yet. This can happen, because it is called 1833 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1834 // the scalars are collected. That should be a safe assumption in most 1835 // cases, because we check if the operands have vectorizable types 1836 // beforehand in LoopVectorizationLegality. 1837 return Scalars.find(VF) == Scalars.end() || 1838 !isScalarAfterVectorization(I, VF); 1839 }; 1840 1841 /// Returns a range containing only operands needing to be extracted. 1842 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1843 ElementCount VF) const { 1844 return SmallVector<Value *, 4>(make_filter_range( 1845 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1846 } 1847 1848 /// Determines if we have the infrastructure to vectorize loop \p L and its 1849 /// epilogue, assuming the main loop is vectorized by \p VF. 1850 bool isCandidateForEpilogueVectorization(const Loop &L, 1851 const ElementCount VF) const; 1852 1853 /// Returns true if epilogue vectorization is considered profitable, and 1854 /// false otherwise. 1855 /// \p VF is the vectorization factor chosen for the original loop. 1856 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1857 1858 public: 1859 /// The loop that we evaluate. 1860 Loop *TheLoop; 1861 1862 /// Predicated scalar evolution analysis. 1863 PredicatedScalarEvolution &PSE; 1864 1865 /// Loop Info analysis. 1866 LoopInfo *LI; 1867 1868 /// Vectorization legality. 1869 LoopVectorizationLegality *Legal; 1870 1871 /// Vector target information. 1872 const TargetTransformInfo &TTI; 1873 1874 /// Target Library Info. 1875 const TargetLibraryInfo *TLI; 1876 1877 /// Demanded bits analysis. 1878 DemandedBits *DB; 1879 1880 /// Assumption cache. 1881 AssumptionCache *AC; 1882 1883 /// Interface to emit optimization remarks. 1884 OptimizationRemarkEmitter *ORE; 1885 1886 const Function *TheFunction; 1887 1888 /// Loop Vectorize Hint. 1889 const LoopVectorizeHints *Hints; 1890 1891 /// The interleave access information contains groups of interleaved accesses 1892 /// with the same stride and close to each other. 1893 InterleavedAccessInfo &InterleaveInfo; 1894 1895 /// Values to ignore in the cost model. 1896 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1897 1898 /// Values to ignore in the cost model when VF > 1. 1899 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1900 1901 /// All element types found in the loop. 1902 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1903 1904 /// Profitable vector factors. 1905 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1906 }; 1907 } // end namespace llvm 1908 1909 /// Helper struct to manage generating runtime checks for vectorization. 1910 /// 1911 /// The runtime checks are created up-front in temporary blocks to allow better 1912 /// estimating the cost and un-linked from the existing IR. After deciding to 1913 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1914 /// temporary blocks are completely removed. 1915 class GeneratedRTChecks { 1916 /// Basic block which contains the generated SCEV checks, if any. 1917 BasicBlock *SCEVCheckBlock = nullptr; 1918 1919 /// The value representing the result of the generated SCEV checks. If it is 1920 /// nullptr, either no SCEV checks have been generated or they have been used. 1921 Value *SCEVCheckCond = nullptr; 1922 1923 /// Basic block which contains the generated memory runtime checks, if any. 1924 BasicBlock *MemCheckBlock = nullptr; 1925 1926 /// The value representing the result of the generated memory runtime checks. 1927 /// If it is nullptr, either no memory runtime checks have been generated or 1928 /// they have been used. 1929 Instruction *MemRuntimeCheckCond = nullptr; 1930 1931 DominatorTree *DT; 1932 LoopInfo *LI; 1933 1934 SCEVExpander SCEVExp; 1935 SCEVExpander MemCheckExp; 1936 1937 public: 1938 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1939 const DataLayout &DL) 1940 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1941 MemCheckExp(SE, DL, "scev.check") {} 1942 1943 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1944 /// accurately estimate the cost of the runtime checks. The blocks are 1945 /// un-linked from the IR and is added back during vector code generation. If 1946 /// there is no vector code generation, the check blocks are removed 1947 /// completely. 1948 void Create(Loop *L, const LoopAccessInfo &LAI, 1949 const SCEVUnionPredicate &UnionPred) { 1950 1951 BasicBlock *LoopHeader = L->getHeader(); 1952 BasicBlock *Preheader = L->getLoopPreheader(); 1953 1954 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1955 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1956 // may be used by SCEVExpander. The blocks will be un-linked from their 1957 // predecessors and removed from LI & DT at the end of the function. 1958 if (!UnionPred.isAlwaysTrue()) { 1959 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1960 nullptr, "vector.scevcheck"); 1961 1962 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1963 &UnionPred, SCEVCheckBlock->getTerminator()); 1964 } 1965 1966 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1967 if (RtPtrChecking.Need) { 1968 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1969 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1970 "vector.memcheck"); 1971 1972 std::tie(std::ignore, MemRuntimeCheckCond) = 1973 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1974 RtPtrChecking.getChecks(), MemCheckExp); 1975 assert(MemRuntimeCheckCond && 1976 "no RT checks generated although RtPtrChecking " 1977 "claimed checks are required"); 1978 } 1979 1980 if (!MemCheckBlock && !SCEVCheckBlock) 1981 return; 1982 1983 // Unhook the temporary block with the checks, update various places 1984 // accordingly. 1985 if (SCEVCheckBlock) 1986 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1987 if (MemCheckBlock) 1988 MemCheckBlock->replaceAllUsesWith(Preheader); 1989 1990 if (SCEVCheckBlock) { 1991 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1992 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1993 Preheader->getTerminator()->eraseFromParent(); 1994 } 1995 if (MemCheckBlock) { 1996 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1997 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1998 Preheader->getTerminator()->eraseFromParent(); 1999 } 2000 2001 DT->changeImmediateDominator(LoopHeader, Preheader); 2002 if (MemCheckBlock) { 2003 DT->eraseNode(MemCheckBlock); 2004 LI->removeBlock(MemCheckBlock); 2005 } 2006 if (SCEVCheckBlock) { 2007 DT->eraseNode(SCEVCheckBlock); 2008 LI->removeBlock(SCEVCheckBlock); 2009 } 2010 } 2011 2012 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2013 /// unused. 2014 ~GeneratedRTChecks() { 2015 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2016 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2017 if (!SCEVCheckCond) 2018 SCEVCleaner.markResultUsed(); 2019 2020 if (!MemRuntimeCheckCond) 2021 MemCheckCleaner.markResultUsed(); 2022 2023 if (MemRuntimeCheckCond) { 2024 auto &SE = *MemCheckExp.getSE(); 2025 // Memory runtime check generation creates compares that use expanded 2026 // values. Remove them before running the SCEVExpanderCleaners. 2027 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2028 if (MemCheckExp.isInsertedInstruction(&I)) 2029 continue; 2030 SE.forgetValue(&I); 2031 SE.eraseValueFromMap(&I); 2032 I.eraseFromParent(); 2033 } 2034 } 2035 MemCheckCleaner.cleanup(); 2036 SCEVCleaner.cleanup(); 2037 2038 if (SCEVCheckCond) 2039 SCEVCheckBlock->eraseFromParent(); 2040 if (MemRuntimeCheckCond) 2041 MemCheckBlock->eraseFromParent(); 2042 } 2043 2044 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2045 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2046 /// depending on the generated condition. 2047 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2048 BasicBlock *LoopVectorPreHeader, 2049 BasicBlock *LoopExitBlock) { 2050 if (!SCEVCheckCond) 2051 return nullptr; 2052 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2053 if (C->isZero()) 2054 return nullptr; 2055 2056 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2057 2058 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2059 // Create new preheader for vector loop. 2060 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2061 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2062 2063 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2064 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2065 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2066 SCEVCheckBlock); 2067 2068 DT->addNewBlock(SCEVCheckBlock, Pred); 2069 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2070 2071 ReplaceInstWithInst( 2072 SCEVCheckBlock->getTerminator(), 2073 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2074 // Mark the check as used, to prevent it from being removed during cleanup. 2075 SCEVCheckCond = nullptr; 2076 return SCEVCheckBlock; 2077 } 2078 2079 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2080 /// the branches to branch to the vector preheader or \p Bypass, depending on 2081 /// the generated condition. 2082 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2083 BasicBlock *LoopVectorPreHeader) { 2084 // Check if we generated code that checks in runtime if arrays overlap. 2085 if (!MemRuntimeCheckCond) 2086 return nullptr; 2087 2088 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2089 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2090 MemCheckBlock); 2091 2092 DT->addNewBlock(MemCheckBlock, Pred); 2093 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2094 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2095 2096 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2097 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2098 2099 ReplaceInstWithInst( 2100 MemCheckBlock->getTerminator(), 2101 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2102 MemCheckBlock->getTerminator()->setDebugLoc( 2103 Pred->getTerminator()->getDebugLoc()); 2104 2105 // Mark the check as used, to prevent it from being removed during cleanup. 2106 MemRuntimeCheckCond = nullptr; 2107 return MemCheckBlock; 2108 } 2109 }; 2110 2111 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2112 // vectorization. The loop needs to be annotated with #pragma omp simd 2113 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2114 // vector length information is not provided, vectorization is not considered 2115 // explicit. Interleave hints are not allowed either. These limitations will be 2116 // relaxed in the future. 2117 // Please, note that we are currently forced to abuse the pragma 'clang 2118 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2119 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2120 // provides *explicit vectorization hints* (LV can bypass legal checks and 2121 // assume that vectorization is legal). However, both hints are implemented 2122 // using the same metadata (llvm.loop.vectorize, processed by 2123 // LoopVectorizeHints). This will be fixed in the future when the native IR 2124 // representation for pragma 'omp simd' is introduced. 2125 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2126 OptimizationRemarkEmitter *ORE) { 2127 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2128 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2129 2130 // Only outer loops with an explicit vectorization hint are supported. 2131 // Unannotated outer loops are ignored. 2132 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2133 return false; 2134 2135 Function *Fn = OuterLp->getHeader()->getParent(); 2136 if (!Hints.allowVectorization(Fn, OuterLp, 2137 true /*VectorizeOnlyWhenForced*/)) { 2138 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2139 return false; 2140 } 2141 2142 if (Hints.getInterleave() > 1) { 2143 // TODO: Interleave support is future work. 2144 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2145 "outer loops.\n"); 2146 Hints.emitRemarkWithHints(); 2147 return false; 2148 } 2149 2150 return true; 2151 } 2152 2153 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2154 OptimizationRemarkEmitter *ORE, 2155 SmallVectorImpl<Loop *> &V) { 2156 // Collect inner loops and outer loops without irreducible control flow. For 2157 // now, only collect outer loops that have explicit vectorization hints. If we 2158 // are stress testing the VPlan H-CFG construction, we collect the outermost 2159 // loop of every loop nest. 2160 if (L.isInnermost() || VPlanBuildStressTest || 2161 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2162 LoopBlocksRPO RPOT(&L); 2163 RPOT.perform(LI); 2164 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2165 V.push_back(&L); 2166 // TODO: Collect inner loops inside marked outer loops in case 2167 // vectorization fails for the outer loop. Do not invoke 2168 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2169 // already known to be reducible. We can use an inherited attribute for 2170 // that. 2171 return; 2172 } 2173 } 2174 for (Loop *InnerL : L) 2175 collectSupportedLoops(*InnerL, LI, ORE, V); 2176 } 2177 2178 namespace { 2179 2180 /// The LoopVectorize Pass. 2181 struct LoopVectorize : public FunctionPass { 2182 /// Pass identification, replacement for typeid 2183 static char ID; 2184 2185 LoopVectorizePass Impl; 2186 2187 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2188 bool VectorizeOnlyWhenForced = false) 2189 : FunctionPass(ID), 2190 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2191 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2192 } 2193 2194 bool runOnFunction(Function &F) override { 2195 if (skipFunction(F)) 2196 return false; 2197 2198 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2199 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2200 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2201 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2202 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2203 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2204 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2205 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2206 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2207 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2208 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2209 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2210 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2211 2212 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2213 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2214 2215 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2216 GetLAA, *ORE, PSI).MadeAnyChange; 2217 } 2218 2219 void getAnalysisUsage(AnalysisUsage &AU) const override { 2220 AU.addRequired<AssumptionCacheTracker>(); 2221 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2222 AU.addRequired<DominatorTreeWrapperPass>(); 2223 AU.addRequired<LoopInfoWrapperPass>(); 2224 AU.addRequired<ScalarEvolutionWrapperPass>(); 2225 AU.addRequired<TargetTransformInfoWrapperPass>(); 2226 AU.addRequired<AAResultsWrapperPass>(); 2227 AU.addRequired<LoopAccessLegacyAnalysis>(); 2228 AU.addRequired<DemandedBitsWrapperPass>(); 2229 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2230 AU.addRequired<InjectTLIMappingsLegacy>(); 2231 2232 // We currently do not preserve loopinfo/dominator analyses with outer loop 2233 // vectorization. Until this is addressed, mark these analyses as preserved 2234 // only for non-VPlan-native path. 2235 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2236 if (!EnableVPlanNativePath) { 2237 AU.addPreserved<LoopInfoWrapperPass>(); 2238 AU.addPreserved<DominatorTreeWrapperPass>(); 2239 } 2240 2241 AU.addPreserved<BasicAAWrapperPass>(); 2242 AU.addPreserved<GlobalsAAWrapperPass>(); 2243 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2244 } 2245 }; 2246 2247 } // end anonymous namespace 2248 2249 //===----------------------------------------------------------------------===// 2250 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2251 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2252 //===----------------------------------------------------------------------===// 2253 2254 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2255 // We need to place the broadcast of invariant variables outside the loop, 2256 // but only if it's proven safe to do so. Else, broadcast will be inside 2257 // vector loop body. 2258 Instruction *Instr = dyn_cast<Instruction>(V); 2259 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2260 (!Instr || 2261 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2262 // Place the code for broadcasting invariant variables in the new preheader. 2263 IRBuilder<>::InsertPointGuard Guard(Builder); 2264 if (SafeToHoist) 2265 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2266 2267 // Broadcast the scalar into all locations in the vector. 2268 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2269 2270 return Shuf; 2271 } 2272 2273 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2274 const InductionDescriptor &II, Value *Step, Value *Start, 2275 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2276 VPTransformState &State) { 2277 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2278 "Expected either an induction phi-node or a truncate of it!"); 2279 2280 // Construct the initial value of the vector IV in the vector loop preheader 2281 auto CurrIP = Builder.saveIP(); 2282 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2283 if (isa<TruncInst>(EntryVal)) { 2284 assert(Start->getType()->isIntegerTy() && 2285 "Truncation requires an integer type"); 2286 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2287 Step = Builder.CreateTrunc(Step, TruncType); 2288 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2289 } 2290 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2291 Value *SteppedStart = 2292 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2293 2294 // We create vector phi nodes for both integer and floating-point induction 2295 // variables. Here, we determine the kind of arithmetic we will perform. 2296 Instruction::BinaryOps AddOp; 2297 Instruction::BinaryOps MulOp; 2298 if (Step->getType()->isIntegerTy()) { 2299 AddOp = Instruction::Add; 2300 MulOp = Instruction::Mul; 2301 } else { 2302 AddOp = II.getInductionOpcode(); 2303 MulOp = Instruction::FMul; 2304 } 2305 2306 // Multiply the vectorization factor by the step using integer or 2307 // floating-point arithmetic as appropriate. 2308 Type *StepType = Step->getType(); 2309 if (Step->getType()->isFloatingPointTy()) 2310 StepType = IntegerType::get(StepType->getContext(), 2311 StepType->getScalarSizeInBits()); 2312 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2313 if (Step->getType()->isFloatingPointTy()) 2314 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2315 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2316 2317 // Create a vector splat to use in the induction update. 2318 // 2319 // FIXME: If the step is non-constant, we create the vector splat with 2320 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2321 // handle a constant vector splat. 2322 Value *SplatVF = isa<Constant>(Mul) 2323 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2324 : Builder.CreateVectorSplat(VF, Mul); 2325 Builder.restoreIP(CurrIP); 2326 2327 // We may need to add the step a number of times, depending on the unroll 2328 // factor. The last of those goes into the PHI. 2329 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2330 &*LoopVectorBody->getFirstInsertionPt()); 2331 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2332 Instruction *LastInduction = VecInd; 2333 for (unsigned Part = 0; Part < UF; ++Part) { 2334 State.set(Def, LastInduction, Part); 2335 2336 if (isa<TruncInst>(EntryVal)) 2337 addMetadata(LastInduction, EntryVal); 2338 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2339 State, Part); 2340 2341 LastInduction = cast<Instruction>( 2342 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2343 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2344 } 2345 2346 // Move the last step to the end of the latch block. This ensures consistent 2347 // placement of all induction updates. 2348 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2349 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2350 auto *ICmp = cast<Instruction>(Br->getCondition()); 2351 LastInduction->moveBefore(ICmp); 2352 LastInduction->setName("vec.ind.next"); 2353 2354 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2355 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2356 } 2357 2358 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2359 return Cost->isScalarAfterVectorization(I, VF) || 2360 Cost->isProfitableToScalarize(I, VF); 2361 } 2362 2363 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2364 if (shouldScalarizeInstruction(IV)) 2365 return true; 2366 auto isScalarInst = [&](User *U) -> bool { 2367 auto *I = cast<Instruction>(U); 2368 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2369 }; 2370 return llvm::any_of(IV->users(), isScalarInst); 2371 } 2372 2373 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2374 const InductionDescriptor &ID, const Instruction *EntryVal, 2375 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2376 unsigned Part, unsigned Lane) { 2377 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2378 "Expected either an induction phi-node or a truncate of it!"); 2379 2380 // This induction variable is not the phi from the original loop but the 2381 // newly-created IV based on the proof that casted Phi is equal to the 2382 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2383 // re-uses the same InductionDescriptor that original IV uses but we don't 2384 // have to do any recording in this case - that is done when original IV is 2385 // processed. 2386 if (isa<TruncInst>(EntryVal)) 2387 return; 2388 2389 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2390 if (Casts.empty()) 2391 return; 2392 // Only the first Cast instruction in the Casts vector is of interest. 2393 // The rest of the Casts (if exist) have no uses outside the 2394 // induction update chain itself. 2395 if (Lane < UINT_MAX) 2396 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2397 else 2398 State.set(CastDef, VectorLoopVal, Part); 2399 } 2400 2401 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2402 TruncInst *Trunc, VPValue *Def, 2403 VPValue *CastDef, 2404 VPTransformState &State) { 2405 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2406 "Primary induction variable must have an integer type"); 2407 2408 auto II = Legal->getInductionVars().find(IV); 2409 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2410 2411 auto ID = II->second; 2412 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2413 2414 // The value from the original loop to which we are mapping the new induction 2415 // variable. 2416 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2417 2418 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2419 2420 // Generate code for the induction step. Note that induction steps are 2421 // required to be loop-invariant 2422 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2423 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2424 "Induction step should be loop invariant"); 2425 if (PSE.getSE()->isSCEVable(IV->getType())) { 2426 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2427 return Exp.expandCodeFor(Step, Step->getType(), 2428 LoopVectorPreHeader->getTerminator()); 2429 } 2430 return cast<SCEVUnknown>(Step)->getValue(); 2431 }; 2432 2433 // The scalar value to broadcast. This is derived from the canonical 2434 // induction variable. If a truncation type is given, truncate the canonical 2435 // induction variable and step. Otherwise, derive these values from the 2436 // induction descriptor. 2437 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2438 Value *ScalarIV = Induction; 2439 if (IV != OldInduction) { 2440 ScalarIV = IV->getType()->isIntegerTy() 2441 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2442 : Builder.CreateCast(Instruction::SIToFP, Induction, 2443 IV->getType()); 2444 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2445 ScalarIV->setName("offset.idx"); 2446 } 2447 if (Trunc) { 2448 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2449 assert(Step->getType()->isIntegerTy() && 2450 "Truncation requires an integer step"); 2451 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2452 Step = Builder.CreateTrunc(Step, TruncType); 2453 } 2454 return ScalarIV; 2455 }; 2456 2457 // Create the vector values from the scalar IV, in the absence of creating a 2458 // vector IV. 2459 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2460 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2461 for (unsigned Part = 0; Part < UF; ++Part) { 2462 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2463 Value *EntryPart = 2464 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2465 ID.getInductionOpcode()); 2466 State.set(Def, EntryPart, Part); 2467 if (Trunc) 2468 addMetadata(EntryPart, Trunc); 2469 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2470 State, Part); 2471 } 2472 }; 2473 2474 // Fast-math-flags propagate from the original induction instruction. 2475 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2476 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2477 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2478 2479 // Now do the actual transformations, and start with creating the step value. 2480 Value *Step = CreateStepValue(ID.getStep()); 2481 if (VF.isZero() || VF.isScalar()) { 2482 Value *ScalarIV = CreateScalarIV(Step); 2483 CreateSplatIV(ScalarIV, Step); 2484 return; 2485 } 2486 2487 // Determine if we want a scalar version of the induction variable. This is 2488 // true if the induction variable itself is not widened, or if it has at 2489 // least one user in the loop that is not widened. 2490 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2491 if (!NeedsScalarIV) { 2492 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2493 State); 2494 return; 2495 } 2496 2497 // Try to create a new independent vector induction variable. If we can't 2498 // create the phi node, we will splat the scalar induction variable in each 2499 // loop iteration. 2500 if (!shouldScalarizeInstruction(EntryVal)) { 2501 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2502 State); 2503 Value *ScalarIV = CreateScalarIV(Step); 2504 // Create scalar steps that can be used by instructions we will later 2505 // scalarize. Note that the addition of the scalar steps will not increase 2506 // the number of instructions in the loop in the common case prior to 2507 // InstCombine. We will be trading one vector extract for each scalar step. 2508 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2509 return; 2510 } 2511 2512 // All IV users are scalar instructions, so only emit a scalar IV, not a 2513 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2514 // predicate used by the masked loads/stores. 2515 Value *ScalarIV = CreateScalarIV(Step); 2516 if (!Cost->isScalarEpilogueAllowed()) 2517 CreateSplatIV(ScalarIV, Step); 2518 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2519 } 2520 2521 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2522 Instruction::BinaryOps BinOp) { 2523 // Create and check the types. 2524 auto *ValVTy = cast<VectorType>(Val->getType()); 2525 ElementCount VLen = ValVTy->getElementCount(); 2526 2527 Type *STy = Val->getType()->getScalarType(); 2528 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2529 "Induction Step must be an integer or FP"); 2530 assert(Step->getType() == STy && "Step has wrong type"); 2531 2532 SmallVector<Constant *, 8> Indices; 2533 2534 // Create a vector of consecutive numbers from zero to VF. 2535 VectorType *InitVecValVTy = ValVTy; 2536 Type *InitVecValSTy = STy; 2537 if (STy->isFloatingPointTy()) { 2538 InitVecValSTy = 2539 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2540 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2541 } 2542 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2543 2544 // Add on StartIdx 2545 Value *StartIdxSplat = Builder.CreateVectorSplat( 2546 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2547 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2548 2549 if (STy->isIntegerTy()) { 2550 Step = Builder.CreateVectorSplat(VLen, Step); 2551 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2552 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2553 // which can be found from the original scalar operations. 2554 Step = Builder.CreateMul(InitVec, Step); 2555 return Builder.CreateAdd(Val, Step, "induction"); 2556 } 2557 2558 // Floating point induction. 2559 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2560 "Binary Opcode should be specified for FP induction"); 2561 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2562 Step = Builder.CreateVectorSplat(VLen, Step); 2563 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2564 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2565 } 2566 2567 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2568 Instruction *EntryVal, 2569 const InductionDescriptor &ID, 2570 VPValue *Def, VPValue *CastDef, 2571 VPTransformState &State) { 2572 // We shouldn't have to build scalar steps if we aren't vectorizing. 2573 assert(VF.isVector() && "VF should be greater than one"); 2574 // Get the value type and ensure it and the step have the same integer type. 2575 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2576 assert(ScalarIVTy == Step->getType() && 2577 "Val and Step should have the same type"); 2578 2579 // We build scalar steps for both integer and floating-point induction 2580 // variables. Here, we determine the kind of arithmetic we will perform. 2581 Instruction::BinaryOps AddOp; 2582 Instruction::BinaryOps MulOp; 2583 if (ScalarIVTy->isIntegerTy()) { 2584 AddOp = Instruction::Add; 2585 MulOp = Instruction::Mul; 2586 } else { 2587 AddOp = ID.getInductionOpcode(); 2588 MulOp = Instruction::FMul; 2589 } 2590 2591 // Determine the number of scalars we need to generate for each unroll 2592 // iteration. If EntryVal is uniform, we only need to generate the first 2593 // lane. Otherwise, we generate all VF values. 2594 bool IsUniform = 2595 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2596 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2597 // Compute the scalar steps and save the results in State. 2598 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2599 ScalarIVTy->getScalarSizeInBits()); 2600 Type *VecIVTy = nullptr; 2601 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2602 if (!IsUniform && VF.isScalable()) { 2603 VecIVTy = VectorType::get(ScalarIVTy, VF); 2604 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2605 SplatStep = Builder.CreateVectorSplat(VF, Step); 2606 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2607 } 2608 2609 for (unsigned Part = 0; Part < UF; ++Part) { 2610 Value *StartIdx0 = 2611 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2612 2613 if (!IsUniform && VF.isScalable()) { 2614 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2615 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2616 if (ScalarIVTy->isFloatingPointTy()) 2617 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2618 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2619 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2620 State.set(Def, Add, Part); 2621 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2622 Part); 2623 // It's useful to record the lane values too for the known minimum number 2624 // of elements so we do those below. This improves the code quality when 2625 // trying to extract the first element, for example. 2626 } 2627 2628 if (ScalarIVTy->isFloatingPointTy()) 2629 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2630 2631 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2632 Value *StartIdx = Builder.CreateBinOp( 2633 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2634 // The step returned by `createStepForVF` is a runtime-evaluated value 2635 // when VF is scalable. Otherwise, it should be folded into a Constant. 2636 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2637 "Expected StartIdx to be folded to a constant when VF is not " 2638 "scalable"); 2639 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2640 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2641 State.set(Def, Add, VPIteration(Part, Lane)); 2642 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2643 Part, Lane); 2644 } 2645 } 2646 } 2647 2648 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2649 const VPIteration &Instance, 2650 VPTransformState &State) { 2651 Value *ScalarInst = State.get(Def, Instance); 2652 Value *VectorValue = State.get(Def, Instance.Part); 2653 VectorValue = Builder.CreateInsertElement( 2654 VectorValue, ScalarInst, 2655 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2656 State.set(Def, VectorValue, Instance.Part); 2657 } 2658 2659 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2660 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2661 return Builder.CreateVectorReverse(Vec, "reverse"); 2662 } 2663 2664 // Return whether we allow using masked interleave-groups (for dealing with 2665 // strided loads/stores that reside in predicated blocks, or for dealing 2666 // with gaps). 2667 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2668 // If an override option has been passed in for interleaved accesses, use it. 2669 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2670 return EnableMaskedInterleavedMemAccesses; 2671 2672 return TTI.enableMaskedInterleavedAccessVectorization(); 2673 } 2674 2675 // Try to vectorize the interleave group that \p Instr belongs to. 2676 // 2677 // E.g. Translate following interleaved load group (factor = 3): 2678 // for (i = 0; i < N; i+=3) { 2679 // R = Pic[i]; // Member of index 0 2680 // G = Pic[i+1]; // Member of index 1 2681 // B = Pic[i+2]; // Member of index 2 2682 // ... // do something to R, G, B 2683 // } 2684 // To: 2685 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2686 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2687 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2688 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2689 // 2690 // Or translate following interleaved store group (factor = 3): 2691 // for (i = 0; i < N; i+=3) { 2692 // ... do something to R, G, B 2693 // Pic[i] = R; // Member of index 0 2694 // Pic[i+1] = G; // Member of index 1 2695 // Pic[i+2] = B; // Member of index 2 2696 // } 2697 // To: 2698 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2699 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2700 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2701 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2702 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2703 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2704 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2705 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2706 VPValue *BlockInMask) { 2707 Instruction *Instr = Group->getInsertPos(); 2708 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2709 2710 // Prepare for the vector type of the interleaved load/store. 2711 Type *ScalarTy = getLoadStoreType(Instr); 2712 unsigned InterleaveFactor = Group->getFactor(); 2713 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2714 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2715 2716 // Prepare for the new pointers. 2717 SmallVector<Value *, 2> AddrParts; 2718 unsigned Index = Group->getIndex(Instr); 2719 2720 // TODO: extend the masked interleaved-group support to reversed access. 2721 assert((!BlockInMask || !Group->isReverse()) && 2722 "Reversed masked interleave-group not supported."); 2723 2724 // If the group is reverse, adjust the index to refer to the last vector lane 2725 // instead of the first. We adjust the index from the first vector lane, 2726 // rather than directly getting the pointer for lane VF - 1, because the 2727 // pointer operand of the interleaved access is supposed to be uniform. For 2728 // uniform instructions, we're only required to generate a value for the 2729 // first vector lane in each unroll iteration. 2730 if (Group->isReverse()) 2731 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2732 2733 for (unsigned Part = 0; Part < UF; Part++) { 2734 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2735 setDebugLocFromInst(AddrPart); 2736 2737 // Notice current instruction could be any index. Need to adjust the address 2738 // to the member of index 0. 2739 // 2740 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2741 // b = A[i]; // Member of index 0 2742 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2743 // 2744 // E.g. A[i+1] = a; // Member of index 1 2745 // A[i] = b; // Member of index 0 2746 // A[i+2] = c; // Member of index 2 (Current instruction) 2747 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2748 2749 bool InBounds = false; 2750 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2751 InBounds = gep->isInBounds(); 2752 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2753 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2754 2755 // Cast to the vector pointer type. 2756 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2757 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2758 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2759 } 2760 2761 setDebugLocFromInst(Instr); 2762 Value *PoisonVec = PoisonValue::get(VecTy); 2763 2764 Value *MaskForGaps = nullptr; 2765 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2766 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2767 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2768 } 2769 2770 // Vectorize the interleaved load group. 2771 if (isa<LoadInst>(Instr)) { 2772 // For each unroll part, create a wide load for the group. 2773 SmallVector<Value *, 2> NewLoads; 2774 for (unsigned Part = 0; Part < UF; Part++) { 2775 Instruction *NewLoad; 2776 if (BlockInMask || MaskForGaps) { 2777 assert(useMaskedInterleavedAccesses(*TTI) && 2778 "masked interleaved groups are not allowed."); 2779 Value *GroupMask = MaskForGaps; 2780 if (BlockInMask) { 2781 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2782 Value *ShuffledMask = Builder.CreateShuffleVector( 2783 BlockInMaskPart, 2784 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2785 "interleaved.mask"); 2786 GroupMask = MaskForGaps 2787 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2788 MaskForGaps) 2789 : ShuffledMask; 2790 } 2791 NewLoad = 2792 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2793 GroupMask, PoisonVec, "wide.masked.vec"); 2794 } 2795 else 2796 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2797 Group->getAlign(), "wide.vec"); 2798 Group->addMetadata(NewLoad); 2799 NewLoads.push_back(NewLoad); 2800 } 2801 2802 // For each member in the group, shuffle out the appropriate data from the 2803 // wide loads. 2804 unsigned J = 0; 2805 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2806 Instruction *Member = Group->getMember(I); 2807 2808 // Skip the gaps in the group. 2809 if (!Member) 2810 continue; 2811 2812 auto StrideMask = 2813 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2814 for (unsigned Part = 0; Part < UF; Part++) { 2815 Value *StridedVec = Builder.CreateShuffleVector( 2816 NewLoads[Part], StrideMask, "strided.vec"); 2817 2818 // If this member has different type, cast the result type. 2819 if (Member->getType() != ScalarTy) { 2820 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2821 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2822 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2823 } 2824 2825 if (Group->isReverse()) 2826 StridedVec = reverseVector(StridedVec); 2827 2828 State.set(VPDefs[J], StridedVec, Part); 2829 } 2830 ++J; 2831 } 2832 return; 2833 } 2834 2835 // The sub vector type for current instruction. 2836 auto *SubVT = VectorType::get(ScalarTy, VF); 2837 2838 // Vectorize the interleaved store group. 2839 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2840 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2841 "masked interleaved groups are not allowed."); 2842 assert((!MaskForGaps || !VF.isScalable()) && 2843 "masking gaps for scalable vectors is not yet supported."); 2844 for (unsigned Part = 0; Part < UF; Part++) { 2845 // Collect the stored vector from each member. 2846 SmallVector<Value *, 4> StoredVecs; 2847 for (unsigned i = 0; i < InterleaveFactor; i++) { 2848 assert((Group->getMember(i) || MaskForGaps) && 2849 "Fail to get a member from an interleaved store group"); 2850 Instruction *Member = Group->getMember(i); 2851 2852 // Skip the gaps in the group. 2853 if (!Member) { 2854 Value *Undef = PoisonValue::get(SubVT); 2855 StoredVecs.push_back(Undef); 2856 continue; 2857 } 2858 2859 Value *StoredVec = State.get(StoredValues[i], Part); 2860 2861 if (Group->isReverse()) 2862 StoredVec = reverseVector(StoredVec); 2863 2864 // If this member has different type, cast it to a unified type. 2865 2866 if (StoredVec->getType() != SubVT) 2867 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2868 2869 StoredVecs.push_back(StoredVec); 2870 } 2871 2872 // Concatenate all vectors into a wide vector. 2873 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2874 2875 // Interleave the elements in the wide vector. 2876 Value *IVec = Builder.CreateShuffleVector( 2877 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2878 "interleaved.vec"); 2879 2880 Instruction *NewStoreInstr; 2881 if (BlockInMask || MaskForGaps) { 2882 Value *GroupMask = MaskForGaps; 2883 if (BlockInMask) { 2884 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2885 Value *ShuffledMask = Builder.CreateShuffleVector( 2886 BlockInMaskPart, 2887 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2888 "interleaved.mask"); 2889 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2890 ShuffledMask, MaskForGaps) 2891 : ShuffledMask; 2892 } 2893 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2894 Group->getAlign(), GroupMask); 2895 } else 2896 NewStoreInstr = 2897 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2898 2899 Group->addMetadata(NewStoreInstr); 2900 } 2901 } 2902 2903 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2904 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2905 VPValue *StoredValue, VPValue *BlockInMask) { 2906 // Attempt to issue a wide load. 2907 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2908 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2909 2910 assert((LI || SI) && "Invalid Load/Store instruction"); 2911 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2912 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2913 2914 LoopVectorizationCostModel::InstWidening Decision = 2915 Cost->getWideningDecision(Instr, VF); 2916 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2917 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2918 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2919 "CM decision is not to widen the memory instruction"); 2920 2921 Type *ScalarDataTy = getLoadStoreType(Instr); 2922 2923 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2924 const Align Alignment = getLoadStoreAlignment(Instr); 2925 2926 // Determine if the pointer operand of the access is either consecutive or 2927 // reverse consecutive. 2928 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2929 bool ConsecutiveStride = 2930 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2931 bool CreateGatherScatter = 2932 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2933 2934 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2935 // gather/scatter. Otherwise Decision should have been to Scalarize. 2936 assert((ConsecutiveStride || CreateGatherScatter) && 2937 "The instruction should be scalarized"); 2938 (void)ConsecutiveStride; 2939 2940 VectorParts BlockInMaskParts(UF); 2941 bool isMaskRequired = BlockInMask; 2942 if (isMaskRequired) 2943 for (unsigned Part = 0; Part < UF; ++Part) 2944 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2945 2946 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2947 // Calculate the pointer for the specific unroll-part. 2948 GetElementPtrInst *PartPtr = nullptr; 2949 2950 bool InBounds = false; 2951 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2952 InBounds = gep->isInBounds(); 2953 if (Reverse) { 2954 // If the address is consecutive but reversed, then the 2955 // wide store needs to start at the last vector element. 2956 // RunTimeVF = VScale * VF.getKnownMinValue() 2957 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2958 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2959 // NumElt = -Part * RunTimeVF 2960 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2961 // LastLane = 1 - RunTimeVF 2962 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2963 PartPtr = 2964 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2965 PartPtr->setIsInBounds(InBounds); 2966 PartPtr = cast<GetElementPtrInst>( 2967 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2968 PartPtr->setIsInBounds(InBounds); 2969 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2970 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2971 } else { 2972 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2973 PartPtr = cast<GetElementPtrInst>( 2974 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2975 PartPtr->setIsInBounds(InBounds); 2976 } 2977 2978 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2979 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2980 }; 2981 2982 // Handle Stores: 2983 if (SI) { 2984 setDebugLocFromInst(SI); 2985 2986 for (unsigned Part = 0; Part < UF; ++Part) { 2987 Instruction *NewSI = nullptr; 2988 Value *StoredVal = State.get(StoredValue, Part); 2989 if (CreateGatherScatter) { 2990 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2991 Value *VectorGep = State.get(Addr, Part); 2992 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2993 MaskPart); 2994 } else { 2995 if (Reverse) { 2996 // If we store to reverse consecutive memory locations, then we need 2997 // to reverse the order of elements in the stored value. 2998 StoredVal = reverseVector(StoredVal); 2999 // We don't want to update the value in the map as it might be used in 3000 // another expression. So don't call resetVectorValue(StoredVal). 3001 } 3002 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3003 if (isMaskRequired) 3004 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 3005 BlockInMaskParts[Part]); 3006 else 3007 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 3008 } 3009 addMetadata(NewSI, SI); 3010 } 3011 return; 3012 } 3013 3014 // Handle loads. 3015 assert(LI && "Must have a load instruction"); 3016 setDebugLocFromInst(LI); 3017 for (unsigned Part = 0; Part < UF; ++Part) { 3018 Value *NewLI; 3019 if (CreateGatherScatter) { 3020 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3021 Value *VectorGep = State.get(Addr, Part); 3022 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3023 nullptr, "wide.masked.gather"); 3024 addMetadata(NewLI, LI); 3025 } else { 3026 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3027 if (isMaskRequired) 3028 NewLI = Builder.CreateMaskedLoad( 3029 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3030 PoisonValue::get(DataTy), "wide.masked.load"); 3031 else 3032 NewLI = 3033 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3034 3035 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3036 addMetadata(NewLI, LI); 3037 if (Reverse) 3038 NewLI = reverseVector(NewLI); 3039 } 3040 3041 State.set(Def, NewLI, Part); 3042 } 3043 } 3044 3045 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3046 VPUser &User, 3047 const VPIteration &Instance, 3048 bool IfPredicateInstr, 3049 VPTransformState &State) { 3050 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3051 3052 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3053 // the first lane and part. 3054 if (isa<NoAliasScopeDeclInst>(Instr)) 3055 if (!Instance.isFirstIteration()) 3056 return; 3057 3058 setDebugLocFromInst(Instr); 3059 3060 // Does this instruction return a value ? 3061 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3062 3063 Instruction *Cloned = Instr->clone(); 3064 if (!IsVoidRetTy) 3065 Cloned->setName(Instr->getName() + ".cloned"); 3066 3067 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3068 Builder.GetInsertPoint()); 3069 // Replace the operands of the cloned instructions with their scalar 3070 // equivalents in the new loop. 3071 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3072 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3073 auto InputInstance = Instance; 3074 if (!Operand || !OrigLoop->contains(Operand) || 3075 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3076 InputInstance.Lane = VPLane::getFirstLane(); 3077 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3078 Cloned->setOperand(op, NewOp); 3079 } 3080 addNewMetadata(Cloned, Instr); 3081 3082 // Place the cloned scalar in the new loop. 3083 Builder.Insert(Cloned); 3084 3085 State.set(Def, Cloned, Instance); 3086 3087 // If we just cloned a new assumption, add it the assumption cache. 3088 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3089 AC->registerAssumption(II); 3090 3091 // End if-block. 3092 if (IfPredicateInstr) 3093 PredicatedInstructions.push_back(Cloned); 3094 } 3095 3096 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3097 Value *End, Value *Step, 3098 Instruction *DL) { 3099 BasicBlock *Header = L->getHeader(); 3100 BasicBlock *Latch = L->getLoopLatch(); 3101 // As we're just creating this loop, it's possible no latch exists 3102 // yet. If so, use the header as this will be a single block loop. 3103 if (!Latch) 3104 Latch = Header; 3105 3106 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3107 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3108 setDebugLocFromInst(OldInst, &B); 3109 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3110 3111 B.SetInsertPoint(Latch->getTerminator()); 3112 setDebugLocFromInst(OldInst, &B); 3113 3114 // Create i+1 and fill the PHINode. 3115 // 3116 // If the tail is not folded, we know that End - Start >= Step (either 3117 // statically or through the minimum iteration checks). We also know that both 3118 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3119 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3120 // overflows and we can mark the induction increment as NUW. 3121 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3122 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3123 Induction->addIncoming(Start, L->getLoopPreheader()); 3124 Induction->addIncoming(Next, Latch); 3125 // Create the compare. 3126 Value *ICmp = B.CreateICmpEQ(Next, End); 3127 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3128 3129 // Now we have two terminators. Remove the old one from the block. 3130 Latch->getTerminator()->eraseFromParent(); 3131 3132 return Induction; 3133 } 3134 3135 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3136 if (TripCount) 3137 return TripCount; 3138 3139 assert(L && "Create Trip Count for null loop."); 3140 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3141 // Find the loop boundaries. 3142 ScalarEvolution *SE = PSE.getSE(); 3143 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3144 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3145 "Invalid loop count"); 3146 3147 Type *IdxTy = Legal->getWidestInductionType(); 3148 assert(IdxTy && "No type for induction"); 3149 3150 // The exit count might have the type of i64 while the phi is i32. This can 3151 // happen if we have an induction variable that is sign extended before the 3152 // compare. The only way that we get a backedge taken count is that the 3153 // induction variable was signed and as such will not overflow. In such a case 3154 // truncation is legal. 3155 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3156 IdxTy->getPrimitiveSizeInBits()) 3157 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3158 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3159 3160 // Get the total trip count from the count by adding 1. 3161 const SCEV *ExitCount = SE->getAddExpr( 3162 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3163 3164 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3165 3166 // Expand the trip count and place the new instructions in the preheader. 3167 // Notice that the pre-header does not change, only the loop body. 3168 SCEVExpander Exp(*SE, DL, "induction"); 3169 3170 // Count holds the overall loop count (N). 3171 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3172 L->getLoopPreheader()->getTerminator()); 3173 3174 if (TripCount->getType()->isPointerTy()) 3175 TripCount = 3176 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3177 L->getLoopPreheader()->getTerminator()); 3178 3179 return TripCount; 3180 } 3181 3182 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3183 if (VectorTripCount) 3184 return VectorTripCount; 3185 3186 Value *TC = getOrCreateTripCount(L); 3187 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3188 3189 Type *Ty = TC->getType(); 3190 // This is where we can make the step a runtime constant. 3191 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3192 3193 // If the tail is to be folded by masking, round the number of iterations N 3194 // up to a multiple of Step instead of rounding down. This is done by first 3195 // adding Step-1 and then rounding down. Note that it's ok if this addition 3196 // overflows: the vector induction variable will eventually wrap to zero given 3197 // that it starts at zero and its Step is a power of two; the loop will then 3198 // exit, with the last early-exit vector comparison also producing all-true. 3199 if (Cost->foldTailByMasking()) { 3200 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3201 "VF*UF must be a power of 2 when folding tail by masking"); 3202 assert(!VF.isScalable() && 3203 "Tail folding not yet supported for scalable vectors"); 3204 TC = Builder.CreateAdd( 3205 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3206 } 3207 3208 // Now we need to generate the expression for the part of the loop that the 3209 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3210 // iterations are not required for correctness, or N - Step, otherwise. Step 3211 // is equal to the vectorization factor (number of SIMD elements) times the 3212 // unroll factor (number of SIMD instructions). 3213 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3214 3215 // There are cases where we *must* run at least one iteration in the remainder 3216 // loop. See the cost model for when this can happen. If the step evenly 3217 // divides the trip count, we set the remainder to be equal to the step. If 3218 // the step does not evenly divide the trip count, no adjustment is necessary 3219 // since there will already be scalar iterations. Note that the minimum 3220 // iterations check ensures that N >= Step. 3221 if (Cost->requiresScalarEpilogue(VF)) { 3222 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3223 R = Builder.CreateSelect(IsZero, Step, R); 3224 } 3225 3226 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3227 3228 return VectorTripCount; 3229 } 3230 3231 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3232 const DataLayout &DL) { 3233 // Verify that V is a vector type with same number of elements as DstVTy. 3234 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3235 unsigned VF = DstFVTy->getNumElements(); 3236 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3237 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3238 Type *SrcElemTy = SrcVecTy->getElementType(); 3239 Type *DstElemTy = DstFVTy->getElementType(); 3240 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3241 "Vector elements must have same size"); 3242 3243 // Do a direct cast if element types are castable. 3244 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3245 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3246 } 3247 // V cannot be directly casted to desired vector type. 3248 // May happen when V is a floating point vector but DstVTy is a vector of 3249 // pointers or vice-versa. Handle this using a two-step bitcast using an 3250 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3251 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3252 "Only one type should be a pointer type"); 3253 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3254 "Only one type should be a floating point type"); 3255 Type *IntTy = 3256 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3257 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3258 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3259 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3260 } 3261 3262 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3263 BasicBlock *Bypass) { 3264 Value *Count = getOrCreateTripCount(L); 3265 // Reuse existing vector loop preheader for TC checks. 3266 // Note that new preheader block is generated for vector loop. 3267 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3268 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3269 3270 // Generate code to check if the loop's trip count is less than VF * UF, or 3271 // equal to it in case a scalar epilogue is required; this implies that the 3272 // vector trip count is zero. This check also covers the case where adding one 3273 // to the backedge-taken count overflowed leading to an incorrect trip count 3274 // of zero. In this case we will also jump to the scalar loop. 3275 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3276 : ICmpInst::ICMP_ULT; 3277 3278 // If tail is to be folded, vector loop takes care of all iterations. 3279 Value *CheckMinIters = Builder.getFalse(); 3280 if (!Cost->foldTailByMasking()) { 3281 Value *Step = 3282 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3283 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3284 } 3285 // Create new preheader for vector loop. 3286 LoopVectorPreHeader = 3287 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3288 "vector.ph"); 3289 3290 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3291 DT->getNode(Bypass)->getIDom()) && 3292 "TC check is expected to dominate Bypass"); 3293 3294 // Update dominator for Bypass & LoopExit (if needed). 3295 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3296 if (!Cost->requiresScalarEpilogue(VF)) 3297 // If there is an epilogue which must run, there's no edge from the 3298 // middle block to exit blocks and thus no need to update the immediate 3299 // dominator of the exit blocks. 3300 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3301 3302 ReplaceInstWithInst( 3303 TCCheckBlock->getTerminator(), 3304 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3305 LoopBypassBlocks.push_back(TCCheckBlock); 3306 } 3307 3308 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3309 3310 BasicBlock *const SCEVCheckBlock = 3311 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3312 if (!SCEVCheckBlock) 3313 return nullptr; 3314 3315 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3316 (OptForSizeBasedOnProfile && 3317 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3318 "Cannot SCEV check stride or overflow when optimizing for size"); 3319 3320 3321 // Update dominator only if this is first RT check. 3322 if (LoopBypassBlocks.empty()) { 3323 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3324 if (!Cost->requiresScalarEpilogue(VF)) 3325 // If there is an epilogue which must run, there's no edge from the 3326 // middle block to exit blocks and thus no need to update the immediate 3327 // dominator of the exit blocks. 3328 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3329 } 3330 3331 LoopBypassBlocks.push_back(SCEVCheckBlock); 3332 AddedSafetyChecks = true; 3333 return SCEVCheckBlock; 3334 } 3335 3336 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3337 BasicBlock *Bypass) { 3338 // VPlan-native path does not do any analysis for runtime checks currently. 3339 if (EnableVPlanNativePath) 3340 return nullptr; 3341 3342 BasicBlock *const MemCheckBlock = 3343 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3344 3345 // Check if we generated code that checks in runtime if arrays overlap. We put 3346 // the checks into a separate block to make the more common case of few 3347 // elements faster. 3348 if (!MemCheckBlock) 3349 return nullptr; 3350 3351 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3352 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3353 "Cannot emit memory checks when optimizing for size, unless forced " 3354 "to vectorize."); 3355 ORE->emit([&]() { 3356 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3357 L->getStartLoc(), L->getHeader()) 3358 << "Code-size may be reduced by not forcing " 3359 "vectorization, or by source-code modifications " 3360 "eliminating the need for runtime checks " 3361 "(e.g., adding 'restrict')."; 3362 }); 3363 } 3364 3365 LoopBypassBlocks.push_back(MemCheckBlock); 3366 3367 AddedSafetyChecks = true; 3368 3369 // We currently don't use LoopVersioning for the actual loop cloning but we 3370 // still use it to add the noalias metadata. 3371 LVer = std::make_unique<LoopVersioning>( 3372 *Legal->getLAI(), 3373 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3374 DT, PSE.getSE()); 3375 LVer->prepareNoAliasMetadata(); 3376 return MemCheckBlock; 3377 } 3378 3379 Value *InnerLoopVectorizer::emitTransformedIndex( 3380 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3381 const InductionDescriptor &ID) const { 3382 3383 SCEVExpander Exp(*SE, DL, "induction"); 3384 auto Step = ID.getStep(); 3385 auto StartValue = ID.getStartValue(); 3386 assert(Index->getType()->getScalarType() == Step->getType() && 3387 "Index scalar type does not match StepValue type"); 3388 3389 // Note: the IR at this point is broken. We cannot use SE to create any new 3390 // SCEV and then expand it, hoping that SCEV's simplification will give us 3391 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3392 // lead to various SCEV crashes. So all we can do is to use builder and rely 3393 // on InstCombine for future simplifications. Here we handle some trivial 3394 // cases only. 3395 auto CreateAdd = [&B](Value *X, Value *Y) { 3396 assert(X->getType() == Y->getType() && "Types don't match!"); 3397 if (auto *CX = dyn_cast<ConstantInt>(X)) 3398 if (CX->isZero()) 3399 return Y; 3400 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3401 if (CY->isZero()) 3402 return X; 3403 return B.CreateAdd(X, Y); 3404 }; 3405 3406 // We allow X to be a vector type, in which case Y will potentially be 3407 // splatted into a vector with the same element count. 3408 auto CreateMul = [&B](Value *X, Value *Y) { 3409 assert(X->getType()->getScalarType() == Y->getType() && 3410 "Types don't match!"); 3411 if (auto *CX = dyn_cast<ConstantInt>(X)) 3412 if (CX->isOne()) 3413 return Y; 3414 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3415 if (CY->isOne()) 3416 return X; 3417 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3418 if (XVTy && !isa<VectorType>(Y->getType())) 3419 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3420 return B.CreateMul(X, Y); 3421 }; 3422 3423 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3424 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3425 // the DomTree is not kept up-to-date for additional blocks generated in the 3426 // vector loop. By using the header as insertion point, we guarantee that the 3427 // expanded instructions dominate all their uses. 3428 auto GetInsertPoint = [this, &B]() { 3429 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3430 if (InsertBB != LoopVectorBody && 3431 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3432 return LoopVectorBody->getTerminator(); 3433 return &*B.GetInsertPoint(); 3434 }; 3435 3436 switch (ID.getKind()) { 3437 case InductionDescriptor::IK_IntInduction: { 3438 assert(!isa<VectorType>(Index->getType()) && 3439 "Vector indices not supported for integer inductions yet"); 3440 assert(Index->getType() == StartValue->getType() && 3441 "Index type does not match StartValue type"); 3442 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3443 return B.CreateSub(StartValue, Index); 3444 auto *Offset = CreateMul( 3445 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3446 return CreateAdd(StartValue, Offset); 3447 } 3448 case InductionDescriptor::IK_PtrInduction: { 3449 assert(isa<SCEVConstant>(Step) && 3450 "Expected constant step for pointer induction"); 3451 return B.CreateGEP( 3452 StartValue->getType()->getPointerElementType(), StartValue, 3453 CreateMul(Index, 3454 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3455 GetInsertPoint()))); 3456 } 3457 case InductionDescriptor::IK_FpInduction: { 3458 assert(!isa<VectorType>(Index->getType()) && 3459 "Vector indices not supported for FP inductions yet"); 3460 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3461 auto InductionBinOp = ID.getInductionBinOp(); 3462 assert(InductionBinOp && 3463 (InductionBinOp->getOpcode() == Instruction::FAdd || 3464 InductionBinOp->getOpcode() == Instruction::FSub) && 3465 "Original bin op should be defined for FP induction"); 3466 3467 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3468 Value *MulExp = B.CreateFMul(StepValue, Index); 3469 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3470 "induction"); 3471 } 3472 case InductionDescriptor::IK_NoInduction: 3473 return nullptr; 3474 } 3475 llvm_unreachable("invalid enum"); 3476 } 3477 3478 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3479 LoopScalarBody = OrigLoop->getHeader(); 3480 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3481 assert(LoopVectorPreHeader && "Invalid loop structure"); 3482 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3483 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3484 "multiple exit loop without required epilogue?"); 3485 3486 LoopMiddleBlock = 3487 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3488 LI, nullptr, Twine(Prefix) + "middle.block"); 3489 LoopScalarPreHeader = 3490 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3491 nullptr, Twine(Prefix) + "scalar.ph"); 3492 3493 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3494 3495 // Set up the middle block terminator. Two cases: 3496 // 1) If we know that we must execute the scalar epilogue, emit an 3497 // unconditional branch. 3498 // 2) Otherwise, we must have a single unique exit block (due to how we 3499 // implement the multiple exit case). In this case, set up a conditonal 3500 // branch from the middle block to the loop scalar preheader, and the 3501 // exit block. completeLoopSkeleton will update the condition to use an 3502 // iteration check, if required to decide whether to execute the remainder. 3503 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3504 BranchInst::Create(LoopScalarPreHeader) : 3505 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3506 Builder.getTrue()); 3507 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3508 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3509 3510 // We intentionally don't let SplitBlock to update LoopInfo since 3511 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3512 // LoopVectorBody is explicitly added to the correct place few lines later. 3513 LoopVectorBody = 3514 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3515 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3516 3517 // Update dominator for loop exit. 3518 if (!Cost->requiresScalarEpilogue(VF)) 3519 // If there is an epilogue which must run, there's no edge from the 3520 // middle block to exit blocks and thus no need to update the immediate 3521 // dominator of the exit blocks. 3522 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3523 3524 // Create and register the new vector loop. 3525 Loop *Lp = LI->AllocateLoop(); 3526 Loop *ParentLoop = OrigLoop->getParentLoop(); 3527 3528 // Insert the new loop into the loop nest and register the new basic blocks 3529 // before calling any utilities such as SCEV that require valid LoopInfo. 3530 if (ParentLoop) { 3531 ParentLoop->addChildLoop(Lp); 3532 } else { 3533 LI->addTopLevelLoop(Lp); 3534 } 3535 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3536 return Lp; 3537 } 3538 3539 void InnerLoopVectorizer::createInductionResumeValues( 3540 Loop *L, Value *VectorTripCount, 3541 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3542 assert(VectorTripCount && L && "Expected valid arguments"); 3543 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3544 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3545 "Inconsistent information about additional bypass."); 3546 // We are going to resume the execution of the scalar loop. 3547 // Go over all of the induction variables that we found and fix the 3548 // PHIs that are left in the scalar version of the loop. 3549 // The starting values of PHI nodes depend on the counter of the last 3550 // iteration in the vectorized loop. 3551 // If we come from a bypass edge then we need to start from the original 3552 // start value. 3553 for (auto &InductionEntry : Legal->getInductionVars()) { 3554 PHINode *OrigPhi = InductionEntry.first; 3555 InductionDescriptor II = InductionEntry.second; 3556 3557 // Create phi nodes to merge from the backedge-taken check block. 3558 PHINode *BCResumeVal = 3559 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3560 LoopScalarPreHeader->getTerminator()); 3561 // Copy original phi DL over to the new one. 3562 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3563 Value *&EndValue = IVEndValues[OrigPhi]; 3564 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3565 if (OrigPhi == OldInduction) { 3566 // We know what the end value is. 3567 EndValue = VectorTripCount; 3568 } else { 3569 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3570 3571 // Fast-math-flags propagate from the original induction instruction. 3572 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3573 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3574 3575 Type *StepType = II.getStep()->getType(); 3576 Instruction::CastOps CastOp = 3577 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3578 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3579 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3580 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3581 EndValue->setName("ind.end"); 3582 3583 // Compute the end value for the additional bypass (if applicable). 3584 if (AdditionalBypass.first) { 3585 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3586 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3587 StepType, true); 3588 CRD = 3589 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3590 EndValueFromAdditionalBypass = 3591 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3592 EndValueFromAdditionalBypass->setName("ind.end"); 3593 } 3594 } 3595 // The new PHI merges the original incoming value, in case of a bypass, 3596 // or the value at the end of the vectorized loop. 3597 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3598 3599 // Fix the scalar body counter (PHI node). 3600 // The old induction's phi node in the scalar body needs the truncated 3601 // value. 3602 for (BasicBlock *BB : LoopBypassBlocks) 3603 BCResumeVal->addIncoming(II.getStartValue(), BB); 3604 3605 if (AdditionalBypass.first) 3606 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3607 EndValueFromAdditionalBypass); 3608 3609 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3610 } 3611 } 3612 3613 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3614 MDNode *OrigLoopID) { 3615 assert(L && "Expected valid loop."); 3616 3617 // The trip counts should be cached by now. 3618 Value *Count = getOrCreateTripCount(L); 3619 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3620 3621 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3622 3623 // Add a check in the middle block to see if we have completed 3624 // all of the iterations in the first vector loop. Three cases: 3625 // 1) If we require a scalar epilogue, there is no conditional branch as 3626 // we unconditionally branch to the scalar preheader. Do nothing. 3627 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3628 // Thus if tail is to be folded, we know we don't need to run the 3629 // remainder and we can use the previous value for the condition (true). 3630 // 3) Otherwise, construct a runtime check. 3631 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3632 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3633 Count, VectorTripCount, "cmp.n", 3634 LoopMiddleBlock->getTerminator()); 3635 3636 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3637 // of the corresponding compare because they may have ended up with 3638 // different line numbers and we want to avoid awkward line stepping while 3639 // debugging. Eg. if the compare has got a line number inside the loop. 3640 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3641 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3642 } 3643 3644 // Get ready to start creating new instructions into the vectorized body. 3645 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3646 "Inconsistent vector loop preheader"); 3647 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3648 3649 Optional<MDNode *> VectorizedLoopID = 3650 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3651 LLVMLoopVectorizeFollowupVectorized}); 3652 if (VectorizedLoopID.hasValue()) { 3653 L->setLoopID(VectorizedLoopID.getValue()); 3654 3655 // Do not setAlreadyVectorized if loop attributes have been defined 3656 // explicitly. 3657 return LoopVectorPreHeader; 3658 } 3659 3660 // Keep all loop hints from the original loop on the vector loop (we'll 3661 // replace the vectorizer-specific hints below). 3662 if (MDNode *LID = OrigLoop->getLoopID()) 3663 L->setLoopID(LID); 3664 3665 LoopVectorizeHints Hints(L, true, *ORE); 3666 Hints.setAlreadyVectorized(); 3667 3668 #ifdef EXPENSIVE_CHECKS 3669 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3670 LI->verify(*DT); 3671 #endif 3672 3673 return LoopVectorPreHeader; 3674 } 3675 3676 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3677 /* 3678 In this function we generate a new loop. The new loop will contain 3679 the vectorized instructions while the old loop will continue to run the 3680 scalar remainder. 3681 3682 [ ] <-- loop iteration number check. 3683 / | 3684 / v 3685 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3686 | / | 3687 | / v 3688 || [ ] <-- vector pre header. 3689 |/ | 3690 | v 3691 | [ ] \ 3692 | [ ]_| <-- vector loop. 3693 | | 3694 | v 3695 \ -[ ] <--- middle-block. 3696 \/ | 3697 /\ v 3698 | ->[ ] <--- new preheader. 3699 | | 3700 (opt) v <-- edge from middle to exit iff epilogue is not required. 3701 | [ ] \ 3702 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3703 \ | 3704 \ v 3705 >[ ] <-- exit block(s). 3706 ... 3707 */ 3708 3709 // Get the metadata of the original loop before it gets modified. 3710 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3711 3712 // Workaround! Compute the trip count of the original loop and cache it 3713 // before we start modifying the CFG. This code has a systemic problem 3714 // wherein it tries to run analysis over partially constructed IR; this is 3715 // wrong, and not simply for SCEV. The trip count of the original loop 3716 // simply happens to be prone to hitting this in practice. In theory, we 3717 // can hit the same issue for any SCEV, or ValueTracking query done during 3718 // mutation. See PR49900. 3719 getOrCreateTripCount(OrigLoop); 3720 3721 // Create an empty vector loop, and prepare basic blocks for the runtime 3722 // checks. 3723 Loop *Lp = createVectorLoopSkeleton(""); 3724 3725 // Now, compare the new count to zero. If it is zero skip the vector loop and 3726 // jump to the scalar loop. This check also covers the case where the 3727 // backedge-taken count is uint##_max: adding one to it will overflow leading 3728 // to an incorrect trip count of zero. In this (rare) case we will also jump 3729 // to the scalar loop. 3730 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3731 3732 // Generate the code to check any assumptions that we've made for SCEV 3733 // expressions. 3734 emitSCEVChecks(Lp, LoopScalarPreHeader); 3735 3736 // Generate the code that checks in runtime if arrays overlap. We put the 3737 // checks into a separate block to make the more common case of few elements 3738 // faster. 3739 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3740 3741 // Some loops have a single integer induction variable, while other loops 3742 // don't. One example is c++ iterators that often have multiple pointer 3743 // induction variables. In the code below we also support a case where we 3744 // don't have a single induction variable. 3745 // 3746 // We try to obtain an induction variable from the original loop as hard 3747 // as possible. However if we don't find one that: 3748 // - is an integer 3749 // - counts from zero, stepping by one 3750 // - is the size of the widest induction variable type 3751 // then we create a new one. 3752 OldInduction = Legal->getPrimaryInduction(); 3753 Type *IdxTy = Legal->getWidestInductionType(); 3754 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3755 // The loop step is equal to the vectorization factor (num of SIMD elements) 3756 // times the unroll factor (num of SIMD instructions). 3757 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3758 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3759 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3760 Induction = 3761 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3762 getDebugLocFromInstOrOperands(OldInduction)); 3763 3764 // Emit phis for the new starting index of the scalar loop. 3765 createInductionResumeValues(Lp, CountRoundDown); 3766 3767 return completeLoopSkeleton(Lp, OrigLoopID); 3768 } 3769 3770 // Fix up external users of the induction variable. At this point, we are 3771 // in LCSSA form, with all external PHIs that use the IV having one input value, 3772 // coming from the remainder loop. We need those PHIs to also have a correct 3773 // value for the IV when arriving directly from the middle block. 3774 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3775 const InductionDescriptor &II, 3776 Value *CountRoundDown, Value *EndValue, 3777 BasicBlock *MiddleBlock) { 3778 // There are two kinds of external IV usages - those that use the value 3779 // computed in the last iteration (the PHI) and those that use the penultimate 3780 // value (the value that feeds into the phi from the loop latch). 3781 // We allow both, but they, obviously, have different values. 3782 3783 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3784 3785 DenseMap<Value *, Value *> MissingVals; 3786 3787 // An external user of the last iteration's value should see the value that 3788 // the remainder loop uses to initialize its own IV. 3789 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3790 for (User *U : PostInc->users()) { 3791 Instruction *UI = cast<Instruction>(U); 3792 if (!OrigLoop->contains(UI)) { 3793 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3794 MissingVals[UI] = EndValue; 3795 } 3796 } 3797 3798 // An external user of the penultimate value need to see EndValue - Step. 3799 // The simplest way to get this is to recompute it from the constituent SCEVs, 3800 // that is Start + (Step * (CRD - 1)). 3801 for (User *U : OrigPhi->users()) { 3802 auto *UI = cast<Instruction>(U); 3803 if (!OrigLoop->contains(UI)) { 3804 const DataLayout &DL = 3805 OrigLoop->getHeader()->getModule()->getDataLayout(); 3806 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3807 3808 IRBuilder<> B(MiddleBlock->getTerminator()); 3809 3810 // Fast-math-flags propagate from the original induction instruction. 3811 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3812 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3813 3814 Value *CountMinusOne = B.CreateSub( 3815 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3816 Value *CMO = 3817 !II.getStep()->getType()->isIntegerTy() 3818 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3819 II.getStep()->getType()) 3820 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3821 CMO->setName("cast.cmo"); 3822 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3823 Escape->setName("ind.escape"); 3824 MissingVals[UI] = Escape; 3825 } 3826 } 3827 3828 for (auto &I : MissingVals) { 3829 PHINode *PHI = cast<PHINode>(I.first); 3830 // One corner case we have to handle is two IVs "chasing" each-other, 3831 // that is %IV2 = phi [...], [ %IV1, %latch ] 3832 // In this case, if IV1 has an external use, we need to avoid adding both 3833 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3834 // don't already have an incoming value for the middle block. 3835 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3836 PHI->addIncoming(I.second, MiddleBlock); 3837 } 3838 } 3839 3840 namespace { 3841 3842 struct CSEDenseMapInfo { 3843 static bool canHandle(const Instruction *I) { 3844 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3845 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3846 } 3847 3848 static inline Instruction *getEmptyKey() { 3849 return DenseMapInfo<Instruction *>::getEmptyKey(); 3850 } 3851 3852 static inline Instruction *getTombstoneKey() { 3853 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3854 } 3855 3856 static unsigned getHashValue(const Instruction *I) { 3857 assert(canHandle(I) && "Unknown instruction!"); 3858 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3859 I->value_op_end())); 3860 } 3861 3862 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3863 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3864 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3865 return LHS == RHS; 3866 return LHS->isIdenticalTo(RHS); 3867 } 3868 }; 3869 3870 } // end anonymous namespace 3871 3872 ///Perform cse of induction variable instructions. 3873 static void cse(BasicBlock *BB) { 3874 // Perform simple cse. 3875 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3876 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3877 Instruction *In = &*I++; 3878 3879 if (!CSEDenseMapInfo::canHandle(In)) 3880 continue; 3881 3882 // Check if we can replace this instruction with any of the 3883 // visited instructions. 3884 if (Instruction *V = CSEMap.lookup(In)) { 3885 In->replaceAllUsesWith(V); 3886 In->eraseFromParent(); 3887 continue; 3888 } 3889 3890 CSEMap[In] = In; 3891 } 3892 } 3893 3894 InstructionCost 3895 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3896 bool &NeedToScalarize) const { 3897 Function *F = CI->getCalledFunction(); 3898 Type *ScalarRetTy = CI->getType(); 3899 SmallVector<Type *, 4> Tys, ScalarTys; 3900 for (auto &ArgOp : CI->arg_operands()) 3901 ScalarTys.push_back(ArgOp->getType()); 3902 3903 // Estimate cost of scalarized vector call. The source operands are assumed 3904 // to be vectors, so we need to extract individual elements from there, 3905 // execute VF scalar calls, and then gather the result into the vector return 3906 // value. 3907 InstructionCost ScalarCallCost = 3908 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3909 if (VF.isScalar()) 3910 return ScalarCallCost; 3911 3912 // Compute corresponding vector type for return value and arguments. 3913 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3914 for (Type *ScalarTy : ScalarTys) 3915 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3916 3917 // Compute costs of unpacking argument values for the scalar calls and 3918 // packing the return values to a vector. 3919 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3920 3921 InstructionCost Cost = 3922 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3923 3924 // If we can't emit a vector call for this function, then the currently found 3925 // cost is the cost we need to return. 3926 NeedToScalarize = true; 3927 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3928 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3929 3930 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3931 return Cost; 3932 3933 // If the corresponding vector cost is cheaper, return its cost. 3934 InstructionCost VectorCallCost = 3935 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3936 if (VectorCallCost < Cost) { 3937 NeedToScalarize = false; 3938 Cost = VectorCallCost; 3939 } 3940 return Cost; 3941 } 3942 3943 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3944 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3945 return Elt; 3946 return VectorType::get(Elt, VF); 3947 } 3948 3949 InstructionCost 3950 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3951 ElementCount VF) const { 3952 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3953 assert(ID && "Expected intrinsic call!"); 3954 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3955 FastMathFlags FMF; 3956 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3957 FMF = FPMO->getFastMathFlags(); 3958 3959 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3960 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3961 SmallVector<Type *> ParamTys; 3962 std::transform(FTy->param_begin(), FTy->param_end(), 3963 std::back_inserter(ParamTys), 3964 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3965 3966 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3967 dyn_cast<IntrinsicInst>(CI)); 3968 return TTI.getIntrinsicInstrCost(CostAttrs, 3969 TargetTransformInfo::TCK_RecipThroughput); 3970 } 3971 3972 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3973 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3974 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3975 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3976 } 3977 3978 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3979 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3980 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3981 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3982 } 3983 3984 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3985 // For every instruction `I` in MinBWs, truncate the operands, create a 3986 // truncated version of `I` and reextend its result. InstCombine runs 3987 // later and will remove any ext/trunc pairs. 3988 SmallPtrSet<Value *, 4> Erased; 3989 for (const auto &KV : Cost->getMinimalBitwidths()) { 3990 // If the value wasn't vectorized, we must maintain the original scalar 3991 // type. The absence of the value from State indicates that it 3992 // wasn't vectorized. 3993 VPValue *Def = State.Plan->getVPValue(KV.first); 3994 if (!State.hasAnyVectorValue(Def)) 3995 continue; 3996 for (unsigned Part = 0; Part < UF; ++Part) { 3997 Value *I = State.get(Def, Part); 3998 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3999 continue; 4000 Type *OriginalTy = I->getType(); 4001 Type *ScalarTruncatedTy = 4002 IntegerType::get(OriginalTy->getContext(), KV.second); 4003 auto *TruncatedTy = VectorType::get( 4004 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 4005 if (TruncatedTy == OriginalTy) 4006 continue; 4007 4008 IRBuilder<> B(cast<Instruction>(I)); 4009 auto ShrinkOperand = [&](Value *V) -> Value * { 4010 if (auto *ZI = dyn_cast<ZExtInst>(V)) 4011 if (ZI->getSrcTy() == TruncatedTy) 4012 return ZI->getOperand(0); 4013 return B.CreateZExtOrTrunc(V, TruncatedTy); 4014 }; 4015 4016 // The actual instruction modification depends on the instruction type, 4017 // unfortunately. 4018 Value *NewI = nullptr; 4019 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4020 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4021 ShrinkOperand(BO->getOperand(1))); 4022 4023 // Any wrapping introduced by shrinking this operation shouldn't be 4024 // considered undefined behavior. So, we can't unconditionally copy 4025 // arithmetic wrapping flags to NewI. 4026 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4027 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4028 NewI = 4029 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4030 ShrinkOperand(CI->getOperand(1))); 4031 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4032 NewI = B.CreateSelect(SI->getCondition(), 4033 ShrinkOperand(SI->getTrueValue()), 4034 ShrinkOperand(SI->getFalseValue())); 4035 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4036 switch (CI->getOpcode()) { 4037 default: 4038 llvm_unreachable("Unhandled cast!"); 4039 case Instruction::Trunc: 4040 NewI = ShrinkOperand(CI->getOperand(0)); 4041 break; 4042 case Instruction::SExt: 4043 NewI = B.CreateSExtOrTrunc( 4044 CI->getOperand(0), 4045 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4046 break; 4047 case Instruction::ZExt: 4048 NewI = B.CreateZExtOrTrunc( 4049 CI->getOperand(0), 4050 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4051 break; 4052 } 4053 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4054 auto Elements0 = 4055 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4056 auto *O0 = B.CreateZExtOrTrunc( 4057 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4058 auto Elements1 = 4059 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4060 auto *O1 = B.CreateZExtOrTrunc( 4061 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4062 4063 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4064 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4065 // Don't do anything with the operands, just extend the result. 4066 continue; 4067 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4068 auto Elements = 4069 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4070 auto *O0 = B.CreateZExtOrTrunc( 4071 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4072 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4073 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4074 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4075 auto Elements = 4076 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4077 auto *O0 = B.CreateZExtOrTrunc( 4078 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4079 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4080 } else { 4081 // If we don't know what to do, be conservative and don't do anything. 4082 continue; 4083 } 4084 4085 // Lastly, extend the result. 4086 NewI->takeName(cast<Instruction>(I)); 4087 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4088 I->replaceAllUsesWith(Res); 4089 cast<Instruction>(I)->eraseFromParent(); 4090 Erased.insert(I); 4091 State.reset(Def, Res, Part); 4092 } 4093 } 4094 4095 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4096 for (const auto &KV : Cost->getMinimalBitwidths()) { 4097 // If the value wasn't vectorized, we must maintain the original scalar 4098 // type. The absence of the value from State indicates that it 4099 // wasn't vectorized. 4100 VPValue *Def = State.Plan->getVPValue(KV.first); 4101 if (!State.hasAnyVectorValue(Def)) 4102 continue; 4103 for (unsigned Part = 0; Part < UF; ++Part) { 4104 Value *I = State.get(Def, Part); 4105 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4106 if (Inst && Inst->use_empty()) { 4107 Value *NewI = Inst->getOperand(0); 4108 Inst->eraseFromParent(); 4109 State.reset(Def, NewI, Part); 4110 } 4111 } 4112 } 4113 } 4114 4115 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4116 // Insert truncates and extends for any truncated instructions as hints to 4117 // InstCombine. 4118 if (VF.isVector()) 4119 truncateToMinimalBitwidths(State); 4120 4121 // Fix widened non-induction PHIs by setting up the PHI operands. 4122 if (OrigPHIsToFix.size()) { 4123 assert(EnableVPlanNativePath && 4124 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4125 fixNonInductionPHIs(State); 4126 } 4127 4128 // At this point every instruction in the original loop is widened to a 4129 // vector form. Now we need to fix the recurrences in the loop. These PHI 4130 // nodes are currently empty because we did not want to introduce cycles. 4131 // This is the second stage of vectorizing recurrences. 4132 fixCrossIterationPHIs(State); 4133 4134 // Forget the original basic block. 4135 PSE.getSE()->forgetLoop(OrigLoop); 4136 4137 // If we inserted an edge from the middle block to the unique exit block, 4138 // update uses outside the loop (phis) to account for the newly inserted 4139 // edge. 4140 if (!Cost->requiresScalarEpilogue(VF)) { 4141 // Fix-up external users of the induction variables. 4142 for (auto &Entry : Legal->getInductionVars()) 4143 fixupIVUsers(Entry.first, Entry.second, 4144 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4145 IVEndValues[Entry.first], LoopMiddleBlock); 4146 4147 fixLCSSAPHIs(State); 4148 } 4149 4150 for (Instruction *PI : PredicatedInstructions) 4151 sinkScalarOperands(&*PI); 4152 4153 // Remove redundant induction instructions. 4154 cse(LoopVectorBody); 4155 4156 // Set/update profile weights for the vector and remainder loops as original 4157 // loop iterations are now distributed among them. Note that original loop 4158 // represented by LoopScalarBody becomes remainder loop after vectorization. 4159 // 4160 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4161 // end up getting slightly roughened result but that should be OK since 4162 // profile is not inherently precise anyway. Note also possible bypass of 4163 // vector code caused by legality checks is ignored, assigning all the weight 4164 // to the vector loop, optimistically. 4165 // 4166 // For scalable vectorization we can't know at compile time how many iterations 4167 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4168 // vscale of '1'. 4169 setProfileInfoAfterUnrolling( 4170 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4171 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4172 } 4173 4174 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4175 // In order to support recurrences we need to be able to vectorize Phi nodes. 4176 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4177 // stage #2: We now need to fix the recurrences by adding incoming edges to 4178 // the currently empty PHI nodes. At this point every instruction in the 4179 // original loop is widened to a vector form so we can use them to construct 4180 // the incoming edges. 4181 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4182 for (VPRecipeBase &R : Header->phis()) { 4183 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4184 fixReduction(ReductionPhi, State); 4185 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4186 fixFirstOrderRecurrence(FOR, State); 4187 } 4188 } 4189 4190 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4191 VPTransformState &State) { 4192 // This is the second phase of vectorizing first-order recurrences. An 4193 // overview of the transformation is described below. Suppose we have the 4194 // following loop. 4195 // 4196 // for (int i = 0; i < n; ++i) 4197 // b[i] = a[i] - a[i - 1]; 4198 // 4199 // There is a first-order recurrence on "a". For this loop, the shorthand 4200 // scalar IR looks like: 4201 // 4202 // scalar.ph: 4203 // s_init = a[-1] 4204 // br scalar.body 4205 // 4206 // scalar.body: 4207 // i = phi [0, scalar.ph], [i+1, scalar.body] 4208 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4209 // s2 = a[i] 4210 // b[i] = s2 - s1 4211 // br cond, scalar.body, ... 4212 // 4213 // In this example, s1 is a recurrence because it's value depends on the 4214 // previous iteration. In the first phase of vectorization, we created a 4215 // vector phi v1 for s1. We now complete the vectorization and produce the 4216 // shorthand vector IR shown below (for VF = 4, UF = 1). 4217 // 4218 // vector.ph: 4219 // v_init = vector(..., ..., ..., a[-1]) 4220 // br vector.body 4221 // 4222 // vector.body 4223 // i = phi [0, vector.ph], [i+4, vector.body] 4224 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4225 // v2 = a[i, i+1, i+2, i+3]; 4226 // v3 = vector(v1(3), v2(0, 1, 2)) 4227 // b[i, i+1, i+2, i+3] = v2 - v3 4228 // br cond, vector.body, middle.block 4229 // 4230 // middle.block: 4231 // x = v2(3) 4232 // br scalar.ph 4233 // 4234 // scalar.ph: 4235 // s_init = phi [x, middle.block], [a[-1], otherwise] 4236 // br scalar.body 4237 // 4238 // After execution completes the vector loop, we extract the next value of 4239 // the recurrence (x) to use as the initial value in the scalar loop. 4240 4241 // Extract the last vector element in the middle block. This will be the 4242 // initial value for the recurrence when jumping to the scalar loop. 4243 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4244 Value *Incoming = State.get(PreviousDef, UF - 1); 4245 auto *ExtractForScalar = Incoming; 4246 auto *IdxTy = Builder.getInt32Ty(); 4247 if (VF.isVector()) { 4248 auto *One = ConstantInt::get(IdxTy, 1); 4249 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4250 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4251 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4252 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4253 "vector.recur.extract"); 4254 } 4255 // Extract the second last element in the middle block if the 4256 // Phi is used outside the loop. We need to extract the phi itself 4257 // and not the last element (the phi update in the current iteration). This 4258 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4259 // when the scalar loop is not run at all. 4260 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4261 if (VF.isVector()) { 4262 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4263 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4264 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4265 Incoming, Idx, "vector.recur.extract.for.phi"); 4266 } else if (UF > 1) 4267 // When loop is unrolled without vectorizing, initialize 4268 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4269 // of `Incoming`. This is analogous to the vectorized case above: extracting 4270 // the second last element when VF > 1. 4271 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4272 4273 // Fix the initial value of the original recurrence in the scalar loop. 4274 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4275 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4276 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4277 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4278 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4279 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4280 Start->addIncoming(Incoming, BB); 4281 } 4282 4283 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4284 Phi->setName("scalar.recur"); 4285 4286 // Finally, fix users of the recurrence outside the loop. The users will need 4287 // either the last value of the scalar recurrence or the last value of the 4288 // vector recurrence we extracted in the middle block. Since the loop is in 4289 // LCSSA form, we just need to find all the phi nodes for the original scalar 4290 // recurrence in the exit block, and then add an edge for the middle block. 4291 // Note that LCSSA does not imply single entry when the original scalar loop 4292 // had multiple exiting edges (as we always run the last iteration in the 4293 // scalar epilogue); in that case, there is no edge from middle to exit and 4294 // and thus no phis which needed updated. 4295 if (!Cost->requiresScalarEpilogue(VF)) 4296 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4297 if (any_of(LCSSAPhi.incoming_values(), 4298 [Phi](Value *V) { return V == Phi; })) 4299 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4300 } 4301 4302 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4303 VPTransformState &State) { 4304 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4305 // Get it's reduction variable descriptor. 4306 assert(Legal->isReductionVariable(OrigPhi) && 4307 "Unable to find the reduction variable"); 4308 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4309 4310 RecurKind RK = RdxDesc.getRecurrenceKind(); 4311 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4312 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4313 setDebugLocFromInst(ReductionStartValue); 4314 4315 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4316 // This is the vector-clone of the value that leaves the loop. 4317 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4318 4319 // Wrap flags are in general invalid after vectorization, clear them. 4320 clearReductionWrapFlags(RdxDesc, State); 4321 4322 // Before each round, move the insertion point right between 4323 // the PHIs and the values we are going to write. 4324 // This allows us to write both PHINodes and the extractelement 4325 // instructions. 4326 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4327 4328 setDebugLocFromInst(LoopExitInst); 4329 4330 Type *PhiTy = OrigPhi->getType(); 4331 // If tail is folded by masking, the vector value to leave the loop should be 4332 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4333 // instead of the former. For an inloop reduction the reduction will already 4334 // be predicated, and does not need to be handled here. 4335 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4336 for (unsigned Part = 0; Part < UF; ++Part) { 4337 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4338 Value *Sel = nullptr; 4339 for (User *U : VecLoopExitInst->users()) { 4340 if (isa<SelectInst>(U)) { 4341 assert(!Sel && "Reduction exit feeding two selects"); 4342 Sel = U; 4343 } else 4344 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4345 } 4346 assert(Sel && "Reduction exit feeds no select"); 4347 State.reset(LoopExitInstDef, Sel, Part); 4348 4349 // If the target can create a predicated operator for the reduction at no 4350 // extra cost in the loop (for example a predicated vadd), it can be 4351 // cheaper for the select to remain in the loop than be sunk out of it, 4352 // and so use the select value for the phi instead of the old 4353 // LoopExitValue. 4354 if (PreferPredicatedReductionSelect || 4355 TTI->preferPredicatedReductionSelect( 4356 RdxDesc.getOpcode(), PhiTy, 4357 TargetTransformInfo::ReductionFlags())) { 4358 auto *VecRdxPhi = 4359 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4360 VecRdxPhi->setIncomingValueForBlock( 4361 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4362 } 4363 } 4364 } 4365 4366 // If the vector reduction can be performed in a smaller type, we truncate 4367 // then extend the loop exit value to enable InstCombine to evaluate the 4368 // entire expression in the smaller type. 4369 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4370 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4371 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4372 Builder.SetInsertPoint( 4373 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4374 VectorParts RdxParts(UF); 4375 for (unsigned Part = 0; Part < UF; ++Part) { 4376 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4377 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4378 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4379 : Builder.CreateZExt(Trunc, VecTy); 4380 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4381 UI != RdxParts[Part]->user_end();) 4382 if (*UI != Trunc) { 4383 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4384 RdxParts[Part] = Extnd; 4385 } else { 4386 ++UI; 4387 } 4388 } 4389 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4390 for (unsigned Part = 0; Part < UF; ++Part) { 4391 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4392 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4393 } 4394 } 4395 4396 // Reduce all of the unrolled parts into a single vector. 4397 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4398 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4399 4400 // The middle block terminator has already been assigned a DebugLoc here (the 4401 // OrigLoop's single latch terminator). We want the whole middle block to 4402 // appear to execute on this line because: (a) it is all compiler generated, 4403 // (b) these instructions are always executed after evaluating the latch 4404 // conditional branch, and (c) other passes may add new predecessors which 4405 // terminate on this line. This is the easiest way to ensure we don't 4406 // accidentally cause an extra step back into the loop while debugging. 4407 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4408 if (PhiR->isOrdered()) 4409 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4410 else { 4411 // Floating-point operations should have some FMF to enable the reduction. 4412 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4413 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4414 for (unsigned Part = 1; Part < UF; ++Part) { 4415 Value *RdxPart = State.get(LoopExitInstDef, Part); 4416 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4417 ReducedPartRdx = Builder.CreateBinOp( 4418 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4419 } else { 4420 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4421 } 4422 } 4423 } 4424 4425 // Create the reduction after the loop. Note that inloop reductions create the 4426 // target reduction in the loop using a Reduction recipe. 4427 if (VF.isVector() && !PhiR->isInLoop()) { 4428 ReducedPartRdx = 4429 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4430 // If the reduction can be performed in a smaller type, we need to extend 4431 // the reduction to the wider type before we branch to the original loop. 4432 if (PhiTy != RdxDesc.getRecurrenceType()) 4433 ReducedPartRdx = RdxDesc.isSigned() 4434 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4435 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4436 } 4437 4438 // Create a phi node that merges control-flow from the backedge-taken check 4439 // block and the middle block. 4440 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4441 LoopScalarPreHeader->getTerminator()); 4442 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4443 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4444 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4445 4446 // Now, we need to fix the users of the reduction variable 4447 // inside and outside of the scalar remainder loop. 4448 4449 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4450 // in the exit blocks. See comment on analogous loop in 4451 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4452 if (!Cost->requiresScalarEpilogue(VF)) 4453 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4454 if (any_of(LCSSAPhi.incoming_values(), 4455 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4456 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4457 4458 // Fix the scalar loop reduction variable with the incoming reduction sum 4459 // from the vector body and from the backedge value. 4460 int IncomingEdgeBlockIdx = 4461 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4462 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4463 // Pick the other block. 4464 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4465 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4466 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4467 } 4468 4469 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4470 VPTransformState &State) { 4471 RecurKind RK = RdxDesc.getRecurrenceKind(); 4472 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4473 return; 4474 4475 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4476 assert(LoopExitInstr && "null loop exit instruction"); 4477 SmallVector<Instruction *, 8> Worklist; 4478 SmallPtrSet<Instruction *, 8> Visited; 4479 Worklist.push_back(LoopExitInstr); 4480 Visited.insert(LoopExitInstr); 4481 4482 while (!Worklist.empty()) { 4483 Instruction *Cur = Worklist.pop_back_val(); 4484 if (isa<OverflowingBinaryOperator>(Cur)) 4485 for (unsigned Part = 0; Part < UF; ++Part) { 4486 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4487 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4488 } 4489 4490 for (User *U : Cur->users()) { 4491 Instruction *UI = cast<Instruction>(U); 4492 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4493 Visited.insert(UI).second) 4494 Worklist.push_back(UI); 4495 } 4496 } 4497 } 4498 4499 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4500 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4501 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4502 // Some phis were already hand updated by the reduction and recurrence 4503 // code above, leave them alone. 4504 continue; 4505 4506 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4507 // Non-instruction incoming values will have only one value. 4508 4509 VPLane Lane = VPLane::getFirstLane(); 4510 if (isa<Instruction>(IncomingValue) && 4511 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4512 VF)) 4513 Lane = VPLane::getLastLaneForVF(VF); 4514 4515 // Can be a loop invariant incoming value or the last scalar value to be 4516 // extracted from the vectorized loop. 4517 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4518 Value *lastIncomingValue = 4519 OrigLoop->isLoopInvariant(IncomingValue) 4520 ? IncomingValue 4521 : State.get(State.Plan->getVPValue(IncomingValue), 4522 VPIteration(UF - 1, Lane)); 4523 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4524 } 4525 } 4526 4527 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4528 // The basic block and loop containing the predicated instruction. 4529 auto *PredBB = PredInst->getParent(); 4530 auto *VectorLoop = LI->getLoopFor(PredBB); 4531 4532 // Initialize a worklist with the operands of the predicated instruction. 4533 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4534 4535 // Holds instructions that we need to analyze again. An instruction may be 4536 // reanalyzed if we don't yet know if we can sink it or not. 4537 SmallVector<Instruction *, 8> InstsToReanalyze; 4538 4539 // Returns true if a given use occurs in the predicated block. Phi nodes use 4540 // their operands in their corresponding predecessor blocks. 4541 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4542 auto *I = cast<Instruction>(U.getUser()); 4543 BasicBlock *BB = I->getParent(); 4544 if (auto *Phi = dyn_cast<PHINode>(I)) 4545 BB = Phi->getIncomingBlock( 4546 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4547 return BB == PredBB; 4548 }; 4549 4550 // Iteratively sink the scalarized operands of the predicated instruction 4551 // into the block we created for it. When an instruction is sunk, it's 4552 // operands are then added to the worklist. The algorithm ends after one pass 4553 // through the worklist doesn't sink a single instruction. 4554 bool Changed; 4555 do { 4556 // Add the instructions that need to be reanalyzed to the worklist, and 4557 // reset the changed indicator. 4558 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4559 InstsToReanalyze.clear(); 4560 Changed = false; 4561 4562 while (!Worklist.empty()) { 4563 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4564 4565 // We can't sink an instruction if it is a phi node, is not in the loop, 4566 // or may have side effects. 4567 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4568 I->mayHaveSideEffects()) 4569 continue; 4570 4571 // If the instruction is already in PredBB, check if we can sink its 4572 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4573 // sinking the scalar instruction I, hence it appears in PredBB; but it 4574 // may have failed to sink I's operands (recursively), which we try 4575 // (again) here. 4576 if (I->getParent() == PredBB) { 4577 Worklist.insert(I->op_begin(), I->op_end()); 4578 continue; 4579 } 4580 4581 // It's legal to sink the instruction if all its uses occur in the 4582 // predicated block. Otherwise, there's nothing to do yet, and we may 4583 // need to reanalyze the instruction. 4584 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4585 InstsToReanalyze.push_back(I); 4586 continue; 4587 } 4588 4589 // Move the instruction to the beginning of the predicated block, and add 4590 // it's operands to the worklist. 4591 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4592 Worklist.insert(I->op_begin(), I->op_end()); 4593 4594 // The sinking may have enabled other instructions to be sunk, so we will 4595 // need to iterate. 4596 Changed = true; 4597 } 4598 } while (Changed); 4599 } 4600 4601 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4602 for (PHINode *OrigPhi : OrigPHIsToFix) { 4603 VPWidenPHIRecipe *VPPhi = 4604 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4605 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4606 // Make sure the builder has a valid insert point. 4607 Builder.SetInsertPoint(NewPhi); 4608 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4609 VPValue *Inc = VPPhi->getIncomingValue(i); 4610 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4611 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4612 } 4613 } 4614 } 4615 4616 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4617 return Cost->useOrderedReductions(RdxDesc); 4618 } 4619 4620 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4621 VPUser &Operands, unsigned UF, 4622 ElementCount VF, bool IsPtrLoopInvariant, 4623 SmallBitVector &IsIndexLoopInvariant, 4624 VPTransformState &State) { 4625 // Construct a vector GEP by widening the operands of the scalar GEP as 4626 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4627 // results in a vector of pointers when at least one operand of the GEP 4628 // is vector-typed. Thus, to keep the representation compact, we only use 4629 // vector-typed operands for loop-varying values. 4630 4631 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4632 // If we are vectorizing, but the GEP has only loop-invariant operands, 4633 // the GEP we build (by only using vector-typed operands for 4634 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4635 // produce a vector of pointers, we need to either arbitrarily pick an 4636 // operand to broadcast, or broadcast a clone of the original GEP. 4637 // Here, we broadcast a clone of the original. 4638 // 4639 // TODO: If at some point we decide to scalarize instructions having 4640 // loop-invariant operands, this special case will no longer be 4641 // required. We would add the scalarization decision to 4642 // collectLoopScalars() and teach getVectorValue() to broadcast 4643 // the lane-zero scalar value. 4644 auto *Clone = Builder.Insert(GEP->clone()); 4645 for (unsigned Part = 0; Part < UF; ++Part) { 4646 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4647 State.set(VPDef, EntryPart, Part); 4648 addMetadata(EntryPart, GEP); 4649 } 4650 } else { 4651 // If the GEP has at least one loop-varying operand, we are sure to 4652 // produce a vector of pointers. But if we are only unrolling, we want 4653 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4654 // produce with the code below will be scalar (if VF == 1) or vector 4655 // (otherwise). Note that for the unroll-only case, we still maintain 4656 // values in the vector mapping with initVector, as we do for other 4657 // instructions. 4658 for (unsigned Part = 0; Part < UF; ++Part) { 4659 // The pointer operand of the new GEP. If it's loop-invariant, we 4660 // won't broadcast it. 4661 auto *Ptr = IsPtrLoopInvariant 4662 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4663 : State.get(Operands.getOperand(0), Part); 4664 4665 // Collect all the indices for the new GEP. If any index is 4666 // loop-invariant, we won't broadcast it. 4667 SmallVector<Value *, 4> Indices; 4668 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4669 VPValue *Operand = Operands.getOperand(I); 4670 if (IsIndexLoopInvariant[I - 1]) 4671 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4672 else 4673 Indices.push_back(State.get(Operand, Part)); 4674 } 4675 4676 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4677 // but it should be a vector, otherwise. 4678 auto *NewGEP = 4679 GEP->isInBounds() 4680 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4681 Indices) 4682 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4683 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4684 "NewGEP is not a pointer vector"); 4685 State.set(VPDef, NewGEP, Part); 4686 addMetadata(NewGEP, GEP); 4687 } 4688 } 4689 } 4690 4691 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4692 VPWidenPHIRecipe *PhiR, 4693 VPTransformState &State) { 4694 PHINode *P = cast<PHINode>(PN); 4695 if (EnableVPlanNativePath) { 4696 // Currently we enter here in the VPlan-native path for non-induction 4697 // PHIs where all control flow is uniform. We simply widen these PHIs. 4698 // Create a vector phi with no operands - the vector phi operands will be 4699 // set at the end of vector code generation. 4700 Type *VecTy = (State.VF.isScalar()) 4701 ? PN->getType() 4702 : VectorType::get(PN->getType(), State.VF); 4703 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4704 State.set(PhiR, VecPhi, 0); 4705 OrigPHIsToFix.push_back(P); 4706 4707 return; 4708 } 4709 4710 assert(PN->getParent() == OrigLoop->getHeader() && 4711 "Non-header phis should have been handled elsewhere"); 4712 4713 // In order to support recurrences we need to be able to vectorize Phi nodes. 4714 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4715 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4716 // this value when we vectorize all of the instructions that use the PHI. 4717 4718 assert(!Legal->isReductionVariable(P) && 4719 "reductions should be handled elsewhere"); 4720 4721 setDebugLocFromInst(P); 4722 4723 // This PHINode must be an induction variable. 4724 // Make sure that we know about it. 4725 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4726 4727 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4728 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4729 4730 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4731 // which can be found from the original scalar operations. 4732 switch (II.getKind()) { 4733 case InductionDescriptor::IK_NoInduction: 4734 llvm_unreachable("Unknown induction"); 4735 case InductionDescriptor::IK_IntInduction: 4736 case InductionDescriptor::IK_FpInduction: 4737 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4738 case InductionDescriptor::IK_PtrInduction: { 4739 // Handle the pointer induction variable case. 4740 assert(P->getType()->isPointerTy() && "Unexpected type."); 4741 4742 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4743 // This is the normalized GEP that starts counting at zero. 4744 Value *PtrInd = 4745 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4746 // Determine the number of scalars we need to generate for each unroll 4747 // iteration. If the instruction is uniform, we only need to generate the 4748 // first lane. Otherwise, we generate all VF values. 4749 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4750 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4751 4752 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4753 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4754 if (NeedsVectorIndex) { 4755 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4756 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4757 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4758 } 4759 4760 for (unsigned Part = 0; Part < UF; ++Part) { 4761 Value *PartStart = createStepForVF( 4762 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4763 4764 if (NeedsVectorIndex) { 4765 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4766 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4767 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4768 Value *SclrGep = 4769 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4770 SclrGep->setName("next.gep"); 4771 State.set(PhiR, SclrGep, Part); 4772 // We've cached the whole vector, which means we can support the 4773 // extraction of any lane. 4774 continue; 4775 } 4776 4777 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4778 Value *Idx = Builder.CreateAdd( 4779 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4780 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4781 Value *SclrGep = 4782 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4783 SclrGep->setName("next.gep"); 4784 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4785 } 4786 } 4787 return; 4788 } 4789 assert(isa<SCEVConstant>(II.getStep()) && 4790 "Induction step not a SCEV constant!"); 4791 Type *PhiType = II.getStep()->getType(); 4792 4793 // Build a pointer phi 4794 Value *ScalarStartValue = II.getStartValue(); 4795 Type *ScStValueType = ScalarStartValue->getType(); 4796 PHINode *NewPointerPhi = 4797 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4798 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4799 4800 // A pointer induction, performed by using a gep 4801 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4802 Instruction *InductionLoc = LoopLatch->getTerminator(); 4803 const SCEV *ScalarStep = II.getStep(); 4804 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4805 Value *ScalarStepValue = 4806 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4807 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4808 Value *NumUnrolledElems = 4809 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4810 Value *InductionGEP = GetElementPtrInst::Create( 4811 ScStValueType->getPointerElementType(), NewPointerPhi, 4812 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4813 InductionLoc); 4814 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4815 4816 // Create UF many actual address geps that use the pointer 4817 // phi as base and a vectorized version of the step value 4818 // (<step*0, ..., step*N>) as offset. 4819 for (unsigned Part = 0; Part < State.UF; ++Part) { 4820 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4821 Value *StartOffsetScalar = 4822 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4823 Value *StartOffset = 4824 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4825 // Create a vector of consecutive numbers from zero to VF. 4826 StartOffset = 4827 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4828 4829 Value *GEP = Builder.CreateGEP( 4830 ScStValueType->getPointerElementType(), NewPointerPhi, 4831 Builder.CreateMul( 4832 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4833 "vector.gep")); 4834 State.set(PhiR, GEP, Part); 4835 } 4836 } 4837 } 4838 } 4839 4840 /// A helper function for checking whether an integer division-related 4841 /// instruction may divide by zero (in which case it must be predicated if 4842 /// executed conditionally in the scalar code). 4843 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4844 /// Non-zero divisors that are non compile-time constants will not be 4845 /// converted into multiplication, so we will still end up scalarizing 4846 /// the division, but can do so w/o predication. 4847 static bool mayDivideByZero(Instruction &I) { 4848 assert((I.getOpcode() == Instruction::UDiv || 4849 I.getOpcode() == Instruction::SDiv || 4850 I.getOpcode() == Instruction::URem || 4851 I.getOpcode() == Instruction::SRem) && 4852 "Unexpected instruction"); 4853 Value *Divisor = I.getOperand(1); 4854 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4855 return !CInt || CInt->isZero(); 4856 } 4857 4858 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4859 VPUser &User, 4860 VPTransformState &State) { 4861 switch (I.getOpcode()) { 4862 case Instruction::Call: 4863 case Instruction::Br: 4864 case Instruction::PHI: 4865 case Instruction::GetElementPtr: 4866 case Instruction::Select: 4867 llvm_unreachable("This instruction is handled by a different recipe."); 4868 case Instruction::UDiv: 4869 case Instruction::SDiv: 4870 case Instruction::SRem: 4871 case Instruction::URem: 4872 case Instruction::Add: 4873 case Instruction::FAdd: 4874 case Instruction::Sub: 4875 case Instruction::FSub: 4876 case Instruction::FNeg: 4877 case Instruction::Mul: 4878 case Instruction::FMul: 4879 case Instruction::FDiv: 4880 case Instruction::FRem: 4881 case Instruction::Shl: 4882 case Instruction::LShr: 4883 case Instruction::AShr: 4884 case Instruction::And: 4885 case Instruction::Or: 4886 case Instruction::Xor: { 4887 // Just widen unops and binops. 4888 setDebugLocFromInst(&I); 4889 4890 for (unsigned Part = 0; Part < UF; ++Part) { 4891 SmallVector<Value *, 2> Ops; 4892 for (VPValue *VPOp : User.operands()) 4893 Ops.push_back(State.get(VPOp, Part)); 4894 4895 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4896 4897 if (auto *VecOp = dyn_cast<Instruction>(V)) 4898 VecOp->copyIRFlags(&I); 4899 4900 // Use this vector value for all users of the original instruction. 4901 State.set(Def, V, Part); 4902 addMetadata(V, &I); 4903 } 4904 4905 break; 4906 } 4907 case Instruction::ICmp: 4908 case Instruction::FCmp: { 4909 // Widen compares. Generate vector compares. 4910 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4911 auto *Cmp = cast<CmpInst>(&I); 4912 setDebugLocFromInst(Cmp); 4913 for (unsigned Part = 0; Part < UF; ++Part) { 4914 Value *A = State.get(User.getOperand(0), Part); 4915 Value *B = State.get(User.getOperand(1), Part); 4916 Value *C = nullptr; 4917 if (FCmp) { 4918 // Propagate fast math flags. 4919 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4920 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4921 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4922 } else { 4923 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4924 } 4925 State.set(Def, C, Part); 4926 addMetadata(C, &I); 4927 } 4928 4929 break; 4930 } 4931 4932 case Instruction::ZExt: 4933 case Instruction::SExt: 4934 case Instruction::FPToUI: 4935 case Instruction::FPToSI: 4936 case Instruction::FPExt: 4937 case Instruction::PtrToInt: 4938 case Instruction::IntToPtr: 4939 case Instruction::SIToFP: 4940 case Instruction::UIToFP: 4941 case Instruction::Trunc: 4942 case Instruction::FPTrunc: 4943 case Instruction::BitCast: { 4944 auto *CI = cast<CastInst>(&I); 4945 setDebugLocFromInst(CI); 4946 4947 /// Vectorize casts. 4948 Type *DestTy = 4949 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4950 4951 for (unsigned Part = 0; Part < UF; ++Part) { 4952 Value *A = State.get(User.getOperand(0), Part); 4953 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4954 State.set(Def, Cast, Part); 4955 addMetadata(Cast, &I); 4956 } 4957 break; 4958 } 4959 default: 4960 // This instruction is not vectorized by simple widening. 4961 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4962 llvm_unreachable("Unhandled instruction!"); 4963 } // end of switch. 4964 } 4965 4966 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4967 VPUser &ArgOperands, 4968 VPTransformState &State) { 4969 assert(!isa<DbgInfoIntrinsic>(I) && 4970 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4971 setDebugLocFromInst(&I); 4972 4973 Module *M = I.getParent()->getParent()->getParent(); 4974 auto *CI = cast<CallInst>(&I); 4975 4976 SmallVector<Type *, 4> Tys; 4977 for (Value *ArgOperand : CI->arg_operands()) 4978 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4979 4980 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4981 4982 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4983 // version of the instruction. 4984 // Is it beneficial to perform intrinsic call compared to lib call? 4985 bool NeedToScalarize = false; 4986 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4987 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4988 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4989 assert((UseVectorIntrinsic || !NeedToScalarize) && 4990 "Instruction should be scalarized elsewhere."); 4991 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4992 "Either the intrinsic cost or vector call cost must be valid"); 4993 4994 for (unsigned Part = 0; Part < UF; ++Part) { 4995 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4996 SmallVector<Value *, 4> Args; 4997 for (auto &I : enumerate(ArgOperands.operands())) { 4998 // Some intrinsics have a scalar argument - don't replace it with a 4999 // vector. 5000 Value *Arg; 5001 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5002 Arg = State.get(I.value(), Part); 5003 else { 5004 Arg = State.get(I.value(), VPIteration(0, 0)); 5005 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 5006 TysForDecl.push_back(Arg->getType()); 5007 } 5008 Args.push_back(Arg); 5009 } 5010 5011 Function *VectorF; 5012 if (UseVectorIntrinsic) { 5013 // Use vector version of the intrinsic. 5014 if (VF.isVector()) 5015 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5016 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5017 assert(VectorF && "Can't retrieve vector intrinsic."); 5018 } else { 5019 // Use vector version of the function call. 5020 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5021 #ifndef NDEBUG 5022 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5023 "Can't create vector function."); 5024 #endif 5025 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5026 } 5027 SmallVector<OperandBundleDef, 1> OpBundles; 5028 CI->getOperandBundlesAsDefs(OpBundles); 5029 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5030 5031 if (isa<FPMathOperator>(V)) 5032 V->copyFastMathFlags(CI); 5033 5034 State.set(Def, V, Part); 5035 addMetadata(V, &I); 5036 } 5037 } 5038 5039 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5040 VPUser &Operands, 5041 bool InvariantCond, 5042 VPTransformState &State) { 5043 setDebugLocFromInst(&I); 5044 5045 // The condition can be loop invariant but still defined inside the 5046 // loop. This means that we can't just use the original 'cond' value. 5047 // We have to take the 'vectorized' value and pick the first lane. 5048 // Instcombine will make this a no-op. 5049 auto *InvarCond = InvariantCond 5050 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5051 : nullptr; 5052 5053 for (unsigned Part = 0; Part < UF; ++Part) { 5054 Value *Cond = 5055 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5056 Value *Op0 = State.get(Operands.getOperand(1), Part); 5057 Value *Op1 = State.get(Operands.getOperand(2), Part); 5058 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5059 State.set(VPDef, Sel, Part); 5060 addMetadata(Sel, &I); 5061 } 5062 } 5063 5064 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5065 // We should not collect Scalars more than once per VF. Right now, this 5066 // function is called from collectUniformsAndScalars(), which already does 5067 // this check. Collecting Scalars for VF=1 does not make any sense. 5068 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5069 "This function should not be visited twice for the same VF"); 5070 5071 SmallSetVector<Instruction *, 8> Worklist; 5072 5073 // These sets are used to seed the analysis with pointers used by memory 5074 // accesses that will remain scalar. 5075 SmallSetVector<Instruction *, 8> ScalarPtrs; 5076 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5077 auto *Latch = TheLoop->getLoopLatch(); 5078 5079 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5080 // The pointer operands of loads and stores will be scalar as long as the 5081 // memory access is not a gather or scatter operation. The value operand of a 5082 // store will remain scalar if the store is scalarized. 5083 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5084 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5085 assert(WideningDecision != CM_Unknown && 5086 "Widening decision should be ready at this moment"); 5087 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5088 if (Ptr == Store->getValueOperand()) 5089 return WideningDecision == CM_Scalarize; 5090 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5091 "Ptr is neither a value or pointer operand"); 5092 return WideningDecision != CM_GatherScatter; 5093 }; 5094 5095 // A helper that returns true if the given value is a bitcast or 5096 // getelementptr instruction contained in the loop. 5097 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5098 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5099 isa<GetElementPtrInst>(V)) && 5100 !TheLoop->isLoopInvariant(V); 5101 }; 5102 5103 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5104 if (!isa<PHINode>(Ptr) || 5105 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5106 return false; 5107 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5108 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5109 return false; 5110 return isScalarUse(MemAccess, Ptr); 5111 }; 5112 5113 // A helper that evaluates a memory access's use of a pointer. If the 5114 // pointer is actually the pointer induction of a loop, it is being 5115 // inserted into Worklist. If the use will be a scalar use, and the 5116 // pointer is only used by memory accesses, we place the pointer in 5117 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5118 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5119 if (isScalarPtrInduction(MemAccess, Ptr)) { 5120 Worklist.insert(cast<Instruction>(Ptr)); 5121 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5122 << "\n"); 5123 5124 Instruction *Update = cast<Instruction>( 5125 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5126 ScalarPtrs.insert(Update); 5127 return; 5128 } 5129 // We only care about bitcast and getelementptr instructions contained in 5130 // the loop. 5131 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5132 return; 5133 5134 // If the pointer has already been identified as scalar (e.g., if it was 5135 // also identified as uniform), there's nothing to do. 5136 auto *I = cast<Instruction>(Ptr); 5137 if (Worklist.count(I)) 5138 return; 5139 5140 // If the use of the pointer will be a scalar use, and all users of the 5141 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5142 // place the pointer in PossibleNonScalarPtrs. 5143 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5144 return isa<LoadInst>(U) || isa<StoreInst>(U); 5145 })) 5146 ScalarPtrs.insert(I); 5147 else 5148 PossibleNonScalarPtrs.insert(I); 5149 }; 5150 5151 // We seed the scalars analysis with three classes of instructions: (1) 5152 // instructions marked uniform-after-vectorization and (2) bitcast, 5153 // getelementptr and (pointer) phi instructions used by memory accesses 5154 // requiring a scalar use. 5155 // 5156 // (1) Add to the worklist all instructions that have been identified as 5157 // uniform-after-vectorization. 5158 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5159 5160 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5161 // memory accesses requiring a scalar use. The pointer operands of loads and 5162 // stores will be scalar as long as the memory accesses is not a gather or 5163 // scatter operation. The value operand of a store will remain scalar if the 5164 // store is scalarized. 5165 for (auto *BB : TheLoop->blocks()) 5166 for (auto &I : *BB) { 5167 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5168 evaluatePtrUse(Load, Load->getPointerOperand()); 5169 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5170 evaluatePtrUse(Store, Store->getPointerOperand()); 5171 evaluatePtrUse(Store, Store->getValueOperand()); 5172 } 5173 } 5174 for (auto *I : ScalarPtrs) 5175 if (!PossibleNonScalarPtrs.count(I)) { 5176 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5177 Worklist.insert(I); 5178 } 5179 5180 // Insert the forced scalars. 5181 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5182 // induction variable when the PHI user is scalarized. 5183 auto ForcedScalar = ForcedScalars.find(VF); 5184 if (ForcedScalar != ForcedScalars.end()) 5185 for (auto *I : ForcedScalar->second) 5186 Worklist.insert(I); 5187 5188 // Expand the worklist by looking through any bitcasts and getelementptr 5189 // instructions we've already identified as scalar. This is similar to the 5190 // expansion step in collectLoopUniforms(); however, here we're only 5191 // expanding to include additional bitcasts and getelementptr instructions. 5192 unsigned Idx = 0; 5193 while (Idx != Worklist.size()) { 5194 Instruction *Dst = Worklist[Idx++]; 5195 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5196 continue; 5197 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5198 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5199 auto *J = cast<Instruction>(U); 5200 return !TheLoop->contains(J) || Worklist.count(J) || 5201 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5202 isScalarUse(J, Src)); 5203 })) { 5204 Worklist.insert(Src); 5205 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5206 } 5207 } 5208 5209 // An induction variable will remain scalar if all users of the induction 5210 // variable and induction variable update remain scalar. 5211 for (auto &Induction : Legal->getInductionVars()) { 5212 auto *Ind = Induction.first; 5213 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5214 5215 // If tail-folding is applied, the primary induction variable will be used 5216 // to feed a vector compare. 5217 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5218 continue; 5219 5220 // Determine if all users of the induction variable are scalar after 5221 // vectorization. 5222 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5223 auto *I = cast<Instruction>(U); 5224 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5225 }); 5226 if (!ScalarInd) 5227 continue; 5228 5229 // Determine if all users of the induction variable update instruction are 5230 // scalar after vectorization. 5231 auto ScalarIndUpdate = 5232 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5233 auto *I = cast<Instruction>(U); 5234 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5235 }); 5236 if (!ScalarIndUpdate) 5237 continue; 5238 5239 // The induction variable and its update instruction will remain scalar. 5240 Worklist.insert(Ind); 5241 Worklist.insert(IndUpdate); 5242 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5243 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5244 << "\n"); 5245 } 5246 5247 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5248 } 5249 5250 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5251 if (!blockNeedsPredication(I->getParent())) 5252 return false; 5253 switch(I->getOpcode()) { 5254 default: 5255 break; 5256 case Instruction::Load: 5257 case Instruction::Store: { 5258 if (!Legal->isMaskRequired(I)) 5259 return false; 5260 auto *Ptr = getLoadStorePointerOperand(I); 5261 auto *Ty = getLoadStoreType(I); 5262 const Align Alignment = getLoadStoreAlignment(I); 5263 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5264 TTI.isLegalMaskedGather(Ty, Alignment)) 5265 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5266 TTI.isLegalMaskedScatter(Ty, Alignment)); 5267 } 5268 case Instruction::UDiv: 5269 case Instruction::SDiv: 5270 case Instruction::SRem: 5271 case Instruction::URem: 5272 return mayDivideByZero(*I); 5273 } 5274 return false; 5275 } 5276 5277 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5278 Instruction *I, ElementCount VF) { 5279 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5280 assert(getWideningDecision(I, VF) == CM_Unknown && 5281 "Decision should not be set yet."); 5282 auto *Group = getInterleavedAccessGroup(I); 5283 assert(Group && "Must have a group."); 5284 5285 // If the instruction's allocated size doesn't equal it's type size, it 5286 // requires padding and will be scalarized. 5287 auto &DL = I->getModule()->getDataLayout(); 5288 auto *ScalarTy = getLoadStoreType(I); 5289 if (hasIrregularType(ScalarTy, DL)) 5290 return false; 5291 5292 // Check if masking is required. 5293 // A Group may need masking for one of two reasons: it resides in a block that 5294 // needs predication, or it was decided to use masking to deal with gaps 5295 // (either a gap at the end of a load-access that may result in a speculative 5296 // load, or any gaps in a store-access). 5297 bool PredicatedAccessRequiresMasking = 5298 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5299 bool LoadAccessWithGapsRequiresEpilogMasking = 5300 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 5301 !isScalarEpilogueAllowed(); 5302 bool StoreAccessWithGapsRequiresMasking = 5303 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 5304 if (!PredicatedAccessRequiresMasking && 5305 !LoadAccessWithGapsRequiresEpilogMasking && 5306 !StoreAccessWithGapsRequiresMasking) 5307 return true; 5308 5309 // If masked interleaving is required, we expect that the user/target had 5310 // enabled it, because otherwise it either wouldn't have been created or 5311 // it should have been invalidated by the CostModel. 5312 assert(useMaskedInterleavedAccesses(TTI) && 5313 "Masked interleave-groups for predicated accesses are not enabled."); 5314 5315 auto *Ty = getLoadStoreType(I); 5316 const Align Alignment = getLoadStoreAlignment(I); 5317 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5318 : TTI.isLegalMaskedStore(Ty, Alignment); 5319 } 5320 5321 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5322 Instruction *I, ElementCount VF) { 5323 // Get and ensure we have a valid memory instruction. 5324 LoadInst *LI = dyn_cast<LoadInst>(I); 5325 StoreInst *SI = dyn_cast<StoreInst>(I); 5326 assert((LI || SI) && "Invalid memory instruction"); 5327 5328 auto *Ptr = getLoadStorePointerOperand(I); 5329 5330 // In order to be widened, the pointer should be consecutive, first of all. 5331 if (!Legal->isConsecutivePtr(Ptr)) 5332 return false; 5333 5334 // If the instruction is a store located in a predicated block, it will be 5335 // scalarized. 5336 if (isScalarWithPredication(I)) 5337 return false; 5338 5339 // If the instruction's allocated size doesn't equal it's type size, it 5340 // requires padding and will be scalarized. 5341 auto &DL = I->getModule()->getDataLayout(); 5342 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5343 if (hasIrregularType(ScalarTy, DL)) 5344 return false; 5345 5346 return true; 5347 } 5348 5349 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5350 // We should not collect Uniforms more than once per VF. Right now, 5351 // this function is called from collectUniformsAndScalars(), which 5352 // already does this check. Collecting Uniforms for VF=1 does not make any 5353 // sense. 5354 5355 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5356 "This function should not be visited twice for the same VF"); 5357 5358 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5359 // not analyze again. Uniforms.count(VF) will return 1. 5360 Uniforms[VF].clear(); 5361 5362 // We now know that the loop is vectorizable! 5363 // Collect instructions inside the loop that will remain uniform after 5364 // vectorization. 5365 5366 // Global values, params and instructions outside of current loop are out of 5367 // scope. 5368 auto isOutOfScope = [&](Value *V) -> bool { 5369 Instruction *I = dyn_cast<Instruction>(V); 5370 return (!I || !TheLoop->contains(I)); 5371 }; 5372 5373 SetVector<Instruction *> Worklist; 5374 BasicBlock *Latch = TheLoop->getLoopLatch(); 5375 5376 // Instructions that are scalar with predication must not be considered 5377 // uniform after vectorization, because that would create an erroneous 5378 // replicating region where only a single instance out of VF should be formed. 5379 // TODO: optimize such seldom cases if found important, see PR40816. 5380 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5381 if (isOutOfScope(I)) { 5382 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5383 << *I << "\n"); 5384 return; 5385 } 5386 if (isScalarWithPredication(I)) { 5387 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5388 << *I << "\n"); 5389 return; 5390 } 5391 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5392 Worklist.insert(I); 5393 }; 5394 5395 // Start with the conditional branch. If the branch condition is an 5396 // instruction contained in the loop that is only used by the branch, it is 5397 // uniform. 5398 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5399 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5400 addToWorklistIfAllowed(Cmp); 5401 5402 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5403 InstWidening WideningDecision = getWideningDecision(I, VF); 5404 assert(WideningDecision != CM_Unknown && 5405 "Widening decision should be ready at this moment"); 5406 5407 // A uniform memory op is itself uniform. We exclude uniform stores 5408 // here as they demand the last lane, not the first one. 5409 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5410 assert(WideningDecision == CM_Scalarize); 5411 return true; 5412 } 5413 5414 return (WideningDecision == CM_Widen || 5415 WideningDecision == CM_Widen_Reverse || 5416 WideningDecision == CM_Interleave); 5417 }; 5418 5419 5420 // Returns true if Ptr is the pointer operand of a memory access instruction 5421 // I, and I is known to not require scalarization. 5422 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5423 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5424 }; 5425 5426 // Holds a list of values which are known to have at least one uniform use. 5427 // Note that there may be other uses which aren't uniform. A "uniform use" 5428 // here is something which only demands lane 0 of the unrolled iterations; 5429 // it does not imply that all lanes produce the same value (e.g. this is not 5430 // the usual meaning of uniform) 5431 SetVector<Value *> HasUniformUse; 5432 5433 // Scan the loop for instructions which are either a) known to have only 5434 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5435 for (auto *BB : TheLoop->blocks()) 5436 for (auto &I : *BB) { 5437 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5438 switch (II->getIntrinsicID()) { 5439 case Intrinsic::sideeffect: 5440 case Intrinsic::experimental_noalias_scope_decl: 5441 case Intrinsic::assume: 5442 case Intrinsic::lifetime_start: 5443 case Intrinsic::lifetime_end: 5444 if (TheLoop->hasLoopInvariantOperands(&I)) 5445 addToWorklistIfAllowed(&I); 5446 break; 5447 default: 5448 break; 5449 } 5450 } 5451 5452 // ExtractValue instructions must be uniform, because the operands are 5453 // known to be loop-invariant. 5454 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5455 assert(isOutOfScope(EVI->getAggregateOperand()) && 5456 "Expected aggregate value to be loop invariant"); 5457 addToWorklistIfAllowed(EVI); 5458 continue; 5459 } 5460 5461 // If there's no pointer operand, there's nothing to do. 5462 auto *Ptr = getLoadStorePointerOperand(&I); 5463 if (!Ptr) 5464 continue; 5465 5466 // A uniform memory op is itself uniform. We exclude uniform stores 5467 // here as they demand the last lane, not the first one. 5468 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5469 addToWorklistIfAllowed(&I); 5470 5471 if (isUniformDecision(&I, VF)) { 5472 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5473 HasUniformUse.insert(Ptr); 5474 } 5475 } 5476 5477 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5478 // demanding) users. Since loops are assumed to be in LCSSA form, this 5479 // disallows uses outside the loop as well. 5480 for (auto *V : HasUniformUse) { 5481 if (isOutOfScope(V)) 5482 continue; 5483 auto *I = cast<Instruction>(V); 5484 auto UsersAreMemAccesses = 5485 llvm::all_of(I->users(), [&](User *U) -> bool { 5486 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5487 }); 5488 if (UsersAreMemAccesses) 5489 addToWorklistIfAllowed(I); 5490 } 5491 5492 // Expand Worklist in topological order: whenever a new instruction 5493 // is added , its users should be already inside Worklist. It ensures 5494 // a uniform instruction will only be used by uniform instructions. 5495 unsigned idx = 0; 5496 while (idx != Worklist.size()) { 5497 Instruction *I = Worklist[idx++]; 5498 5499 for (auto OV : I->operand_values()) { 5500 // isOutOfScope operands cannot be uniform instructions. 5501 if (isOutOfScope(OV)) 5502 continue; 5503 // First order recurrence Phi's should typically be considered 5504 // non-uniform. 5505 auto *OP = dyn_cast<PHINode>(OV); 5506 if (OP && Legal->isFirstOrderRecurrence(OP)) 5507 continue; 5508 // If all the users of the operand are uniform, then add the 5509 // operand into the uniform worklist. 5510 auto *OI = cast<Instruction>(OV); 5511 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5512 auto *J = cast<Instruction>(U); 5513 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5514 })) 5515 addToWorklistIfAllowed(OI); 5516 } 5517 } 5518 5519 // For an instruction to be added into Worklist above, all its users inside 5520 // the loop should also be in Worklist. However, this condition cannot be 5521 // true for phi nodes that form a cyclic dependence. We must process phi 5522 // nodes separately. An induction variable will remain uniform if all users 5523 // of the induction variable and induction variable update remain uniform. 5524 // The code below handles both pointer and non-pointer induction variables. 5525 for (auto &Induction : Legal->getInductionVars()) { 5526 auto *Ind = Induction.first; 5527 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5528 5529 // Determine if all users of the induction variable are uniform after 5530 // vectorization. 5531 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5532 auto *I = cast<Instruction>(U); 5533 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5534 isVectorizedMemAccessUse(I, Ind); 5535 }); 5536 if (!UniformInd) 5537 continue; 5538 5539 // Determine if all users of the induction variable update instruction are 5540 // uniform after vectorization. 5541 auto UniformIndUpdate = 5542 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5543 auto *I = cast<Instruction>(U); 5544 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5545 isVectorizedMemAccessUse(I, IndUpdate); 5546 }); 5547 if (!UniformIndUpdate) 5548 continue; 5549 5550 // The induction variable and its update instruction will remain uniform. 5551 addToWorklistIfAllowed(Ind); 5552 addToWorklistIfAllowed(IndUpdate); 5553 } 5554 5555 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5556 } 5557 5558 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5559 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5560 5561 if (Legal->getRuntimePointerChecking()->Need) { 5562 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5563 "runtime pointer checks needed. Enable vectorization of this " 5564 "loop with '#pragma clang loop vectorize(enable)' when " 5565 "compiling with -Os/-Oz", 5566 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5567 return true; 5568 } 5569 5570 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5571 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5572 "runtime SCEV checks needed. Enable vectorization of this " 5573 "loop with '#pragma clang loop vectorize(enable)' when " 5574 "compiling with -Os/-Oz", 5575 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5576 return true; 5577 } 5578 5579 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5580 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5581 reportVectorizationFailure("Runtime stride check for small trip count", 5582 "runtime stride == 1 checks needed. Enable vectorization of " 5583 "this loop without such check by compiling with -Os/-Oz", 5584 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5585 return true; 5586 } 5587 5588 return false; 5589 } 5590 5591 ElementCount 5592 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5593 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5594 return ElementCount::getScalable(0); 5595 5596 if (Hints->isScalableVectorizationDisabled()) { 5597 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5598 "ScalableVectorizationDisabled", ORE, TheLoop); 5599 return ElementCount::getScalable(0); 5600 } 5601 5602 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5603 5604 auto MaxScalableVF = ElementCount::getScalable( 5605 std::numeric_limits<ElementCount::ScalarTy>::max()); 5606 5607 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5608 // FIXME: While for scalable vectors this is currently sufficient, this should 5609 // be replaced by a more detailed mechanism that filters out specific VFs, 5610 // instead of invalidating vectorization for a whole set of VFs based on the 5611 // MaxVF. 5612 5613 // Disable scalable vectorization if the loop contains unsupported reductions. 5614 if (!canVectorizeReductions(MaxScalableVF)) { 5615 reportVectorizationInfo( 5616 "Scalable vectorization not supported for the reduction " 5617 "operations found in this loop.", 5618 "ScalableVFUnfeasible", ORE, TheLoop); 5619 return ElementCount::getScalable(0); 5620 } 5621 5622 // Disable scalable vectorization if the loop contains any instructions 5623 // with element types not supported for scalable vectors. 5624 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5625 return !Ty->isVoidTy() && 5626 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5627 })) { 5628 reportVectorizationInfo("Scalable vectorization is not supported " 5629 "for all element types found in this loop.", 5630 "ScalableVFUnfeasible", ORE, TheLoop); 5631 return ElementCount::getScalable(0); 5632 } 5633 5634 if (Legal->isSafeForAnyVectorWidth()) 5635 return MaxScalableVF; 5636 5637 // Limit MaxScalableVF by the maximum safe dependence distance. 5638 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5639 MaxScalableVF = ElementCount::getScalable( 5640 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5641 if (!MaxScalableVF) 5642 reportVectorizationInfo( 5643 "Max legal vector width too small, scalable vectorization " 5644 "unfeasible.", 5645 "ScalableVFUnfeasible", ORE, TheLoop); 5646 5647 return MaxScalableVF; 5648 } 5649 5650 FixedScalableVFPair 5651 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5652 ElementCount UserVF) { 5653 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5654 unsigned SmallestType, WidestType; 5655 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5656 5657 // Get the maximum safe dependence distance in bits computed by LAA. 5658 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5659 // the memory accesses that is most restrictive (involved in the smallest 5660 // dependence distance). 5661 unsigned MaxSafeElements = 5662 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5663 5664 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5665 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5666 5667 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5668 << ".\n"); 5669 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5670 << ".\n"); 5671 5672 // First analyze the UserVF, fall back if the UserVF should be ignored. 5673 if (UserVF) { 5674 auto MaxSafeUserVF = 5675 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5676 5677 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5678 // If `VF=vscale x N` is safe, then so is `VF=N` 5679 if (UserVF.isScalable()) 5680 return FixedScalableVFPair( 5681 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5682 else 5683 return UserVF; 5684 } 5685 5686 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5687 5688 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5689 // is better to ignore the hint and let the compiler choose a suitable VF. 5690 if (!UserVF.isScalable()) { 5691 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5692 << " is unsafe, clamping to max safe VF=" 5693 << MaxSafeFixedVF << ".\n"); 5694 ORE->emit([&]() { 5695 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5696 TheLoop->getStartLoc(), 5697 TheLoop->getHeader()) 5698 << "User-specified vectorization factor " 5699 << ore::NV("UserVectorizationFactor", UserVF) 5700 << " is unsafe, clamping to maximum safe vectorization factor " 5701 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5702 }); 5703 return MaxSafeFixedVF; 5704 } 5705 5706 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5707 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5708 << " is ignored because scalable vectors are not " 5709 "available.\n"); 5710 ORE->emit([&]() { 5711 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5712 TheLoop->getStartLoc(), 5713 TheLoop->getHeader()) 5714 << "User-specified vectorization factor " 5715 << ore::NV("UserVectorizationFactor", UserVF) 5716 << " is ignored because the target does not support scalable " 5717 "vectors. The compiler will pick a more suitable value."; 5718 }); 5719 } else { 5720 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5721 << " is unsafe. Ignoring scalable UserVF.\n"); 5722 ORE->emit([&]() { 5723 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5724 TheLoop->getStartLoc(), 5725 TheLoop->getHeader()) 5726 << "User-specified vectorization factor " 5727 << ore::NV("UserVectorizationFactor", UserVF) 5728 << " is unsafe. Ignoring the hint to let the compiler pick a " 5729 "more suitable value."; 5730 }); 5731 } 5732 } 5733 5734 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5735 << " / " << WidestType << " bits.\n"); 5736 5737 FixedScalableVFPair Result(ElementCount::getFixed(1), 5738 ElementCount::getScalable(0)); 5739 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5740 WidestType, MaxSafeFixedVF)) 5741 Result.FixedVF = MaxVF; 5742 5743 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5744 WidestType, MaxSafeScalableVF)) 5745 if (MaxVF.isScalable()) { 5746 Result.ScalableVF = MaxVF; 5747 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5748 << "\n"); 5749 } 5750 5751 return Result; 5752 } 5753 5754 FixedScalableVFPair 5755 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5756 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5757 // TODO: It may by useful to do since it's still likely to be dynamically 5758 // uniform if the target can skip. 5759 reportVectorizationFailure( 5760 "Not inserting runtime ptr check for divergent target", 5761 "runtime pointer checks needed. Not enabled for divergent target", 5762 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5763 return FixedScalableVFPair::getNone(); 5764 } 5765 5766 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5767 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5768 if (TC == 1) { 5769 reportVectorizationFailure("Single iteration (non) loop", 5770 "loop trip count is one, irrelevant for vectorization", 5771 "SingleIterationLoop", ORE, TheLoop); 5772 return FixedScalableVFPair::getNone(); 5773 } 5774 5775 switch (ScalarEpilogueStatus) { 5776 case CM_ScalarEpilogueAllowed: 5777 return computeFeasibleMaxVF(TC, UserVF); 5778 case CM_ScalarEpilogueNotAllowedUsePredicate: 5779 LLVM_FALLTHROUGH; 5780 case CM_ScalarEpilogueNotNeededUsePredicate: 5781 LLVM_DEBUG( 5782 dbgs() << "LV: vector predicate hint/switch found.\n" 5783 << "LV: Not allowing scalar epilogue, creating predicated " 5784 << "vector loop.\n"); 5785 break; 5786 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5787 // fallthrough as a special case of OptForSize 5788 case CM_ScalarEpilogueNotAllowedOptSize: 5789 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5790 LLVM_DEBUG( 5791 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5792 else 5793 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5794 << "count.\n"); 5795 5796 // Bail if runtime checks are required, which are not good when optimising 5797 // for size. 5798 if (runtimeChecksRequired()) 5799 return FixedScalableVFPair::getNone(); 5800 5801 break; 5802 } 5803 5804 // The only loops we can vectorize without a scalar epilogue, are loops with 5805 // a bottom-test and a single exiting block. We'd have to handle the fact 5806 // that not every instruction executes on the last iteration. This will 5807 // require a lane mask which varies through the vector loop body. (TODO) 5808 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5809 // If there was a tail-folding hint/switch, but we can't fold the tail by 5810 // masking, fallback to a vectorization with a scalar epilogue. 5811 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5812 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5813 "scalar epilogue instead.\n"); 5814 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5815 return computeFeasibleMaxVF(TC, UserVF); 5816 } 5817 return FixedScalableVFPair::getNone(); 5818 } 5819 5820 // Now try the tail folding 5821 5822 // Invalidate interleave groups that require an epilogue if we can't mask 5823 // the interleave-group. 5824 if (!useMaskedInterleavedAccesses(TTI)) { 5825 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5826 "No decisions should have been taken at this point"); 5827 // Note: There is no need to invalidate any cost modeling decisions here, as 5828 // non where taken so far. 5829 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5830 } 5831 5832 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5833 // Avoid tail folding if the trip count is known to be a multiple of any VF 5834 // we chose. 5835 // FIXME: The condition below pessimises the case for fixed-width vectors, 5836 // when scalable VFs are also candidates for vectorization. 5837 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5838 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5839 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5840 "MaxFixedVF must be a power of 2"); 5841 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5842 : MaxFixedVF.getFixedValue(); 5843 ScalarEvolution *SE = PSE.getSE(); 5844 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5845 const SCEV *ExitCount = SE->getAddExpr( 5846 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5847 const SCEV *Rem = SE->getURemExpr( 5848 SE->applyLoopGuards(ExitCount, TheLoop), 5849 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5850 if (Rem->isZero()) { 5851 // Accept MaxFixedVF if we do not have a tail. 5852 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5853 return MaxFactors; 5854 } 5855 } 5856 5857 // For scalable vectors, don't use tail folding as this is currently not yet 5858 // supported. The code is likely to have ended up here if the tripcount is 5859 // low, in which case it makes sense not to use scalable vectors. 5860 if (MaxFactors.ScalableVF.isVector()) 5861 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5862 5863 // If we don't know the precise trip count, or if the trip count that we 5864 // found modulo the vectorization factor is not zero, try to fold the tail 5865 // by masking. 5866 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5867 if (Legal->prepareToFoldTailByMasking()) { 5868 FoldTailByMasking = true; 5869 return MaxFactors; 5870 } 5871 5872 // If there was a tail-folding hint/switch, but we can't fold the tail by 5873 // masking, fallback to a vectorization with a scalar epilogue. 5874 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5875 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5876 "scalar epilogue instead.\n"); 5877 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5878 return MaxFactors; 5879 } 5880 5881 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5882 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5883 return FixedScalableVFPair::getNone(); 5884 } 5885 5886 if (TC == 0) { 5887 reportVectorizationFailure( 5888 "Unable to calculate the loop count due to complex control flow", 5889 "unable to calculate the loop count due to complex control flow", 5890 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5891 return FixedScalableVFPair::getNone(); 5892 } 5893 5894 reportVectorizationFailure( 5895 "Cannot optimize for size and vectorize at the same time.", 5896 "cannot optimize for size and vectorize at the same time. " 5897 "Enable vectorization of this loop with '#pragma clang loop " 5898 "vectorize(enable)' when compiling with -Os/-Oz", 5899 "NoTailLoopWithOptForSize", ORE, TheLoop); 5900 return FixedScalableVFPair::getNone(); 5901 } 5902 5903 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5904 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5905 const ElementCount &MaxSafeVF) { 5906 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5907 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5908 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5909 : TargetTransformInfo::RGK_FixedWidthVector); 5910 5911 // Convenience function to return the minimum of two ElementCounts. 5912 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5913 assert((LHS.isScalable() == RHS.isScalable()) && 5914 "Scalable flags must match"); 5915 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5916 }; 5917 5918 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5919 // Note that both WidestRegister and WidestType may not be a powers of 2. 5920 auto MaxVectorElementCount = ElementCount::get( 5921 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5922 ComputeScalableMaxVF); 5923 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5924 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5925 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5926 5927 if (!MaxVectorElementCount) { 5928 LLVM_DEBUG(dbgs() << "LV: The target has no " 5929 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5930 << " vector registers.\n"); 5931 return ElementCount::getFixed(1); 5932 } 5933 5934 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5935 if (ConstTripCount && 5936 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5937 isPowerOf2_32(ConstTripCount)) { 5938 // We need to clamp the VF to be the ConstTripCount. There is no point in 5939 // choosing a higher viable VF as done in the loop below. If 5940 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5941 // the TC is less than or equal to the known number of lanes. 5942 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5943 << ConstTripCount << "\n"); 5944 return TripCountEC; 5945 } 5946 5947 ElementCount MaxVF = MaxVectorElementCount; 5948 if (TTI.shouldMaximizeVectorBandwidth() || 5949 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5950 auto MaxVectorElementCountMaxBW = ElementCount::get( 5951 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5952 ComputeScalableMaxVF); 5953 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5954 5955 // Collect all viable vectorization factors larger than the default MaxVF 5956 // (i.e. MaxVectorElementCount). 5957 SmallVector<ElementCount, 8> VFs; 5958 for (ElementCount VS = MaxVectorElementCount * 2; 5959 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5960 VFs.push_back(VS); 5961 5962 // For each VF calculate its register usage. 5963 auto RUs = calculateRegisterUsage(VFs); 5964 5965 // Select the largest VF which doesn't require more registers than existing 5966 // ones. 5967 for (int i = RUs.size() - 1; i >= 0; --i) { 5968 bool Selected = true; 5969 for (auto &pair : RUs[i].MaxLocalUsers) { 5970 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5971 if (pair.second > TargetNumRegisters) 5972 Selected = false; 5973 } 5974 if (Selected) { 5975 MaxVF = VFs[i]; 5976 break; 5977 } 5978 } 5979 if (ElementCount MinVF = 5980 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5981 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5982 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5983 << ") with target's minimum: " << MinVF << '\n'); 5984 MaxVF = MinVF; 5985 } 5986 } 5987 } 5988 return MaxVF; 5989 } 5990 5991 bool LoopVectorizationCostModel::isMoreProfitable( 5992 const VectorizationFactor &A, const VectorizationFactor &B) const { 5993 InstructionCost CostA = A.Cost; 5994 InstructionCost CostB = B.Cost; 5995 5996 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5997 5998 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5999 MaxTripCount) { 6000 // If we are folding the tail and the trip count is a known (possibly small) 6001 // constant, the trip count will be rounded up to an integer number of 6002 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6003 // which we compare directly. When not folding the tail, the total cost will 6004 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6005 // approximated with the per-lane cost below instead of using the tripcount 6006 // as here. 6007 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6008 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6009 return RTCostA < RTCostB; 6010 } 6011 6012 // When set to preferred, for now assume vscale may be larger than 1, so 6013 // that scalable vectorization is slightly favorable over fixed-width 6014 // vectorization. 6015 if (Hints->isScalableVectorizationPreferred()) 6016 if (A.Width.isScalable() && !B.Width.isScalable()) 6017 return (CostA * B.Width.getKnownMinValue()) <= 6018 (CostB * A.Width.getKnownMinValue()); 6019 6020 // To avoid the need for FP division: 6021 // (CostA / A.Width) < (CostB / B.Width) 6022 // <=> (CostA * B.Width) < (CostB * A.Width) 6023 return (CostA * B.Width.getKnownMinValue()) < 6024 (CostB * A.Width.getKnownMinValue()); 6025 } 6026 6027 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 6028 const ElementCountSet &VFCandidates) { 6029 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6030 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6031 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6032 assert(VFCandidates.count(ElementCount::getFixed(1)) && 6033 "Expected Scalar VF to be a candidate"); 6034 6035 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6036 VectorizationFactor ChosenFactor = ScalarCost; 6037 6038 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6039 if (ForceVectorization && VFCandidates.size() > 1) { 6040 // Ignore scalar width, because the user explicitly wants vectorization. 6041 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6042 // evaluation. 6043 ChosenFactor.Cost = InstructionCost::getMax(); 6044 } 6045 6046 SmallVector<InstructionVFPair> InvalidCosts; 6047 for (const auto &i : VFCandidates) { 6048 // The cost for scalar VF=1 is already calculated, so ignore it. 6049 if (i.isScalar()) 6050 continue; 6051 6052 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 6053 VectorizationFactor Candidate(i, C.first); 6054 LLVM_DEBUG( 6055 dbgs() << "LV: Vector loop of width " << i << " costs: " 6056 << (Candidate.Cost / Candidate.Width.getKnownMinValue()) 6057 << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") 6058 << ".\n"); 6059 6060 if (!C.second && !ForceVectorization) { 6061 LLVM_DEBUG( 6062 dbgs() << "LV: Not considering vector loop of width " << i 6063 << " because it will not generate any vector instructions.\n"); 6064 continue; 6065 } 6066 6067 // If profitable add it to ProfitableVF list. 6068 if (isMoreProfitable(Candidate, ScalarCost)) 6069 ProfitableVFs.push_back(Candidate); 6070 6071 if (isMoreProfitable(Candidate, ChosenFactor)) 6072 ChosenFactor = Candidate; 6073 } 6074 6075 // Emit a report of VFs with invalid costs in the loop. 6076 if (!InvalidCosts.empty()) { 6077 // Group the remarks per instruction, keeping the instruction order from 6078 // InvalidCosts. 6079 std::map<Instruction *, unsigned> Numbering; 6080 unsigned I = 0; 6081 for (auto &Pair : InvalidCosts) 6082 if (!Numbering.count(Pair.first)) 6083 Numbering[Pair.first] = I++; 6084 6085 // Sort the list, first on instruction(number) then on VF. 6086 llvm::sort(InvalidCosts, 6087 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6088 if (Numbering[A.first] != Numbering[B.first]) 6089 return Numbering[A.first] < Numbering[B.first]; 6090 ElementCountComparator ECC; 6091 return ECC(A.second, B.second); 6092 }); 6093 6094 // For a list of ordered instruction-vf pairs: 6095 // [(load, vf1), (load, vf2), (store, vf1)] 6096 // Group the instructions together to emit separate remarks for: 6097 // load (vf1, vf2) 6098 // store (vf1) 6099 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6100 auto Subset = ArrayRef<InstructionVFPair>(); 6101 do { 6102 if (Subset.empty()) 6103 Subset = Tail.take_front(1); 6104 6105 Instruction *I = Subset.front().first; 6106 6107 // If the next instruction is different, or if there are no other pairs, 6108 // emit a remark for the collated subset. e.g. 6109 // [(load, vf1), (load, vf2))] 6110 // to emit: 6111 // remark: invalid costs for 'load' at VF=(vf, vf2) 6112 if (Subset == Tail || Tail[Subset.size()].first != I) { 6113 std::string OutString; 6114 raw_string_ostream OS(OutString); 6115 assert(!Subset.empty() && "Unexpected empty range"); 6116 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6117 for (auto &Pair : Subset) 6118 OS << (Pair.second == Subset.front().second ? "" : ", ") 6119 << Pair.second; 6120 OS << "):"; 6121 if (auto *CI = dyn_cast<CallInst>(I)) 6122 OS << " call to " << CI->getCalledFunction()->getName(); 6123 else 6124 OS << " " << I->getOpcodeName(); 6125 OS.flush(); 6126 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6127 Tail = Tail.drop_front(Subset.size()); 6128 Subset = {}; 6129 } else 6130 // Grow the subset by one element 6131 Subset = Tail.take_front(Subset.size() + 1); 6132 } while (!Tail.empty()); 6133 } 6134 6135 if (!EnableCondStoresVectorization && NumPredStores) { 6136 reportVectorizationFailure("There are conditional stores.", 6137 "store that is conditionally executed prevents vectorization", 6138 "ConditionalStore", ORE, TheLoop); 6139 ChosenFactor = ScalarCost; 6140 } 6141 6142 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6143 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6144 << "LV: Vectorization seems to be not beneficial, " 6145 << "but was forced by a user.\n"); 6146 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6147 return ChosenFactor; 6148 } 6149 6150 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6151 const Loop &L, ElementCount VF) const { 6152 // Cross iteration phis such as reductions need special handling and are 6153 // currently unsupported. 6154 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6155 return Legal->isFirstOrderRecurrence(&Phi) || 6156 Legal->isReductionVariable(&Phi); 6157 })) 6158 return false; 6159 6160 // Phis with uses outside of the loop require special handling and are 6161 // currently unsupported. 6162 for (auto &Entry : Legal->getInductionVars()) { 6163 // Look for uses of the value of the induction at the last iteration. 6164 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6165 for (User *U : PostInc->users()) 6166 if (!L.contains(cast<Instruction>(U))) 6167 return false; 6168 // Look for uses of penultimate value of the induction. 6169 for (User *U : Entry.first->users()) 6170 if (!L.contains(cast<Instruction>(U))) 6171 return false; 6172 } 6173 6174 // Induction variables that are widened require special handling that is 6175 // currently not supported. 6176 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6177 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6178 this->isProfitableToScalarize(Entry.first, VF)); 6179 })) 6180 return false; 6181 6182 // Epilogue vectorization code has not been auditted to ensure it handles 6183 // non-latch exits properly. It may be fine, but it needs auditted and 6184 // tested. 6185 if (L.getExitingBlock() != L.getLoopLatch()) 6186 return false; 6187 6188 return true; 6189 } 6190 6191 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6192 const ElementCount VF) const { 6193 // FIXME: We need a much better cost-model to take different parameters such 6194 // as register pressure, code size increase and cost of extra branches into 6195 // account. For now we apply a very crude heuristic and only consider loops 6196 // with vectorization factors larger than a certain value. 6197 // We also consider epilogue vectorization unprofitable for targets that don't 6198 // consider interleaving beneficial (eg. MVE). 6199 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6200 return false; 6201 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6202 return true; 6203 return false; 6204 } 6205 6206 VectorizationFactor 6207 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6208 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6209 VectorizationFactor Result = VectorizationFactor::Disabled(); 6210 if (!EnableEpilogueVectorization) { 6211 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6212 return Result; 6213 } 6214 6215 if (!isScalarEpilogueAllowed()) { 6216 LLVM_DEBUG( 6217 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6218 "allowed.\n";); 6219 return Result; 6220 } 6221 6222 // FIXME: This can be fixed for scalable vectors later, because at this stage 6223 // the LoopVectorizer will only consider vectorizing a loop with scalable 6224 // vectors when the loop has a hint to enable vectorization for a given VF. 6225 if (MainLoopVF.isScalable()) { 6226 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6227 "yet supported.\n"); 6228 return Result; 6229 } 6230 6231 // Not really a cost consideration, but check for unsupported cases here to 6232 // simplify the logic. 6233 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6234 LLVM_DEBUG( 6235 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6236 "not a supported candidate.\n";); 6237 return Result; 6238 } 6239 6240 if (EpilogueVectorizationForceVF > 1) { 6241 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6242 if (LVP.hasPlanWithVFs( 6243 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6244 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6245 else { 6246 LLVM_DEBUG( 6247 dbgs() 6248 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6249 return Result; 6250 } 6251 } 6252 6253 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6254 TheLoop->getHeader()->getParent()->hasMinSize()) { 6255 LLVM_DEBUG( 6256 dbgs() 6257 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6258 return Result; 6259 } 6260 6261 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6262 return Result; 6263 6264 for (auto &NextVF : ProfitableVFs) 6265 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6266 (Result.Width.getFixedValue() == 1 || 6267 isMoreProfitable(NextVF, Result)) && 6268 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6269 Result = NextVF; 6270 6271 if (Result != VectorizationFactor::Disabled()) 6272 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6273 << Result.Width.getFixedValue() << "\n";); 6274 return Result; 6275 } 6276 6277 std::pair<unsigned, unsigned> 6278 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6279 unsigned MinWidth = -1U; 6280 unsigned MaxWidth = 8; 6281 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6282 for (Type *T : ElementTypesInLoop) { 6283 MinWidth = std::min<unsigned>( 6284 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6285 MaxWidth = std::max<unsigned>( 6286 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6287 } 6288 return {MinWidth, MaxWidth}; 6289 } 6290 6291 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6292 ElementTypesInLoop.clear(); 6293 // For each block. 6294 for (BasicBlock *BB : TheLoop->blocks()) { 6295 // For each instruction in the loop. 6296 for (Instruction &I : BB->instructionsWithoutDebug()) { 6297 Type *T = I.getType(); 6298 6299 // Skip ignored values. 6300 if (ValuesToIgnore.count(&I)) 6301 continue; 6302 6303 // Only examine Loads, Stores and PHINodes. 6304 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6305 continue; 6306 6307 // Examine PHI nodes that are reduction variables. Update the type to 6308 // account for the recurrence type. 6309 if (auto *PN = dyn_cast<PHINode>(&I)) { 6310 if (!Legal->isReductionVariable(PN)) 6311 continue; 6312 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6313 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6314 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6315 RdxDesc.getRecurrenceType(), 6316 TargetTransformInfo::ReductionFlags())) 6317 continue; 6318 T = RdxDesc.getRecurrenceType(); 6319 } 6320 6321 // Examine the stored values. 6322 if (auto *ST = dyn_cast<StoreInst>(&I)) 6323 T = ST->getValueOperand()->getType(); 6324 6325 // Ignore loaded pointer types and stored pointer types that are not 6326 // vectorizable. 6327 // 6328 // FIXME: The check here attempts to predict whether a load or store will 6329 // be vectorized. We only know this for certain after a VF has 6330 // been selected. Here, we assume that if an access can be 6331 // vectorized, it will be. We should also look at extending this 6332 // optimization to non-pointer types. 6333 // 6334 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6335 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6336 continue; 6337 6338 ElementTypesInLoop.insert(T); 6339 } 6340 } 6341 } 6342 6343 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6344 unsigned LoopCost) { 6345 // -- The interleave heuristics -- 6346 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6347 // There are many micro-architectural considerations that we can't predict 6348 // at this level. For example, frontend pressure (on decode or fetch) due to 6349 // code size, or the number and capabilities of the execution ports. 6350 // 6351 // We use the following heuristics to select the interleave count: 6352 // 1. If the code has reductions, then we interleave to break the cross 6353 // iteration dependency. 6354 // 2. If the loop is really small, then we interleave to reduce the loop 6355 // overhead. 6356 // 3. We don't interleave if we think that we will spill registers to memory 6357 // due to the increased register pressure. 6358 6359 if (!isScalarEpilogueAllowed()) 6360 return 1; 6361 6362 // We used the distance for the interleave count. 6363 if (Legal->getMaxSafeDepDistBytes() != -1U) 6364 return 1; 6365 6366 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6367 const bool HasReductions = !Legal->getReductionVars().empty(); 6368 // Do not interleave loops with a relatively small known or estimated trip 6369 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6370 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6371 // because with the above conditions interleaving can expose ILP and break 6372 // cross iteration dependences for reductions. 6373 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6374 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6375 return 1; 6376 6377 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6378 // We divide by these constants so assume that we have at least one 6379 // instruction that uses at least one register. 6380 for (auto& pair : R.MaxLocalUsers) { 6381 pair.second = std::max(pair.second, 1U); 6382 } 6383 6384 // We calculate the interleave count using the following formula. 6385 // Subtract the number of loop invariants from the number of available 6386 // registers. These registers are used by all of the interleaved instances. 6387 // Next, divide the remaining registers by the number of registers that is 6388 // required by the loop, in order to estimate how many parallel instances 6389 // fit without causing spills. All of this is rounded down if necessary to be 6390 // a power of two. We want power of two interleave count to simplify any 6391 // addressing operations or alignment considerations. 6392 // We also want power of two interleave counts to ensure that the induction 6393 // variable of the vector loop wraps to zero, when tail is folded by masking; 6394 // this currently happens when OptForSize, in which case IC is set to 1 above. 6395 unsigned IC = UINT_MAX; 6396 6397 for (auto& pair : R.MaxLocalUsers) { 6398 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6399 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6400 << " registers of " 6401 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6402 if (VF.isScalar()) { 6403 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6404 TargetNumRegisters = ForceTargetNumScalarRegs; 6405 } else { 6406 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6407 TargetNumRegisters = ForceTargetNumVectorRegs; 6408 } 6409 unsigned MaxLocalUsers = pair.second; 6410 unsigned LoopInvariantRegs = 0; 6411 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6412 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6413 6414 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6415 // Don't count the induction variable as interleaved. 6416 if (EnableIndVarRegisterHeur) { 6417 TmpIC = 6418 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6419 std::max(1U, (MaxLocalUsers - 1))); 6420 } 6421 6422 IC = std::min(IC, TmpIC); 6423 } 6424 6425 // Clamp the interleave ranges to reasonable counts. 6426 unsigned MaxInterleaveCount = 6427 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6428 6429 // Check if the user has overridden the max. 6430 if (VF.isScalar()) { 6431 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6432 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6433 } else { 6434 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6435 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6436 } 6437 6438 // If trip count is known or estimated compile time constant, limit the 6439 // interleave count to be less than the trip count divided by VF, provided it 6440 // is at least 1. 6441 // 6442 // For scalable vectors we can't know if interleaving is beneficial. It may 6443 // not be beneficial for small loops if none of the lanes in the second vector 6444 // iterations is enabled. However, for larger loops, there is likely to be a 6445 // similar benefit as for fixed-width vectors. For now, we choose to leave 6446 // the InterleaveCount as if vscale is '1', although if some information about 6447 // the vector is known (e.g. min vector size), we can make a better decision. 6448 if (BestKnownTC) { 6449 MaxInterleaveCount = 6450 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6451 // Make sure MaxInterleaveCount is greater than 0. 6452 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6453 } 6454 6455 assert(MaxInterleaveCount > 0 && 6456 "Maximum interleave count must be greater than 0"); 6457 6458 // Clamp the calculated IC to be between the 1 and the max interleave count 6459 // that the target and trip count allows. 6460 if (IC > MaxInterleaveCount) 6461 IC = MaxInterleaveCount; 6462 else 6463 // Make sure IC is greater than 0. 6464 IC = std::max(1u, IC); 6465 6466 assert(IC > 0 && "Interleave count must be greater than 0."); 6467 6468 // If we did not calculate the cost for VF (because the user selected the VF) 6469 // then we calculate the cost of VF here. 6470 if (LoopCost == 0) { 6471 InstructionCost C = expectedCost(VF).first; 6472 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6473 LoopCost = *C.getValue(); 6474 } 6475 6476 assert(LoopCost && "Non-zero loop cost expected"); 6477 6478 // Interleave if we vectorized this loop and there is a reduction that could 6479 // benefit from interleaving. 6480 if (VF.isVector() && HasReductions) { 6481 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6482 return IC; 6483 } 6484 6485 // Note that if we've already vectorized the loop we will have done the 6486 // runtime check and so interleaving won't require further checks. 6487 bool InterleavingRequiresRuntimePointerCheck = 6488 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6489 6490 // We want to interleave small loops in order to reduce the loop overhead and 6491 // potentially expose ILP opportunities. 6492 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6493 << "LV: IC is " << IC << '\n' 6494 << "LV: VF is " << VF << '\n'); 6495 const bool AggressivelyInterleaveReductions = 6496 TTI.enableAggressiveInterleaving(HasReductions); 6497 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6498 // We assume that the cost overhead is 1 and we use the cost model 6499 // to estimate the cost of the loop and interleave until the cost of the 6500 // loop overhead is about 5% of the cost of the loop. 6501 unsigned SmallIC = 6502 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6503 6504 // Interleave until store/load ports (estimated by max interleave count) are 6505 // saturated. 6506 unsigned NumStores = Legal->getNumStores(); 6507 unsigned NumLoads = Legal->getNumLoads(); 6508 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6509 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6510 6511 // If we have a scalar reduction (vector reductions are already dealt with 6512 // by this point), we can increase the critical path length if the loop 6513 // we're interleaving is inside another loop. For tree-wise reductions 6514 // set the limit to 2, and for ordered reductions it's best to disable 6515 // interleaving entirely. 6516 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6517 bool HasOrderedReductions = 6518 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6519 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6520 return RdxDesc.isOrdered(); 6521 }); 6522 if (HasOrderedReductions) { 6523 LLVM_DEBUG( 6524 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6525 return 1; 6526 } 6527 6528 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6529 SmallIC = std::min(SmallIC, F); 6530 StoresIC = std::min(StoresIC, F); 6531 LoadsIC = std::min(LoadsIC, F); 6532 } 6533 6534 if (EnableLoadStoreRuntimeInterleave && 6535 std::max(StoresIC, LoadsIC) > SmallIC) { 6536 LLVM_DEBUG( 6537 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6538 return std::max(StoresIC, LoadsIC); 6539 } 6540 6541 // If there are scalar reductions and TTI has enabled aggressive 6542 // interleaving for reductions, we will interleave to expose ILP. 6543 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6544 AggressivelyInterleaveReductions) { 6545 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6546 // Interleave no less than SmallIC but not as aggressive as the normal IC 6547 // to satisfy the rare situation when resources are too limited. 6548 return std::max(IC / 2, SmallIC); 6549 } else { 6550 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6551 return SmallIC; 6552 } 6553 } 6554 6555 // Interleave if this is a large loop (small loops are already dealt with by 6556 // this point) that could benefit from interleaving. 6557 if (AggressivelyInterleaveReductions) { 6558 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6559 return IC; 6560 } 6561 6562 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6563 return 1; 6564 } 6565 6566 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6567 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6568 // This function calculates the register usage by measuring the highest number 6569 // of values that are alive at a single location. Obviously, this is a very 6570 // rough estimation. We scan the loop in a topological order in order and 6571 // assign a number to each instruction. We use RPO to ensure that defs are 6572 // met before their users. We assume that each instruction that has in-loop 6573 // users starts an interval. We record every time that an in-loop value is 6574 // used, so we have a list of the first and last occurrences of each 6575 // instruction. Next, we transpose this data structure into a multi map that 6576 // holds the list of intervals that *end* at a specific location. This multi 6577 // map allows us to perform a linear search. We scan the instructions linearly 6578 // and record each time that a new interval starts, by placing it in a set. 6579 // If we find this value in the multi-map then we remove it from the set. 6580 // The max register usage is the maximum size of the set. 6581 // We also search for instructions that are defined outside the loop, but are 6582 // used inside the loop. We need this number separately from the max-interval 6583 // usage number because when we unroll, loop-invariant values do not take 6584 // more register. 6585 LoopBlocksDFS DFS(TheLoop); 6586 DFS.perform(LI); 6587 6588 RegisterUsage RU; 6589 6590 // Each 'key' in the map opens a new interval. The values 6591 // of the map are the index of the 'last seen' usage of the 6592 // instruction that is the key. 6593 using IntervalMap = DenseMap<Instruction *, unsigned>; 6594 6595 // Maps instruction to its index. 6596 SmallVector<Instruction *, 64> IdxToInstr; 6597 // Marks the end of each interval. 6598 IntervalMap EndPoint; 6599 // Saves the list of instruction indices that are used in the loop. 6600 SmallPtrSet<Instruction *, 8> Ends; 6601 // Saves the list of values that are used in the loop but are 6602 // defined outside the loop, such as arguments and constants. 6603 SmallPtrSet<Value *, 8> LoopInvariants; 6604 6605 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6606 for (Instruction &I : BB->instructionsWithoutDebug()) { 6607 IdxToInstr.push_back(&I); 6608 6609 // Save the end location of each USE. 6610 for (Value *U : I.operands()) { 6611 auto *Instr = dyn_cast<Instruction>(U); 6612 6613 // Ignore non-instruction values such as arguments, constants, etc. 6614 if (!Instr) 6615 continue; 6616 6617 // If this instruction is outside the loop then record it and continue. 6618 if (!TheLoop->contains(Instr)) { 6619 LoopInvariants.insert(Instr); 6620 continue; 6621 } 6622 6623 // Overwrite previous end points. 6624 EndPoint[Instr] = IdxToInstr.size(); 6625 Ends.insert(Instr); 6626 } 6627 } 6628 } 6629 6630 // Saves the list of intervals that end with the index in 'key'. 6631 using InstrList = SmallVector<Instruction *, 2>; 6632 DenseMap<unsigned, InstrList> TransposeEnds; 6633 6634 // Transpose the EndPoints to a list of values that end at each index. 6635 for (auto &Interval : EndPoint) 6636 TransposeEnds[Interval.second].push_back(Interval.first); 6637 6638 SmallPtrSet<Instruction *, 8> OpenIntervals; 6639 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6640 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6641 6642 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6643 6644 // A lambda that gets the register usage for the given type and VF. 6645 const auto &TTICapture = TTI; 6646 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6647 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6648 return 0; 6649 InstructionCost::CostType RegUsage = 6650 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6651 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6652 "Nonsensical values for register usage."); 6653 return RegUsage; 6654 }; 6655 6656 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6657 Instruction *I = IdxToInstr[i]; 6658 6659 // Remove all of the instructions that end at this location. 6660 InstrList &List = TransposeEnds[i]; 6661 for (Instruction *ToRemove : List) 6662 OpenIntervals.erase(ToRemove); 6663 6664 // Ignore instructions that are never used within the loop. 6665 if (!Ends.count(I)) 6666 continue; 6667 6668 // Skip ignored values. 6669 if (ValuesToIgnore.count(I)) 6670 continue; 6671 6672 // For each VF find the maximum usage of registers. 6673 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6674 // Count the number of live intervals. 6675 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6676 6677 if (VFs[j].isScalar()) { 6678 for (auto Inst : OpenIntervals) { 6679 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6680 if (RegUsage.find(ClassID) == RegUsage.end()) 6681 RegUsage[ClassID] = 1; 6682 else 6683 RegUsage[ClassID] += 1; 6684 } 6685 } else { 6686 collectUniformsAndScalars(VFs[j]); 6687 for (auto Inst : OpenIntervals) { 6688 // Skip ignored values for VF > 1. 6689 if (VecValuesToIgnore.count(Inst)) 6690 continue; 6691 if (isScalarAfterVectorization(Inst, VFs[j])) { 6692 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6693 if (RegUsage.find(ClassID) == RegUsage.end()) 6694 RegUsage[ClassID] = 1; 6695 else 6696 RegUsage[ClassID] += 1; 6697 } else { 6698 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6699 if (RegUsage.find(ClassID) == RegUsage.end()) 6700 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6701 else 6702 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6703 } 6704 } 6705 } 6706 6707 for (auto& pair : RegUsage) { 6708 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6709 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6710 else 6711 MaxUsages[j][pair.first] = pair.second; 6712 } 6713 } 6714 6715 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6716 << OpenIntervals.size() << '\n'); 6717 6718 // Add the current instruction to the list of open intervals. 6719 OpenIntervals.insert(I); 6720 } 6721 6722 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6723 SmallMapVector<unsigned, unsigned, 4> Invariant; 6724 6725 for (auto Inst : LoopInvariants) { 6726 unsigned Usage = 6727 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6728 unsigned ClassID = 6729 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6730 if (Invariant.find(ClassID) == Invariant.end()) 6731 Invariant[ClassID] = Usage; 6732 else 6733 Invariant[ClassID] += Usage; 6734 } 6735 6736 LLVM_DEBUG({ 6737 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6738 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6739 << " item\n"; 6740 for (const auto &pair : MaxUsages[i]) { 6741 dbgs() << "LV(REG): RegisterClass: " 6742 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6743 << " registers\n"; 6744 } 6745 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6746 << " item\n"; 6747 for (const auto &pair : Invariant) { 6748 dbgs() << "LV(REG): RegisterClass: " 6749 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6750 << " registers\n"; 6751 } 6752 }); 6753 6754 RU.LoopInvariantRegs = Invariant; 6755 RU.MaxLocalUsers = MaxUsages[i]; 6756 RUs[i] = RU; 6757 } 6758 6759 return RUs; 6760 } 6761 6762 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6763 // TODO: Cost model for emulated masked load/store is completely 6764 // broken. This hack guides the cost model to use an artificially 6765 // high enough value to practically disable vectorization with such 6766 // operations, except where previously deployed legality hack allowed 6767 // using very low cost values. This is to avoid regressions coming simply 6768 // from moving "masked load/store" check from legality to cost model. 6769 // Masked Load/Gather emulation was previously never allowed. 6770 // Limited number of Masked Store/Scatter emulation was allowed. 6771 assert(isPredicatedInst(I) && 6772 "Expecting a scalar emulated instruction"); 6773 return isa<LoadInst>(I) || 6774 (isa<StoreInst>(I) && 6775 NumPredStores > NumberOfStoresToPredicate); 6776 } 6777 6778 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6779 // If we aren't vectorizing the loop, or if we've already collected the 6780 // instructions to scalarize, there's nothing to do. Collection may already 6781 // have occurred if we have a user-selected VF and are now computing the 6782 // expected cost for interleaving. 6783 if (VF.isScalar() || VF.isZero() || 6784 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6785 return; 6786 6787 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6788 // not profitable to scalarize any instructions, the presence of VF in the 6789 // map will indicate that we've analyzed it already. 6790 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6791 6792 // Find all the instructions that are scalar with predication in the loop and 6793 // determine if it would be better to not if-convert the blocks they are in. 6794 // If so, we also record the instructions to scalarize. 6795 for (BasicBlock *BB : TheLoop->blocks()) { 6796 if (!blockNeedsPredication(BB)) 6797 continue; 6798 for (Instruction &I : *BB) 6799 if (isScalarWithPredication(&I)) { 6800 ScalarCostsTy ScalarCosts; 6801 // Do not apply discount if scalable, because that would lead to 6802 // invalid scalarization costs. 6803 // Do not apply discount logic if hacked cost is needed 6804 // for emulated masked memrefs. 6805 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6806 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6807 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6808 // Remember that BB will remain after vectorization. 6809 PredicatedBBsAfterVectorization.insert(BB); 6810 } 6811 } 6812 } 6813 6814 int LoopVectorizationCostModel::computePredInstDiscount( 6815 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6816 assert(!isUniformAfterVectorization(PredInst, VF) && 6817 "Instruction marked uniform-after-vectorization will be predicated"); 6818 6819 // Initialize the discount to zero, meaning that the scalar version and the 6820 // vector version cost the same. 6821 InstructionCost Discount = 0; 6822 6823 // Holds instructions to analyze. The instructions we visit are mapped in 6824 // ScalarCosts. Those instructions are the ones that would be scalarized if 6825 // we find that the scalar version costs less. 6826 SmallVector<Instruction *, 8> Worklist; 6827 6828 // Returns true if the given instruction can be scalarized. 6829 auto canBeScalarized = [&](Instruction *I) -> bool { 6830 // We only attempt to scalarize instructions forming a single-use chain 6831 // from the original predicated block that would otherwise be vectorized. 6832 // Although not strictly necessary, we give up on instructions we know will 6833 // already be scalar to avoid traversing chains that are unlikely to be 6834 // beneficial. 6835 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6836 isScalarAfterVectorization(I, VF)) 6837 return false; 6838 6839 // If the instruction is scalar with predication, it will be analyzed 6840 // separately. We ignore it within the context of PredInst. 6841 if (isScalarWithPredication(I)) 6842 return false; 6843 6844 // If any of the instruction's operands are uniform after vectorization, 6845 // the instruction cannot be scalarized. This prevents, for example, a 6846 // masked load from being scalarized. 6847 // 6848 // We assume we will only emit a value for lane zero of an instruction 6849 // marked uniform after vectorization, rather than VF identical values. 6850 // Thus, if we scalarize an instruction that uses a uniform, we would 6851 // create uses of values corresponding to the lanes we aren't emitting code 6852 // for. This behavior can be changed by allowing getScalarValue to clone 6853 // the lane zero values for uniforms rather than asserting. 6854 for (Use &U : I->operands()) 6855 if (auto *J = dyn_cast<Instruction>(U.get())) 6856 if (isUniformAfterVectorization(J, VF)) 6857 return false; 6858 6859 // Otherwise, we can scalarize the instruction. 6860 return true; 6861 }; 6862 6863 // Compute the expected cost discount from scalarizing the entire expression 6864 // feeding the predicated instruction. We currently only consider expressions 6865 // that are single-use instruction chains. 6866 Worklist.push_back(PredInst); 6867 while (!Worklist.empty()) { 6868 Instruction *I = Worklist.pop_back_val(); 6869 6870 // If we've already analyzed the instruction, there's nothing to do. 6871 if (ScalarCosts.find(I) != ScalarCosts.end()) 6872 continue; 6873 6874 // Compute the cost of the vector instruction. Note that this cost already 6875 // includes the scalarization overhead of the predicated instruction. 6876 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6877 6878 // Compute the cost of the scalarized instruction. This cost is the cost of 6879 // the instruction as if it wasn't if-converted and instead remained in the 6880 // predicated block. We will scale this cost by block probability after 6881 // computing the scalarization overhead. 6882 InstructionCost ScalarCost = 6883 VF.getFixedValue() * 6884 getInstructionCost(I, ElementCount::getFixed(1)).first; 6885 6886 // Compute the scalarization overhead of needed insertelement instructions 6887 // and phi nodes. 6888 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6889 ScalarCost += TTI.getScalarizationOverhead( 6890 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6891 APInt::getAllOnesValue(VF.getFixedValue()), true, false); 6892 ScalarCost += 6893 VF.getFixedValue() * 6894 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6895 } 6896 6897 // Compute the scalarization overhead of needed extractelement 6898 // instructions. For each of the instruction's operands, if the operand can 6899 // be scalarized, add it to the worklist; otherwise, account for the 6900 // overhead. 6901 for (Use &U : I->operands()) 6902 if (auto *J = dyn_cast<Instruction>(U.get())) { 6903 assert(VectorType::isValidElementType(J->getType()) && 6904 "Instruction has non-scalar type"); 6905 if (canBeScalarized(J)) 6906 Worklist.push_back(J); 6907 else if (needsExtract(J, VF)) { 6908 ScalarCost += TTI.getScalarizationOverhead( 6909 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6910 APInt::getAllOnesValue(VF.getFixedValue()), false, true); 6911 } 6912 } 6913 6914 // Scale the total scalar cost by block probability. 6915 ScalarCost /= getReciprocalPredBlockProb(); 6916 6917 // Compute the discount. A non-negative discount means the vector version 6918 // of the instruction costs more, and scalarizing would be beneficial. 6919 Discount += VectorCost - ScalarCost; 6920 ScalarCosts[I] = ScalarCost; 6921 } 6922 6923 return *Discount.getValue(); 6924 } 6925 6926 LoopVectorizationCostModel::VectorizationCostTy 6927 LoopVectorizationCostModel::expectedCost( 6928 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6929 VectorizationCostTy Cost; 6930 6931 // For each block. 6932 for (BasicBlock *BB : TheLoop->blocks()) { 6933 VectorizationCostTy BlockCost; 6934 6935 // For each instruction in the old loop. 6936 for (Instruction &I : BB->instructionsWithoutDebug()) { 6937 // Skip ignored values. 6938 if (ValuesToIgnore.count(&I) || 6939 (VF.isVector() && VecValuesToIgnore.count(&I))) 6940 continue; 6941 6942 VectorizationCostTy C = getInstructionCost(&I, VF); 6943 6944 // Check if we should override the cost. 6945 if (C.first.isValid() && 6946 ForceTargetInstructionCost.getNumOccurrences() > 0) 6947 C.first = InstructionCost(ForceTargetInstructionCost); 6948 6949 // Keep a list of instructions with invalid costs. 6950 if (Invalid && !C.first.isValid()) 6951 Invalid->emplace_back(&I, VF); 6952 6953 BlockCost.first += C.first; 6954 BlockCost.second |= C.second; 6955 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6956 << " for VF " << VF << " For instruction: " << I 6957 << '\n'); 6958 } 6959 6960 // If we are vectorizing a predicated block, it will have been 6961 // if-converted. This means that the block's instructions (aside from 6962 // stores and instructions that may divide by zero) will now be 6963 // unconditionally executed. For the scalar case, we may not always execute 6964 // the predicated block, if it is an if-else block. Thus, scale the block's 6965 // cost by the probability of executing it. blockNeedsPredication from 6966 // Legal is used so as to not include all blocks in tail folded loops. 6967 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6968 BlockCost.first /= getReciprocalPredBlockProb(); 6969 6970 Cost.first += BlockCost.first; 6971 Cost.second |= BlockCost.second; 6972 } 6973 6974 return Cost; 6975 } 6976 6977 /// Gets Address Access SCEV after verifying that the access pattern 6978 /// is loop invariant except the induction variable dependence. 6979 /// 6980 /// This SCEV can be sent to the Target in order to estimate the address 6981 /// calculation cost. 6982 static const SCEV *getAddressAccessSCEV( 6983 Value *Ptr, 6984 LoopVectorizationLegality *Legal, 6985 PredicatedScalarEvolution &PSE, 6986 const Loop *TheLoop) { 6987 6988 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6989 if (!Gep) 6990 return nullptr; 6991 6992 // We are looking for a gep with all loop invariant indices except for one 6993 // which should be an induction variable. 6994 auto SE = PSE.getSE(); 6995 unsigned NumOperands = Gep->getNumOperands(); 6996 for (unsigned i = 1; i < NumOperands; ++i) { 6997 Value *Opd = Gep->getOperand(i); 6998 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6999 !Legal->isInductionVariable(Opd)) 7000 return nullptr; 7001 } 7002 7003 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 7004 return PSE.getSCEV(Ptr); 7005 } 7006 7007 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 7008 return Legal->hasStride(I->getOperand(0)) || 7009 Legal->hasStride(I->getOperand(1)); 7010 } 7011 7012 InstructionCost 7013 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 7014 ElementCount VF) { 7015 assert(VF.isVector() && 7016 "Scalarization cost of instruction implies vectorization."); 7017 if (VF.isScalable()) 7018 return InstructionCost::getInvalid(); 7019 7020 Type *ValTy = getLoadStoreType(I); 7021 auto SE = PSE.getSE(); 7022 7023 unsigned AS = getLoadStoreAddressSpace(I); 7024 Value *Ptr = getLoadStorePointerOperand(I); 7025 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 7026 7027 // Figure out whether the access is strided and get the stride value 7028 // if it's known in compile time 7029 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 7030 7031 // Get the cost of the scalar memory instruction and address computation. 7032 InstructionCost Cost = 7033 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 7034 7035 // Don't pass *I here, since it is scalar but will actually be part of a 7036 // vectorized loop where the user of it is a vectorized instruction. 7037 const Align Alignment = getLoadStoreAlignment(I); 7038 Cost += VF.getKnownMinValue() * 7039 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 7040 AS, TTI::TCK_RecipThroughput); 7041 7042 // Get the overhead of the extractelement and insertelement instructions 7043 // we might create due to scalarization. 7044 Cost += getScalarizationOverhead(I, VF); 7045 7046 // If we have a predicated load/store, it will need extra i1 extracts and 7047 // conditional branches, but may not be executed for each vector lane. Scale 7048 // the cost by the probability of executing the predicated block. 7049 if (isPredicatedInst(I)) { 7050 Cost /= getReciprocalPredBlockProb(); 7051 7052 // Add the cost of an i1 extract and a branch 7053 auto *Vec_i1Ty = 7054 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 7055 Cost += TTI.getScalarizationOverhead( 7056 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7057 /*Insert=*/false, /*Extract=*/true); 7058 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 7059 7060 if (useEmulatedMaskMemRefHack(I)) 7061 // Artificially setting to a high enough value to practically disable 7062 // vectorization with such operations. 7063 Cost = 3000000; 7064 } 7065 7066 return Cost; 7067 } 7068 7069 InstructionCost 7070 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7071 ElementCount VF) { 7072 Type *ValTy = getLoadStoreType(I); 7073 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7074 Value *Ptr = getLoadStorePointerOperand(I); 7075 unsigned AS = getLoadStoreAddressSpace(I); 7076 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 7077 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7078 7079 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7080 "Stride should be 1 or -1 for consecutive memory access"); 7081 const Align Alignment = getLoadStoreAlignment(I); 7082 InstructionCost Cost = 0; 7083 if (Legal->isMaskRequired(I)) 7084 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7085 CostKind); 7086 else 7087 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7088 CostKind, I); 7089 7090 bool Reverse = ConsecutiveStride < 0; 7091 if (Reverse) 7092 Cost += 7093 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7094 return Cost; 7095 } 7096 7097 InstructionCost 7098 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7099 ElementCount VF) { 7100 assert(Legal->isUniformMemOp(*I)); 7101 7102 Type *ValTy = getLoadStoreType(I); 7103 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7104 const Align Alignment = getLoadStoreAlignment(I); 7105 unsigned AS = getLoadStoreAddressSpace(I); 7106 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7107 if (isa<LoadInst>(I)) { 7108 return TTI.getAddressComputationCost(ValTy) + 7109 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7110 CostKind) + 7111 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7112 } 7113 StoreInst *SI = cast<StoreInst>(I); 7114 7115 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7116 return TTI.getAddressComputationCost(ValTy) + 7117 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7118 CostKind) + 7119 (isLoopInvariantStoreValue 7120 ? 0 7121 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7122 VF.getKnownMinValue() - 1)); 7123 } 7124 7125 InstructionCost 7126 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7127 ElementCount VF) { 7128 Type *ValTy = getLoadStoreType(I); 7129 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7130 const Align Alignment = getLoadStoreAlignment(I); 7131 const Value *Ptr = getLoadStorePointerOperand(I); 7132 7133 return TTI.getAddressComputationCost(VectorTy) + 7134 TTI.getGatherScatterOpCost( 7135 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7136 TargetTransformInfo::TCK_RecipThroughput, I); 7137 } 7138 7139 InstructionCost 7140 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7141 ElementCount VF) { 7142 // TODO: Once we have support for interleaving with scalable vectors 7143 // we can calculate the cost properly here. 7144 if (VF.isScalable()) 7145 return InstructionCost::getInvalid(); 7146 7147 Type *ValTy = getLoadStoreType(I); 7148 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7149 unsigned AS = getLoadStoreAddressSpace(I); 7150 7151 auto Group = getInterleavedAccessGroup(I); 7152 assert(Group && "Fail to get an interleaved access group."); 7153 7154 unsigned InterleaveFactor = Group->getFactor(); 7155 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7156 7157 // Holds the indices of existing members in the interleaved group. 7158 SmallVector<unsigned, 4> Indices; 7159 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 7160 if (Group->getMember(IF)) 7161 Indices.push_back(IF); 7162 7163 // Calculate the cost of the whole interleaved group. 7164 bool UseMaskForGaps = 7165 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 7166 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 7167 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7168 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7169 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7170 7171 if (Group->isReverse()) { 7172 // TODO: Add support for reversed masked interleaved access. 7173 assert(!Legal->isMaskRequired(I) && 7174 "Reverse masked interleaved access not supported."); 7175 Cost += 7176 Group->getNumMembers() * 7177 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7178 } 7179 return Cost; 7180 } 7181 7182 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7183 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7184 using namespace llvm::PatternMatch; 7185 // Early exit for no inloop reductions 7186 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7187 return None; 7188 auto *VectorTy = cast<VectorType>(Ty); 7189 7190 // We are looking for a pattern of, and finding the minimal acceptable cost: 7191 // reduce(mul(ext(A), ext(B))) or 7192 // reduce(mul(A, B)) or 7193 // reduce(ext(A)) or 7194 // reduce(A). 7195 // The basic idea is that we walk down the tree to do that, finding the root 7196 // reduction instruction in InLoopReductionImmediateChains. From there we find 7197 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7198 // of the components. If the reduction cost is lower then we return it for the 7199 // reduction instruction and 0 for the other instructions in the pattern. If 7200 // it is not we return an invalid cost specifying the orignal cost method 7201 // should be used. 7202 Instruction *RetI = I; 7203 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7204 if (!RetI->hasOneUser()) 7205 return None; 7206 RetI = RetI->user_back(); 7207 } 7208 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7209 RetI->user_back()->getOpcode() == Instruction::Add) { 7210 if (!RetI->hasOneUser()) 7211 return None; 7212 RetI = RetI->user_back(); 7213 } 7214 7215 // Test if the found instruction is a reduction, and if not return an invalid 7216 // cost specifying the parent to use the original cost modelling. 7217 if (!InLoopReductionImmediateChains.count(RetI)) 7218 return None; 7219 7220 // Find the reduction this chain is a part of and calculate the basic cost of 7221 // the reduction on its own. 7222 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7223 Instruction *ReductionPhi = LastChain; 7224 while (!isa<PHINode>(ReductionPhi)) 7225 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7226 7227 const RecurrenceDescriptor &RdxDesc = 7228 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7229 7230 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7231 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7232 7233 // If we're using ordered reductions then we can just return the base cost 7234 // here, since getArithmeticReductionCost calculates the full ordered 7235 // reduction cost when FP reassociation is not allowed. 7236 if (useOrderedReductions(RdxDesc)) 7237 return BaseCost; 7238 7239 // Get the operand that was not the reduction chain and match it to one of the 7240 // patterns, returning the better cost if it is found. 7241 Instruction *RedOp = RetI->getOperand(1) == LastChain 7242 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7243 : dyn_cast<Instruction>(RetI->getOperand(1)); 7244 7245 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7246 7247 Instruction *Op0, *Op1; 7248 if (RedOp && 7249 match(RedOp, 7250 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7251 match(Op0, m_ZExtOrSExt(m_Value())) && 7252 Op0->getOpcode() == Op1->getOpcode() && 7253 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7254 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7255 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7256 7257 // Matched reduce(ext(mul(ext(A), ext(B))) 7258 // Note that the extend opcodes need to all match, or if A==B they will have 7259 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7260 // which is equally fine. 7261 bool IsUnsigned = isa<ZExtInst>(Op0); 7262 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7263 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7264 7265 InstructionCost ExtCost = 7266 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7267 TTI::CastContextHint::None, CostKind, Op0); 7268 InstructionCost MulCost = 7269 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7270 InstructionCost Ext2Cost = 7271 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7272 TTI::CastContextHint::None, CostKind, RedOp); 7273 7274 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7275 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7276 CostKind); 7277 7278 if (RedCost.isValid() && 7279 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7280 return I == RetI ? RedCost : 0; 7281 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7282 !TheLoop->isLoopInvariant(RedOp)) { 7283 // Matched reduce(ext(A)) 7284 bool IsUnsigned = isa<ZExtInst>(RedOp); 7285 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7286 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7287 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7288 CostKind); 7289 7290 InstructionCost ExtCost = 7291 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7292 TTI::CastContextHint::None, CostKind, RedOp); 7293 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7294 return I == RetI ? RedCost : 0; 7295 } else if (RedOp && 7296 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7297 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7298 Op0->getOpcode() == Op1->getOpcode() && 7299 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7300 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7301 bool IsUnsigned = isa<ZExtInst>(Op0); 7302 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7303 // Matched reduce(mul(ext, ext)) 7304 InstructionCost ExtCost = 7305 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7306 TTI::CastContextHint::None, CostKind, Op0); 7307 InstructionCost MulCost = 7308 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7309 7310 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7311 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7312 CostKind); 7313 7314 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7315 return I == RetI ? RedCost : 0; 7316 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7317 // Matched reduce(mul()) 7318 InstructionCost MulCost = 7319 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7320 7321 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7322 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7323 CostKind); 7324 7325 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7326 return I == RetI ? RedCost : 0; 7327 } 7328 } 7329 7330 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7331 } 7332 7333 InstructionCost 7334 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7335 ElementCount VF) { 7336 // Calculate scalar cost only. Vectorization cost should be ready at this 7337 // moment. 7338 if (VF.isScalar()) { 7339 Type *ValTy = getLoadStoreType(I); 7340 const Align Alignment = getLoadStoreAlignment(I); 7341 unsigned AS = getLoadStoreAddressSpace(I); 7342 7343 return TTI.getAddressComputationCost(ValTy) + 7344 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7345 TTI::TCK_RecipThroughput, I); 7346 } 7347 return getWideningCost(I, VF); 7348 } 7349 7350 LoopVectorizationCostModel::VectorizationCostTy 7351 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7352 ElementCount VF) { 7353 // If we know that this instruction will remain uniform, check the cost of 7354 // the scalar version. 7355 if (isUniformAfterVectorization(I, VF)) 7356 VF = ElementCount::getFixed(1); 7357 7358 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7359 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7360 7361 // Forced scalars do not have any scalarization overhead. 7362 auto ForcedScalar = ForcedScalars.find(VF); 7363 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7364 auto InstSet = ForcedScalar->second; 7365 if (InstSet.count(I)) 7366 return VectorizationCostTy( 7367 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7368 VF.getKnownMinValue()), 7369 false); 7370 } 7371 7372 Type *VectorTy; 7373 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7374 7375 bool TypeNotScalarized = 7376 VF.isVector() && VectorTy->isVectorTy() && 7377 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7378 return VectorizationCostTy(C, TypeNotScalarized); 7379 } 7380 7381 InstructionCost 7382 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7383 ElementCount VF) const { 7384 7385 // There is no mechanism yet to create a scalable scalarization loop, 7386 // so this is currently Invalid. 7387 if (VF.isScalable()) 7388 return InstructionCost::getInvalid(); 7389 7390 if (VF.isScalar()) 7391 return 0; 7392 7393 InstructionCost Cost = 0; 7394 Type *RetTy = ToVectorTy(I->getType(), VF); 7395 if (!RetTy->isVoidTy() && 7396 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7397 Cost += TTI.getScalarizationOverhead( 7398 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7399 true, false); 7400 7401 // Some targets keep addresses scalar. 7402 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7403 return Cost; 7404 7405 // Some targets support efficient element stores. 7406 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7407 return Cost; 7408 7409 // Collect operands to consider. 7410 CallInst *CI = dyn_cast<CallInst>(I); 7411 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7412 7413 // Skip operands that do not require extraction/scalarization and do not incur 7414 // any overhead. 7415 SmallVector<Type *> Tys; 7416 for (auto *V : filterExtractingOperands(Ops, VF)) 7417 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7418 return Cost + TTI.getOperandsScalarizationOverhead( 7419 filterExtractingOperands(Ops, VF), Tys); 7420 } 7421 7422 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7423 if (VF.isScalar()) 7424 return; 7425 NumPredStores = 0; 7426 for (BasicBlock *BB : TheLoop->blocks()) { 7427 // For each instruction in the old loop. 7428 for (Instruction &I : *BB) { 7429 Value *Ptr = getLoadStorePointerOperand(&I); 7430 if (!Ptr) 7431 continue; 7432 7433 // TODO: We should generate better code and update the cost model for 7434 // predicated uniform stores. Today they are treated as any other 7435 // predicated store (see added test cases in 7436 // invariant-store-vectorization.ll). 7437 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7438 NumPredStores++; 7439 7440 if (Legal->isUniformMemOp(I)) { 7441 // TODO: Avoid replicating loads and stores instead of 7442 // relying on instcombine to remove them. 7443 // Load: Scalar load + broadcast 7444 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7445 InstructionCost Cost; 7446 if (isa<StoreInst>(&I) && VF.isScalable() && 7447 isLegalGatherOrScatter(&I)) { 7448 Cost = getGatherScatterCost(&I, VF); 7449 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7450 } else { 7451 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7452 "Cannot yet scalarize uniform stores"); 7453 Cost = getUniformMemOpCost(&I, VF); 7454 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7455 } 7456 continue; 7457 } 7458 7459 // We assume that widening is the best solution when possible. 7460 if (memoryInstructionCanBeWidened(&I, VF)) { 7461 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7462 int ConsecutiveStride = 7463 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7464 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7465 "Expected consecutive stride."); 7466 InstWidening Decision = 7467 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7468 setWideningDecision(&I, VF, Decision, Cost); 7469 continue; 7470 } 7471 7472 // Choose between Interleaving, Gather/Scatter or Scalarization. 7473 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7474 unsigned NumAccesses = 1; 7475 if (isAccessInterleaved(&I)) { 7476 auto Group = getInterleavedAccessGroup(&I); 7477 assert(Group && "Fail to get an interleaved access group."); 7478 7479 // Make one decision for the whole group. 7480 if (getWideningDecision(&I, VF) != CM_Unknown) 7481 continue; 7482 7483 NumAccesses = Group->getNumMembers(); 7484 if (interleavedAccessCanBeWidened(&I, VF)) 7485 InterleaveCost = getInterleaveGroupCost(&I, VF); 7486 } 7487 7488 InstructionCost GatherScatterCost = 7489 isLegalGatherOrScatter(&I) 7490 ? getGatherScatterCost(&I, VF) * NumAccesses 7491 : InstructionCost::getInvalid(); 7492 7493 InstructionCost ScalarizationCost = 7494 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7495 7496 // Choose better solution for the current VF, 7497 // write down this decision and use it during vectorization. 7498 InstructionCost Cost; 7499 InstWidening Decision; 7500 if (InterleaveCost <= GatherScatterCost && 7501 InterleaveCost < ScalarizationCost) { 7502 Decision = CM_Interleave; 7503 Cost = InterleaveCost; 7504 } else if (GatherScatterCost < ScalarizationCost) { 7505 Decision = CM_GatherScatter; 7506 Cost = GatherScatterCost; 7507 } else { 7508 Decision = CM_Scalarize; 7509 Cost = ScalarizationCost; 7510 } 7511 // If the instructions belongs to an interleave group, the whole group 7512 // receives the same decision. The whole group receives the cost, but 7513 // the cost will actually be assigned to one instruction. 7514 if (auto Group = getInterleavedAccessGroup(&I)) 7515 setWideningDecision(Group, VF, Decision, Cost); 7516 else 7517 setWideningDecision(&I, VF, Decision, Cost); 7518 } 7519 } 7520 7521 // Make sure that any load of address and any other address computation 7522 // remains scalar unless there is gather/scatter support. This avoids 7523 // inevitable extracts into address registers, and also has the benefit of 7524 // activating LSR more, since that pass can't optimize vectorized 7525 // addresses. 7526 if (TTI.prefersVectorizedAddressing()) 7527 return; 7528 7529 // Start with all scalar pointer uses. 7530 SmallPtrSet<Instruction *, 8> AddrDefs; 7531 for (BasicBlock *BB : TheLoop->blocks()) 7532 for (Instruction &I : *BB) { 7533 Instruction *PtrDef = 7534 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7535 if (PtrDef && TheLoop->contains(PtrDef) && 7536 getWideningDecision(&I, VF) != CM_GatherScatter) 7537 AddrDefs.insert(PtrDef); 7538 } 7539 7540 // Add all instructions used to generate the addresses. 7541 SmallVector<Instruction *, 4> Worklist; 7542 append_range(Worklist, AddrDefs); 7543 while (!Worklist.empty()) { 7544 Instruction *I = Worklist.pop_back_val(); 7545 for (auto &Op : I->operands()) 7546 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7547 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7548 AddrDefs.insert(InstOp).second) 7549 Worklist.push_back(InstOp); 7550 } 7551 7552 for (auto *I : AddrDefs) { 7553 if (isa<LoadInst>(I)) { 7554 // Setting the desired widening decision should ideally be handled in 7555 // by cost functions, but since this involves the task of finding out 7556 // if the loaded register is involved in an address computation, it is 7557 // instead changed here when we know this is the case. 7558 InstWidening Decision = getWideningDecision(I, VF); 7559 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7560 // Scalarize a widened load of address. 7561 setWideningDecision( 7562 I, VF, CM_Scalarize, 7563 (VF.getKnownMinValue() * 7564 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7565 else if (auto Group = getInterleavedAccessGroup(I)) { 7566 // Scalarize an interleave group of address loads. 7567 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7568 if (Instruction *Member = Group->getMember(I)) 7569 setWideningDecision( 7570 Member, VF, CM_Scalarize, 7571 (VF.getKnownMinValue() * 7572 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7573 } 7574 } 7575 } else 7576 // Make sure I gets scalarized and a cost estimate without 7577 // scalarization overhead. 7578 ForcedScalars[VF].insert(I); 7579 } 7580 } 7581 7582 InstructionCost 7583 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7584 Type *&VectorTy) { 7585 Type *RetTy = I->getType(); 7586 if (canTruncateToMinimalBitwidth(I, VF)) 7587 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7588 auto SE = PSE.getSE(); 7589 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7590 7591 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7592 ElementCount VF) -> bool { 7593 if (VF.isScalar()) 7594 return true; 7595 7596 auto Scalarized = InstsToScalarize.find(VF); 7597 assert(Scalarized != InstsToScalarize.end() && 7598 "VF not yet analyzed for scalarization profitability"); 7599 return !Scalarized->second.count(I) && 7600 llvm::all_of(I->users(), [&](User *U) { 7601 auto *UI = cast<Instruction>(U); 7602 return !Scalarized->second.count(UI); 7603 }); 7604 }; 7605 (void) hasSingleCopyAfterVectorization; 7606 7607 if (isScalarAfterVectorization(I, VF)) { 7608 // With the exception of GEPs and PHIs, after scalarization there should 7609 // only be one copy of the instruction generated in the loop. This is 7610 // because the VF is either 1, or any instructions that need scalarizing 7611 // have already been dealt with by the the time we get here. As a result, 7612 // it means we don't have to multiply the instruction cost by VF. 7613 assert(I->getOpcode() == Instruction::GetElementPtr || 7614 I->getOpcode() == Instruction::PHI || 7615 (I->getOpcode() == Instruction::BitCast && 7616 I->getType()->isPointerTy()) || 7617 hasSingleCopyAfterVectorization(I, VF)); 7618 VectorTy = RetTy; 7619 } else 7620 VectorTy = ToVectorTy(RetTy, VF); 7621 7622 // TODO: We need to estimate the cost of intrinsic calls. 7623 switch (I->getOpcode()) { 7624 case Instruction::GetElementPtr: 7625 // We mark this instruction as zero-cost because the cost of GEPs in 7626 // vectorized code depends on whether the corresponding memory instruction 7627 // is scalarized or not. Therefore, we handle GEPs with the memory 7628 // instruction cost. 7629 return 0; 7630 case Instruction::Br: { 7631 // In cases of scalarized and predicated instructions, there will be VF 7632 // predicated blocks in the vectorized loop. Each branch around these 7633 // blocks requires also an extract of its vector compare i1 element. 7634 bool ScalarPredicatedBB = false; 7635 BranchInst *BI = cast<BranchInst>(I); 7636 if (VF.isVector() && BI->isConditional() && 7637 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7638 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7639 ScalarPredicatedBB = true; 7640 7641 if (ScalarPredicatedBB) { 7642 // Not possible to scalarize scalable vector with predicated instructions. 7643 if (VF.isScalable()) 7644 return InstructionCost::getInvalid(); 7645 // Return cost for branches around scalarized and predicated blocks. 7646 auto *Vec_i1Ty = 7647 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7648 return ( 7649 TTI.getScalarizationOverhead( 7650 Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false, 7651 true) + 7652 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7653 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7654 // The back-edge branch will remain, as will all scalar branches. 7655 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7656 else 7657 // This branch will be eliminated by if-conversion. 7658 return 0; 7659 // Note: We currently assume zero cost for an unconditional branch inside 7660 // a predicated block since it will become a fall-through, although we 7661 // may decide in the future to call TTI for all branches. 7662 } 7663 case Instruction::PHI: { 7664 auto *Phi = cast<PHINode>(I); 7665 7666 // First-order recurrences are replaced by vector shuffles inside the loop. 7667 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7668 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7669 return TTI.getShuffleCost( 7670 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7671 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7672 7673 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7674 // converted into select instructions. We require N - 1 selects per phi 7675 // node, where N is the number of incoming values. 7676 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7677 return (Phi->getNumIncomingValues() - 1) * 7678 TTI.getCmpSelInstrCost( 7679 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7680 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7681 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7682 7683 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7684 } 7685 case Instruction::UDiv: 7686 case Instruction::SDiv: 7687 case Instruction::URem: 7688 case Instruction::SRem: 7689 // If we have a predicated instruction, it may not be executed for each 7690 // vector lane. Get the scalarization cost and scale this amount by the 7691 // probability of executing the predicated block. If the instruction is not 7692 // predicated, we fall through to the next case. 7693 if (VF.isVector() && isScalarWithPredication(I)) { 7694 InstructionCost Cost = 0; 7695 7696 // These instructions have a non-void type, so account for the phi nodes 7697 // that we will create. This cost is likely to be zero. The phi node 7698 // cost, if any, should be scaled by the block probability because it 7699 // models a copy at the end of each predicated block. 7700 Cost += VF.getKnownMinValue() * 7701 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7702 7703 // The cost of the non-predicated instruction. 7704 Cost += VF.getKnownMinValue() * 7705 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7706 7707 // The cost of insertelement and extractelement instructions needed for 7708 // scalarization. 7709 Cost += getScalarizationOverhead(I, VF); 7710 7711 // Scale the cost by the probability of executing the predicated blocks. 7712 // This assumes the predicated block for each vector lane is equally 7713 // likely. 7714 return Cost / getReciprocalPredBlockProb(); 7715 } 7716 LLVM_FALLTHROUGH; 7717 case Instruction::Add: 7718 case Instruction::FAdd: 7719 case Instruction::Sub: 7720 case Instruction::FSub: 7721 case Instruction::Mul: 7722 case Instruction::FMul: 7723 case Instruction::FDiv: 7724 case Instruction::FRem: 7725 case Instruction::Shl: 7726 case Instruction::LShr: 7727 case Instruction::AShr: 7728 case Instruction::And: 7729 case Instruction::Or: 7730 case Instruction::Xor: { 7731 // Since we will replace the stride by 1 the multiplication should go away. 7732 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7733 return 0; 7734 7735 // Detect reduction patterns 7736 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7737 return *RedCost; 7738 7739 // Certain instructions can be cheaper to vectorize if they have a constant 7740 // second vector operand. One example of this are shifts on x86. 7741 Value *Op2 = I->getOperand(1); 7742 TargetTransformInfo::OperandValueProperties Op2VP; 7743 TargetTransformInfo::OperandValueKind Op2VK = 7744 TTI.getOperandInfo(Op2, Op2VP); 7745 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7746 Op2VK = TargetTransformInfo::OK_UniformValue; 7747 7748 SmallVector<const Value *, 4> Operands(I->operand_values()); 7749 return TTI.getArithmeticInstrCost( 7750 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7751 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7752 } 7753 case Instruction::FNeg: { 7754 return TTI.getArithmeticInstrCost( 7755 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7756 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7757 TargetTransformInfo::OP_None, I->getOperand(0), I); 7758 } 7759 case Instruction::Select: { 7760 SelectInst *SI = cast<SelectInst>(I); 7761 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7762 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7763 7764 const Value *Op0, *Op1; 7765 using namespace llvm::PatternMatch; 7766 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7767 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7768 // select x, y, false --> x & y 7769 // select x, true, y --> x | y 7770 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7771 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7772 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7773 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7774 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7775 Op1->getType()->getScalarSizeInBits() == 1); 7776 7777 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7778 return TTI.getArithmeticInstrCost( 7779 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7780 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7781 } 7782 7783 Type *CondTy = SI->getCondition()->getType(); 7784 if (!ScalarCond) 7785 CondTy = VectorType::get(CondTy, VF); 7786 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7787 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7788 } 7789 case Instruction::ICmp: 7790 case Instruction::FCmp: { 7791 Type *ValTy = I->getOperand(0)->getType(); 7792 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7793 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7794 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7795 VectorTy = ToVectorTy(ValTy, VF); 7796 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7797 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7798 } 7799 case Instruction::Store: 7800 case Instruction::Load: { 7801 ElementCount Width = VF; 7802 if (Width.isVector()) { 7803 InstWidening Decision = getWideningDecision(I, Width); 7804 assert(Decision != CM_Unknown && 7805 "CM decision should be taken at this point"); 7806 if (Decision == CM_Scalarize) 7807 Width = ElementCount::getFixed(1); 7808 } 7809 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7810 return getMemoryInstructionCost(I, VF); 7811 } 7812 case Instruction::BitCast: 7813 if (I->getType()->isPointerTy()) 7814 return 0; 7815 LLVM_FALLTHROUGH; 7816 case Instruction::ZExt: 7817 case Instruction::SExt: 7818 case Instruction::FPToUI: 7819 case Instruction::FPToSI: 7820 case Instruction::FPExt: 7821 case Instruction::PtrToInt: 7822 case Instruction::IntToPtr: 7823 case Instruction::SIToFP: 7824 case Instruction::UIToFP: 7825 case Instruction::Trunc: 7826 case Instruction::FPTrunc: { 7827 // Computes the CastContextHint from a Load/Store instruction. 7828 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7829 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7830 "Expected a load or a store!"); 7831 7832 if (VF.isScalar() || !TheLoop->contains(I)) 7833 return TTI::CastContextHint::Normal; 7834 7835 switch (getWideningDecision(I, VF)) { 7836 case LoopVectorizationCostModel::CM_GatherScatter: 7837 return TTI::CastContextHint::GatherScatter; 7838 case LoopVectorizationCostModel::CM_Interleave: 7839 return TTI::CastContextHint::Interleave; 7840 case LoopVectorizationCostModel::CM_Scalarize: 7841 case LoopVectorizationCostModel::CM_Widen: 7842 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7843 : TTI::CastContextHint::Normal; 7844 case LoopVectorizationCostModel::CM_Widen_Reverse: 7845 return TTI::CastContextHint::Reversed; 7846 case LoopVectorizationCostModel::CM_Unknown: 7847 llvm_unreachable("Instr did not go through cost modelling?"); 7848 } 7849 7850 llvm_unreachable("Unhandled case!"); 7851 }; 7852 7853 unsigned Opcode = I->getOpcode(); 7854 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7855 // For Trunc, the context is the only user, which must be a StoreInst. 7856 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7857 if (I->hasOneUse()) 7858 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7859 CCH = ComputeCCH(Store); 7860 } 7861 // For Z/Sext, the context is the operand, which must be a LoadInst. 7862 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7863 Opcode == Instruction::FPExt) { 7864 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7865 CCH = ComputeCCH(Load); 7866 } 7867 7868 // We optimize the truncation of induction variables having constant 7869 // integer steps. The cost of these truncations is the same as the scalar 7870 // operation. 7871 if (isOptimizableIVTruncate(I, VF)) { 7872 auto *Trunc = cast<TruncInst>(I); 7873 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7874 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7875 } 7876 7877 // Detect reduction patterns 7878 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7879 return *RedCost; 7880 7881 Type *SrcScalarTy = I->getOperand(0)->getType(); 7882 Type *SrcVecTy = 7883 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7884 if (canTruncateToMinimalBitwidth(I, VF)) { 7885 // This cast is going to be shrunk. This may remove the cast or it might 7886 // turn it into slightly different cast. For example, if MinBW == 16, 7887 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7888 // 7889 // Calculate the modified src and dest types. 7890 Type *MinVecTy = VectorTy; 7891 if (Opcode == Instruction::Trunc) { 7892 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7893 VectorTy = 7894 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7895 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7896 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7897 VectorTy = 7898 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7899 } 7900 } 7901 7902 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7903 } 7904 case Instruction::Call: { 7905 bool NeedToScalarize; 7906 CallInst *CI = cast<CallInst>(I); 7907 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7908 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7909 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7910 return std::min(CallCost, IntrinsicCost); 7911 } 7912 return CallCost; 7913 } 7914 case Instruction::ExtractValue: 7915 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7916 case Instruction::Alloca: 7917 // We cannot easily widen alloca to a scalable alloca, as 7918 // the result would need to be a vector of pointers. 7919 if (VF.isScalable()) 7920 return InstructionCost::getInvalid(); 7921 LLVM_FALLTHROUGH; 7922 default: 7923 // This opcode is unknown. Assume that it is the same as 'mul'. 7924 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7925 } // end of switch. 7926 } 7927 7928 char LoopVectorize::ID = 0; 7929 7930 static const char lv_name[] = "Loop Vectorization"; 7931 7932 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7933 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7934 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7935 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7936 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7937 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7938 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7939 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7940 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7941 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7942 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7943 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7944 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7945 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7946 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7947 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7948 7949 namespace llvm { 7950 7951 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7952 7953 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7954 bool VectorizeOnlyWhenForced) { 7955 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7956 } 7957 7958 } // end namespace llvm 7959 7960 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7961 // Check if the pointer operand of a load or store instruction is 7962 // consecutive. 7963 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7964 return Legal->isConsecutivePtr(Ptr); 7965 return false; 7966 } 7967 7968 void LoopVectorizationCostModel::collectValuesToIgnore() { 7969 // Ignore ephemeral values. 7970 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7971 7972 // Ignore type-promoting instructions we identified during reduction 7973 // detection. 7974 for (auto &Reduction : Legal->getReductionVars()) { 7975 RecurrenceDescriptor &RedDes = Reduction.second; 7976 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7977 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7978 } 7979 // Ignore type-casting instructions we identified during induction 7980 // detection. 7981 for (auto &Induction : Legal->getInductionVars()) { 7982 InductionDescriptor &IndDes = Induction.second; 7983 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7984 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7985 } 7986 } 7987 7988 void LoopVectorizationCostModel::collectInLoopReductions() { 7989 for (auto &Reduction : Legal->getReductionVars()) { 7990 PHINode *Phi = Reduction.first; 7991 RecurrenceDescriptor &RdxDesc = Reduction.second; 7992 7993 // We don't collect reductions that are type promoted (yet). 7994 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7995 continue; 7996 7997 // If the target would prefer this reduction to happen "in-loop", then we 7998 // want to record it as such. 7999 unsigned Opcode = RdxDesc.getOpcode(); 8000 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 8001 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 8002 TargetTransformInfo::ReductionFlags())) 8003 continue; 8004 8005 // Check that we can correctly put the reductions into the loop, by 8006 // finding the chain of operations that leads from the phi to the loop 8007 // exit value. 8008 SmallVector<Instruction *, 4> ReductionOperations = 8009 RdxDesc.getReductionOpChain(Phi, TheLoop); 8010 bool InLoop = !ReductionOperations.empty(); 8011 if (InLoop) { 8012 InLoopReductionChains[Phi] = ReductionOperations; 8013 // Add the elements to InLoopReductionImmediateChains for cost modelling. 8014 Instruction *LastChain = Phi; 8015 for (auto *I : ReductionOperations) { 8016 InLoopReductionImmediateChains[I] = LastChain; 8017 LastChain = I; 8018 } 8019 } 8020 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 8021 << " reduction for phi: " << *Phi << "\n"); 8022 } 8023 } 8024 8025 // TODO: we could return a pair of values that specify the max VF and 8026 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 8027 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 8028 // doesn't have a cost model that can choose which plan to execute if 8029 // more than one is generated. 8030 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 8031 LoopVectorizationCostModel &CM) { 8032 unsigned WidestType; 8033 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 8034 return WidestVectorRegBits / WidestType; 8035 } 8036 8037 VectorizationFactor 8038 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 8039 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 8040 ElementCount VF = UserVF; 8041 // Outer loop handling: They may require CFG and instruction level 8042 // transformations before even evaluating whether vectorization is profitable. 8043 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8044 // the vectorization pipeline. 8045 if (!OrigLoop->isInnermost()) { 8046 // If the user doesn't provide a vectorization factor, determine a 8047 // reasonable one. 8048 if (UserVF.isZero()) { 8049 VF = ElementCount::getFixed(determineVPlanVF( 8050 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8051 .getFixedSize(), 8052 CM)); 8053 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8054 8055 // Make sure we have a VF > 1 for stress testing. 8056 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8057 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8058 << "overriding computed VF.\n"); 8059 VF = ElementCount::getFixed(4); 8060 } 8061 } 8062 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8063 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8064 "VF needs to be a power of two"); 8065 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8066 << "VF " << VF << " to build VPlans.\n"); 8067 buildVPlans(VF, VF); 8068 8069 // For VPlan build stress testing, we bail out after VPlan construction. 8070 if (VPlanBuildStressTest) 8071 return VectorizationFactor::Disabled(); 8072 8073 return {VF, 0 /*Cost*/}; 8074 } 8075 8076 LLVM_DEBUG( 8077 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8078 "VPlan-native path.\n"); 8079 return VectorizationFactor::Disabled(); 8080 } 8081 8082 Optional<VectorizationFactor> 8083 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8084 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8085 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8086 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8087 return None; 8088 8089 // Invalidate interleave groups if all blocks of loop will be predicated. 8090 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 8091 !useMaskedInterleavedAccesses(*TTI)) { 8092 LLVM_DEBUG( 8093 dbgs() 8094 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8095 "which requires masked-interleaved support.\n"); 8096 if (CM.InterleaveInfo.invalidateGroups()) 8097 // Invalidating interleave groups also requires invalidating all decisions 8098 // based on them, which includes widening decisions and uniform and scalar 8099 // values. 8100 CM.invalidateCostModelingDecisions(); 8101 } 8102 8103 ElementCount MaxUserVF = 8104 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8105 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8106 if (!UserVF.isZero() && UserVFIsLegal) { 8107 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8108 "VF needs to be a power of two"); 8109 // Collect the instructions (and their associated costs) that will be more 8110 // profitable to scalarize. 8111 if (CM.selectUserVectorizationFactor(UserVF)) { 8112 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8113 CM.collectInLoopReductions(); 8114 buildVPlansWithVPRecipes(UserVF, UserVF); 8115 LLVM_DEBUG(printPlans(dbgs())); 8116 return {{UserVF, 0}}; 8117 } else 8118 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8119 "InvalidCost", ORE, OrigLoop); 8120 } 8121 8122 // Populate the set of Vectorization Factor Candidates. 8123 ElementCountSet VFCandidates; 8124 for (auto VF = ElementCount::getFixed(1); 8125 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8126 VFCandidates.insert(VF); 8127 for (auto VF = ElementCount::getScalable(1); 8128 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8129 VFCandidates.insert(VF); 8130 8131 for (const auto &VF : VFCandidates) { 8132 // Collect Uniform and Scalar instructions after vectorization with VF. 8133 CM.collectUniformsAndScalars(VF); 8134 8135 // Collect the instructions (and their associated costs) that will be more 8136 // profitable to scalarize. 8137 if (VF.isVector()) 8138 CM.collectInstsToScalarize(VF); 8139 } 8140 8141 CM.collectInLoopReductions(); 8142 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8143 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8144 8145 LLVM_DEBUG(printPlans(dbgs())); 8146 if (!MaxFactors.hasVector()) 8147 return VectorizationFactor::Disabled(); 8148 8149 // Select the optimal vectorization factor. 8150 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8151 8152 // Check if it is profitable to vectorize with runtime checks. 8153 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8154 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8155 bool PragmaThresholdReached = 8156 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8157 bool ThresholdReached = 8158 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8159 if ((ThresholdReached && !Hints.allowReordering()) || 8160 PragmaThresholdReached) { 8161 ORE->emit([&]() { 8162 return OptimizationRemarkAnalysisAliasing( 8163 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8164 OrigLoop->getHeader()) 8165 << "loop not vectorized: cannot prove it is safe to reorder " 8166 "memory operations"; 8167 }); 8168 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8169 Hints.emitRemarkWithHints(); 8170 return VectorizationFactor::Disabled(); 8171 } 8172 } 8173 return SelectedVF; 8174 } 8175 8176 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8177 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8178 << '\n'); 8179 BestVF = VF; 8180 BestUF = UF; 8181 8182 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8183 return !Plan->hasVF(VF); 8184 }); 8185 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8186 } 8187 8188 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8189 DominatorTree *DT) { 8190 // Perform the actual loop transformation. 8191 8192 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8193 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8194 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8195 8196 VPTransformState State{ 8197 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8198 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8199 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8200 State.CanonicalIV = ILV.Induction; 8201 8202 ILV.printDebugTracesAtStart(); 8203 8204 //===------------------------------------------------===// 8205 // 8206 // Notice: any optimization or new instruction that go 8207 // into the code below should also be implemented in 8208 // the cost-model. 8209 // 8210 //===------------------------------------------------===// 8211 8212 // 2. Copy and widen instructions from the old loop into the new loop. 8213 VPlans.front()->execute(&State); 8214 8215 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8216 // predication, updating analyses. 8217 ILV.fixVectorizedLoop(State); 8218 8219 ILV.printDebugTracesAtEnd(); 8220 } 8221 8222 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8223 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8224 for (const auto &Plan : VPlans) 8225 if (PrintVPlansInDotFormat) 8226 Plan->printDOT(O); 8227 else 8228 Plan->print(O); 8229 } 8230 #endif 8231 8232 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8233 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8234 8235 // We create new control-flow for the vectorized loop, so the original exit 8236 // conditions will be dead after vectorization if it's only used by the 8237 // terminator 8238 SmallVector<BasicBlock*> ExitingBlocks; 8239 OrigLoop->getExitingBlocks(ExitingBlocks); 8240 for (auto *BB : ExitingBlocks) { 8241 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8242 if (!Cmp || !Cmp->hasOneUse()) 8243 continue; 8244 8245 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8246 if (!DeadInstructions.insert(Cmp).second) 8247 continue; 8248 8249 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8250 // TODO: can recurse through operands in general 8251 for (Value *Op : Cmp->operands()) { 8252 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8253 DeadInstructions.insert(cast<Instruction>(Op)); 8254 } 8255 } 8256 8257 // We create new "steps" for induction variable updates to which the original 8258 // induction variables map. An original update instruction will be dead if 8259 // all its users except the induction variable are dead. 8260 auto *Latch = OrigLoop->getLoopLatch(); 8261 for (auto &Induction : Legal->getInductionVars()) { 8262 PHINode *Ind = Induction.first; 8263 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8264 8265 // If the tail is to be folded by masking, the primary induction variable, 8266 // if exists, isn't dead: it will be used for masking. Don't kill it. 8267 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8268 continue; 8269 8270 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8271 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8272 })) 8273 DeadInstructions.insert(IndUpdate); 8274 8275 // We record as "Dead" also the type-casting instructions we had identified 8276 // during induction analysis. We don't need any handling for them in the 8277 // vectorized loop because we have proven that, under a proper runtime 8278 // test guarding the vectorized loop, the value of the phi, and the casted 8279 // value of the phi, are the same. The last instruction in this casting chain 8280 // will get its scalar/vector/widened def from the scalar/vector/widened def 8281 // of the respective phi node. Any other casts in the induction def-use chain 8282 // have no other uses outside the phi update chain, and will be ignored. 8283 InductionDescriptor &IndDes = Induction.second; 8284 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8285 DeadInstructions.insert(Casts.begin(), Casts.end()); 8286 } 8287 } 8288 8289 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8290 8291 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8292 8293 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8294 Instruction::BinaryOps BinOp) { 8295 // When unrolling and the VF is 1, we only need to add a simple scalar. 8296 Type *Ty = Val->getType(); 8297 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8298 8299 if (Ty->isFloatingPointTy()) { 8300 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8301 8302 // Floating-point operations inherit FMF via the builder's flags. 8303 Value *MulOp = Builder.CreateFMul(C, Step); 8304 return Builder.CreateBinOp(BinOp, Val, MulOp); 8305 } 8306 Constant *C = ConstantInt::get(Ty, StartIdx); 8307 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8308 } 8309 8310 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8311 SmallVector<Metadata *, 4> MDs; 8312 // Reserve first location for self reference to the LoopID metadata node. 8313 MDs.push_back(nullptr); 8314 bool IsUnrollMetadata = false; 8315 MDNode *LoopID = L->getLoopID(); 8316 if (LoopID) { 8317 // First find existing loop unrolling disable metadata. 8318 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8319 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8320 if (MD) { 8321 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8322 IsUnrollMetadata = 8323 S && S->getString().startswith("llvm.loop.unroll.disable"); 8324 } 8325 MDs.push_back(LoopID->getOperand(i)); 8326 } 8327 } 8328 8329 if (!IsUnrollMetadata) { 8330 // Add runtime unroll disable metadata. 8331 LLVMContext &Context = L->getHeader()->getContext(); 8332 SmallVector<Metadata *, 1> DisableOperands; 8333 DisableOperands.push_back( 8334 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8335 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8336 MDs.push_back(DisableNode); 8337 MDNode *NewLoopID = MDNode::get(Context, MDs); 8338 // Set operand 0 to refer to the loop id itself. 8339 NewLoopID->replaceOperandWith(0, NewLoopID); 8340 L->setLoopID(NewLoopID); 8341 } 8342 } 8343 8344 //===--------------------------------------------------------------------===// 8345 // EpilogueVectorizerMainLoop 8346 //===--------------------------------------------------------------------===// 8347 8348 /// This function is partially responsible for generating the control flow 8349 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8350 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8351 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8352 Loop *Lp = createVectorLoopSkeleton(""); 8353 8354 // Generate the code to check the minimum iteration count of the vector 8355 // epilogue (see below). 8356 EPI.EpilogueIterationCountCheck = 8357 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8358 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8359 8360 // Generate the code to check any assumptions that we've made for SCEV 8361 // expressions. 8362 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8363 8364 // Generate the code that checks at runtime if arrays overlap. We put the 8365 // checks into a separate block to make the more common case of few elements 8366 // faster. 8367 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8368 8369 // Generate the iteration count check for the main loop, *after* the check 8370 // for the epilogue loop, so that the path-length is shorter for the case 8371 // that goes directly through the vector epilogue. The longer-path length for 8372 // the main loop is compensated for, by the gain from vectorizing the larger 8373 // trip count. Note: the branch will get updated later on when we vectorize 8374 // the epilogue. 8375 EPI.MainLoopIterationCountCheck = 8376 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8377 8378 // Generate the induction variable. 8379 OldInduction = Legal->getPrimaryInduction(); 8380 Type *IdxTy = Legal->getWidestInductionType(); 8381 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8382 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8383 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8384 EPI.VectorTripCount = CountRoundDown; 8385 Induction = 8386 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8387 getDebugLocFromInstOrOperands(OldInduction)); 8388 8389 // Skip induction resume value creation here because they will be created in 8390 // the second pass. If we created them here, they wouldn't be used anyway, 8391 // because the vplan in the second pass still contains the inductions from the 8392 // original loop. 8393 8394 return completeLoopSkeleton(Lp, OrigLoopID); 8395 } 8396 8397 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8398 LLVM_DEBUG({ 8399 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8400 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8401 << ", Main Loop UF:" << EPI.MainLoopUF 8402 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8403 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8404 }); 8405 } 8406 8407 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8408 DEBUG_WITH_TYPE(VerboseDebug, { 8409 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8410 }); 8411 } 8412 8413 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8414 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8415 assert(L && "Expected valid Loop."); 8416 assert(Bypass && "Expected valid bypass basic block."); 8417 unsigned VFactor = 8418 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8419 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8420 Value *Count = getOrCreateTripCount(L); 8421 // Reuse existing vector loop preheader for TC checks. 8422 // Note that new preheader block is generated for vector loop. 8423 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8424 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8425 8426 // Generate code to check if the loop's trip count is less than VF * UF of the 8427 // main vector loop. 8428 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8429 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8430 8431 Value *CheckMinIters = Builder.CreateICmp( 8432 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8433 "min.iters.check"); 8434 8435 if (!ForEpilogue) 8436 TCCheckBlock->setName("vector.main.loop.iter.check"); 8437 8438 // Create new preheader for vector loop. 8439 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8440 DT, LI, nullptr, "vector.ph"); 8441 8442 if (ForEpilogue) { 8443 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8444 DT->getNode(Bypass)->getIDom()) && 8445 "TC check is expected to dominate Bypass"); 8446 8447 // Update dominator for Bypass & LoopExit. 8448 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8449 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8450 // For loops with multiple exits, there's no edge from the middle block 8451 // to exit blocks (as the epilogue must run) and thus no need to update 8452 // the immediate dominator of the exit blocks. 8453 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8454 8455 LoopBypassBlocks.push_back(TCCheckBlock); 8456 8457 // Save the trip count so we don't have to regenerate it in the 8458 // vec.epilog.iter.check. This is safe to do because the trip count 8459 // generated here dominates the vector epilog iter check. 8460 EPI.TripCount = Count; 8461 } 8462 8463 ReplaceInstWithInst( 8464 TCCheckBlock->getTerminator(), 8465 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8466 8467 return TCCheckBlock; 8468 } 8469 8470 //===--------------------------------------------------------------------===// 8471 // EpilogueVectorizerEpilogueLoop 8472 //===--------------------------------------------------------------------===// 8473 8474 /// This function is partially responsible for generating the control flow 8475 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8476 BasicBlock * 8477 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8478 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8479 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8480 8481 // Now, compare the remaining count and if there aren't enough iterations to 8482 // execute the vectorized epilogue skip to the scalar part. 8483 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8484 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8485 LoopVectorPreHeader = 8486 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8487 LI, nullptr, "vec.epilog.ph"); 8488 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8489 VecEpilogueIterationCountCheck); 8490 8491 // Adjust the control flow taking the state info from the main loop 8492 // vectorization into account. 8493 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8494 "expected this to be saved from the previous pass."); 8495 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8496 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8497 8498 DT->changeImmediateDominator(LoopVectorPreHeader, 8499 EPI.MainLoopIterationCountCheck); 8500 8501 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8502 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8503 8504 if (EPI.SCEVSafetyCheck) 8505 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8506 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8507 if (EPI.MemSafetyCheck) 8508 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8509 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8510 8511 DT->changeImmediateDominator( 8512 VecEpilogueIterationCountCheck, 8513 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8514 8515 DT->changeImmediateDominator(LoopScalarPreHeader, 8516 EPI.EpilogueIterationCountCheck); 8517 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8518 // If there is an epilogue which must run, there's no edge from the 8519 // middle block to exit blocks and thus no need to update the immediate 8520 // dominator of the exit blocks. 8521 DT->changeImmediateDominator(LoopExitBlock, 8522 EPI.EpilogueIterationCountCheck); 8523 8524 // Keep track of bypass blocks, as they feed start values to the induction 8525 // phis in the scalar loop preheader. 8526 if (EPI.SCEVSafetyCheck) 8527 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8528 if (EPI.MemSafetyCheck) 8529 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8530 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8531 8532 // Generate a resume induction for the vector epilogue and put it in the 8533 // vector epilogue preheader 8534 Type *IdxTy = Legal->getWidestInductionType(); 8535 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8536 LoopVectorPreHeader->getFirstNonPHI()); 8537 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8538 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8539 EPI.MainLoopIterationCountCheck); 8540 8541 // Generate the induction variable. 8542 OldInduction = Legal->getPrimaryInduction(); 8543 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8544 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8545 Value *StartIdx = EPResumeVal; 8546 Induction = 8547 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8548 getDebugLocFromInstOrOperands(OldInduction)); 8549 8550 // Generate induction resume values. These variables save the new starting 8551 // indexes for the scalar loop. They are used to test if there are any tail 8552 // iterations left once the vector loop has completed. 8553 // Note that when the vectorized epilogue is skipped due to iteration count 8554 // check, then the resume value for the induction variable comes from 8555 // the trip count of the main vector loop, hence passing the AdditionalBypass 8556 // argument. 8557 createInductionResumeValues(Lp, CountRoundDown, 8558 {VecEpilogueIterationCountCheck, 8559 EPI.VectorTripCount} /* AdditionalBypass */); 8560 8561 AddRuntimeUnrollDisableMetaData(Lp); 8562 return completeLoopSkeleton(Lp, OrigLoopID); 8563 } 8564 8565 BasicBlock * 8566 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8567 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8568 8569 assert(EPI.TripCount && 8570 "Expected trip count to have been safed in the first pass."); 8571 assert( 8572 (!isa<Instruction>(EPI.TripCount) || 8573 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8574 "saved trip count does not dominate insertion point."); 8575 Value *TC = EPI.TripCount; 8576 IRBuilder<> Builder(Insert->getTerminator()); 8577 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8578 8579 // Generate code to check if the loop's trip count is less than VF * UF of the 8580 // vector epilogue loop. 8581 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8582 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8583 8584 Value *CheckMinIters = Builder.CreateICmp( 8585 P, Count, 8586 ConstantInt::get(Count->getType(), 8587 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8588 "min.epilog.iters.check"); 8589 8590 ReplaceInstWithInst( 8591 Insert->getTerminator(), 8592 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8593 8594 LoopBypassBlocks.push_back(Insert); 8595 return Insert; 8596 } 8597 8598 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8599 LLVM_DEBUG({ 8600 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8601 << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8602 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8603 }); 8604 } 8605 8606 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8607 DEBUG_WITH_TYPE(VerboseDebug, { 8608 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8609 }); 8610 } 8611 8612 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8613 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8614 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8615 bool PredicateAtRangeStart = Predicate(Range.Start); 8616 8617 for (ElementCount TmpVF = Range.Start * 2; 8618 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8619 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8620 Range.End = TmpVF; 8621 break; 8622 } 8623 8624 return PredicateAtRangeStart; 8625 } 8626 8627 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8628 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8629 /// of VF's starting at a given VF and extending it as much as possible. Each 8630 /// vectorization decision can potentially shorten this sub-range during 8631 /// buildVPlan(). 8632 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8633 ElementCount MaxVF) { 8634 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8635 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8636 VFRange SubRange = {VF, MaxVFPlusOne}; 8637 VPlans.push_back(buildVPlan(SubRange)); 8638 VF = SubRange.End; 8639 } 8640 } 8641 8642 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8643 VPlanPtr &Plan) { 8644 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8645 8646 // Look for cached value. 8647 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8648 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8649 if (ECEntryIt != EdgeMaskCache.end()) 8650 return ECEntryIt->second; 8651 8652 VPValue *SrcMask = createBlockInMask(Src, Plan); 8653 8654 // The terminator has to be a branch inst! 8655 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8656 assert(BI && "Unexpected terminator found"); 8657 8658 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8659 return EdgeMaskCache[Edge] = SrcMask; 8660 8661 // If source is an exiting block, we know the exit edge is dynamically dead 8662 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8663 // adding uses of an otherwise potentially dead instruction. 8664 if (OrigLoop->isLoopExiting(Src)) 8665 return EdgeMaskCache[Edge] = SrcMask; 8666 8667 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8668 assert(EdgeMask && "No Edge Mask found for condition"); 8669 8670 if (BI->getSuccessor(0) != Dst) 8671 EdgeMask = Builder.createNot(EdgeMask); 8672 8673 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8674 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8675 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8676 // The select version does not introduce new UB if SrcMask is false and 8677 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8678 VPValue *False = Plan->getOrAddVPValue( 8679 ConstantInt::getFalse(BI->getCondition()->getType())); 8680 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8681 } 8682 8683 return EdgeMaskCache[Edge] = EdgeMask; 8684 } 8685 8686 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8687 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8688 8689 // Look for cached value. 8690 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8691 if (BCEntryIt != BlockMaskCache.end()) 8692 return BCEntryIt->second; 8693 8694 // All-one mask is modelled as no-mask following the convention for masked 8695 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8696 VPValue *BlockMask = nullptr; 8697 8698 if (OrigLoop->getHeader() == BB) { 8699 if (!CM.blockNeedsPredication(BB)) 8700 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8701 8702 // Create the block in mask as the first non-phi instruction in the block. 8703 VPBuilder::InsertPointGuard Guard(Builder); 8704 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8705 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8706 8707 // Introduce the early-exit compare IV <= BTC to form header block mask. 8708 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8709 // Start by constructing the desired canonical IV. 8710 VPValue *IV = nullptr; 8711 if (Legal->getPrimaryInduction()) 8712 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8713 else { 8714 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8715 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8716 IV = IVRecipe->getVPSingleValue(); 8717 } 8718 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8719 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8720 8721 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8722 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8723 // as a second argument, we only pass the IV here and extract the 8724 // tripcount from the transform state where codegen of the VP instructions 8725 // happen. 8726 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8727 } else { 8728 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8729 } 8730 return BlockMaskCache[BB] = BlockMask; 8731 } 8732 8733 // This is the block mask. We OR all incoming edges. 8734 for (auto *Predecessor : predecessors(BB)) { 8735 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8736 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8737 return BlockMaskCache[BB] = EdgeMask; 8738 8739 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8740 BlockMask = EdgeMask; 8741 continue; 8742 } 8743 8744 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8745 } 8746 8747 return BlockMaskCache[BB] = BlockMask; 8748 } 8749 8750 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8751 ArrayRef<VPValue *> Operands, 8752 VFRange &Range, 8753 VPlanPtr &Plan) { 8754 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8755 "Must be called with either a load or store"); 8756 8757 auto willWiden = [&](ElementCount VF) -> bool { 8758 if (VF.isScalar()) 8759 return false; 8760 LoopVectorizationCostModel::InstWidening Decision = 8761 CM.getWideningDecision(I, VF); 8762 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8763 "CM decision should be taken at this point."); 8764 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8765 return true; 8766 if (CM.isScalarAfterVectorization(I, VF) || 8767 CM.isProfitableToScalarize(I, VF)) 8768 return false; 8769 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8770 }; 8771 8772 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8773 return nullptr; 8774 8775 VPValue *Mask = nullptr; 8776 if (Legal->isMaskRequired(I)) 8777 Mask = createBlockInMask(I->getParent(), Plan); 8778 8779 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8780 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8781 8782 StoreInst *Store = cast<StoreInst>(I); 8783 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8784 Mask); 8785 } 8786 8787 VPWidenIntOrFpInductionRecipe * 8788 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8789 ArrayRef<VPValue *> Operands) const { 8790 // Check if this is an integer or fp induction. If so, build the recipe that 8791 // produces its scalar and vector values. 8792 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8793 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8794 II.getKind() == InductionDescriptor::IK_FpInduction) { 8795 assert(II.getStartValue() == 8796 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8797 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8798 return new VPWidenIntOrFpInductionRecipe( 8799 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8800 } 8801 8802 return nullptr; 8803 } 8804 8805 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8806 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8807 VPlan &Plan) const { 8808 // Optimize the special case where the source is a constant integer 8809 // induction variable. Notice that we can only optimize the 'trunc' case 8810 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8811 // (c) other casts depend on pointer size. 8812 8813 // Determine whether \p K is a truncation based on an induction variable that 8814 // can be optimized. 8815 auto isOptimizableIVTruncate = 8816 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8817 return [=](ElementCount VF) -> bool { 8818 return CM.isOptimizableIVTruncate(K, VF); 8819 }; 8820 }; 8821 8822 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8823 isOptimizableIVTruncate(I), Range)) { 8824 8825 InductionDescriptor II = 8826 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8827 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8828 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8829 Start, nullptr, I); 8830 } 8831 return nullptr; 8832 } 8833 8834 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8835 ArrayRef<VPValue *> Operands, 8836 VPlanPtr &Plan) { 8837 // If all incoming values are equal, the incoming VPValue can be used directly 8838 // instead of creating a new VPBlendRecipe. 8839 VPValue *FirstIncoming = Operands[0]; 8840 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8841 return FirstIncoming == Inc; 8842 })) { 8843 return Operands[0]; 8844 } 8845 8846 // We know that all PHIs in non-header blocks are converted into selects, so 8847 // we don't have to worry about the insertion order and we can just use the 8848 // builder. At this point we generate the predication tree. There may be 8849 // duplications since this is a simple recursive scan, but future 8850 // optimizations will clean it up. 8851 SmallVector<VPValue *, 2> OperandsWithMask; 8852 unsigned NumIncoming = Phi->getNumIncomingValues(); 8853 8854 for (unsigned In = 0; In < NumIncoming; In++) { 8855 VPValue *EdgeMask = 8856 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8857 assert((EdgeMask || NumIncoming == 1) && 8858 "Multiple predecessors with one having a full mask"); 8859 OperandsWithMask.push_back(Operands[In]); 8860 if (EdgeMask) 8861 OperandsWithMask.push_back(EdgeMask); 8862 } 8863 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8864 } 8865 8866 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8867 ArrayRef<VPValue *> Operands, 8868 VFRange &Range) const { 8869 8870 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8871 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8872 Range); 8873 8874 if (IsPredicated) 8875 return nullptr; 8876 8877 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8878 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8879 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8880 ID == Intrinsic::pseudoprobe || 8881 ID == Intrinsic::experimental_noalias_scope_decl)) 8882 return nullptr; 8883 8884 auto willWiden = [&](ElementCount VF) -> bool { 8885 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8886 // The following case may be scalarized depending on the VF. 8887 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8888 // version of the instruction. 8889 // Is it beneficial to perform intrinsic call compared to lib call? 8890 bool NeedToScalarize = false; 8891 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8892 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8893 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8894 return UseVectorIntrinsic || !NeedToScalarize; 8895 }; 8896 8897 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8898 return nullptr; 8899 8900 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8901 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8902 } 8903 8904 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8905 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8906 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8907 // Instruction should be widened, unless it is scalar after vectorization, 8908 // scalarization is profitable or it is predicated. 8909 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8910 return CM.isScalarAfterVectorization(I, VF) || 8911 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8912 }; 8913 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8914 Range); 8915 } 8916 8917 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8918 ArrayRef<VPValue *> Operands) const { 8919 auto IsVectorizableOpcode = [](unsigned Opcode) { 8920 switch (Opcode) { 8921 case Instruction::Add: 8922 case Instruction::And: 8923 case Instruction::AShr: 8924 case Instruction::BitCast: 8925 case Instruction::FAdd: 8926 case Instruction::FCmp: 8927 case Instruction::FDiv: 8928 case Instruction::FMul: 8929 case Instruction::FNeg: 8930 case Instruction::FPExt: 8931 case Instruction::FPToSI: 8932 case Instruction::FPToUI: 8933 case Instruction::FPTrunc: 8934 case Instruction::FRem: 8935 case Instruction::FSub: 8936 case Instruction::ICmp: 8937 case Instruction::IntToPtr: 8938 case Instruction::LShr: 8939 case Instruction::Mul: 8940 case Instruction::Or: 8941 case Instruction::PtrToInt: 8942 case Instruction::SDiv: 8943 case Instruction::Select: 8944 case Instruction::SExt: 8945 case Instruction::Shl: 8946 case Instruction::SIToFP: 8947 case Instruction::SRem: 8948 case Instruction::Sub: 8949 case Instruction::Trunc: 8950 case Instruction::UDiv: 8951 case Instruction::UIToFP: 8952 case Instruction::URem: 8953 case Instruction::Xor: 8954 case Instruction::ZExt: 8955 return true; 8956 } 8957 return false; 8958 }; 8959 8960 if (!IsVectorizableOpcode(I->getOpcode())) 8961 return nullptr; 8962 8963 // Success: widen this instruction. 8964 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8965 } 8966 8967 void VPRecipeBuilder::fixHeaderPhis() { 8968 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8969 for (VPWidenPHIRecipe *R : PhisToFix) { 8970 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8971 VPRecipeBase *IncR = 8972 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8973 R->addOperand(IncR->getVPSingleValue()); 8974 } 8975 } 8976 8977 VPBasicBlock *VPRecipeBuilder::handleReplication( 8978 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8979 VPlanPtr &Plan) { 8980 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8981 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8982 Range); 8983 8984 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8985 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8986 8987 // Even if the instruction is not marked as uniform, there are certain 8988 // intrinsic calls that can be effectively treated as such, so we check for 8989 // them here. Conservatively, we only do this for scalable vectors, since 8990 // for fixed-width VFs we can always fall back on full scalarization. 8991 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8992 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8993 case Intrinsic::assume: 8994 case Intrinsic::lifetime_start: 8995 case Intrinsic::lifetime_end: 8996 // For scalable vectors if one of the operands is variant then we still 8997 // want to mark as uniform, which will generate one instruction for just 8998 // the first lane of the vector. We can't scalarize the call in the same 8999 // way as for fixed-width vectors because we don't know how many lanes 9000 // there are. 9001 // 9002 // The reasons for doing it this way for scalable vectors are: 9003 // 1. For the assume intrinsic generating the instruction for the first 9004 // lane is still be better than not generating any at all. For 9005 // example, the input may be a splat across all lanes. 9006 // 2. For the lifetime start/end intrinsics the pointer operand only 9007 // does anything useful when the input comes from a stack object, 9008 // which suggests it should always be uniform. For non-stack objects 9009 // the effect is to poison the object, which still allows us to 9010 // remove the call. 9011 IsUniform = true; 9012 break; 9013 default: 9014 break; 9015 } 9016 } 9017 9018 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 9019 IsUniform, IsPredicated); 9020 setRecipe(I, Recipe); 9021 Plan->addVPValue(I, Recipe); 9022 9023 // Find if I uses a predicated instruction. If so, it will use its scalar 9024 // value. Avoid hoisting the insert-element which packs the scalar value into 9025 // a vector value, as that happens iff all users use the vector value. 9026 for (VPValue *Op : Recipe->operands()) { 9027 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 9028 if (!PredR) 9029 continue; 9030 auto *RepR = 9031 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 9032 assert(RepR->isPredicated() && 9033 "expected Replicate recipe to be predicated"); 9034 RepR->setAlsoPack(false); 9035 } 9036 9037 // Finalize the recipe for Instr, first if it is not predicated. 9038 if (!IsPredicated) { 9039 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9040 VPBB->appendRecipe(Recipe); 9041 return VPBB; 9042 } 9043 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9044 assert(VPBB->getSuccessors().empty() && 9045 "VPBB has successors when handling predicated replication."); 9046 // Record predicated instructions for above packing optimizations. 9047 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9048 VPBlockUtils::insertBlockAfter(Region, VPBB); 9049 auto *RegSucc = new VPBasicBlock(); 9050 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9051 return RegSucc; 9052 } 9053 9054 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9055 VPRecipeBase *PredRecipe, 9056 VPlanPtr &Plan) { 9057 // Instructions marked for predication are replicated and placed under an 9058 // if-then construct to prevent side-effects. 9059 9060 // Generate recipes to compute the block mask for this region. 9061 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9062 9063 // Build the triangular if-then region. 9064 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9065 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9066 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9067 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9068 auto *PHIRecipe = Instr->getType()->isVoidTy() 9069 ? nullptr 9070 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9071 if (PHIRecipe) { 9072 Plan->removeVPValueFor(Instr); 9073 Plan->addVPValue(Instr, PHIRecipe); 9074 } 9075 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9076 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9077 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9078 9079 // Note: first set Entry as region entry and then connect successors starting 9080 // from it in order, to propagate the "parent" of each VPBasicBlock. 9081 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9082 VPBlockUtils::connectBlocks(Pred, Exit); 9083 9084 return Region; 9085 } 9086 9087 VPRecipeOrVPValueTy 9088 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9089 ArrayRef<VPValue *> Operands, 9090 VFRange &Range, VPlanPtr &Plan) { 9091 // First, check for specific widening recipes that deal with calls, memory 9092 // operations, inductions and Phi nodes. 9093 if (auto *CI = dyn_cast<CallInst>(Instr)) 9094 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9095 9096 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9097 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9098 9099 VPRecipeBase *Recipe; 9100 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9101 if (Phi->getParent() != OrigLoop->getHeader()) 9102 return tryToBlend(Phi, Operands, Plan); 9103 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9104 return toVPRecipeResult(Recipe); 9105 9106 VPWidenPHIRecipe *PhiRecipe = nullptr; 9107 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9108 VPValue *StartV = Operands[0]; 9109 if (Legal->isReductionVariable(Phi)) { 9110 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9111 assert(RdxDesc.getRecurrenceStartValue() == 9112 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9113 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9114 CM.isInLoopReduction(Phi), 9115 CM.useOrderedReductions(RdxDesc)); 9116 } else { 9117 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9118 } 9119 9120 // Record the incoming value from the backedge, so we can add the incoming 9121 // value from the backedge after all recipes have been created. 9122 recordRecipeOf(cast<Instruction>( 9123 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9124 PhisToFix.push_back(PhiRecipe); 9125 } else { 9126 // TODO: record start and backedge value for remaining pointer induction 9127 // phis. 9128 assert(Phi->getType()->isPointerTy() && 9129 "only pointer phis should be handled here"); 9130 PhiRecipe = new VPWidenPHIRecipe(Phi); 9131 } 9132 9133 return toVPRecipeResult(PhiRecipe); 9134 } 9135 9136 if (isa<TruncInst>(Instr) && 9137 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9138 Range, *Plan))) 9139 return toVPRecipeResult(Recipe); 9140 9141 if (!shouldWiden(Instr, Range)) 9142 return nullptr; 9143 9144 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9145 return toVPRecipeResult(new VPWidenGEPRecipe( 9146 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9147 9148 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9149 bool InvariantCond = 9150 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9151 return toVPRecipeResult(new VPWidenSelectRecipe( 9152 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9153 } 9154 9155 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9156 } 9157 9158 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9159 ElementCount MaxVF) { 9160 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9161 9162 // Collect instructions from the original loop that will become trivially dead 9163 // in the vectorized loop. We don't need to vectorize these instructions. For 9164 // example, original induction update instructions can become dead because we 9165 // separately emit induction "steps" when generating code for the new loop. 9166 // Similarly, we create a new latch condition when setting up the structure 9167 // of the new loop, so the old one can become dead. 9168 SmallPtrSet<Instruction *, 4> DeadInstructions; 9169 collectTriviallyDeadInstructions(DeadInstructions); 9170 9171 // Add assume instructions we need to drop to DeadInstructions, to prevent 9172 // them from being added to the VPlan. 9173 // TODO: We only need to drop assumes in blocks that get flattend. If the 9174 // control flow is preserved, we should keep them. 9175 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9176 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9177 9178 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9179 // Dead instructions do not need sinking. Remove them from SinkAfter. 9180 for (Instruction *I : DeadInstructions) 9181 SinkAfter.erase(I); 9182 9183 // Cannot sink instructions after dead instructions (there won't be any 9184 // recipes for them). Instead, find the first non-dead previous instruction. 9185 for (auto &P : Legal->getSinkAfter()) { 9186 Instruction *SinkTarget = P.second; 9187 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9188 (void)FirstInst; 9189 while (DeadInstructions.contains(SinkTarget)) { 9190 assert( 9191 SinkTarget != FirstInst && 9192 "Must find a live instruction (at least the one feeding the " 9193 "first-order recurrence PHI) before reaching beginning of the block"); 9194 SinkTarget = SinkTarget->getPrevNode(); 9195 assert(SinkTarget != P.first && 9196 "sink source equals target, no sinking required"); 9197 } 9198 P.second = SinkTarget; 9199 } 9200 9201 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9202 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9203 VFRange SubRange = {VF, MaxVFPlusOne}; 9204 VPlans.push_back( 9205 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9206 VF = SubRange.End; 9207 } 9208 } 9209 9210 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9211 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9212 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9213 9214 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9215 9216 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9217 9218 // --------------------------------------------------------------------------- 9219 // Pre-construction: record ingredients whose recipes we'll need to further 9220 // process after constructing the initial VPlan. 9221 // --------------------------------------------------------------------------- 9222 9223 // Mark instructions we'll need to sink later and their targets as 9224 // ingredients whose recipe we'll need to record. 9225 for (auto &Entry : SinkAfter) { 9226 RecipeBuilder.recordRecipeOf(Entry.first); 9227 RecipeBuilder.recordRecipeOf(Entry.second); 9228 } 9229 for (auto &Reduction : CM.getInLoopReductionChains()) { 9230 PHINode *Phi = Reduction.first; 9231 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9232 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9233 9234 RecipeBuilder.recordRecipeOf(Phi); 9235 for (auto &R : ReductionOperations) { 9236 RecipeBuilder.recordRecipeOf(R); 9237 // For min/max reducitons, where we have a pair of icmp/select, we also 9238 // need to record the ICmp recipe, so it can be removed later. 9239 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9240 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9241 } 9242 } 9243 9244 // For each interleave group which is relevant for this (possibly trimmed) 9245 // Range, add it to the set of groups to be later applied to the VPlan and add 9246 // placeholders for its members' Recipes which we'll be replacing with a 9247 // single VPInterleaveRecipe. 9248 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9249 auto applyIG = [IG, this](ElementCount VF) -> bool { 9250 return (VF.isVector() && // Query is illegal for VF == 1 9251 CM.getWideningDecision(IG->getInsertPos(), VF) == 9252 LoopVectorizationCostModel::CM_Interleave); 9253 }; 9254 if (!getDecisionAndClampRange(applyIG, Range)) 9255 continue; 9256 InterleaveGroups.insert(IG); 9257 for (unsigned i = 0; i < IG->getFactor(); i++) 9258 if (Instruction *Member = IG->getMember(i)) 9259 RecipeBuilder.recordRecipeOf(Member); 9260 }; 9261 9262 // --------------------------------------------------------------------------- 9263 // Build initial VPlan: Scan the body of the loop in a topological order to 9264 // visit each basic block after having visited its predecessor basic blocks. 9265 // --------------------------------------------------------------------------- 9266 9267 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9268 auto Plan = std::make_unique<VPlan>(); 9269 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9270 Plan->setEntry(VPBB); 9271 9272 // Scan the body of the loop in a topological order to visit each basic block 9273 // after having visited its predecessor basic blocks. 9274 LoopBlocksDFS DFS(OrigLoop); 9275 DFS.perform(LI); 9276 9277 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9278 // Relevant instructions from basic block BB will be grouped into VPRecipe 9279 // ingredients and fill a new VPBasicBlock. 9280 unsigned VPBBsForBB = 0; 9281 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9282 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9283 VPBB = FirstVPBBForBB; 9284 Builder.setInsertPoint(VPBB); 9285 9286 // Introduce each ingredient into VPlan. 9287 // TODO: Model and preserve debug instrinsics in VPlan. 9288 for (Instruction &I : BB->instructionsWithoutDebug()) { 9289 Instruction *Instr = &I; 9290 9291 // First filter out irrelevant instructions, to ensure no recipes are 9292 // built for them. 9293 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9294 continue; 9295 9296 SmallVector<VPValue *, 4> Operands; 9297 auto *Phi = dyn_cast<PHINode>(Instr); 9298 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9299 Operands.push_back(Plan->getOrAddVPValue( 9300 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9301 } else { 9302 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9303 Operands = {OpRange.begin(), OpRange.end()}; 9304 } 9305 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9306 Instr, Operands, Range, Plan)) { 9307 // If Instr can be simplified to an existing VPValue, use it. 9308 if (RecipeOrValue.is<VPValue *>()) { 9309 auto *VPV = RecipeOrValue.get<VPValue *>(); 9310 Plan->addVPValue(Instr, VPV); 9311 // If the re-used value is a recipe, register the recipe for the 9312 // instruction, in case the recipe for Instr needs to be recorded. 9313 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9314 RecipeBuilder.setRecipe(Instr, R); 9315 continue; 9316 } 9317 // Otherwise, add the new recipe. 9318 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9319 for (auto *Def : Recipe->definedValues()) { 9320 auto *UV = Def->getUnderlyingValue(); 9321 Plan->addVPValue(UV, Def); 9322 } 9323 9324 RecipeBuilder.setRecipe(Instr, Recipe); 9325 VPBB->appendRecipe(Recipe); 9326 continue; 9327 } 9328 9329 // Otherwise, if all widening options failed, Instruction is to be 9330 // replicated. This may create a successor for VPBB. 9331 VPBasicBlock *NextVPBB = 9332 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9333 if (NextVPBB != VPBB) { 9334 VPBB = NextVPBB; 9335 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9336 : ""); 9337 } 9338 } 9339 } 9340 9341 RecipeBuilder.fixHeaderPhis(); 9342 9343 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9344 // may also be empty, such as the last one VPBB, reflecting original 9345 // basic-blocks with no recipes. 9346 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9347 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9348 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9349 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9350 delete PreEntry; 9351 9352 // --------------------------------------------------------------------------- 9353 // Transform initial VPlan: Apply previously taken decisions, in order, to 9354 // bring the VPlan to its final state. 9355 // --------------------------------------------------------------------------- 9356 9357 // Apply Sink-After legal constraints. 9358 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9359 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9360 if (Region && Region->isReplicator()) { 9361 assert(Region->getNumSuccessors() == 1 && 9362 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9363 assert(R->getParent()->size() == 1 && 9364 "A recipe in an original replicator region must be the only " 9365 "recipe in its block"); 9366 return Region; 9367 } 9368 return nullptr; 9369 }; 9370 for (auto &Entry : SinkAfter) { 9371 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9372 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9373 9374 auto *TargetRegion = GetReplicateRegion(Target); 9375 auto *SinkRegion = GetReplicateRegion(Sink); 9376 if (!SinkRegion) { 9377 // If the sink source is not a replicate region, sink the recipe directly. 9378 if (TargetRegion) { 9379 // The target is in a replication region, make sure to move Sink to 9380 // the block after it, not into the replication region itself. 9381 VPBasicBlock *NextBlock = 9382 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9383 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9384 } else 9385 Sink->moveAfter(Target); 9386 continue; 9387 } 9388 9389 // The sink source is in a replicate region. Unhook the region from the CFG. 9390 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9391 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9392 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9393 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9394 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9395 9396 if (TargetRegion) { 9397 // The target recipe is also in a replicate region, move the sink region 9398 // after the target region. 9399 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9400 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9401 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9402 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9403 } else { 9404 // The sink source is in a replicate region, we need to move the whole 9405 // replicate region, which should only contain a single recipe in the 9406 // main block. 9407 auto *SplitBlock = 9408 Target->getParent()->splitAt(std::next(Target->getIterator())); 9409 9410 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9411 9412 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9413 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9414 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9415 if (VPBB == SplitPred) 9416 VPBB = SplitBlock; 9417 } 9418 } 9419 9420 // Introduce a recipe to combine the incoming and previous values of a 9421 // first-order recurrence. 9422 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9423 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9424 if (!RecurPhi) 9425 continue; 9426 9427 auto *RecurSplice = cast<VPInstruction>( 9428 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9429 {RecurPhi, RecurPhi->getBackedgeValue()})); 9430 9431 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9432 if (auto *Region = GetReplicateRegion(PrevRecipe)) { 9433 VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9434 RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi()); 9435 } else 9436 RecurSplice->moveAfter(PrevRecipe); 9437 RecurPhi->replaceAllUsesWith(RecurSplice); 9438 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9439 // all users. 9440 RecurSplice->setOperand(0, RecurPhi); 9441 } 9442 9443 // Interleave memory: for each Interleave Group we marked earlier as relevant 9444 // for this VPlan, replace the Recipes widening its memory instructions with a 9445 // single VPInterleaveRecipe at its insertion point. 9446 for (auto IG : InterleaveGroups) { 9447 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9448 RecipeBuilder.getRecipe(IG->getInsertPos())); 9449 SmallVector<VPValue *, 4> StoredValues; 9450 for (unsigned i = 0; i < IG->getFactor(); ++i) 9451 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9452 auto *StoreR = 9453 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9454 StoredValues.push_back(StoreR->getStoredValue()); 9455 } 9456 9457 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9458 Recipe->getMask()); 9459 VPIG->insertBefore(Recipe); 9460 unsigned J = 0; 9461 for (unsigned i = 0; i < IG->getFactor(); ++i) 9462 if (Instruction *Member = IG->getMember(i)) { 9463 if (!Member->getType()->isVoidTy()) { 9464 VPValue *OriginalV = Plan->getVPValue(Member); 9465 Plan->removeVPValueFor(Member); 9466 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9467 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9468 J++; 9469 } 9470 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9471 } 9472 } 9473 9474 // Adjust the recipes for any inloop reductions. 9475 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9476 9477 VPlanTransforms::sinkScalarOperands(*Plan); 9478 VPlanTransforms::mergeReplicateRegions(*Plan); 9479 9480 std::string PlanName; 9481 raw_string_ostream RSO(PlanName); 9482 ElementCount VF = Range.Start; 9483 Plan->addVF(VF); 9484 RSO << "Initial VPlan for VF={" << VF; 9485 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9486 Plan->addVF(VF); 9487 RSO << "," << VF; 9488 } 9489 RSO << "},UF>=1"; 9490 RSO.flush(); 9491 Plan->setName(PlanName); 9492 9493 return Plan; 9494 } 9495 9496 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9497 // Outer loop handling: They may require CFG and instruction level 9498 // transformations before even evaluating whether vectorization is profitable. 9499 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9500 // the vectorization pipeline. 9501 assert(!OrigLoop->isInnermost()); 9502 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9503 9504 // Create new empty VPlan 9505 auto Plan = std::make_unique<VPlan>(); 9506 9507 // Build hierarchical CFG 9508 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9509 HCFGBuilder.buildHierarchicalCFG(); 9510 9511 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9512 VF *= 2) 9513 Plan->addVF(VF); 9514 9515 if (EnableVPlanPredication) { 9516 VPlanPredicator VPP(*Plan); 9517 VPP.predicate(); 9518 9519 // Avoid running transformation to recipes until masked code generation in 9520 // VPlan-native path is in place. 9521 return Plan; 9522 } 9523 9524 SmallPtrSet<Instruction *, 1> DeadInstructions; 9525 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9526 Legal->getInductionVars(), 9527 DeadInstructions, *PSE.getSE()); 9528 return Plan; 9529 } 9530 9531 // Adjust the recipes for reductions. For in-loop reductions the chain of 9532 // instructions leading from the loop exit instr to the phi need to be converted 9533 // to reductions, with one operand being vector and the other being the scalar 9534 // reduction chain. For other reductions, a select is introduced between the phi 9535 // and live-out recipes when folding the tail. 9536 void LoopVectorizationPlanner::adjustRecipesForReductions( 9537 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9538 ElementCount MinVF) { 9539 for (auto &Reduction : CM.getInLoopReductionChains()) { 9540 PHINode *Phi = Reduction.first; 9541 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9542 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9543 9544 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9545 continue; 9546 9547 // ReductionOperations are orders top-down from the phi's use to the 9548 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9549 // which of the two operands will remain scalar and which will be reduced. 9550 // For minmax the chain will be the select instructions. 9551 Instruction *Chain = Phi; 9552 for (Instruction *R : ReductionOperations) { 9553 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9554 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9555 9556 VPValue *ChainOp = Plan->getVPValue(Chain); 9557 unsigned FirstOpId; 9558 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9559 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9560 "Expected to replace a VPWidenSelectSC"); 9561 FirstOpId = 1; 9562 } else { 9563 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && 9564 "Expected to replace a VPWidenSC"); 9565 FirstOpId = 0; 9566 } 9567 unsigned VecOpId = 9568 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9569 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9570 9571 auto *CondOp = CM.foldTailByMasking() 9572 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9573 : nullptr; 9574 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9575 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9576 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9577 Plan->removeVPValueFor(R); 9578 Plan->addVPValue(R, RedRecipe); 9579 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9580 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9581 WidenRecipe->eraseFromParent(); 9582 9583 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9584 VPRecipeBase *CompareRecipe = 9585 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9586 assert(isa<VPWidenRecipe>(CompareRecipe) && 9587 "Expected to replace a VPWidenSC"); 9588 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9589 "Expected no remaining users"); 9590 CompareRecipe->eraseFromParent(); 9591 } 9592 Chain = R; 9593 } 9594 } 9595 9596 // If tail is folded by masking, introduce selects between the phi 9597 // and the live-out instruction of each reduction, at the end of the latch. 9598 if (CM.foldTailByMasking()) { 9599 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9600 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9601 if (!PhiR || PhiR->isInLoop()) 9602 continue; 9603 Builder.setInsertPoint(LatchVPBB); 9604 VPValue *Cond = 9605 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9606 VPValue *Red = PhiR->getBackedgeValue(); 9607 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9608 } 9609 } 9610 } 9611 9612 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9613 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9614 VPSlotTracker &SlotTracker) const { 9615 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9616 IG->getInsertPos()->printAsOperand(O, false); 9617 O << ", "; 9618 getAddr()->printAsOperand(O, SlotTracker); 9619 VPValue *Mask = getMask(); 9620 if (Mask) { 9621 O << ", "; 9622 Mask->printAsOperand(O, SlotTracker); 9623 } 9624 9625 unsigned OpIdx = 0; 9626 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9627 if (!IG->getMember(i)) 9628 continue; 9629 if (getNumStoreOperands() > 0) { 9630 O << "\n" << Indent << " store "; 9631 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9632 O << " to index " << i; 9633 } else { 9634 O << "\n" << Indent << " "; 9635 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9636 O << " = load from index " << i; 9637 } 9638 ++OpIdx; 9639 } 9640 } 9641 #endif 9642 9643 void VPWidenCallRecipe::execute(VPTransformState &State) { 9644 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9645 *this, State); 9646 } 9647 9648 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9649 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9650 this, *this, InvariantCond, State); 9651 } 9652 9653 void VPWidenRecipe::execute(VPTransformState &State) { 9654 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9655 } 9656 9657 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9658 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9659 *this, State.UF, State.VF, IsPtrLoopInvariant, 9660 IsIndexLoopInvariant, State); 9661 } 9662 9663 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9664 assert(!State.Instance && "Int or FP induction being replicated."); 9665 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9666 getTruncInst(), getVPValue(0), 9667 getCastValue(), State); 9668 } 9669 9670 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9671 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9672 State); 9673 } 9674 9675 void VPBlendRecipe::execute(VPTransformState &State) { 9676 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9677 // We know that all PHIs in non-header blocks are converted into 9678 // selects, so we don't have to worry about the insertion order and we 9679 // can just use the builder. 9680 // At this point we generate the predication tree. There may be 9681 // duplications since this is a simple recursive scan, but future 9682 // optimizations will clean it up. 9683 9684 unsigned NumIncoming = getNumIncomingValues(); 9685 9686 // Generate a sequence of selects of the form: 9687 // SELECT(Mask3, In3, 9688 // SELECT(Mask2, In2, 9689 // SELECT(Mask1, In1, 9690 // In0))) 9691 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9692 // are essentially undef are taken from In0. 9693 InnerLoopVectorizer::VectorParts Entry(State.UF); 9694 for (unsigned In = 0; In < NumIncoming; ++In) { 9695 for (unsigned Part = 0; Part < State.UF; ++Part) { 9696 // We might have single edge PHIs (blocks) - use an identity 9697 // 'select' for the first PHI operand. 9698 Value *In0 = State.get(getIncomingValue(In), Part); 9699 if (In == 0) 9700 Entry[Part] = In0; // Initialize with the first incoming value. 9701 else { 9702 // Select between the current value and the previous incoming edge 9703 // based on the incoming mask. 9704 Value *Cond = State.get(getMask(In), Part); 9705 Entry[Part] = 9706 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9707 } 9708 } 9709 } 9710 for (unsigned Part = 0; Part < State.UF; ++Part) 9711 State.set(this, Entry[Part], Part); 9712 } 9713 9714 void VPInterleaveRecipe::execute(VPTransformState &State) { 9715 assert(!State.Instance && "Interleave group being replicated."); 9716 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9717 getStoredValues(), getMask()); 9718 } 9719 9720 void VPReductionRecipe::execute(VPTransformState &State) { 9721 assert(!State.Instance && "Reduction being replicated."); 9722 Value *PrevInChain = State.get(getChainOp(), 0); 9723 for (unsigned Part = 0; Part < State.UF; ++Part) { 9724 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9725 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9726 Value *NewVecOp = State.get(getVecOp(), Part); 9727 if (VPValue *Cond = getCondOp()) { 9728 Value *NewCond = State.get(Cond, Part); 9729 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9730 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9731 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9732 Constant *IdenVec = 9733 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9734 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9735 NewVecOp = Select; 9736 } 9737 Value *NewRed; 9738 Value *NextInChain; 9739 if (IsOrdered) { 9740 if (State.VF.isVector()) 9741 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9742 PrevInChain); 9743 else 9744 NewRed = State.Builder.CreateBinOp( 9745 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), 9746 PrevInChain, NewVecOp); 9747 PrevInChain = NewRed; 9748 } else { 9749 PrevInChain = State.get(getChainOp(), Part); 9750 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9751 } 9752 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9753 NextInChain = 9754 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9755 NewRed, PrevInChain); 9756 } else if (IsOrdered) 9757 NextInChain = NewRed; 9758 else { 9759 NextInChain = State.Builder.CreateBinOp( 9760 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9761 PrevInChain); 9762 } 9763 State.set(this, NextInChain, Part); 9764 } 9765 } 9766 9767 void VPReplicateRecipe::execute(VPTransformState &State) { 9768 if (State.Instance) { // Generate a single instance. 9769 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9770 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9771 *State.Instance, IsPredicated, State); 9772 // Insert scalar instance packing it into a vector. 9773 if (AlsoPack && State.VF.isVector()) { 9774 // If we're constructing lane 0, initialize to start from poison. 9775 if (State.Instance->Lane.isFirstLane()) { 9776 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9777 Value *Poison = PoisonValue::get( 9778 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9779 State.set(this, Poison, State.Instance->Part); 9780 } 9781 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9782 } 9783 return; 9784 } 9785 9786 // Generate scalar instances for all VF lanes of all UF parts, unless the 9787 // instruction is uniform inwhich case generate only the first lane for each 9788 // of the UF parts. 9789 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9790 assert((!State.VF.isScalable() || IsUniform) && 9791 "Can't scalarize a scalable vector"); 9792 for (unsigned Part = 0; Part < State.UF; ++Part) 9793 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9794 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9795 VPIteration(Part, Lane), IsPredicated, 9796 State); 9797 } 9798 9799 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9800 assert(State.Instance && "Branch on Mask works only on single instance."); 9801 9802 unsigned Part = State.Instance->Part; 9803 unsigned Lane = State.Instance->Lane.getKnownLane(); 9804 9805 Value *ConditionBit = nullptr; 9806 VPValue *BlockInMask = getMask(); 9807 if (BlockInMask) { 9808 ConditionBit = State.get(BlockInMask, Part); 9809 if (ConditionBit->getType()->isVectorTy()) 9810 ConditionBit = State.Builder.CreateExtractElement( 9811 ConditionBit, State.Builder.getInt32(Lane)); 9812 } else // Block in mask is all-one. 9813 ConditionBit = State.Builder.getTrue(); 9814 9815 // Replace the temporary unreachable terminator with a new conditional branch, 9816 // whose two destinations will be set later when they are created. 9817 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9818 assert(isa<UnreachableInst>(CurrentTerminator) && 9819 "Expected to replace unreachable terminator with conditional branch."); 9820 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9821 CondBr->setSuccessor(0, nullptr); 9822 ReplaceInstWithInst(CurrentTerminator, CondBr); 9823 } 9824 9825 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9826 assert(State.Instance && "Predicated instruction PHI works per instance."); 9827 Instruction *ScalarPredInst = 9828 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9829 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9830 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9831 assert(PredicatingBB && "Predicated block has no single predecessor."); 9832 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9833 "operand must be VPReplicateRecipe"); 9834 9835 // By current pack/unpack logic we need to generate only a single phi node: if 9836 // a vector value for the predicated instruction exists at this point it means 9837 // the instruction has vector users only, and a phi for the vector value is 9838 // needed. In this case the recipe of the predicated instruction is marked to 9839 // also do that packing, thereby "hoisting" the insert-element sequence. 9840 // Otherwise, a phi node for the scalar value is needed. 9841 unsigned Part = State.Instance->Part; 9842 if (State.hasVectorValue(getOperand(0), Part)) { 9843 Value *VectorValue = State.get(getOperand(0), Part); 9844 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9845 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9846 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9847 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9848 if (State.hasVectorValue(this, Part)) 9849 State.reset(this, VPhi, Part); 9850 else 9851 State.set(this, VPhi, Part); 9852 // NOTE: Currently we need to update the value of the operand, so the next 9853 // predicated iteration inserts its generated value in the correct vector. 9854 State.reset(getOperand(0), VPhi, Part); 9855 } else { 9856 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9857 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9858 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9859 PredicatingBB); 9860 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9861 if (State.hasScalarValue(this, *State.Instance)) 9862 State.reset(this, Phi, *State.Instance); 9863 else 9864 State.set(this, Phi, *State.Instance); 9865 // NOTE: Currently we need to update the value of the operand, so the next 9866 // predicated iteration inserts its generated value in the correct vector. 9867 State.reset(getOperand(0), Phi, *State.Instance); 9868 } 9869 } 9870 9871 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9872 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9873 State.ILV->vectorizeMemoryInstruction( 9874 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9875 StoredValue, getMask()); 9876 } 9877 9878 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9879 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9880 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9881 // for predication. 9882 static ScalarEpilogueLowering getScalarEpilogueLowering( 9883 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9884 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9885 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9886 LoopVectorizationLegality &LVL) { 9887 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9888 // don't look at hints or options, and don't request a scalar epilogue. 9889 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9890 // LoopAccessInfo (due to code dependency and not being able to reliably get 9891 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9892 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9893 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9894 // back to the old way and vectorize with versioning when forced. See D81345.) 9895 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9896 PGSOQueryType::IRPass) && 9897 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9898 return CM_ScalarEpilogueNotAllowedOptSize; 9899 9900 // 2) If set, obey the directives 9901 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9902 switch (PreferPredicateOverEpilogue) { 9903 case PreferPredicateTy::ScalarEpilogue: 9904 return CM_ScalarEpilogueAllowed; 9905 case PreferPredicateTy::PredicateElseScalarEpilogue: 9906 return CM_ScalarEpilogueNotNeededUsePredicate; 9907 case PreferPredicateTy::PredicateOrDontVectorize: 9908 return CM_ScalarEpilogueNotAllowedUsePredicate; 9909 }; 9910 } 9911 9912 // 3) If set, obey the hints 9913 switch (Hints.getPredicate()) { 9914 case LoopVectorizeHints::FK_Enabled: 9915 return CM_ScalarEpilogueNotNeededUsePredicate; 9916 case LoopVectorizeHints::FK_Disabled: 9917 return CM_ScalarEpilogueAllowed; 9918 }; 9919 9920 // 4) if the TTI hook indicates this is profitable, request predication. 9921 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9922 LVL.getLAI())) 9923 return CM_ScalarEpilogueNotNeededUsePredicate; 9924 9925 return CM_ScalarEpilogueAllowed; 9926 } 9927 9928 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9929 // If Values have been set for this Def return the one relevant for \p Part. 9930 if (hasVectorValue(Def, Part)) 9931 return Data.PerPartOutput[Def][Part]; 9932 9933 if (!hasScalarValue(Def, {Part, 0})) { 9934 Value *IRV = Def->getLiveInIRValue(); 9935 Value *B = ILV->getBroadcastInstrs(IRV); 9936 set(Def, B, Part); 9937 return B; 9938 } 9939 9940 Value *ScalarValue = get(Def, {Part, 0}); 9941 // If we aren't vectorizing, we can just copy the scalar map values over 9942 // to the vector map. 9943 if (VF.isScalar()) { 9944 set(Def, ScalarValue, Part); 9945 return ScalarValue; 9946 } 9947 9948 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9949 bool IsUniform = RepR && RepR->isUniform(); 9950 9951 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9952 // Check if there is a scalar value for the selected lane. 9953 if (!hasScalarValue(Def, {Part, LastLane})) { 9954 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9955 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9956 "unexpected recipe found to be invariant"); 9957 IsUniform = true; 9958 LastLane = 0; 9959 } 9960 9961 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9962 // Set the insert point after the last scalarized instruction or after the 9963 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9964 // will directly follow the scalar definitions. 9965 auto OldIP = Builder.saveIP(); 9966 auto NewIP = 9967 isa<PHINode>(LastInst) 9968 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9969 : std::next(BasicBlock::iterator(LastInst)); 9970 Builder.SetInsertPoint(&*NewIP); 9971 9972 // However, if we are vectorizing, we need to construct the vector values. 9973 // If the value is known to be uniform after vectorization, we can just 9974 // broadcast the scalar value corresponding to lane zero for each unroll 9975 // iteration. Otherwise, we construct the vector values using 9976 // insertelement instructions. Since the resulting vectors are stored in 9977 // State, we will only generate the insertelements once. 9978 Value *VectorValue = nullptr; 9979 if (IsUniform) { 9980 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9981 set(Def, VectorValue, Part); 9982 } else { 9983 // Initialize packing with insertelements to start from undef. 9984 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9985 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9986 set(Def, Undef, Part); 9987 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9988 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9989 VectorValue = get(Def, Part); 9990 } 9991 Builder.restoreIP(OldIP); 9992 return VectorValue; 9993 } 9994 9995 // Process the loop in the VPlan-native vectorization path. This path builds 9996 // VPlan upfront in the vectorization pipeline, which allows to apply 9997 // VPlan-to-VPlan transformations from the very beginning without modifying the 9998 // input LLVM IR. 9999 static bool processLoopInVPlanNativePath( 10000 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10001 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10002 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10003 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10004 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10005 LoopVectorizationRequirements &Requirements) { 10006 10007 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10008 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10009 return false; 10010 } 10011 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10012 Function *F = L->getHeader()->getParent(); 10013 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10014 10015 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10016 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10017 10018 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10019 &Hints, IAI); 10020 // Use the planner for outer loop vectorization. 10021 // TODO: CM is not used at this point inside the planner. Turn CM into an 10022 // optional argument if we don't need it in the future. 10023 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10024 Requirements, ORE); 10025 10026 // Get user vectorization factor. 10027 ElementCount UserVF = Hints.getWidth(); 10028 10029 CM.collectElementTypesForWidening(); 10030 10031 // Plan how to best vectorize, return the best VF and its cost. 10032 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10033 10034 // If we are stress testing VPlan builds, do not attempt to generate vector 10035 // code. Masked vector code generation support will follow soon. 10036 // Also, do not attempt to vectorize if no vector code will be produced. 10037 if (VPlanBuildStressTest || EnableVPlanPredication || 10038 VectorizationFactor::Disabled() == VF) 10039 return false; 10040 10041 LVP.setBestPlan(VF.Width, 1); 10042 10043 { 10044 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10045 F->getParent()->getDataLayout()); 10046 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10047 &CM, BFI, PSI, Checks); 10048 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10049 << L->getHeader()->getParent()->getName() << "\"\n"); 10050 LVP.executePlan(LB, DT); 10051 } 10052 10053 // Mark the loop as already vectorized to avoid vectorizing again. 10054 Hints.setAlreadyVectorized(); 10055 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10056 return true; 10057 } 10058 10059 // Emit a remark if there are stores to floats that required a floating point 10060 // extension. If the vectorized loop was generated with floating point there 10061 // will be a performance penalty from the conversion overhead and the change in 10062 // the vector width. 10063 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10064 SmallVector<Instruction *, 4> Worklist; 10065 for (BasicBlock *BB : L->getBlocks()) { 10066 for (Instruction &Inst : *BB) { 10067 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10068 if (S->getValueOperand()->getType()->isFloatTy()) 10069 Worklist.push_back(S); 10070 } 10071 } 10072 } 10073 10074 // Traverse the floating point stores upwards searching, for floating point 10075 // conversions. 10076 SmallPtrSet<const Instruction *, 4> Visited; 10077 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10078 while (!Worklist.empty()) { 10079 auto *I = Worklist.pop_back_val(); 10080 if (!L->contains(I)) 10081 continue; 10082 if (!Visited.insert(I).second) 10083 continue; 10084 10085 // Emit a remark if the floating point store required a floating 10086 // point conversion. 10087 // TODO: More work could be done to identify the root cause such as a 10088 // constant or a function return type and point the user to it. 10089 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10090 ORE->emit([&]() { 10091 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10092 I->getDebugLoc(), L->getHeader()) 10093 << "floating point conversion changes vector width. " 10094 << "Mixed floating point precision requires an up/down " 10095 << "cast that will negatively impact performance."; 10096 }); 10097 10098 for (Use &Op : I->operands()) 10099 if (auto *OpI = dyn_cast<Instruction>(Op)) 10100 Worklist.push_back(OpI); 10101 } 10102 } 10103 10104 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10105 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10106 !EnableLoopInterleaving), 10107 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10108 !EnableLoopVectorization) {} 10109 10110 bool LoopVectorizePass::processLoop(Loop *L) { 10111 assert((EnableVPlanNativePath || L->isInnermost()) && 10112 "VPlan-native path is not enabled. Only process inner loops."); 10113 10114 #ifndef NDEBUG 10115 const std::string DebugLocStr = getDebugLocString(L); 10116 #endif /* NDEBUG */ 10117 10118 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10119 << L->getHeader()->getParent()->getName() << "\" from " 10120 << DebugLocStr << "\n"); 10121 10122 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10123 10124 LLVM_DEBUG( 10125 dbgs() << "LV: Loop hints:" 10126 << " force=" 10127 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10128 ? "disabled" 10129 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10130 ? "enabled" 10131 : "?")) 10132 << " width=" << Hints.getWidth() 10133 << " interleave=" << Hints.getInterleave() << "\n"); 10134 10135 // Function containing loop 10136 Function *F = L->getHeader()->getParent(); 10137 10138 // Looking at the diagnostic output is the only way to determine if a loop 10139 // was vectorized (other than looking at the IR or machine code), so it 10140 // is important to generate an optimization remark for each loop. Most of 10141 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10142 // generated as OptimizationRemark and OptimizationRemarkMissed are 10143 // less verbose reporting vectorized loops and unvectorized loops that may 10144 // benefit from vectorization, respectively. 10145 10146 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10147 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10148 return false; 10149 } 10150 10151 PredicatedScalarEvolution PSE(*SE, *L); 10152 10153 // Check if it is legal to vectorize the loop. 10154 LoopVectorizationRequirements Requirements; 10155 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10156 &Requirements, &Hints, DB, AC, BFI, PSI); 10157 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10158 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10159 Hints.emitRemarkWithHints(); 10160 return false; 10161 } 10162 10163 // Check the function attributes and profiles to find out if this function 10164 // should be optimized for size. 10165 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10166 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10167 10168 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10169 // here. They may require CFG and instruction level transformations before 10170 // even evaluating whether vectorization is profitable. Since we cannot modify 10171 // the incoming IR, we need to build VPlan upfront in the vectorization 10172 // pipeline. 10173 if (!L->isInnermost()) 10174 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10175 ORE, BFI, PSI, Hints, Requirements); 10176 10177 assert(L->isInnermost() && "Inner loop expected."); 10178 10179 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10180 // count by optimizing for size, to minimize overheads. 10181 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10182 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10183 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10184 << "This loop is worth vectorizing only if no scalar " 10185 << "iteration overheads are incurred."); 10186 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10187 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10188 else { 10189 LLVM_DEBUG(dbgs() << "\n"); 10190 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10191 } 10192 } 10193 10194 // Check the function attributes to see if implicit floats are allowed. 10195 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10196 // an integer loop and the vector instructions selected are purely integer 10197 // vector instructions? 10198 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10199 reportVectorizationFailure( 10200 "Can't vectorize when the NoImplicitFloat attribute is used", 10201 "loop not vectorized due to NoImplicitFloat attribute", 10202 "NoImplicitFloat", ORE, L); 10203 Hints.emitRemarkWithHints(); 10204 return false; 10205 } 10206 10207 // Check if the target supports potentially unsafe FP vectorization. 10208 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10209 // for the target we're vectorizing for, to make sure none of the 10210 // additional fp-math flags can help. 10211 if (Hints.isPotentiallyUnsafe() && 10212 TTI->isFPVectorizationPotentiallyUnsafe()) { 10213 reportVectorizationFailure( 10214 "Potentially unsafe FP op prevents vectorization", 10215 "loop not vectorized due to unsafe FP support.", 10216 "UnsafeFP", ORE, L); 10217 Hints.emitRemarkWithHints(); 10218 return false; 10219 } 10220 10221 if (!LVL.canVectorizeFPMath(ForceOrderedReductions)) { 10222 ORE->emit([&]() { 10223 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10224 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10225 ExactFPMathInst->getDebugLoc(), 10226 ExactFPMathInst->getParent()) 10227 << "loop not vectorized: cannot prove it is safe to reorder " 10228 "floating-point operations"; 10229 }); 10230 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10231 "reorder floating-point operations\n"); 10232 Hints.emitRemarkWithHints(); 10233 return false; 10234 } 10235 10236 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10237 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10238 10239 // If an override option has been passed in for interleaved accesses, use it. 10240 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10241 UseInterleaved = EnableInterleavedMemAccesses; 10242 10243 // Analyze interleaved memory accesses. 10244 if (UseInterleaved) { 10245 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10246 } 10247 10248 // Use the cost model. 10249 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10250 F, &Hints, IAI); 10251 CM.collectValuesToIgnore(); 10252 CM.collectElementTypesForWidening(); 10253 10254 // Use the planner for vectorization. 10255 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10256 Requirements, ORE); 10257 10258 // Get user vectorization factor and interleave count. 10259 ElementCount UserVF = Hints.getWidth(); 10260 unsigned UserIC = Hints.getInterleave(); 10261 10262 // Plan how to best vectorize, return the best VF and its cost. 10263 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10264 10265 VectorizationFactor VF = VectorizationFactor::Disabled(); 10266 unsigned IC = 1; 10267 10268 if (MaybeVF) { 10269 VF = *MaybeVF; 10270 // Select the interleave count. 10271 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10272 } 10273 10274 // Identify the diagnostic messages that should be produced. 10275 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10276 bool VectorizeLoop = true, InterleaveLoop = true; 10277 if (VF.Width.isScalar()) { 10278 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10279 VecDiagMsg = std::make_pair( 10280 "VectorizationNotBeneficial", 10281 "the cost-model indicates that vectorization is not beneficial"); 10282 VectorizeLoop = false; 10283 } 10284 10285 if (!MaybeVF && UserIC > 1) { 10286 // Tell the user interleaving was avoided up-front, despite being explicitly 10287 // requested. 10288 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10289 "interleaving should be avoided up front\n"); 10290 IntDiagMsg = std::make_pair( 10291 "InterleavingAvoided", 10292 "Ignoring UserIC, because interleaving was avoided up front"); 10293 InterleaveLoop = false; 10294 } else if (IC == 1 && UserIC <= 1) { 10295 // Tell the user interleaving is not beneficial. 10296 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10297 IntDiagMsg = std::make_pair( 10298 "InterleavingNotBeneficial", 10299 "the cost-model indicates that interleaving is not beneficial"); 10300 InterleaveLoop = false; 10301 if (UserIC == 1) { 10302 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10303 IntDiagMsg.second += 10304 " and is explicitly disabled or interleave count is set to 1"; 10305 } 10306 } else if (IC > 1 && UserIC == 1) { 10307 // Tell the user interleaving is beneficial, but it explicitly disabled. 10308 LLVM_DEBUG( 10309 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10310 IntDiagMsg = std::make_pair( 10311 "InterleavingBeneficialButDisabled", 10312 "the cost-model indicates that interleaving is beneficial " 10313 "but is explicitly disabled or interleave count is set to 1"); 10314 InterleaveLoop = false; 10315 } 10316 10317 // Override IC if user provided an interleave count. 10318 IC = UserIC > 0 ? UserIC : IC; 10319 10320 // Emit diagnostic messages, if any. 10321 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10322 if (!VectorizeLoop && !InterleaveLoop) { 10323 // Do not vectorize or interleaving the loop. 10324 ORE->emit([&]() { 10325 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10326 L->getStartLoc(), L->getHeader()) 10327 << VecDiagMsg.second; 10328 }); 10329 ORE->emit([&]() { 10330 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10331 L->getStartLoc(), L->getHeader()) 10332 << IntDiagMsg.second; 10333 }); 10334 return false; 10335 } else if (!VectorizeLoop && InterleaveLoop) { 10336 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10337 ORE->emit([&]() { 10338 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10339 L->getStartLoc(), L->getHeader()) 10340 << VecDiagMsg.second; 10341 }); 10342 } else if (VectorizeLoop && !InterleaveLoop) { 10343 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10344 << ") in " << DebugLocStr << '\n'); 10345 ORE->emit([&]() { 10346 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10347 L->getStartLoc(), L->getHeader()) 10348 << IntDiagMsg.second; 10349 }); 10350 } else if (VectorizeLoop && InterleaveLoop) { 10351 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10352 << ") in " << DebugLocStr << '\n'); 10353 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10354 } 10355 10356 bool DisableRuntimeUnroll = false; 10357 MDNode *OrigLoopID = L->getLoopID(); 10358 { 10359 // Optimistically generate runtime checks. Drop them if they turn out to not 10360 // be profitable. Limit the scope of Checks, so the cleanup happens 10361 // immediately after vector codegeneration is done. 10362 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10363 F->getParent()->getDataLayout()); 10364 if (!VF.Width.isScalar() || IC > 1) 10365 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10366 LVP.setBestPlan(VF.Width, IC); 10367 10368 using namespace ore; 10369 if (!VectorizeLoop) { 10370 assert(IC > 1 && "interleave count should not be 1 or 0"); 10371 // If we decided that it is not legal to vectorize the loop, then 10372 // interleave it. 10373 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10374 &CM, BFI, PSI, Checks); 10375 LVP.executePlan(Unroller, DT); 10376 10377 ORE->emit([&]() { 10378 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10379 L->getHeader()) 10380 << "interleaved loop (interleaved count: " 10381 << NV("InterleaveCount", IC) << ")"; 10382 }); 10383 } else { 10384 // If we decided that it is *legal* to vectorize the loop, then do it. 10385 10386 // Consider vectorizing the epilogue too if it's profitable. 10387 VectorizationFactor EpilogueVF = 10388 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10389 if (EpilogueVF.Width.isVector()) { 10390 10391 // The first pass vectorizes the main loop and creates a scalar epilogue 10392 // to be vectorized by executing the plan (potentially with a different 10393 // factor) again shortly afterwards. 10394 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10395 EpilogueVF.Width.getKnownMinValue(), 10396 1); 10397 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10398 EPI, &LVL, &CM, BFI, PSI, Checks); 10399 10400 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10401 LVP.executePlan(MainILV, DT); 10402 ++LoopsVectorized; 10403 10404 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10405 formLCSSARecursively(*L, *DT, LI, SE); 10406 10407 // Second pass vectorizes the epilogue and adjusts the control flow 10408 // edges from the first pass. 10409 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10410 EPI.MainLoopVF = EPI.EpilogueVF; 10411 EPI.MainLoopUF = EPI.EpilogueUF; 10412 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10413 ORE, EPI, &LVL, &CM, BFI, PSI, 10414 Checks); 10415 LVP.executePlan(EpilogILV, DT); 10416 ++LoopsEpilogueVectorized; 10417 10418 if (!MainILV.areSafetyChecksAdded()) 10419 DisableRuntimeUnroll = true; 10420 } else { 10421 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10422 &LVL, &CM, BFI, PSI, Checks); 10423 LVP.executePlan(LB, DT); 10424 ++LoopsVectorized; 10425 10426 // Add metadata to disable runtime unrolling a scalar loop when there 10427 // are no runtime checks about strides and memory. A scalar loop that is 10428 // rarely used is not worth unrolling. 10429 if (!LB.areSafetyChecksAdded()) 10430 DisableRuntimeUnroll = true; 10431 } 10432 // Report the vectorization decision. 10433 ORE->emit([&]() { 10434 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10435 L->getHeader()) 10436 << "vectorized loop (vectorization width: " 10437 << NV("VectorizationFactor", VF.Width) 10438 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10439 }); 10440 } 10441 10442 if (ORE->allowExtraAnalysis(LV_NAME)) 10443 checkMixedPrecision(L, ORE); 10444 } 10445 10446 Optional<MDNode *> RemainderLoopID = 10447 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10448 LLVMLoopVectorizeFollowupEpilogue}); 10449 if (RemainderLoopID.hasValue()) { 10450 L->setLoopID(RemainderLoopID.getValue()); 10451 } else { 10452 if (DisableRuntimeUnroll) 10453 AddRuntimeUnrollDisableMetaData(L); 10454 10455 // Mark the loop as already vectorized to avoid vectorizing again. 10456 Hints.setAlreadyVectorized(); 10457 } 10458 10459 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10460 return true; 10461 } 10462 10463 LoopVectorizeResult LoopVectorizePass::runImpl( 10464 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10465 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10466 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10467 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10468 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10469 SE = &SE_; 10470 LI = &LI_; 10471 TTI = &TTI_; 10472 DT = &DT_; 10473 BFI = &BFI_; 10474 TLI = TLI_; 10475 AA = &AA_; 10476 AC = &AC_; 10477 GetLAA = &GetLAA_; 10478 DB = &DB_; 10479 ORE = &ORE_; 10480 PSI = PSI_; 10481 10482 // Don't attempt if 10483 // 1. the target claims to have no vector registers, and 10484 // 2. interleaving won't help ILP. 10485 // 10486 // The second condition is necessary because, even if the target has no 10487 // vector registers, loop vectorization may still enable scalar 10488 // interleaving. 10489 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10490 TTI->getMaxInterleaveFactor(1) < 2) 10491 return LoopVectorizeResult(false, false); 10492 10493 bool Changed = false, CFGChanged = false; 10494 10495 // The vectorizer requires loops to be in simplified form. 10496 // Since simplification may add new inner loops, it has to run before the 10497 // legality and profitability checks. This means running the loop vectorizer 10498 // will simplify all loops, regardless of whether anything end up being 10499 // vectorized. 10500 for (auto &L : *LI) 10501 Changed |= CFGChanged |= 10502 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10503 10504 // Build up a worklist of inner-loops to vectorize. This is necessary as 10505 // the act of vectorizing or partially unrolling a loop creates new loops 10506 // and can invalidate iterators across the loops. 10507 SmallVector<Loop *, 8> Worklist; 10508 10509 for (Loop *L : *LI) 10510 collectSupportedLoops(*L, LI, ORE, Worklist); 10511 10512 LoopsAnalyzed += Worklist.size(); 10513 10514 // Now walk the identified inner loops. 10515 while (!Worklist.empty()) { 10516 Loop *L = Worklist.pop_back_val(); 10517 10518 // For the inner loops we actually process, form LCSSA to simplify the 10519 // transform. 10520 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10521 10522 Changed |= CFGChanged |= processLoop(L); 10523 } 10524 10525 // Process each loop nest in the function. 10526 return LoopVectorizeResult(Changed, CFGChanged); 10527 } 10528 10529 PreservedAnalyses LoopVectorizePass::run(Function &F, 10530 FunctionAnalysisManager &AM) { 10531 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10532 auto &LI = AM.getResult<LoopAnalysis>(F); 10533 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10534 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10535 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10536 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10537 auto &AA = AM.getResult<AAManager>(F); 10538 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10539 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10540 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10541 10542 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10543 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10544 [&](Loop &L) -> const LoopAccessInfo & { 10545 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10546 TLI, TTI, nullptr, nullptr}; 10547 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10548 }; 10549 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10550 ProfileSummaryInfo *PSI = 10551 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10552 LoopVectorizeResult Result = 10553 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10554 if (!Result.MadeAnyChange) 10555 return PreservedAnalyses::all(); 10556 PreservedAnalyses PA; 10557 10558 // We currently do not preserve loopinfo/dominator analyses with outer loop 10559 // vectorization. Until this is addressed, mark these analyses as preserved 10560 // only for non-VPlan-native path. 10561 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10562 if (!EnableVPlanNativePath) { 10563 PA.preserve<LoopAnalysis>(); 10564 PA.preserve<DominatorTreeAnalysis>(); 10565 } 10566 if (!Result.MadeCFGChange) 10567 PA.preserveSet<CFGAnalyses>(); 10568 return PA; 10569 } 10570