1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop. 472 /// In the case of epilogue vectorization, this function is overriden to 473 /// handle the more complex control flow around the loops. 474 virtual BasicBlock *createVectorizedLoopSkeleton(); 475 476 /// Widen a single instruction within the innermost loop. 477 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 478 VPTransformState &State); 479 480 /// Widen a single call instruction within the innermost loop. 481 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 482 VPTransformState &State); 483 484 /// Widen a single select instruction within the innermost loop. 485 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 486 bool InvariantCond, VPTransformState &State); 487 488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 489 void fixVectorizedLoop(VPTransformState &State); 490 491 // Return true if any runtime check is added. 492 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 493 494 /// A type for vectorized values in the new loop. Each value from the 495 /// original loop, when vectorized, is represented by UF vector values in the 496 /// new unrolled loop, where UF is the unroll factor. 497 using VectorParts = SmallVector<Value *, 2>; 498 499 /// Vectorize a single GetElementPtrInst based on information gathered and 500 /// decisions taken during planning. 501 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 502 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 503 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 504 505 /// Vectorize a single first-order recurrence or pointer induction PHINode in 506 /// a block. This method handles the induction variable canonicalization. It 507 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 508 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 509 VPTransformState &State); 510 511 /// A helper function to scalarize a single Instruction in the innermost loop. 512 /// Generates a sequence of scalar instances for each lane between \p MinLane 513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 514 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 515 /// Instr's operands. 516 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 517 const VPIteration &Instance, bool IfPredicateInstr, 518 VPTransformState &State); 519 520 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 521 /// is provided, the integer induction variable will first be truncated to 522 /// the corresponding type. 523 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 524 VPValue *Def, VPValue *CastDef, 525 VPTransformState &State); 526 527 /// Construct the vector value of a scalarized value \p V one lane at a time. 528 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 529 VPTransformState &State); 530 531 /// Try to vectorize interleaved access group \p Group with the base address 532 /// given in \p Addr, optionally masking the vector operations if \p 533 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 534 /// values in the vectorized loop. 535 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 536 ArrayRef<VPValue *> VPDefs, 537 VPTransformState &State, VPValue *Addr, 538 ArrayRef<VPValue *> StoredValues, 539 VPValue *BlockInMask = nullptr); 540 541 /// Vectorize Load and Store instructions with the base address given in \p 542 /// Addr, optionally masking the vector operations if \p BlockInMask is 543 /// non-null. Use \p State to translate given VPValues to IR values in the 544 /// vectorized loop. 545 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 546 VPValue *Def, VPValue *Addr, 547 VPValue *StoredValue, VPValue *BlockInMask, 548 bool ConsecutiveStride, bool Reverse); 549 550 /// Set the debug location in the builder \p Ptr using the debug location in 551 /// \p V. If \p Ptr is None then it uses the class member's Builder. 552 void setDebugLocFromInst(const Value *V, 553 Optional<IRBuilder<> *> CustomBuilder = None); 554 555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 556 void fixNonInductionPHIs(VPTransformState &State); 557 558 /// Returns true if the reordering of FP operations is not allowed, but we are 559 /// able to vectorize with strict in-order reductions for the given RdxDesc. 560 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 561 562 /// Create a broadcast instruction. This method generates a broadcast 563 /// instruction (shuffle) for loop invariant values and for the induction 564 /// value. If this is the induction variable then we extend it to N, N+1, ... 565 /// this is needed because each iteration in the loop corresponds to a SIMD 566 /// element. 567 virtual Value *getBroadcastInstrs(Value *V); 568 569 protected: 570 friend class LoopVectorizationPlanner; 571 572 /// A small list of PHINodes. 573 using PhiVector = SmallVector<PHINode *, 4>; 574 575 /// A type for scalarized values in the new loop. Each value from the 576 /// original loop, when scalarized, is represented by UF x VF scalar values 577 /// in the new unrolled loop, where UF is the unroll factor and VF is the 578 /// vectorization factor. 579 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 580 581 /// Set up the values of the IVs correctly when exiting the vector loop. 582 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 583 Value *CountRoundDown, Value *EndValue, 584 BasicBlock *MiddleBlock); 585 586 /// Create a new induction variable inside L. 587 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 588 Value *Step, Instruction *DL); 589 590 /// Handle all cross-iteration phis in the header. 591 void fixCrossIterationPHIs(VPTransformState &State); 592 593 /// Create the exit value of first order recurrences in the middle block and 594 /// update their users. 595 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 596 597 /// Create code for the loop exit value of the reduction. 598 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 599 600 /// Clear NSW/NUW flags from reduction instructions if necessary. 601 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 602 VPTransformState &State); 603 604 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 605 /// means we need to add the appropriate incoming value from the middle 606 /// block as exiting edges from the scalar epilogue loop (if present) are 607 /// already in place, and we exit the vector loop exclusively to the middle 608 /// block. 609 void fixLCSSAPHIs(VPTransformState &State); 610 611 /// Iteratively sink the scalarized operands of a predicated instruction into 612 /// the block that was created for it. 613 void sinkScalarOperands(Instruction *PredInst); 614 615 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 616 /// represented as. 617 void truncateToMinimalBitwidths(VPTransformState &State); 618 619 /// This function adds 620 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 621 /// to each vector element of Val. The sequence starts at StartIndex. 622 /// \p Opcode is relevant for FP induction variable. 623 virtual Value * 624 getStepVector(Value *Val, Value *StartIdx, Value *Step, 625 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 626 627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 628 /// variable on which to base the steps, \p Step is the size of the step, and 629 /// \p EntryVal is the value from the original loop that maps to the steps. 630 /// Note that \p EntryVal doesn't have to be an induction variable - it 631 /// can also be a truncate instruction. 632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 633 const InductionDescriptor &ID, VPValue *Def, 634 VPValue *CastDef, VPTransformState &State); 635 636 /// Create a vector induction phi node based on an existing scalar one. \p 637 /// EntryVal is the value from the original loop that maps to the vector phi 638 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 639 /// truncate instruction, instead of widening the original IV, we widen a 640 /// version of the IV truncated to \p EntryVal's type. 641 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 642 Value *Step, Value *Start, 643 Instruction *EntryVal, VPValue *Def, 644 VPValue *CastDef, 645 VPTransformState &State); 646 647 /// Returns true if an instruction \p I should be scalarized instead of 648 /// vectorized for the chosen vectorization factor. 649 bool shouldScalarizeInstruction(Instruction *I) const; 650 651 /// Returns true if we should generate a scalar version of \p IV. 652 bool needsScalarInduction(Instruction *IV) const; 653 654 /// If there is a cast involved in the induction variable \p ID, which should 655 /// be ignored in the vectorized loop body, this function records the 656 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 657 /// cast. We had already proved that the casted Phi is equal to the uncasted 658 /// Phi in the vectorized loop (under a runtime guard), and therefore 659 /// there is no need to vectorize the cast - the same value can be used in the 660 /// vector loop for both the Phi and the cast. 661 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 662 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 663 /// 664 /// \p EntryVal is the value from the original loop that maps to the vector 665 /// phi node and is used to distinguish what is the IV currently being 666 /// processed - original one (if \p EntryVal is a phi corresponding to the 667 /// original IV) or the "newly-created" one based on the proof mentioned above 668 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 669 /// latter case \p EntryVal is a TruncInst and we must not record anything for 670 /// that IV, but it's error-prone to expect callers of this routine to care 671 /// about that, hence this explicit parameter. 672 void recordVectorLoopValueForInductionCast( 673 const InductionDescriptor &ID, const Instruction *EntryVal, 674 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 675 unsigned Part, unsigned Lane = UINT_MAX); 676 677 /// Generate a shuffle sequence that will reverse the vector Vec. 678 virtual Value *reverseVector(Value *Vec); 679 680 /// Returns (and creates if needed) the original loop trip count. 681 Value *getOrCreateTripCount(Loop *NewLoop); 682 683 /// Returns (and creates if needed) the trip count of the widened loop. 684 Value *getOrCreateVectorTripCount(Loop *NewLoop); 685 686 /// Returns a bitcasted value to the requested vector type. 687 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 688 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 689 const DataLayout &DL); 690 691 /// Emit a bypass check to see if the vector trip count is zero, including if 692 /// it overflows. 693 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 694 695 /// Emit a bypass check to see if all of the SCEV assumptions we've 696 /// had to make are correct. Returns the block containing the checks or 697 /// nullptr if no checks have been added. 698 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 699 700 /// Emit bypass checks to check any memory assumptions we may have made. 701 /// Returns the block containing the checks or nullptr if no checks have been 702 /// added. 703 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 704 705 /// Compute the transformed value of Index at offset StartValue using step 706 /// StepValue. 707 /// For integer induction, returns StartValue + Index * StepValue. 708 /// For pointer induction, returns StartValue[Index * StepValue]. 709 /// FIXME: The newly created binary instructions should contain nsw/nuw 710 /// flags, which can be found from the original scalar operations. 711 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 712 const DataLayout &DL, 713 const InductionDescriptor &ID) const; 714 715 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 716 /// vector loop preheader, middle block and scalar preheader. Also 717 /// allocate a loop object for the new vector loop and return it. 718 Loop *createVectorLoopSkeleton(StringRef Prefix); 719 720 /// Create new phi nodes for the induction variables to resume iteration count 721 /// in the scalar epilogue, from where the vectorized loop left off (given by 722 /// \p VectorTripCount). 723 /// In cases where the loop skeleton is more complicated (eg. epilogue 724 /// vectorization) and the resume values can come from an additional bypass 725 /// block, the \p AdditionalBypass pair provides information about the bypass 726 /// block and the end value on the edge from bypass to this loop. 727 void createInductionResumeValues( 728 Loop *L, Value *VectorTripCount, 729 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 730 731 /// Complete the loop skeleton by adding debug MDs, creating appropriate 732 /// conditional branches in the middle block, preparing the builder and 733 /// running the verifier. Take in the vector loop \p L as argument, and return 734 /// the preheader of the completed vector loop. 735 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 736 737 /// Add additional metadata to \p To that was not present on \p Orig. 738 /// 739 /// Currently this is used to add the noalias annotations based on the 740 /// inserted memchecks. Use this for instructions that are *cloned* into the 741 /// vector loop. 742 void addNewMetadata(Instruction *To, const Instruction *Orig); 743 744 /// Add metadata from one instruction to another. 745 /// 746 /// This includes both the original MDs from \p From and additional ones (\see 747 /// addNewMetadata). Use this for *newly created* instructions in the vector 748 /// loop. 749 void addMetadata(Instruction *To, Instruction *From); 750 751 /// Similar to the previous function but it adds the metadata to a 752 /// vector of instructions. 753 void addMetadata(ArrayRef<Value *> To, Instruction *From); 754 755 /// Allow subclasses to override and print debug traces before/after vplan 756 /// execution, when trace information is requested. 757 virtual void printDebugTracesAtStart(){}; 758 virtual void printDebugTracesAtEnd(){}; 759 760 /// The original loop. 761 Loop *OrigLoop; 762 763 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 764 /// dynamic knowledge to simplify SCEV expressions and converts them to a 765 /// more usable form. 766 PredicatedScalarEvolution &PSE; 767 768 /// Loop Info. 769 LoopInfo *LI; 770 771 /// Dominator Tree. 772 DominatorTree *DT; 773 774 /// Alias Analysis. 775 AAResults *AA; 776 777 /// Target Library Info. 778 const TargetLibraryInfo *TLI; 779 780 /// Target Transform Info. 781 const TargetTransformInfo *TTI; 782 783 /// Assumption Cache. 784 AssumptionCache *AC; 785 786 /// Interface to emit optimization remarks. 787 OptimizationRemarkEmitter *ORE; 788 789 /// LoopVersioning. It's only set up (non-null) if memchecks were 790 /// used. 791 /// 792 /// This is currently only used to add no-alias metadata based on the 793 /// memchecks. The actually versioning is performed manually. 794 std::unique_ptr<LoopVersioning> LVer; 795 796 /// The vectorization SIMD factor to use. Each vector will have this many 797 /// vector elements. 798 ElementCount VF; 799 800 /// The vectorization unroll factor to use. Each scalar is vectorized to this 801 /// many different vector instructions. 802 unsigned UF; 803 804 /// The builder that we use 805 IRBuilder<> Builder; 806 807 // --- Vectorization state --- 808 809 /// The vector-loop preheader. 810 BasicBlock *LoopVectorPreHeader; 811 812 /// The scalar-loop preheader. 813 BasicBlock *LoopScalarPreHeader; 814 815 /// Middle Block between the vector and the scalar. 816 BasicBlock *LoopMiddleBlock; 817 818 /// The unique ExitBlock of the scalar loop if one exists. Note that 819 /// there can be multiple exiting edges reaching this block. 820 BasicBlock *LoopExitBlock; 821 822 /// The vector loop body. 823 BasicBlock *LoopVectorBody; 824 825 /// The scalar loop body. 826 BasicBlock *LoopScalarBody; 827 828 /// A list of all bypass blocks. The first block is the entry of the loop. 829 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 830 831 /// The new Induction variable which was added to the new block. 832 PHINode *Induction = nullptr; 833 834 /// The induction variable of the old basic block. 835 PHINode *OldInduction = nullptr; 836 837 /// Store instructions that were predicated. 838 SmallVector<Instruction *, 4> PredicatedInstructions; 839 840 /// Trip count of the original loop. 841 Value *TripCount = nullptr; 842 843 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 844 Value *VectorTripCount = nullptr; 845 846 /// The legality analysis. 847 LoopVectorizationLegality *Legal; 848 849 /// The profitablity analysis. 850 LoopVectorizationCostModel *Cost; 851 852 // Record whether runtime checks are added. 853 bool AddedSafetyChecks = false; 854 855 // Holds the end values for each induction variable. We save the end values 856 // so we can later fix-up the external users of the induction variables. 857 DenseMap<PHINode *, Value *> IVEndValues; 858 859 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 860 // fixed up at the end of vector code generation. 861 SmallVector<PHINode *, 8> OrigPHIsToFix; 862 863 /// BFI and PSI are used to check for profile guided size optimizations. 864 BlockFrequencyInfo *BFI; 865 ProfileSummaryInfo *PSI; 866 867 // Whether this loop should be optimized for size based on profile guided size 868 // optimizatios. 869 bool OptForSizeBasedOnProfile; 870 871 /// Structure to hold information about generated runtime checks, responsible 872 /// for cleaning the checks, if vectorization turns out unprofitable. 873 GeneratedRTChecks &RTChecks; 874 }; 875 876 class InnerLoopUnroller : public InnerLoopVectorizer { 877 public: 878 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 879 LoopInfo *LI, DominatorTree *DT, 880 const TargetLibraryInfo *TLI, 881 const TargetTransformInfo *TTI, AssumptionCache *AC, 882 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 883 LoopVectorizationLegality *LVL, 884 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 885 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 887 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 888 BFI, PSI, Check) {} 889 890 private: 891 Value *getBroadcastInstrs(Value *V) override; 892 Value *getStepVector( 893 Value *Val, Value *StartIdx, Value *Step, 894 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 895 Value *reverseVector(Value *Vec) override; 896 }; 897 898 /// Encapsulate information regarding vectorization of a loop and its epilogue. 899 /// This information is meant to be updated and used across two stages of 900 /// epilogue vectorization. 901 struct EpilogueLoopVectorizationInfo { 902 ElementCount MainLoopVF = ElementCount::getFixed(0); 903 unsigned MainLoopUF = 0; 904 ElementCount EpilogueVF = ElementCount::getFixed(0); 905 unsigned EpilogueUF = 0; 906 BasicBlock *MainLoopIterationCountCheck = nullptr; 907 BasicBlock *EpilogueIterationCountCheck = nullptr; 908 BasicBlock *SCEVSafetyCheck = nullptr; 909 BasicBlock *MemSafetyCheck = nullptr; 910 Value *TripCount = nullptr; 911 Value *VectorTripCount = nullptr; 912 913 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 914 ElementCount EVF, unsigned EUF) 915 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 916 assert(EUF == 1 && 917 "A high UF for the epilogue loop is likely not beneficial."); 918 } 919 }; 920 921 /// An extension of the inner loop vectorizer that creates a skeleton for a 922 /// vectorized loop that has its epilogue (residual) also vectorized. 923 /// The idea is to run the vplan on a given loop twice, firstly to setup the 924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 925 /// from the first step and vectorize the epilogue. This is achieved by 926 /// deriving two concrete strategy classes from this base class and invoking 927 /// them in succession from the loop vectorizer planner. 928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 929 public: 930 InnerLoopAndEpilogueVectorizer( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 940 Checks), 941 EPI(EPI) {} 942 943 // Override this function to handle the more complex control flow around the 944 // three loops. 945 BasicBlock *createVectorizedLoopSkeleton() final override { 946 return createEpilogueVectorizedLoopSkeleton(); 947 } 948 949 /// The interface for creating a vectorized skeleton using one of two 950 /// different strategies, each corresponding to one execution of the vplan 951 /// as described above. 952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 953 954 /// Holds and updates state information required to vectorize the main loop 955 /// and its epilogue in two separate passes. This setup helps us avoid 956 /// regenerating and recomputing runtime safety checks. It also helps us to 957 /// shorten the iteration-count-check path length for the cases where the 958 /// iteration count of the loop is so small that the main vector loop is 959 /// completely skipped. 960 EpilogueLoopVectorizationInfo &EPI; 961 }; 962 963 /// A specialized derived class of inner loop vectorizer that performs 964 /// vectorization of *main* loops in the process of vectorizing loops and their 965 /// epilogues. 966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 967 public: 968 EpilogueVectorizerMainLoop( 969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 970 DominatorTree *DT, const TargetLibraryInfo *TLI, 971 const TargetTransformInfo *TTI, AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 975 GeneratedRTChecks &Check) 976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 977 EPI, LVL, CM, BFI, PSI, Check) {} 978 /// Implements the interface for creating a vectorized skeleton using the 979 /// *main loop* strategy (ie the first pass of vplan execution). 980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 981 982 protected: 983 /// Emits an iteration count bypass check once for the main loop (when \p 984 /// ForEpilogue is false) and once for the epilogue loop (when \p 985 /// ForEpilogue is true). 986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 987 bool ForEpilogue); 988 void printDebugTracesAtStart() override; 989 void printDebugTracesAtEnd() override; 990 }; 991 992 // A specialized derived class of inner loop vectorizer that performs 993 // vectorization of *epilogue* loops in the process of vectorizing loops and 994 // their epilogues. 995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 996 public: 997 EpilogueVectorizerEpilogueLoop( 998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 999 DominatorTree *DT, const TargetLibraryInfo *TLI, 1000 const TargetTransformInfo *TTI, AssumptionCache *AC, 1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1004 GeneratedRTChecks &Checks) 1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1006 EPI, LVL, CM, BFI, PSI, Checks) {} 1007 /// Implements the interface for creating a vectorized skeleton using the 1008 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1010 1011 protected: 1012 /// Emits an iteration count bypass check after the main vector loop has 1013 /// finished to see if there are any iterations left to execute by either 1014 /// the vector epilogue or the scalar epilogue. 1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1016 BasicBlock *Bypass, 1017 BasicBlock *Insert); 1018 void printDebugTracesAtStart() override; 1019 void printDebugTracesAtEnd() override; 1020 }; 1021 } // end namespace llvm 1022 1023 /// Look for a meaningful debug location on the instruction or it's 1024 /// operands. 1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1026 if (!I) 1027 return I; 1028 1029 DebugLoc Empty; 1030 if (I->getDebugLoc() != Empty) 1031 return I; 1032 1033 for (Use &Op : I->operands()) { 1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1035 if (OpInst->getDebugLoc() != Empty) 1036 return OpInst; 1037 } 1038 1039 return I; 1040 } 1041 1042 void InnerLoopVectorizer::setDebugLocFromInst( 1043 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1044 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1045 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1046 const DILocation *DIL = Inst->getDebugLoc(); 1047 1048 // When a FSDiscriminator is enabled, we don't need to add the multiply 1049 // factors to the discriminators. 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1052 // FIXME: For scalable vectors, assume vscale=1. 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B->SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } else 1062 B->SetCurrentDebugLocation(DIL); 1063 } else 1064 B->SetCurrentDebugLocation(DebugLoc()); 1065 } 1066 1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1068 /// is passed, the message relates to that particular instruction. 1069 #ifndef NDEBUG 1070 static void debugVectorizationMessage(const StringRef Prefix, 1071 const StringRef DebugMsg, 1072 Instruction *I) { 1073 dbgs() << "LV: " << Prefix << DebugMsg; 1074 if (I != nullptr) 1075 dbgs() << " " << *I; 1076 else 1077 dbgs() << '.'; 1078 dbgs() << '\n'; 1079 } 1080 #endif 1081 1082 /// Create an analysis remark that explains why vectorization failed 1083 /// 1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1085 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1086 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1087 /// the location of the remark. \return the remark object that can be 1088 /// streamed to. 1089 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1090 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1091 Value *CodeRegion = TheLoop->getHeader(); 1092 DebugLoc DL = TheLoop->getStartLoc(); 1093 1094 if (I) { 1095 CodeRegion = I->getParent(); 1096 // If there is no debug location attached to the instruction, revert back to 1097 // using the loop's. 1098 if (I->getDebugLoc()) 1099 DL = I->getDebugLoc(); 1100 } 1101 1102 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1103 } 1104 1105 /// Return a value for Step multiplied by VF. 1106 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1107 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1108 Constant *StepVal = ConstantInt::get( 1109 Step->getType(), 1110 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1111 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1112 } 1113 1114 namespace llvm { 1115 1116 /// Return the runtime value for VF. 1117 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1118 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1119 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1120 } 1121 1122 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1123 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1124 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1125 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1126 return B.CreateUIToFP(RuntimeVF, FTy); 1127 } 1128 1129 void reportVectorizationFailure(const StringRef DebugMsg, 1130 const StringRef OREMsg, const StringRef ORETag, 1131 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1132 Instruction *I) { 1133 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1134 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1135 ORE->emit( 1136 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1137 << "loop not vectorized: " << OREMsg); 1138 } 1139 1140 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1141 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1142 Instruction *I) { 1143 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1144 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1145 ORE->emit( 1146 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1147 << Msg); 1148 } 1149 1150 } // end namespace llvm 1151 1152 #ifndef NDEBUG 1153 /// \return string containing a file name and a line # for the given loop. 1154 static std::string getDebugLocString(const Loop *L) { 1155 std::string Result; 1156 if (L) { 1157 raw_string_ostream OS(Result); 1158 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1159 LoopDbgLoc.print(OS); 1160 else 1161 // Just print the module name. 1162 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1163 OS.flush(); 1164 } 1165 return Result; 1166 } 1167 #endif 1168 1169 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1170 const Instruction *Orig) { 1171 // If the loop was versioned with memchecks, add the corresponding no-alias 1172 // metadata. 1173 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1174 LVer->annotateInstWithNoAlias(To, Orig); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(Instruction *To, 1178 Instruction *From) { 1179 propagateMetadata(To, From); 1180 addNewMetadata(To, From); 1181 } 1182 1183 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1184 Instruction *From) { 1185 for (Value *V : To) { 1186 if (Instruction *I = dyn_cast<Instruction>(V)) 1187 addMetadata(I, From); 1188 } 1189 } 1190 1191 namespace llvm { 1192 1193 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1194 // lowered. 1195 enum ScalarEpilogueLowering { 1196 1197 // The default: allowing scalar epilogues. 1198 CM_ScalarEpilogueAllowed, 1199 1200 // Vectorization with OptForSize: don't allow epilogues. 1201 CM_ScalarEpilogueNotAllowedOptSize, 1202 1203 // A special case of vectorisation with OptForSize: loops with a very small 1204 // trip count are considered for vectorization under OptForSize, thereby 1205 // making sure the cost of their loop body is dominant, free of runtime 1206 // guards and scalar iteration overheads. 1207 CM_ScalarEpilogueNotAllowedLowTripLoop, 1208 1209 // Loop hint predicate indicating an epilogue is undesired. 1210 CM_ScalarEpilogueNotNeededUsePredicate, 1211 1212 // Directive indicating we must either tail fold or not vectorize 1213 CM_ScalarEpilogueNotAllowedUsePredicate 1214 }; 1215 1216 /// ElementCountComparator creates a total ordering for ElementCount 1217 /// for the purposes of using it in a set structure. 1218 struct ElementCountComparator { 1219 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1220 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1221 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1222 } 1223 }; 1224 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1225 1226 /// LoopVectorizationCostModel - estimates the expected speedups due to 1227 /// vectorization. 1228 /// In many cases vectorization is not profitable. This can happen because of 1229 /// a number of reasons. In this class we mainly attempt to predict the 1230 /// expected speedup/slowdowns due to the supported instruction set. We use the 1231 /// TargetTransformInfo to query the different backends for the cost of 1232 /// different operations. 1233 class LoopVectorizationCostModel { 1234 public: 1235 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1236 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1237 LoopVectorizationLegality *Legal, 1238 const TargetTransformInfo &TTI, 1239 const TargetLibraryInfo *TLI, DemandedBits *DB, 1240 AssumptionCache *AC, 1241 OptimizationRemarkEmitter *ORE, const Function *F, 1242 const LoopVectorizeHints *Hints, 1243 InterleavedAccessInfo &IAI) 1244 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1245 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1246 Hints(Hints), InterleaveInfo(IAI) {} 1247 1248 /// \return An upper bound for the vectorization factors (both fixed and 1249 /// scalable). If the factors are 0, vectorization and interleaving should be 1250 /// avoided up front. 1251 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1252 1253 /// \return True if runtime checks are required for vectorization, and false 1254 /// otherwise. 1255 bool runtimeChecksRequired(); 1256 1257 /// \return The most profitable vectorization factor and the cost of that VF. 1258 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1259 /// then this vectorization factor will be selected if vectorization is 1260 /// possible. 1261 VectorizationFactor 1262 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1263 1264 VectorizationFactor 1265 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1266 const LoopVectorizationPlanner &LVP); 1267 1268 /// Setup cost-based decisions for user vectorization factor. 1269 /// \return true if the UserVF is a feasible VF to be chosen. 1270 bool selectUserVectorizationFactor(ElementCount UserVF) { 1271 collectUniformsAndScalars(UserVF); 1272 collectInstsToScalarize(UserVF); 1273 return expectedCost(UserVF).first.isValid(); 1274 } 1275 1276 /// \return The size (in bits) of the smallest and widest types in the code 1277 /// that needs to be vectorized. We ignore values that remain scalar such as 1278 /// 64 bit loop indices. 1279 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1280 1281 /// \return The desired interleave count. 1282 /// If interleave count has been specified by metadata it will be returned. 1283 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1284 /// are the selected vectorization factor and the cost of the selected VF. 1285 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1286 1287 /// Memory access instruction may be vectorized in more than one way. 1288 /// Form of instruction after vectorization depends on cost. 1289 /// This function takes cost-based decisions for Load/Store instructions 1290 /// and collects them in a map. This decisions map is used for building 1291 /// the lists of loop-uniform and loop-scalar instructions. 1292 /// The calculated cost is saved with widening decision in order to 1293 /// avoid redundant calculations. 1294 void setCostBasedWideningDecision(ElementCount VF); 1295 1296 /// A struct that represents some properties of the register usage 1297 /// of a loop. 1298 struct RegisterUsage { 1299 /// Holds the number of loop invariant values that are used in the loop. 1300 /// The key is ClassID of target-provided register class. 1301 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1302 /// Holds the maximum number of concurrent live intervals in the loop. 1303 /// The key is ClassID of target-provided register class. 1304 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1305 }; 1306 1307 /// \return Returns information about the register usages of the loop for the 1308 /// given vectorization factors. 1309 SmallVector<RegisterUsage, 8> 1310 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1311 1312 /// Collect values we want to ignore in the cost model. 1313 void collectValuesToIgnore(); 1314 1315 /// Collect all element types in the loop for which widening is needed. 1316 void collectElementTypesForWidening(); 1317 1318 /// Split reductions into those that happen in the loop, and those that happen 1319 /// outside. In loop reductions are collected into InLoopReductionChains. 1320 void collectInLoopReductions(); 1321 1322 /// Returns true if we should use strict in-order reductions for the given 1323 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1324 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1325 /// of FP operations. 1326 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1327 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1328 } 1329 1330 /// \returns The smallest bitwidth each instruction can be represented with. 1331 /// The vector equivalents of these instructions should be truncated to this 1332 /// type. 1333 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1334 return MinBWs; 1335 } 1336 1337 /// \returns True if it is more profitable to scalarize instruction \p I for 1338 /// vectorization factor \p VF. 1339 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1340 assert(VF.isVector() && 1341 "Profitable to scalarize relevant only for VF > 1."); 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto Scalars = InstsToScalarize.find(VF); 1349 assert(Scalars != InstsToScalarize.end() && 1350 "VF not yet analyzed for scalarization profitability"); 1351 return Scalars->second.find(I) != Scalars->second.end(); 1352 } 1353 1354 /// Returns true if \p I is known to be uniform after vectorization. 1355 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1356 if (VF.isScalar()) 1357 return true; 1358 1359 // Cost model is not run in the VPlan-native path - return conservative 1360 // result until this changes. 1361 if (EnableVPlanNativePath) 1362 return false; 1363 1364 auto UniformsPerVF = Uniforms.find(VF); 1365 assert(UniformsPerVF != Uniforms.end() && 1366 "VF not yet analyzed for uniformity"); 1367 return UniformsPerVF->second.count(I); 1368 } 1369 1370 /// Returns true if \p I is known to be scalar after vectorization. 1371 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1372 if (VF.isScalar()) 1373 return true; 1374 1375 // Cost model is not run in the VPlan-native path - return conservative 1376 // result until this changes. 1377 if (EnableVPlanNativePath) 1378 return false; 1379 1380 auto ScalarsPerVF = Scalars.find(VF); 1381 assert(ScalarsPerVF != Scalars.end() && 1382 "Scalar values are not calculated for VF"); 1383 return ScalarsPerVF->second.count(I); 1384 } 1385 1386 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1387 /// for vectorization factor \p VF. 1388 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1389 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1390 !isProfitableToScalarize(I, VF) && 1391 !isScalarAfterVectorization(I, VF); 1392 } 1393 1394 /// Decision that was taken during cost calculation for memory instruction. 1395 enum InstWidening { 1396 CM_Unknown, 1397 CM_Widen, // For consecutive accesses with stride +1. 1398 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1399 CM_Interleave, 1400 CM_GatherScatter, 1401 CM_Scalarize 1402 }; 1403 1404 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1405 /// instruction \p I and vector width \p VF. 1406 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1407 InstructionCost Cost) { 1408 assert(VF.isVector() && "Expected VF >=2"); 1409 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1410 } 1411 1412 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1413 /// interleaving group \p Grp and vector width \p VF. 1414 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1415 ElementCount VF, InstWidening W, 1416 InstructionCost Cost) { 1417 assert(VF.isVector() && "Expected VF >=2"); 1418 /// Broadcast this decicion to all instructions inside the group. 1419 /// But the cost will be assigned to one instruction only. 1420 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1421 if (auto *I = Grp->getMember(i)) { 1422 if (Grp->getInsertPos() == I) 1423 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1424 else 1425 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1426 } 1427 } 1428 } 1429 1430 /// Return the cost model decision for the given instruction \p I and vector 1431 /// width \p VF. Return CM_Unknown if this instruction did not pass 1432 /// through the cost modeling. 1433 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1434 assert(VF.isVector() && "Expected VF to be a vector VF"); 1435 // Cost model is not run in the VPlan-native path - return conservative 1436 // result until this changes. 1437 if (EnableVPlanNativePath) 1438 return CM_GatherScatter; 1439 1440 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1441 auto Itr = WideningDecisions.find(InstOnVF); 1442 if (Itr == WideningDecisions.end()) 1443 return CM_Unknown; 1444 return Itr->second.first; 1445 } 1446 1447 /// Return the vectorization cost for the given instruction \p I and vector 1448 /// width \p VF. 1449 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1450 assert(VF.isVector() && "Expected VF >=2"); 1451 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1452 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1453 "The cost is not calculated"); 1454 return WideningDecisions[InstOnVF].second; 1455 } 1456 1457 /// Return True if instruction \p I is an optimizable truncate whose operand 1458 /// is an induction variable. Such a truncate will be removed by adding a new 1459 /// induction variable with the destination type. 1460 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1461 // If the instruction is not a truncate, return false. 1462 auto *Trunc = dyn_cast<TruncInst>(I); 1463 if (!Trunc) 1464 return false; 1465 1466 // Get the source and destination types of the truncate. 1467 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1468 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1469 1470 // If the truncate is free for the given types, return false. Replacing a 1471 // free truncate with an induction variable would add an induction variable 1472 // update instruction to each iteration of the loop. We exclude from this 1473 // check the primary induction variable since it will need an update 1474 // instruction regardless. 1475 Value *Op = Trunc->getOperand(0); 1476 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1477 return false; 1478 1479 // If the truncated value is not an induction variable, return false. 1480 return Legal->isInductionPhi(Op); 1481 } 1482 1483 /// Collects the instructions to scalarize for each predicated instruction in 1484 /// the loop. 1485 void collectInstsToScalarize(ElementCount VF); 1486 1487 /// Collect Uniform and Scalar values for the given \p VF. 1488 /// The sets depend on CM decision for Load/Store instructions 1489 /// that may be vectorized as interleave, gather-scatter or scalarized. 1490 void collectUniformsAndScalars(ElementCount VF) { 1491 // Do the analysis once. 1492 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1493 return; 1494 setCostBasedWideningDecision(VF); 1495 collectLoopUniforms(VF); 1496 collectLoopScalars(VF); 1497 } 1498 1499 /// Returns true if the target machine supports masked store operation 1500 /// for the given \p DataType and kind of access to \p Ptr. 1501 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1502 return Legal->isConsecutivePtr(DataType, Ptr) && 1503 TTI.isLegalMaskedStore(DataType, Alignment); 1504 } 1505 1506 /// Returns true if the target machine supports masked load operation 1507 /// for the given \p DataType and kind of access to \p Ptr. 1508 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1509 return Legal->isConsecutivePtr(DataType, Ptr) && 1510 TTI.isLegalMaskedLoad(DataType, Alignment); 1511 } 1512 1513 /// Returns true if the target machine can represent \p V as a masked gather 1514 /// or scatter operation. 1515 bool isLegalGatherOrScatter(Value *V) { 1516 bool LI = isa<LoadInst>(V); 1517 bool SI = isa<StoreInst>(V); 1518 if (!LI && !SI) 1519 return false; 1520 auto *Ty = getLoadStoreType(V); 1521 Align Align = getLoadStoreAlignment(V); 1522 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1523 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1524 } 1525 1526 /// Returns true if the target machine supports all of the reduction 1527 /// variables found for the given VF. 1528 bool canVectorizeReductions(ElementCount VF) const { 1529 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1530 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1531 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1532 })); 1533 } 1534 1535 /// Returns true if \p I is an instruction that will be scalarized with 1536 /// predication. Such instructions include conditional stores and 1537 /// instructions that may divide by zero. 1538 /// If a non-zero VF has been calculated, we check if I will be scalarized 1539 /// predication for that VF. 1540 bool isScalarWithPredication(Instruction *I) const; 1541 1542 // Returns true if \p I is an instruction that will be predicated either 1543 // through scalar predication or masked load/store or masked gather/scatter. 1544 // Superset of instructions that return true for isScalarWithPredication. 1545 bool isPredicatedInst(Instruction *I) { 1546 if (!blockNeedsPredication(I->getParent())) 1547 return false; 1548 // Loads and stores that need some form of masked operation are predicated 1549 // instructions. 1550 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1551 return Legal->isMaskRequired(I); 1552 return isScalarWithPredication(I); 1553 } 1554 1555 /// Returns true if \p I is a memory instruction with consecutive memory 1556 /// access that can be widened. 1557 bool 1558 memoryInstructionCanBeWidened(Instruction *I, 1559 ElementCount VF = ElementCount::getFixed(1)); 1560 1561 /// Returns true if \p I is a memory instruction in an interleaved-group 1562 /// of memory accesses that can be vectorized with wide vector loads/stores 1563 /// and shuffles. 1564 bool 1565 interleavedAccessCanBeWidened(Instruction *I, 1566 ElementCount VF = ElementCount::getFixed(1)); 1567 1568 /// Check if \p Instr belongs to any interleaved access group. 1569 bool isAccessInterleaved(Instruction *Instr) { 1570 return InterleaveInfo.isInterleaved(Instr); 1571 } 1572 1573 /// Get the interleaved access group that \p Instr belongs to. 1574 const InterleaveGroup<Instruction> * 1575 getInterleavedAccessGroup(Instruction *Instr) { 1576 return InterleaveInfo.getInterleaveGroup(Instr); 1577 } 1578 1579 /// Returns true if we're required to use a scalar epilogue for at least 1580 /// the final iteration of the original loop. 1581 bool requiresScalarEpilogue(ElementCount VF) const { 1582 if (!isScalarEpilogueAllowed()) 1583 return false; 1584 // If we might exit from anywhere but the latch, must run the exiting 1585 // iteration in scalar form. 1586 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1587 return true; 1588 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1589 } 1590 1591 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1592 /// loop hint annotation. 1593 bool isScalarEpilogueAllowed() const { 1594 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1595 } 1596 1597 /// Returns true if all loop blocks should be masked to fold tail loop. 1598 bool foldTailByMasking() const { return FoldTailByMasking; } 1599 1600 bool blockNeedsPredication(BasicBlock *BB) const { 1601 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1602 } 1603 1604 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1605 /// nodes to the chain of instructions representing the reductions. Uses a 1606 /// MapVector to ensure deterministic iteration order. 1607 using ReductionChainMap = 1608 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1609 1610 /// Return the chain of instructions representing an inloop reduction. 1611 const ReductionChainMap &getInLoopReductionChains() const { 1612 return InLoopReductionChains; 1613 } 1614 1615 /// Returns true if the Phi is part of an inloop reduction. 1616 bool isInLoopReduction(PHINode *Phi) const { 1617 return InLoopReductionChains.count(Phi); 1618 } 1619 1620 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1621 /// with factor VF. Return the cost of the instruction, including 1622 /// scalarization overhead if it's needed. 1623 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1624 1625 /// Estimate cost of a call instruction CI if it were vectorized with factor 1626 /// VF. Return the cost of the instruction, including scalarization overhead 1627 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1628 /// scalarized - 1629 /// i.e. either vector version isn't available, or is too expensive. 1630 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1631 bool &NeedToScalarize) const; 1632 1633 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1634 /// that of B. 1635 bool isMoreProfitable(const VectorizationFactor &A, 1636 const VectorizationFactor &B) const; 1637 1638 /// Invalidates decisions already taken by the cost model. 1639 void invalidateCostModelingDecisions() { 1640 WideningDecisions.clear(); 1641 Uniforms.clear(); 1642 Scalars.clear(); 1643 } 1644 1645 private: 1646 unsigned NumPredStores = 0; 1647 1648 /// \return An upper bound for the vectorization factors for both 1649 /// fixed and scalable vectorization, where the minimum-known number of 1650 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1651 /// disabled or unsupported, then the scalable part will be equal to 1652 /// ElementCount::getScalable(0). 1653 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1654 ElementCount UserVF); 1655 1656 /// \return the maximized element count based on the targets vector 1657 /// registers and the loop trip-count, but limited to a maximum safe VF. 1658 /// This is a helper function of computeFeasibleMaxVF. 1659 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1660 /// issue that occurred on one of the buildbots which cannot be reproduced 1661 /// without having access to the properietary compiler (see comments on 1662 /// D98509). The issue is currently under investigation and this workaround 1663 /// will be removed as soon as possible. 1664 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1665 unsigned SmallestType, 1666 unsigned WidestType, 1667 const ElementCount &MaxSafeVF); 1668 1669 /// \return the maximum legal scalable VF, based on the safe max number 1670 /// of elements. 1671 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1672 1673 /// The vectorization cost is a combination of the cost itself and a boolean 1674 /// indicating whether any of the contributing operations will actually 1675 /// operate on vector values after type legalization in the backend. If this 1676 /// latter value is false, then all operations will be scalarized (i.e. no 1677 /// vectorization has actually taken place). 1678 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1679 1680 /// Returns the expected execution cost. The unit of the cost does 1681 /// not matter because we use the 'cost' units to compare different 1682 /// vector widths. The cost that is returned is *not* normalized by 1683 /// the factor width. If \p Invalid is not nullptr, this function 1684 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1685 /// each instruction that has an Invalid cost for the given VF. 1686 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1687 VectorizationCostTy 1688 expectedCost(ElementCount VF, 1689 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1690 1691 /// Returns the execution time cost of an instruction for a given vector 1692 /// width. Vector width of one means scalar. 1693 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1694 1695 /// The cost-computation logic from getInstructionCost which provides 1696 /// the vector type as an output parameter. 1697 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1698 Type *&VectorTy); 1699 1700 /// Return the cost of instructions in an inloop reduction pattern, if I is 1701 /// part of that pattern. 1702 Optional<InstructionCost> 1703 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1704 TTI::TargetCostKind CostKind); 1705 1706 /// Calculate vectorization cost of memory instruction \p I. 1707 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1708 1709 /// The cost computation for scalarized memory instruction. 1710 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1711 1712 /// The cost computation for interleaving group of memory instructions. 1713 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1714 1715 /// The cost computation for Gather/Scatter instruction. 1716 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1717 1718 /// The cost computation for widening instruction \p I with consecutive 1719 /// memory access. 1720 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1721 1722 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1723 /// Load: scalar load + broadcast. 1724 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1725 /// element) 1726 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1727 1728 /// Estimate the overhead of scalarizing an instruction. This is a 1729 /// convenience wrapper for the type-based getScalarizationOverhead API. 1730 InstructionCost getScalarizationOverhead(Instruction *I, 1731 ElementCount VF) const; 1732 1733 /// Returns whether the instruction is a load or store and will be a emitted 1734 /// as a vector operation. 1735 bool isConsecutiveLoadOrStore(Instruction *I); 1736 1737 /// Returns true if an artificially high cost for emulated masked memrefs 1738 /// should be used. 1739 bool useEmulatedMaskMemRefHack(Instruction *I); 1740 1741 /// Map of scalar integer values to the smallest bitwidth they can be legally 1742 /// represented as. The vector equivalents of these values should be truncated 1743 /// to this type. 1744 MapVector<Instruction *, uint64_t> MinBWs; 1745 1746 /// A type representing the costs for instructions if they were to be 1747 /// scalarized rather than vectorized. The entries are Instruction-Cost 1748 /// pairs. 1749 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1750 1751 /// A set containing all BasicBlocks that are known to present after 1752 /// vectorization as a predicated block. 1753 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1754 1755 /// Records whether it is allowed to have the original scalar loop execute at 1756 /// least once. This may be needed as a fallback loop in case runtime 1757 /// aliasing/dependence checks fail, or to handle the tail/remainder 1758 /// iterations when the trip count is unknown or doesn't divide by the VF, 1759 /// or as a peel-loop to handle gaps in interleave-groups. 1760 /// Under optsize and when the trip count is very small we don't allow any 1761 /// iterations to execute in the scalar loop. 1762 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1763 1764 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1765 bool FoldTailByMasking = false; 1766 1767 /// A map holding scalar costs for different vectorization factors. The 1768 /// presence of a cost for an instruction in the mapping indicates that the 1769 /// instruction will be scalarized when vectorizing with the associated 1770 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1771 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1772 1773 /// Holds the instructions known to be uniform after vectorization. 1774 /// The data is collected per VF. 1775 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1776 1777 /// Holds the instructions known to be scalar after vectorization. 1778 /// The data is collected per VF. 1779 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1780 1781 /// Holds the instructions (address computations) that are forced to be 1782 /// scalarized. 1783 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1784 1785 /// PHINodes of the reductions that should be expanded in-loop along with 1786 /// their associated chains of reduction operations, in program order from top 1787 /// (PHI) to bottom 1788 ReductionChainMap InLoopReductionChains; 1789 1790 /// A Map of inloop reduction operations and their immediate chain operand. 1791 /// FIXME: This can be removed once reductions can be costed correctly in 1792 /// vplan. This was added to allow quick lookup to the inloop operations, 1793 /// without having to loop through InLoopReductionChains. 1794 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1795 1796 /// Returns the expected difference in cost from scalarizing the expression 1797 /// feeding a predicated instruction \p PredInst. The instructions to 1798 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1799 /// non-negative return value implies the expression will be scalarized. 1800 /// Currently, only single-use chains are considered for scalarization. 1801 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1802 ElementCount VF); 1803 1804 /// Collect the instructions that are uniform after vectorization. An 1805 /// instruction is uniform if we represent it with a single scalar value in 1806 /// the vectorized loop corresponding to each vector iteration. Examples of 1807 /// uniform instructions include pointer operands of consecutive or 1808 /// interleaved memory accesses. Note that although uniformity implies an 1809 /// instruction will be scalar, the reverse is not true. In general, a 1810 /// scalarized instruction will be represented by VF scalar values in the 1811 /// vectorized loop, each corresponding to an iteration of the original 1812 /// scalar loop. 1813 void collectLoopUniforms(ElementCount VF); 1814 1815 /// Collect the instructions that are scalar after vectorization. An 1816 /// instruction is scalar if it is known to be uniform or will be scalarized 1817 /// during vectorization. Non-uniform scalarized instructions will be 1818 /// represented by VF values in the vectorized loop, each corresponding to an 1819 /// iteration of the original scalar loop. 1820 void collectLoopScalars(ElementCount VF); 1821 1822 /// Keeps cost model vectorization decision and cost for instructions. 1823 /// Right now it is used for memory instructions only. 1824 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1825 std::pair<InstWidening, InstructionCost>>; 1826 1827 DecisionList WideningDecisions; 1828 1829 /// Returns true if \p V is expected to be vectorized and it needs to be 1830 /// extracted. 1831 bool needsExtract(Value *V, ElementCount VF) const { 1832 Instruction *I = dyn_cast<Instruction>(V); 1833 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1834 TheLoop->isLoopInvariant(I)) 1835 return false; 1836 1837 // Assume we can vectorize V (and hence we need extraction) if the 1838 // scalars are not computed yet. This can happen, because it is called 1839 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1840 // the scalars are collected. That should be a safe assumption in most 1841 // cases, because we check if the operands have vectorizable types 1842 // beforehand in LoopVectorizationLegality. 1843 return Scalars.find(VF) == Scalars.end() || 1844 !isScalarAfterVectorization(I, VF); 1845 }; 1846 1847 /// Returns a range containing only operands needing to be extracted. 1848 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1849 ElementCount VF) const { 1850 return SmallVector<Value *, 4>(make_filter_range( 1851 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1852 } 1853 1854 /// Determines if we have the infrastructure to vectorize loop \p L and its 1855 /// epilogue, assuming the main loop is vectorized by \p VF. 1856 bool isCandidateForEpilogueVectorization(const Loop &L, 1857 const ElementCount VF) const; 1858 1859 /// Returns true if epilogue vectorization is considered profitable, and 1860 /// false otherwise. 1861 /// \p VF is the vectorization factor chosen for the original loop. 1862 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1863 1864 public: 1865 /// The loop that we evaluate. 1866 Loop *TheLoop; 1867 1868 /// Predicated scalar evolution analysis. 1869 PredicatedScalarEvolution &PSE; 1870 1871 /// Loop Info analysis. 1872 LoopInfo *LI; 1873 1874 /// Vectorization legality. 1875 LoopVectorizationLegality *Legal; 1876 1877 /// Vector target information. 1878 const TargetTransformInfo &TTI; 1879 1880 /// Target Library Info. 1881 const TargetLibraryInfo *TLI; 1882 1883 /// Demanded bits analysis. 1884 DemandedBits *DB; 1885 1886 /// Assumption cache. 1887 AssumptionCache *AC; 1888 1889 /// Interface to emit optimization remarks. 1890 OptimizationRemarkEmitter *ORE; 1891 1892 const Function *TheFunction; 1893 1894 /// Loop Vectorize Hint. 1895 const LoopVectorizeHints *Hints; 1896 1897 /// The interleave access information contains groups of interleaved accesses 1898 /// with the same stride and close to each other. 1899 InterleavedAccessInfo &InterleaveInfo; 1900 1901 /// Values to ignore in the cost model. 1902 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1903 1904 /// Values to ignore in the cost model when VF > 1. 1905 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1906 1907 /// All element types found in the loop. 1908 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1909 1910 /// Profitable vector factors. 1911 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1912 }; 1913 } // end namespace llvm 1914 1915 /// Helper struct to manage generating runtime checks for vectorization. 1916 /// 1917 /// The runtime checks are created up-front in temporary blocks to allow better 1918 /// estimating the cost and un-linked from the existing IR. After deciding to 1919 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1920 /// temporary blocks are completely removed. 1921 class GeneratedRTChecks { 1922 /// Basic block which contains the generated SCEV checks, if any. 1923 BasicBlock *SCEVCheckBlock = nullptr; 1924 1925 /// The value representing the result of the generated SCEV checks. If it is 1926 /// nullptr, either no SCEV checks have been generated or they have been used. 1927 Value *SCEVCheckCond = nullptr; 1928 1929 /// Basic block which contains the generated memory runtime checks, if any. 1930 BasicBlock *MemCheckBlock = nullptr; 1931 1932 /// The value representing the result of the generated memory runtime checks. 1933 /// If it is nullptr, either no memory runtime checks have been generated or 1934 /// they have been used. 1935 Value *MemRuntimeCheckCond = nullptr; 1936 1937 DominatorTree *DT; 1938 LoopInfo *LI; 1939 1940 SCEVExpander SCEVExp; 1941 SCEVExpander MemCheckExp; 1942 1943 public: 1944 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1945 const DataLayout &DL) 1946 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1947 MemCheckExp(SE, DL, "scev.check") {} 1948 1949 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1950 /// accurately estimate the cost of the runtime checks. The blocks are 1951 /// un-linked from the IR and is added back during vector code generation. If 1952 /// there is no vector code generation, the check blocks are removed 1953 /// completely. 1954 void Create(Loop *L, const LoopAccessInfo &LAI, 1955 const SCEVUnionPredicate &UnionPred) { 1956 1957 BasicBlock *LoopHeader = L->getHeader(); 1958 BasicBlock *Preheader = L->getLoopPreheader(); 1959 1960 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1961 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1962 // may be used by SCEVExpander. The blocks will be un-linked from their 1963 // predecessors and removed from LI & DT at the end of the function. 1964 if (!UnionPred.isAlwaysTrue()) { 1965 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1966 nullptr, "vector.scevcheck"); 1967 1968 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1969 &UnionPred, SCEVCheckBlock->getTerminator()); 1970 } 1971 1972 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1973 if (RtPtrChecking.Need) { 1974 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1975 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1976 "vector.memcheck"); 1977 1978 MemRuntimeCheckCond = 1979 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1980 RtPtrChecking.getChecks(), MemCheckExp); 1981 assert(MemRuntimeCheckCond && 1982 "no RT checks generated although RtPtrChecking " 1983 "claimed checks are required"); 1984 } 1985 1986 if (!MemCheckBlock && !SCEVCheckBlock) 1987 return; 1988 1989 // Unhook the temporary block with the checks, update various places 1990 // accordingly. 1991 if (SCEVCheckBlock) 1992 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1993 if (MemCheckBlock) 1994 MemCheckBlock->replaceAllUsesWith(Preheader); 1995 1996 if (SCEVCheckBlock) { 1997 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1998 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1999 Preheader->getTerminator()->eraseFromParent(); 2000 } 2001 if (MemCheckBlock) { 2002 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2003 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2004 Preheader->getTerminator()->eraseFromParent(); 2005 } 2006 2007 DT->changeImmediateDominator(LoopHeader, Preheader); 2008 if (MemCheckBlock) { 2009 DT->eraseNode(MemCheckBlock); 2010 LI->removeBlock(MemCheckBlock); 2011 } 2012 if (SCEVCheckBlock) { 2013 DT->eraseNode(SCEVCheckBlock); 2014 LI->removeBlock(SCEVCheckBlock); 2015 } 2016 } 2017 2018 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2019 /// unused. 2020 ~GeneratedRTChecks() { 2021 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2022 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2023 if (!SCEVCheckCond) 2024 SCEVCleaner.markResultUsed(); 2025 2026 if (!MemRuntimeCheckCond) 2027 MemCheckCleaner.markResultUsed(); 2028 2029 if (MemRuntimeCheckCond) { 2030 auto &SE = *MemCheckExp.getSE(); 2031 // Memory runtime check generation creates compares that use expanded 2032 // values. Remove them before running the SCEVExpanderCleaners. 2033 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2034 if (MemCheckExp.isInsertedInstruction(&I)) 2035 continue; 2036 SE.forgetValue(&I); 2037 I.eraseFromParent(); 2038 } 2039 } 2040 MemCheckCleaner.cleanup(); 2041 SCEVCleaner.cleanup(); 2042 2043 if (SCEVCheckCond) 2044 SCEVCheckBlock->eraseFromParent(); 2045 if (MemRuntimeCheckCond) 2046 MemCheckBlock->eraseFromParent(); 2047 } 2048 2049 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2050 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2051 /// depending on the generated condition. 2052 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2053 BasicBlock *LoopVectorPreHeader, 2054 BasicBlock *LoopExitBlock) { 2055 if (!SCEVCheckCond) 2056 return nullptr; 2057 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2058 if (C->isZero()) 2059 return nullptr; 2060 2061 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2062 2063 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2064 // Create new preheader for vector loop. 2065 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2066 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2067 2068 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2069 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2070 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2071 SCEVCheckBlock); 2072 2073 DT->addNewBlock(SCEVCheckBlock, Pred); 2074 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2075 2076 ReplaceInstWithInst( 2077 SCEVCheckBlock->getTerminator(), 2078 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2079 // Mark the check as used, to prevent it from being removed during cleanup. 2080 SCEVCheckCond = nullptr; 2081 return SCEVCheckBlock; 2082 } 2083 2084 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2085 /// the branches to branch to the vector preheader or \p Bypass, depending on 2086 /// the generated condition. 2087 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2088 BasicBlock *LoopVectorPreHeader) { 2089 // Check if we generated code that checks in runtime if arrays overlap. 2090 if (!MemRuntimeCheckCond) 2091 return nullptr; 2092 2093 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2094 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2095 MemCheckBlock); 2096 2097 DT->addNewBlock(MemCheckBlock, Pred); 2098 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2099 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2100 2101 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2102 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2103 2104 ReplaceInstWithInst( 2105 MemCheckBlock->getTerminator(), 2106 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2107 MemCheckBlock->getTerminator()->setDebugLoc( 2108 Pred->getTerminator()->getDebugLoc()); 2109 2110 // Mark the check as used, to prevent it from being removed during cleanup. 2111 MemRuntimeCheckCond = nullptr; 2112 return MemCheckBlock; 2113 } 2114 }; 2115 2116 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2117 // vectorization. The loop needs to be annotated with #pragma omp simd 2118 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2119 // vector length information is not provided, vectorization is not considered 2120 // explicit. Interleave hints are not allowed either. These limitations will be 2121 // relaxed in the future. 2122 // Please, note that we are currently forced to abuse the pragma 'clang 2123 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2124 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2125 // provides *explicit vectorization hints* (LV can bypass legal checks and 2126 // assume that vectorization is legal). However, both hints are implemented 2127 // using the same metadata (llvm.loop.vectorize, processed by 2128 // LoopVectorizeHints). This will be fixed in the future when the native IR 2129 // representation for pragma 'omp simd' is introduced. 2130 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2131 OptimizationRemarkEmitter *ORE) { 2132 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2133 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2134 2135 // Only outer loops with an explicit vectorization hint are supported. 2136 // Unannotated outer loops are ignored. 2137 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2138 return false; 2139 2140 Function *Fn = OuterLp->getHeader()->getParent(); 2141 if (!Hints.allowVectorization(Fn, OuterLp, 2142 true /*VectorizeOnlyWhenForced*/)) { 2143 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2144 return false; 2145 } 2146 2147 if (Hints.getInterleave() > 1) { 2148 // TODO: Interleave support is future work. 2149 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2150 "outer loops.\n"); 2151 Hints.emitRemarkWithHints(); 2152 return false; 2153 } 2154 2155 return true; 2156 } 2157 2158 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2159 OptimizationRemarkEmitter *ORE, 2160 SmallVectorImpl<Loop *> &V) { 2161 // Collect inner loops and outer loops without irreducible control flow. For 2162 // now, only collect outer loops that have explicit vectorization hints. If we 2163 // are stress testing the VPlan H-CFG construction, we collect the outermost 2164 // loop of every loop nest. 2165 if (L.isInnermost() || VPlanBuildStressTest || 2166 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2167 LoopBlocksRPO RPOT(&L); 2168 RPOT.perform(LI); 2169 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2170 V.push_back(&L); 2171 // TODO: Collect inner loops inside marked outer loops in case 2172 // vectorization fails for the outer loop. Do not invoke 2173 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2174 // already known to be reducible. We can use an inherited attribute for 2175 // that. 2176 return; 2177 } 2178 } 2179 for (Loop *InnerL : L) 2180 collectSupportedLoops(*InnerL, LI, ORE, V); 2181 } 2182 2183 namespace { 2184 2185 /// The LoopVectorize Pass. 2186 struct LoopVectorize : public FunctionPass { 2187 /// Pass identification, replacement for typeid 2188 static char ID; 2189 2190 LoopVectorizePass Impl; 2191 2192 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2193 bool VectorizeOnlyWhenForced = false) 2194 : FunctionPass(ID), 2195 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2196 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2197 } 2198 2199 bool runOnFunction(Function &F) override { 2200 if (skipFunction(F)) 2201 return false; 2202 2203 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2204 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2205 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2206 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2207 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2208 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2209 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2210 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2211 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2212 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2213 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2214 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2215 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2216 2217 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2218 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2219 2220 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2221 GetLAA, *ORE, PSI).MadeAnyChange; 2222 } 2223 2224 void getAnalysisUsage(AnalysisUsage &AU) const override { 2225 AU.addRequired<AssumptionCacheTracker>(); 2226 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2227 AU.addRequired<DominatorTreeWrapperPass>(); 2228 AU.addRequired<LoopInfoWrapperPass>(); 2229 AU.addRequired<ScalarEvolutionWrapperPass>(); 2230 AU.addRequired<TargetTransformInfoWrapperPass>(); 2231 AU.addRequired<AAResultsWrapperPass>(); 2232 AU.addRequired<LoopAccessLegacyAnalysis>(); 2233 AU.addRequired<DemandedBitsWrapperPass>(); 2234 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2235 AU.addRequired<InjectTLIMappingsLegacy>(); 2236 2237 // We currently do not preserve loopinfo/dominator analyses with outer loop 2238 // vectorization. Until this is addressed, mark these analyses as preserved 2239 // only for non-VPlan-native path. 2240 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2241 if (!EnableVPlanNativePath) { 2242 AU.addPreserved<LoopInfoWrapperPass>(); 2243 AU.addPreserved<DominatorTreeWrapperPass>(); 2244 } 2245 2246 AU.addPreserved<BasicAAWrapperPass>(); 2247 AU.addPreserved<GlobalsAAWrapperPass>(); 2248 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2249 } 2250 }; 2251 2252 } // end anonymous namespace 2253 2254 //===----------------------------------------------------------------------===// 2255 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2256 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2257 //===----------------------------------------------------------------------===// 2258 2259 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2260 // We need to place the broadcast of invariant variables outside the loop, 2261 // but only if it's proven safe to do so. Else, broadcast will be inside 2262 // vector loop body. 2263 Instruction *Instr = dyn_cast<Instruction>(V); 2264 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2265 (!Instr || 2266 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2267 // Place the code for broadcasting invariant variables in the new preheader. 2268 IRBuilder<>::InsertPointGuard Guard(Builder); 2269 if (SafeToHoist) 2270 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2271 2272 // Broadcast the scalar into all locations in the vector. 2273 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2274 2275 return Shuf; 2276 } 2277 2278 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2279 const InductionDescriptor &II, Value *Step, Value *Start, 2280 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2281 VPTransformState &State) { 2282 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2283 "Expected either an induction phi-node or a truncate of it!"); 2284 2285 // Construct the initial value of the vector IV in the vector loop preheader 2286 auto CurrIP = Builder.saveIP(); 2287 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2288 if (isa<TruncInst>(EntryVal)) { 2289 assert(Start->getType()->isIntegerTy() && 2290 "Truncation requires an integer type"); 2291 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2292 Step = Builder.CreateTrunc(Step, TruncType); 2293 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2294 } 2295 2296 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2297 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2298 Value *SteppedStart = 2299 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2300 2301 // We create vector phi nodes for both integer and floating-point induction 2302 // variables. Here, we determine the kind of arithmetic we will perform. 2303 Instruction::BinaryOps AddOp; 2304 Instruction::BinaryOps MulOp; 2305 if (Step->getType()->isIntegerTy()) { 2306 AddOp = Instruction::Add; 2307 MulOp = Instruction::Mul; 2308 } else { 2309 AddOp = II.getInductionOpcode(); 2310 MulOp = Instruction::FMul; 2311 } 2312 2313 // Multiply the vectorization factor by the step using integer or 2314 // floating-point arithmetic as appropriate. 2315 Type *StepType = Step->getType(); 2316 Value *RuntimeVF; 2317 if (Step->getType()->isFloatingPointTy()) 2318 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); 2319 else 2320 RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2321 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2322 2323 // Create a vector splat to use in the induction update. 2324 // 2325 // FIXME: If the step is non-constant, we create the vector splat with 2326 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2327 // handle a constant vector splat. 2328 Value *SplatVF = isa<Constant>(Mul) 2329 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2330 : Builder.CreateVectorSplat(VF, Mul); 2331 Builder.restoreIP(CurrIP); 2332 2333 // We may need to add the step a number of times, depending on the unroll 2334 // factor. The last of those goes into the PHI. 2335 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2336 &*LoopVectorBody->getFirstInsertionPt()); 2337 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2338 Instruction *LastInduction = VecInd; 2339 for (unsigned Part = 0; Part < UF; ++Part) { 2340 State.set(Def, LastInduction, Part); 2341 2342 if (isa<TruncInst>(EntryVal)) 2343 addMetadata(LastInduction, EntryVal); 2344 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2345 State, Part); 2346 2347 LastInduction = cast<Instruction>( 2348 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2349 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2350 } 2351 2352 // Move the last step to the end of the latch block. This ensures consistent 2353 // placement of all induction updates. 2354 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2355 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2356 auto *ICmp = cast<Instruction>(Br->getCondition()); 2357 LastInduction->moveBefore(ICmp); 2358 LastInduction->setName("vec.ind.next"); 2359 2360 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2361 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2362 } 2363 2364 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2365 return Cost->isScalarAfterVectorization(I, VF) || 2366 Cost->isProfitableToScalarize(I, VF); 2367 } 2368 2369 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2370 if (shouldScalarizeInstruction(IV)) 2371 return true; 2372 auto isScalarInst = [&](User *U) -> bool { 2373 auto *I = cast<Instruction>(U); 2374 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2375 }; 2376 return llvm::any_of(IV->users(), isScalarInst); 2377 } 2378 2379 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2380 const InductionDescriptor &ID, const Instruction *EntryVal, 2381 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2382 unsigned Part, unsigned Lane) { 2383 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2384 "Expected either an induction phi-node or a truncate of it!"); 2385 2386 // This induction variable is not the phi from the original loop but the 2387 // newly-created IV based on the proof that casted Phi is equal to the 2388 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2389 // re-uses the same InductionDescriptor that original IV uses but we don't 2390 // have to do any recording in this case - that is done when original IV is 2391 // processed. 2392 if (isa<TruncInst>(EntryVal)) 2393 return; 2394 2395 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2396 if (Casts.empty()) 2397 return; 2398 // Only the first Cast instruction in the Casts vector is of interest. 2399 // The rest of the Casts (if exist) have no uses outside the 2400 // induction update chain itself. 2401 if (Lane < UINT_MAX) 2402 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2403 else 2404 State.set(CastDef, VectorLoopVal, Part); 2405 } 2406 2407 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2408 TruncInst *Trunc, VPValue *Def, 2409 VPValue *CastDef, 2410 VPTransformState &State) { 2411 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2412 "Primary induction variable must have an integer type"); 2413 2414 auto II = Legal->getInductionVars().find(IV); 2415 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2416 2417 auto ID = II->second; 2418 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2419 2420 // The value from the original loop to which we are mapping the new induction 2421 // variable. 2422 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2423 2424 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2425 2426 // Generate code for the induction step. Note that induction steps are 2427 // required to be loop-invariant 2428 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2429 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2430 "Induction step should be loop invariant"); 2431 if (PSE.getSE()->isSCEVable(IV->getType())) { 2432 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2433 return Exp.expandCodeFor(Step, Step->getType(), 2434 LoopVectorPreHeader->getTerminator()); 2435 } 2436 return cast<SCEVUnknown>(Step)->getValue(); 2437 }; 2438 2439 // The scalar value to broadcast. This is derived from the canonical 2440 // induction variable. If a truncation type is given, truncate the canonical 2441 // induction variable and step. Otherwise, derive these values from the 2442 // induction descriptor. 2443 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2444 Value *ScalarIV = Induction; 2445 if (IV != OldInduction) { 2446 ScalarIV = IV->getType()->isIntegerTy() 2447 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2448 : Builder.CreateCast(Instruction::SIToFP, Induction, 2449 IV->getType()); 2450 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2451 ScalarIV->setName("offset.idx"); 2452 } 2453 if (Trunc) { 2454 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2455 assert(Step->getType()->isIntegerTy() && 2456 "Truncation requires an integer step"); 2457 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2458 Step = Builder.CreateTrunc(Step, TruncType); 2459 } 2460 return ScalarIV; 2461 }; 2462 2463 // Create the vector values from the scalar IV, in the absence of creating a 2464 // vector IV. 2465 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2466 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2467 for (unsigned Part = 0; Part < UF; ++Part) { 2468 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2469 Value *StartIdx; 2470 if (Step->getType()->isFloatingPointTy()) 2471 StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); 2472 else 2473 StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); 2474 2475 Value *EntryPart = 2476 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2477 State.set(Def, EntryPart, Part); 2478 if (Trunc) 2479 addMetadata(EntryPart, Trunc); 2480 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2481 State, Part); 2482 } 2483 }; 2484 2485 // Fast-math-flags propagate from the original induction instruction. 2486 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2487 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2488 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2489 2490 // Now do the actual transformations, and start with creating the step value. 2491 Value *Step = CreateStepValue(ID.getStep()); 2492 if (VF.isZero() || VF.isScalar()) { 2493 Value *ScalarIV = CreateScalarIV(Step); 2494 CreateSplatIV(ScalarIV, Step); 2495 return; 2496 } 2497 2498 // Determine if we want a scalar version of the induction variable. This is 2499 // true if the induction variable itself is not widened, or if it has at 2500 // least one user in the loop that is not widened. 2501 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2502 if (!NeedsScalarIV) { 2503 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2504 State); 2505 return; 2506 } 2507 2508 // Try to create a new independent vector induction variable. If we can't 2509 // create the phi node, we will splat the scalar induction variable in each 2510 // loop iteration. 2511 if (!shouldScalarizeInstruction(EntryVal)) { 2512 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2513 State); 2514 Value *ScalarIV = CreateScalarIV(Step); 2515 // Create scalar steps that can be used by instructions we will later 2516 // scalarize. Note that the addition of the scalar steps will not increase 2517 // the number of instructions in the loop in the common case prior to 2518 // InstCombine. We will be trading one vector extract for each scalar step. 2519 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2520 return; 2521 } 2522 2523 // All IV users are scalar instructions, so only emit a scalar IV, not a 2524 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2525 // predicate used by the masked loads/stores. 2526 Value *ScalarIV = CreateScalarIV(Step); 2527 if (!Cost->isScalarEpilogueAllowed()) 2528 CreateSplatIV(ScalarIV, Step); 2529 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2530 } 2531 2532 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2533 Value *Step, 2534 Instruction::BinaryOps BinOp) { 2535 // Create and check the types. 2536 auto *ValVTy = cast<VectorType>(Val->getType()); 2537 ElementCount VLen = ValVTy->getElementCount(); 2538 2539 Type *STy = Val->getType()->getScalarType(); 2540 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2541 "Induction Step must be an integer or FP"); 2542 assert(Step->getType() == STy && "Step has wrong type"); 2543 2544 SmallVector<Constant *, 8> Indices; 2545 2546 // Create a vector of consecutive numbers from zero to VF. 2547 VectorType *InitVecValVTy = ValVTy; 2548 Type *InitVecValSTy = STy; 2549 if (STy->isFloatingPointTy()) { 2550 InitVecValSTy = 2551 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2552 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2553 } 2554 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2555 2556 // Splat the StartIdx 2557 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2558 2559 if (STy->isIntegerTy()) { 2560 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2561 Step = Builder.CreateVectorSplat(VLen, Step); 2562 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2563 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2564 // which can be found from the original scalar operations. 2565 Step = Builder.CreateMul(InitVec, Step); 2566 return Builder.CreateAdd(Val, Step, "induction"); 2567 } 2568 2569 // Floating point induction. 2570 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2571 "Binary Opcode should be specified for FP induction"); 2572 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2573 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2574 2575 Step = Builder.CreateVectorSplat(VLen, Step); 2576 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2577 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2578 } 2579 2580 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2581 Instruction *EntryVal, 2582 const InductionDescriptor &ID, 2583 VPValue *Def, VPValue *CastDef, 2584 VPTransformState &State) { 2585 // We shouldn't have to build scalar steps if we aren't vectorizing. 2586 assert(VF.isVector() && "VF should be greater than one"); 2587 // Get the value type and ensure it and the step have the same integer type. 2588 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2589 assert(ScalarIVTy == Step->getType() && 2590 "Val and Step should have the same type"); 2591 2592 // We build scalar steps for both integer and floating-point induction 2593 // variables. Here, we determine the kind of arithmetic we will perform. 2594 Instruction::BinaryOps AddOp; 2595 Instruction::BinaryOps MulOp; 2596 if (ScalarIVTy->isIntegerTy()) { 2597 AddOp = Instruction::Add; 2598 MulOp = Instruction::Mul; 2599 } else { 2600 AddOp = ID.getInductionOpcode(); 2601 MulOp = Instruction::FMul; 2602 } 2603 2604 // Determine the number of scalars we need to generate for each unroll 2605 // iteration. If EntryVal is uniform, we only need to generate the first 2606 // lane. Otherwise, we generate all VF values. 2607 bool IsUniform = 2608 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2609 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2610 // Compute the scalar steps and save the results in State. 2611 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2612 ScalarIVTy->getScalarSizeInBits()); 2613 Type *VecIVTy = nullptr; 2614 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2615 if (!IsUniform && VF.isScalable()) { 2616 VecIVTy = VectorType::get(ScalarIVTy, VF); 2617 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2618 SplatStep = Builder.CreateVectorSplat(VF, Step); 2619 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2620 } 2621 2622 for (unsigned Part = 0; Part < UF; ++Part) { 2623 Value *StartIdx0 = 2624 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2625 2626 if (!IsUniform && VF.isScalable()) { 2627 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2628 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2629 if (ScalarIVTy->isFloatingPointTy()) 2630 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2631 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2632 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2633 State.set(Def, Add, Part); 2634 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2635 Part); 2636 // It's useful to record the lane values too for the known minimum number 2637 // of elements so we do those below. This improves the code quality when 2638 // trying to extract the first element, for example. 2639 } 2640 2641 if (ScalarIVTy->isFloatingPointTy()) 2642 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2643 2644 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2645 Value *StartIdx = Builder.CreateBinOp( 2646 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2647 // The step returned by `createStepForVF` is a runtime-evaluated value 2648 // when VF is scalable. Otherwise, it should be folded into a Constant. 2649 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2650 "Expected StartIdx to be folded to a constant when VF is not " 2651 "scalable"); 2652 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2653 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2654 State.set(Def, Add, VPIteration(Part, Lane)); 2655 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2656 Part, Lane); 2657 } 2658 } 2659 } 2660 2661 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2662 const VPIteration &Instance, 2663 VPTransformState &State) { 2664 Value *ScalarInst = State.get(Def, Instance); 2665 Value *VectorValue = State.get(Def, Instance.Part); 2666 VectorValue = Builder.CreateInsertElement( 2667 VectorValue, ScalarInst, 2668 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2669 State.set(Def, VectorValue, Instance.Part); 2670 } 2671 2672 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2673 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2674 return Builder.CreateVectorReverse(Vec, "reverse"); 2675 } 2676 2677 // Return whether we allow using masked interleave-groups (for dealing with 2678 // strided loads/stores that reside in predicated blocks, or for dealing 2679 // with gaps). 2680 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2681 // If an override option has been passed in for interleaved accesses, use it. 2682 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2683 return EnableMaskedInterleavedMemAccesses; 2684 2685 return TTI.enableMaskedInterleavedAccessVectorization(); 2686 } 2687 2688 // Try to vectorize the interleave group that \p Instr belongs to. 2689 // 2690 // E.g. Translate following interleaved load group (factor = 3): 2691 // for (i = 0; i < N; i+=3) { 2692 // R = Pic[i]; // Member of index 0 2693 // G = Pic[i+1]; // Member of index 1 2694 // B = Pic[i+2]; // Member of index 2 2695 // ... // do something to R, G, B 2696 // } 2697 // To: 2698 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2699 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2700 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2701 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2702 // 2703 // Or translate following interleaved store group (factor = 3): 2704 // for (i = 0; i < N; i+=3) { 2705 // ... do something to R, G, B 2706 // Pic[i] = R; // Member of index 0 2707 // Pic[i+1] = G; // Member of index 1 2708 // Pic[i+2] = B; // Member of index 2 2709 // } 2710 // To: 2711 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2712 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2713 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2714 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2715 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2716 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2717 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2718 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2719 VPValue *BlockInMask) { 2720 Instruction *Instr = Group->getInsertPos(); 2721 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2722 2723 // Prepare for the vector type of the interleaved load/store. 2724 Type *ScalarTy = getLoadStoreType(Instr); 2725 unsigned InterleaveFactor = Group->getFactor(); 2726 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2727 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2728 2729 // Prepare for the new pointers. 2730 SmallVector<Value *, 2> AddrParts; 2731 unsigned Index = Group->getIndex(Instr); 2732 2733 // TODO: extend the masked interleaved-group support to reversed access. 2734 assert((!BlockInMask || !Group->isReverse()) && 2735 "Reversed masked interleave-group not supported."); 2736 2737 // If the group is reverse, adjust the index to refer to the last vector lane 2738 // instead of the first. We adjust the index from the first vector lane, 2739 // rather than directly getting the pointer for lane VF - 1, because the 2740 // pointer operand of the interleaved access is supposed to be uniform. For 2741 // uniform instructions, we're only required to generate a value for the 2742 // first vector lane in each unroll iteration. 2743 if (Group->isReverse()) 2744 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2745 2746 for (unsigned Part = 0; Part < UF; Part++) { 2747 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2748 setDebugLocFromInst(AddrPart); 2749 2750 // Notice current instruction could be any index. Need to adjust the address 2751 // to the member of index 0. 2752 // 2753 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2754 // b = A[i]; // Member of index 0 2755 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2756 // 2757 // E.g. A[i+1] = a; // Member of index 1 2758 // A[i] = b; // Member of index 0 2759 // A[i+2] = c; // Member of index 2 (Current instruction) 2760 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2761 2762 bool InBounds = false; 2763 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2764 InBounds = gep->isInBounds(); 2765 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2766 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2767 2768 // Cast to the vector pointer type. 2769 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2770 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2771 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2772 } 2773 2774 setDebugLocFromInst(Instr); 2775 Value *PoisonVec = PoisonValue::get(VecTy); 2776 2777 Value *MaskForGaps = nullptr; 2778 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2779 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2780 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2781 } 2782 2783 // Vectorize the interleaved load group. 2784 if (isa<LoadInst>(Instr)) { 2785 // For each unroll part, create a wide load for the group. 2786 SmallVector<Value *, 2> NewLoads; 2787 for (unsigned Part = 0; Part < UF; Part++) { 2788 Instruction *NewLoad; 2789 if (BlockInMask || MaskForGaps) { 2790 assert(useMaskedInterleavedAccesses(*TTI) && 2791 "masked interleaved groups are not allowed."); 2792 Value *GroupMask = MaskForGaps; 2793 if (BlockInMask) { 2794 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2795 Value *ShuffledMask = Builder.CreateShuffleVector( 2796 BlockInMaskPart, 2797 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2798 "interleaved.mask"); 2799 GroupMask = MaskForGaps 2800 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2801 MaskForGaps) 2802 : ShuffledMask; 2803 } 2804 NewLoad = 2805 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2806 GroupMask, PoisonVec, "wide.masked.vec"); 2807 } 2808 else 2809 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2810 Group->getAlign(), "wide.vec"); 2811 Group->addMetadata(NewLoad); 2812 NewLoads.push_back(NewLoad); 2813 } 2814 2815 // For each member in the group, shuffle out the appropriate data from the 2816 // wide loads. 2817 unsigned J = 0; 2818 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2819 Instruction *Member = Group->getMember(I); 2820 2821 // Skip the gaps in the group. 2822 if (!Member) 2823 continue; 2824 2825 auto StrideMask = 2826 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2827 for (unsigned Part = 0; Part < UF; Part++) { 2828 Value *StridedVec = Builder.CreateShuffleVector( 2829 NewLoads[Part], StrideMask, "strided.vec"); 2830 2831 // If this member has different type, cast the result type. 2832 if (Member->getType() != ScalarTy) { 2833 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2834 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2835 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2836 } 2837 2838 if (Group->isReverse()) 2839 StridedVec = reverseVector(StridedVec); 2840 2841 State.set(VPDefs[J], StridedVec, Part); 2842 } 2843 ++J; 2844 } 2845 return; 2846 } 2847 2848 // The sub vector type for current instruction. 2849 auto *SubVT = VectorType::get(ScalarTy, VF); 2850 2851 // Vectorize the interleaved store group. 2852 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2853 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2854 "masked interleaved groups are not allowed."); 2855 assert((!MaskForGaps || !VF.isScalable()) && 2856 "masking gaps for scalable vectors is not yet supported."); 2857 for (unsigned Part = 0; Part < UF; Part++) { 2858 // Collect the stored vector from each member. 2859 SmallVector<Value *, 4> StoredVecs; 2860 for (unsigned i = 0; i < InterleaveFactor; i++) { 2861 assert((Group->getMember(i) || MaskForGaps) && 2862 "Fail to get a member from an interleaved store group"); 2863 Instruction *Member = Group->getMember(i); 2864 2865 // Skip the gaps in the group. 2866 if (!Member) { 2867 Value *Undef = PoisonValue::get(SubVT); 2868 StoredVecs.push_back(Undef); 2869 continue; 2870 } 2871 2872 Value *StoredVec = State.get(StoredValues[i], Part); 2873 2874 if (Group->isReverse()) 2875 StoredVec = reverseVector(StoredVec); 2876 2877 // If this member has different type, cast it to a unified type. 2878 2879 if (StoredVec->getType() != SubVT) 2880 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2881 2882 StoredVecs.push_back(StoredVec); 2883 } 2884 2885 // Concatenate all vectors into a wide vector. 2886 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2887 2888 // Interleave the elements in the wide vector. 2889 Value *IVec = Builder.CreateShuffleVector( 2890 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2891 "interleaved.vec"); 2892 2893 Instruction *NewStoreInstr; 2894 if (BlockInMask || MaskForGaps) { 2895 Value *GroupMask = MaskForGaps; 2896 if (BlockInMask) { 2897 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2898 Value *ShuffledMask = Builder.CreateShuffleVector( 2899 BlockInMaskPart, 2900 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2901 "interleaved.mask"); 2902 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2903 ShuffledMask, MaskForGaps) 2904 : ShuffledMask; 2905 } 2906 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2907 Group->getAlign(), GroupMask); 2908 } else 2909 NewStoreInstr = 2910 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2911 2912 Group->addMetadata(NewStoreInstr); 2913 } 2914 } 2915 2916 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2917 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2918 VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, 2919 bool Reverse) { 2920 // Attempt to issue a wide load. 2921 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2922 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2923 2924 assert((LI || SI) && "Invalid Load/Store instruction"); 2925 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2926 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2927 2928 Type *ScalarDataTy = getLoadStoreType(Instr); 2929 2930 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2931 const Align Alignment = getLoadStoreAlignment(Instr); 2932 bool CreateGatherScatter = !ConsecutiveStride; 2933 2934 VectorParts BlockInMaskParts(UF); 2935 bool isMaskRequired = BlockInMask; 2936 if (isMaskRequired) 2937 for (unsigned Part = 0; Part < UF; ++Part) 2938 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2939 2940 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2941 // Calculate the pointer for the specific unroll-part. 2942 GetElementPtrInst *PartPtr = nullptr; 2943 2944 bool InBounds = false; 2945 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2946 InBounds = gep->isInBounds(); 2947 if (Reverse) { 2948 // If the address is consecutive but reversed, then the 2949 // wide store needs to start at the last vector element. 2950 // RunTimeVF = VScale * VF.getKnownMinValue() 2951 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2952 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2953 // NumElt = -Part * RunTimeVF 2954 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2955 // LastLane = 1 - RunTimeVF 2956 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2957 PartPtr = 2958 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2959 PartPtr->setIsInBounds(InBounds); 2960 PartPtr = cast<GetElementPtrInst>( 2961 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2962 PartPtr->setIsInBounds(InBounds); 2963 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2964 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2965 } else { 2966 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2967 PartPtr = cast<GetElementPtrInst>( 2968 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2969 PartPtr->setIsInBounds(InBounds); 2970 } 2971 2972 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2973 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2974 }; 2975 2976 // Handle Stores: 2977 if (SI) { 2978 setDebugLocFromInst(SI); 2979 2980 for (unsigned Part = 0; Part < UF; ++Part) { 2981 Instruction *NewSI = nullptr; 2982 Value *StoredVal = State.get(StoredValue, Part); 2983 if (CreateGatherScatter) { 2984 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2985 Value *VectorGep = State.get(Addr, Part); 2986 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2987 MaskPart); 2988 } else { 2989 if (Reverse) { 2990 // If we store to reverse consecutive memory locations, then we need 2991 // to reverse the order of elements in the stored value. 2992 StoredVal = reverseVector(StoredVal); 2993 // We don't want to update the value in the map as it might be used in 2994 // another expression. So don't call resetVectorValue(StoredVal). 2995 } 2996 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2997 if (isMaskRequired) 2998 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2999 BlockInMaskParts[Part]); 3000 else 3001 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 3002 } 3003 addMetadata(NewSI, SI); 3004 } 3005 return; 3006 } 3007 3008 // Handle loads. 3009 assert(LI && "Must have a load instruction"); 3010 setDebugLocFromInst(LI); 3011 for (unsigned Part = 0; Part < UF; ++Part) { 3012 Value *NewLI; 3013 if (CreateGatherScatter) { 3014 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3015 Value *VectorGep = State.get(Addr, Part); 3016 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3017 nullptr, "wide.masked.gather"); 3018 addMetadata(NewLI, LI); 3019 } else { 3020 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3021 if (isMaskRequired) 3022 NewLI = Builder.CreateMaskedLoad( 3023 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3024 PoisonValue::get(DataTy), "wide.masked.load"); 3025 else 3026 NewLI = 3027 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3028 3029 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3030 addMetadata(NewLI, LI); 3031 if (Reverse) 3032 NewLI = reverseVector(NewLI); 3033 } 3034 3035 State.set(Def, NewLI, Part); 3036 } 3037 } 3038 3039 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3040 VPUser &User, 3041 const VPIteration &Instance, 3042 bool IfPredicateInstr, 3043 VPTransformState &State) { 3044 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3045 3046 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3047 // the first lane and part. 3048 if (isa<NoAliasScopeDeclInst>(Instr)) 3049 if (!Instance.isFirstIteration()) 3050 return; 3051 3052 setDebugLocFromInst(Instr); 3053 3054 // Does this instruction return a value ? 3055 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3056 3057 Instruction *Cloned = Instr->clone(); 3058 if (!IsVoidRetTy) 3059 Cloned->setName(Instr->getName() + ".cloned"); 3060 3061 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3062 Builder.GetInsertPoint()); 3063 // Replace the operands of the cloned instructions with their scalar 3064 // equivalents in the new loop. 3065 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3066 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3067 auto InputInstance = Instance; 3068 if (!Operand || !OrigLoop->contains(Operand) || 3069 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3070 InputInstance.Lane = VPLane::getFirstLane(); 3071 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3072 Cloned->setOperand(op, NewOp); 3073 } 3074 addNewMetadata(Cloned, Instr); 3075 3076 // Place the cloned scalar in the new loop. 3077 Builder.Insert(Cloned); 3078 3079 State.set(Def, Cloned, Instance); 3080 3081 // If we just cloned a new assumption, add it the assumption cache. 3082 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3083 AC->registerAssumption(II); 3084 3085 // End if-block. 3086 if (IfPredicateInstr) 3087 PredicatedInstructions.push_back(Cloned); 3088 } 3089 3090 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3091 Value *End, Value *Step, 3092 Instruction *DL) { 3093 BasicBlock *Header = L->getHeader(); 3094 BasicBlock *Latch = L->getLoopLatch(); 3095 // As we're just creating this loop, it's possible no latch exists 3096 // yet. If so, use the header as this will be a single block loop. 3097 if (!Latch) 3098 Latch = Header; 3099 3100 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3101 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3102 setDebugLocFromInst(OldInst, &B); 3103 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3104 3105 B.SetInsertPoint(Latch->getTerminator()); 3106 setDebugLocFromInst(OldInst, &B); 3107 3108 // Create i+1 and fill the PHINode. 3109 // 3110 // If the tail is not folded, we know that End - Start >= Step (either 3111 // statically or through the minimum iteration checks). We also know that both 3112 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3113 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3114 // overflows and we can mark the induction increment as NUW. 3115 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3116 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3117 Induction->addIncoming(Start, L->getLoopPreheader()); 3118 Induction->addIncoming(Next, Latch); 3119 // Create the compare. 3120 Value *ICmp = B.CreateICmpEQ(Next, End); 3121 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3122 3123 // Now we have two terminators. Remove the old one from the block. 3124 Latch->getTerminator()->eraseFromParent(); 3125 3126 return Induction; 3127 } 3128 3129 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3130 if (TripCount) 3131 return TripCount; 3132 3133 assert(L && "Create Trip Count for null loop."); 3134 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3135 // Find the loop boundaries. 3136 ScalarEvolution *SE = PSE.getSE(); 3137 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3138 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3139 "Invalid loop count"); 3140 3141 Type *IdxTy = Legal->getWidestInductionType(); 3142 assert(IdxTy && "No type for induction"); 3143 3144 // The exit count might have the type of i64 while the phi is i32. This can 3145 // happen if we have an induction variable that is sign extended before the 3146 // compare. The only way that we get a backedge taken count is that the 3147 // induction variable was signed and as such will not overflow. In such a case 3148 // truncation is legal. 3149 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3150 IdxTy->getPrimitiveSizeInBits()) 3151 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3152 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3153 3154 // Get the total trip count from the count by adding 1. 3155 const SCEV *ExitCount = SE->getAddExpr( 3156 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3157 3158 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3159 3160 // Expand the trip count and place the new instructions in the preheader. 3161 // Notice that the pre-header does not change, only the loop body. 3162 SCEVExpander Exp(*SE, DL, "induction"); 3163 3164 // Count holds the overall loop count (N). 3165 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3166 L->getLoopPreheader()->getTerminator()); 3167 3168 if (TripCount->getType()->isPointerTy()) 3169 TripCount = 3170 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3171 L->getLoopPreheader()->getTerminator()); 3172 3173 return TripCount; 3174 } 3175 3176 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3177 if (VectorTripCount) 3178 return VectorTripCount; 3179 3180 Value *TC = getOrCreateTripCount(L); 3181 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3182 3183 Type *Ty = TC->getType(); 3184 // This is where we can make the step a runtime constant. 3185 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3186 3187 // If the tail is to be folded by masking, round the number of iterations N 3188 // up to a multiple of Step instead of rounding down. This is done by first 3189 // adding Step-1 and then rounding down. Note that it's ok if this addition 3190 // overflows: the vector induction variable will eventually wrap to zero given 3191 // that it starts at zero and its Step is a power of two; the loop will then 3192 // exit, with the last early-exit vector comparison also producing all-true. 3193 if (Cost->foldTailByMasking()) { 3194 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3195 "VF*UF must be a power of 2 when folding tail by masking"); 3196 assert(!VF.isScalable() && 3197 "Tail folding not yet supported for scalable vectors"); 3198 TC = Builder.CreateAdd( 3199 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3200 } 3201 3202 // Now we need to generate the expression for the part of the loop that the 3203 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3204 // iterations are not required for correctness, or N - Step, otherwise. Step 3205 // is equal to the vectorization factor (number of SIMD elements) times the 3206 // unroll factor (number of SIMD instructions). 3207 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3208 3209 // There are cases where we *must* run at least one iteration in the remainder 3210 // loop. See the cost model for when this can happen. If the step evenly 3211 // divides the trip count, we set the remainder to be equal to the step. If 3212 // the step does not evenly divide the trip count, no adjustment is necessary 3213 // since there will already be scalar iterations. Note that the minimum 3214 // iterations check ensures that N >= Step. 3215 if (Cost->requiresScalarEpilogue(VF)) { 3216 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3217 R = Builder.CreateSelect(IsZero, Step, R); 3218 } 3219 3220 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3221 3222 return VectorTripCount; 3223 } 3224 3225 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3226 const DataLayout &DL) { 3227 // Verify that V is a vector type with same number of elements as DstVTy. 3228 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3229 unsigned VF = DstFVTy->getNumElements(); 3230 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3231 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3232 Type *SrcElemTy = SrcVecTy->getElementType(); 3233 Type *DstElemTy = DstFVTy->getElementType(); 3234 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3235 "Vector elements must have same size"); 3236 3237 // Do a direct cast if element types are castable. 3238 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3239 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3240 } 3241 // V cannot be directly casted to desired vector type. 3242 // May happen when V is a floating point vector but DstVTy is a vector of 3243 // pointers or vice-versa. Handle this using a two-step bitcast using an 3244 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3245 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3246 "Only one type should be a pointer type"); 3247 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3248 "Only one type should be a floating point type"); 3249 Type *IntTy = 3250 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3251 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3252 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3253 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3254 } 3255 3256 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3257 BasicBlock *Bypass) { 3258 Value *Count = getOrCreateTripCount(L); 3259 // Reuse existing vector loop preheader for TC checks. 3260 // Note that new preheader block is generated for vector loop. 3261 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3262 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3263 3264 // Generate code to check if the loop's trip count is less than VF * UF, or 3265 // equal to it in case a scalar epilogue is required; this implies that the 3266 // vector trip count is zero. This check also covers the case where adding one 3267 // to the backedge-taken count overflowed leading to an incorrect trip count 3268 // of zero. In this case we will also jump to the scalar loop. 3269 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3270 : ICmpInst::ICMP_ULT; 3271 3272 // If tail is to be folded, vector loop takes care of all iterations. 3273 Value *CheckMinIters = Builder.getFalse(); 3274 if (!Cost->foldTailByMasking()) { 3275 Value *Step = 3276 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3277 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3278 } 3279 // Create new preheader for vector loop. 3280 LoopVectorPreHeader = 3281 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3282 "vector.ph"); 3283 3284 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3285 DT->getNode(Bypass)->getIDom()) && 3286 "TC check is expected to dominate Bypass"); 3287 3288 // Update dominator for Bypass & LoopExit (if needed). 3289 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3290 if (!Cost->requiresScalarEpilogue(VF)) 3291 // If there is an epilogue which must run, there's no edge from the 3292 // middle block to exit blocks and thus no need to update the immediate 3293 // dominator of the exit blocks. 3294 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3295 3296 ReplaceInstWithInst( 3297 TCCheckBlock->getTerminator(), 3298 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3299 LoopBypassBlocks.push_back(TCCheckBlock); 3300 } 3301 3302 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3303 3304 BasicBlock *const SCEVCheckBlock = 3305 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3306 if (!SCEVCheckBlock) 3307 return nullptr; 3308 3309 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3310 (OptForSizeBasedOnProfile && 3311 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3312 "Cannot SCEV check stride or overflow when optimizing for size"); 3313 3314 3315 // Update dominator only if this is first RT check. 3316 if (LoopBypassBlocks.empty()) { 3317 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3318 if (!Cost->requiresScalarEpilogue(VF)) 3319 // If there is an epilogue which must run, there's no edge from the 3320 // middle block to exit blocks and thus no need to update the immediate 3321 // dominator of the exit blocks. 3322 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3323 } 3324 3325 LoopBypassBlocks.push_back(SCEVCheckBlock); 3326 AddedSafetyChecks = true; 3327 return SCEVCheckBlock; 3328 } 3329 3330 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3331 BasicBlock *Bypass) { 3332 // VPlan-native path does not do any analysis for runtime checks currently. 3333 if (EnableVPlanNativePath) 3334 return nullptr; 3335 3336 BasicBlock *const MemCheckBlock = 3337 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3338 3339 // Check if we generated code that checks in runtime if arrays overlap. We put 3340 // the checks into a separate block to make the more common case of few 3341 // elements faster. 3342 if (!MemCheckBlock) 3343 return nullptr; 3344 3345 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3346 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3347 "Cannot emit memory checks when optimizing for size, unless forced " 3348 "to vectorize."); 3349 ORE->emit([&]() { 3350 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3351 L->getStartLoc(), L->getHeader()) 3352 << "Code-size may be reduced by not forcing " 3353 "vectorization, or by source-code modifications " 3354 "eliminating the need for runtime checks " 3355 "(e.g., adding 'restrict')."; 3356 }); 3357 } 3358 3359 LoopBypassBlocks.push_back(MemCheckBlock); 3360 3361 AddedSafetyChecks = true; 3362 3363 // We currently don't use LoopVersioning for the actual loop cloning but we 3364 // still use it to add the noalias metadata. 3365 LVer = std::make_unique<LoopVersioning>( 3366 *Legal->getLAI(), 3367 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3368 DT, PSE.getSE()); 3369 LVer->prepareNoAliasMetadata(); 3370 return MemCheckBlock; 3371 } 3372 3373 Value *InnerLoopVectorizer::emitTransformedIndex( 3374 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3375 const InductionDescriptor &ID) const { 3376 3377 SCEVExpander Exp(*SE, DL, "induction"); 3378 auto Step = ID.getStep(); 3379 auto StartValue = ID.getStartValue(); 3380 assert(Index->getType()->getScalarType() == Step->getType() && 3381 "Index scalar type does not match StepValue type"); 3382 3383 // Note: the IR at this point is broken. We cannot use SE to create any new 3384 // SCEV and then expand it, hoping that SCEV's simplification will give us 3385 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3386 // lead to various SCEV crashes. So all we can do is to use builder and rely 3387 // on InstCombine for future simplifications. Here we handle some trivial 3388 // cases only. 3389 auto CreateAdd = [&B](Value *X, Value *Y) { 3390 assert(X->getType() == Y->getType() && "Types don't match!"); 3391 if (auto *CX = dyn_cast<ConstantInt>(X)) 3392 if (CX->isZero()) 3393 return Y; 3394 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3395 if (CY->isZero()) 3396 return X; 3397 return B.CreateAdd(X, Y); 3398 }; 3399 3400 // We allow X to be a vector type, in which case Y will potentially be 3401 // splatted into a vector with the same element count. 3402 auto CreateMul = [&B](Value *X, Value *Y) { 3403 assert(X->getType()->getScalarType() == Y->getType() && 3404 "Types don't match!"); 3405 if (auto *CX = dyn_cast<ConstantInt>(X)) 3406 if (CX->isOne()) 3407 return Y; 3408 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3409 if (CY->isOne()) 3410 return X; 3411 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3412 if (XVTy && !isa<VectorType>(Y->getType())) 3413 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3414 return B.CreateMul(X, Y); 3415 }; 3416 3417 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3418 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3419 // the DomTree is not kept up-to-date for additional blocks generated in the 3420 // vector loop. By using the header as insertion point, we guarantee that the 3421 // expanded instructions dominate all their uses. 3422 auto GetInsertPoint = [this, &B]() { 3423 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3424 if (InsertBB != LoopVectorBody && 3425 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3426 return LoopVectorBody->getTerminator(); 3427 return &*B.GetInsertPoint(); 3428 }; 3429 3430 switch (ID.getKind()) { 3431 case InductionDescriptor::IK_IntInduction: { 3432 assert(!isa<VectorType>(Index->getType()) && 3433 "Vector indices not supported for integer inductions yet"); 3434 assert(Index->getType() == StartValue->getType() && 3435 "Index type does not match StartValue type"); 3436 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3437 return B.CreateSub(StartValue, Index); 3438 auto *Offset = CreateMul( 3439 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3440 return CreateAdd(StartValue, Offset); 3441 } 3442 case InductionDescriptor::IK_PtrInduction: { 3443 assert(isa<SCEVConstant>(Step) && 3444 "Expected constant step for pointer induction"); 3445 return B.CreateGEP( 3446 ID.getElementType(), StartValue, 3447 CreateMul(Index, 3448 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3449 GetInsertPoint()))); 3450 } 3451 case InductionDescriptor::IK_FpInduction: { 3452 assert(!isa<VectorType>(Index->getType()) && 3453 "Vector indices not supported for FP inductions yet"); 3454 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3455 auto InductionBinOp = ID.getInductionBinOp(); 3456 assert(InductionBinOp && 3457 (InductionBinOp->getOpcode() == Instruction::FAdd || 3458 InductionBinOp->getOpcode() == Instruction::FSub) && 3459 "Original bin op should be defined for FP induction"); 3460 3461 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3462 Value *MulExp = B.CreateFMul(StepValue, Index); 3463 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3464 "induction"); 3465 } 3466 case InductionDescriptor::IK_NoInduction: 3467 return nullptr; 3468 } 3469 llvm_unreachable("invalid enum"); 3470 } 3471 3472 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3473 LoopScalarBody = OrigLoop->getHeader(); 3474 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3475 assert(LoopVectorPreHeader && "Invalid loop structure"); 3476 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3477 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3478 "multiple exit loop without required epilogue?"); 3479 3480 LoopMiddleBlock = 3481 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3482 LI, nullptr, Twine(Prefix) + "middle.block"); 3483 LoopScalarPreHeader = 3484 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3485 nullptr, Twine(Prefix) + "scalar.ph"); 3486 3487 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3488 3489 // Set up the middle block terminator. Two cases: 3490 // 1) If we know that we must execute the scalar epilogue, emit an 3491 // unconditional branch. 3492 // 2) Otherwise, we must have a single unique exit block (due to how we 3493 // implement the multiple exit case). In this case, set up a conditonal 3494 // branch from the middle block to the loop scalar preheader, and the 3495 // exit block. completeLoopSkeleton will update the condition to use an 3496 // iteration check, if required to decide whether to execute the remainder. 3497 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3498 BranchInst::Create(LoopScalarPreHeader) : 3499 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3500 Builder.getTrue()); 3501 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3502 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3503 3504 // We intentionally don't let SplitBlock to update LoopInfo since 3505 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3506 // LoopVectorBody is explicitly added to the correct place few lines later. 3507 LoopVectorBody = 3508 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3509 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3510 3511 // Update dominator for loop exit. 3512 if (!Cost->requiresScalarEpilogue(VF)) 3513 // If there is an epilogue which must run, there's no edge from the 3514 // middle block to exit blocks and thus no need to update the immediate 3515 // dominator of the exit blocks. 3516 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3517 3518 // Create and register the new vector loop. 3519 Loop *Lp = LI->AllocateLoop(); 3520 Loop *ParentLoop = OrigLoop->getParentLoop(); 3521 3522 // Insert the new loop into the loop nest and register the new basic blocks 3523 // before calling any utilities such as SCEV that require valid LoopInfo. 3524 if (ParentLoop) { 3525 ParentLoop->addChildLoop(Lp); 3526 } else { 3527 LI->addTopLevelLoop(Lp); 3528 } 3529 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3530 return Lp; 3531 } 3532 3533 void InnerLoopVectorizer::createInductionResumeValues( 3534 Loop *L, Value *VectorTripCount, 3535 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3536 assert(VectorTripCount && L && "Expected valid arguments"); 3537 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3538 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3539 "Inconsistent information about additional bypass."); 3540 // We are going to resume the execution of the scalar loop. 3541 // Go over all of the induction variables that we found and fix the 3542 // PHIs that are left in the scalar version of the loop. 3543 // The starting values of PHI nodes depend on the counter of the last 3544 // iteration in the vectorized loop. 3545 // If we come from a bypass edge then we need to start from the original 3546 // start value. 3547 for (auto &InductionEntry : Legal->getInductionVars()) { 3548 PHINode *OrigPhi = InductionEntry.first; 3549 InductionDescriptor II = InductionEntry.second; 3550 3551 // Create phi nodes to merge from the backedge-taken check block. 3552 PHINode *BCResumeVal = 3553 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3554 LoopScalarPreHeader->getTerminator()); 3555 // Copy original phi DL over to the new one. 3556 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3557 Value *&EndValue = IVEndValues[OrigPhi]; 3558 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3559 if (OrigPhi == OldInduction) { 3560 // We know what the end value is. 3561 EndValue = VectorTripCount; 3562 } else { 3563 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3564 3565 // Fast-math-flags propagate from the original induction instruction. 3566 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3567 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3568 3569 Type *StepType = II.getStep()->getType(); 3570 Instruction::CastOps CastOp = 3571 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3572 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3573 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3574 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3575 EndValue->setName("ind.end"); 3576 3577 // Compute the end value for the additional bypass (if applicable). 3578 if (AdditionalBypass.first) { 3579 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3580 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3581 StepType, true); 3582 CRD = 3583 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3584 EndValueFromAdditionalBypass = 3585 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3586 EndValueFromAdditionalBypass->setName("ind.end"); 3587 } 3588 } 3589 // The new PHI merges the original incoming value, in case of a bypass, 3590 // or the value at the end of the vectorized loop. 3591 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3592 3593 // Fix the scalar body counter (PHI node). 3594 // The old induction's phi node in the scalar body needs the truncated 3595 // value. 3596 for (BasicBlock *BB : LoopBypassBlocks) 3597 BCResumeVal->addIncoming(II.getStartValue(), BB); 3598 3599 if (AdditionalBypass.first) 3600 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3601 EndValueFromAdditionalBypass); 3602 3603 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3604 } 3605 } 3606 3607 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3608 MDNode *OrigLoopID) { 3609 assert(L && "Expected valid loop."); 3610 3611 // The trip counts should be cached by now. 3612 Value *Count = getOrCreateTripCount(L); 3613 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3614 3615 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3616 3617 // Add a check in the middle block to see if we have completed 3618 // all of the iterations in the first vector loop. Three cases: 3619 // 1) If we require a scalar epilogue, there is no conditional branch as 3620 // we unconditionally branch to the scalar preheader. Do nothing. 3621 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3622 // Thus if tail is to be folded, we know we don't need to run the 3623 // remainder and we can use the previous value for the condition (true). 3624 // 3) Otherwise, construct a runtime check. 3625 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3626 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3627 Count, VectorTripCount, "cmp.n", 3628 LoopMiddleBlock->getTerminator()); 3629 3630 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3631 // of the corresponding compare because they may have ended up with 3632 // different line numbers and we want to avoid awkward line stepping while 3633 // debugging. Eg. if the compare has got a line number inside the loop. 3634 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3635 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3636 } 3637 3638 // Get ready to start creating new instructions into the vectorized body. 3639 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3640 "Inconsistent vector loop preheader"); 3641 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3642 3643 Optional<MDNode *> VectorizedLoopID = 3644 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3645 LLVMLoopVectorizeFollowupVectorized}); 3646 if (VectorizedLoopID.hasValue()) { 3647 L->setLoopID(VectorizedLoopID.getValue()); 3648 3649 // Do not setAlreadyVectorized if loop attributes have been defined 3650 // explicitly. 3651 return LoopVectorPreHeader; 3652 } 3653 3654 // Keep all loop hints from the original loop on the vector loop (we'll 3655 // replace the vectorizer-specific hints below). 3656 if (MDNode *LID = OrigLoop->getLoopID()) 3657 L->setLoopID(LID); 3658 3659 LoopVectorizeHints Hints(L, true, *ORE); 3660 Hints.setAlreadyVectorized(); 3661 3662 #ifdef EXPENSIVE_CHECKS 3663 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3664 LI->verify(*DT); 3665 #endif 3666 3667 return LoopVectorPreHeader; 3668 } 3669 3670 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3671 /* 3672 In this function we generate a new loop. The new loop will contain 3673 the vectorized instructions while the old loop will continue to run the 3674 scalar remainder. 3675 3676 [ ] <-- loop iteration number check. 3677 / | 3678 / v 3679 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3680 | / | 3681 | / v 3682 || [ ] <-- vector pre header. 3683 |/ | 3684 | v 3685 | [ ] \ 3686 | [ ]_| <-- vector loop. 3687 | | 3688 | v 3689 \ -[ ] <--- middle-block. 3690 \/ | 3691 /\ v 3692 | ->[ ] <--- new preheader. 3693 | | 3694 (opt) v <-- edge from middle to exit iff epilogue is not required. 3695 | [ ] \ 3696 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3697 \ | 3698 \ v 3699 >[ ] <-- exit block(s). 3700 ... 3701 */ 3702 3703 // Get the metadata of the original loop before it gets modified. 3704 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3705 3706 // Workaround! Compute the trip count of the original loop and cache it 3707 // before we start modifying the CFG. This code has a systemic problem 3708 // wherein it tries to run analysis over partially constructed IR; this is 3709 // wrong, and not simply for SCEV. The trip count of the original loop 3710 // simply happens to be prone to hitting this in practice. In theory, we 3711 // can hit the same issue for any SCEV, or ValueTracking query done during 3712 // mutation. See PR49900. 3713 getOrCreateTripCount(OrigLoop); 3714 3715 // Create an empty vector loop, and prepare basic blocks for the runtime 3716 // checks. 3717 Loop *Lp = createVectorLoopSkeleton(""); 3718 3719 // Now, compare the new count to zero. If it is zero skip the vector loop and 3720 // jump to the scalar loop. This check also covers the case where the 3721 // backedge-taken count is uint##_max: adding one to it will overflow leading 3722 // to an incorrect trip count of zero. In this (rare) case we will also jump 3723 // to the scalar loop. 3724 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3725 3726 // Generate the code to check any assumptions that we've made for SCEV 3727 // expressions. 3728 emitSCEVChecks(Lp, LoopScalarPreHeader); 3729 3730 // Generate the code that checks in runtime if arrays overlap. We put the 3731 // checks into a separate block to make the more common case of few elements 3732 // faster. 3733 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3734 3735 // Some loops have a single integer induction variable, while other loops 3736 // don't. One example is c++ iterators that often have multiple pointer 3737 // induction variables. In the code below we also support a case where we 3738 // don't have a single induction variable. 3739 // 3740 // We try to obtain an induction variable from the original loop as hard 3741 // as possible. However if we don't find one that: 3742 // - is an integer 3743 // - counts from zero, stepping by one 3744 // - is the size of the widest induction variable type 3745 // then we create a new one. 3746 OldInduction = Legal->getPrimaryInduction(); 3747 Type *IdxTy = Legal->getWidestInductionType(); 3748 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3749 // The loop step is equal to the vectorization factor (num of SIMD elements) 3750 // times the unroll factor (num of SIMD instructions). 3751 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3752 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3753 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3754 Induction = 3755 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3756 getDebugLocFromInstOrOperands(OldInduction)); 3757 3758 // Emit phis for the new starting index of the scalar loop. 3759 createInductionResumeValues(Lp, CountRoundDown); 3760 3761 return completeLoopSkeleton(Lp, OrigLoopID); 3762 } 3763 3764 // Fix up external users of the induction variable. At this point, we are 3765 // in LCSSA form, with all external PHIs that use the IV having one input value, 3766 // coming from the remainder loop. We need those PHIs to also have a correct 3767 // value for the IV when arriving directly from the middle block. 3768 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3769 const InductionDescriptor &II, 3770 Value *CountRoundDown, Value *EndValue, 3771 BasicBlock *MiddleBlock) { 3772 // There are two kinds of external IV usages - those that use the value 3773 // computed in the last iteration (the PHI) and those that use the penultimate 3774 // value (the value that feeds into the phi from the loop latch). 3775 // We allow both, but they, obviously, have different values. 3776 3777 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3778 3779 DenseMap<Value *, Value *> MissingVals; 3780 3781 // An external user of the last iteration's value should see the value that 3782 // the remainder loop uses to initialize its own IV. 3783 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3784 for (User *U : PostInc->users()) { 3785 Instruction *UI = cast<Instruction>(U); 3786 if (!OrigLoop->contains(UI)) { 3787 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3788 MissingVals[UI] = EndValue; 3789 } 3790 } 3791 3792 // An external user of the penultimate value need to see EndValue - Step. 3793 // The simplest way to get this is to recompute it from the constituent SCEVs, 3794 // that is Start + (Step * (CRD - 1)). 3795 for (User *U : OrigPhi->users()) { 3796 auto *UI = cast<Instruction>(U); 3797 if (!OrigLoop->contains(UI)) { 3798 const DataLayout &DL = 3799 OrigLoop->getHeader()->getModule()->getDataLayout(); 3800 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3801 3802 IRBuilder<> B(MiddleBlock->getTerminator()); 3803 3804 // Fast-math-flags propagate from the original induction instruction. 3805 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3806 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3807 3808 Value *CountMinusOne = B.CreateSub( 3809 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3810 Value *CMO = 3811 !II.getStep()->getType()->isIntegerTy() 3812 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3813 II.getStep()->getType()) 3814 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3815 CMO->setName("cast.cmo"); 3816 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3817 Escape->setName("ind.escape"); 3818 MissingVals[UI] = Escape; 3819 } 3820 } 3821 3822 for (auto &I : MissingVals) { 3823 PHINode *PHI = cast<PHINode>(I.first); 3824 // One corner case we have to handle is two IVs "chasing" each-other, 3825 // that is %IV2 = phi [...], [ %IV1, %latch ] 3826 // In this case, if IV1 has an external use, we need to avoid adding both 3827 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3828 // don't already have an incoming value for the middle block. 3829 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3830 PHI->addIncoming(I.second, MiddleBlock); 3831 } 3832 } 3833 3834 namespace { 3835 3836 struct CSEDenseMapInfo { 3837 static bool canHandle(const Instruction *I) { 3838 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3839 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3840 } 3841 3842 static inline Instruction *getEmptyKey() { 3843 return DenseMapInfo<Instruction *>::getEmptyKey(); 3844 } 3845 3846 static inline Instruction *getTombstoneKey() { 3847 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3848 } 3849 3850 static unsigned getHashValue(const Instruction *I) { 3851 assert(canHandle(I) && "Unknown instruction!"); 3852 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3853 I->value_op_end())); 3854 } 3855 3856 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3857 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3858 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3859 return LHS == RHS; 3860 return LHS->isIdenticalTo(RHS); 3861 } 3862 }; 3863 3864 } // end anonymous namespace 3865 3866 ///Perform cse of induction variable instructions. 3867 static void cse(BasicBlock *BB) { 3868 // Perform simple cse. 3869 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3870 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3871 if (!CSEDenseMapInfo::canHandle(&In)) 3872 continue; 3873 3874 // Check if we can replace this instruction with any of the 3875 // visited instructions. 3876 if (Instruction *V = CSEMap.lookup(&In)) { 3877 In.replaceAllUsesWith(V); 3878 In.eraseFromParent(); 3879 continue; 3880 } 3881 3882 CSEMap[&In] = &In; 3883 } 3884 } 3885 3886 InstructionCost 3887 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3888 bool &NeedToScalarize) const { 3889 Function *F = CI->getCalledFunction(); 3890 Type *ScalarRetTy = CI->getType(); 3891 SmallVector<Type *, 4> Tys, ScalarTys; 3892 for (auto &ArgOp : CI->args()) 3893 ScalarTys.push_back(ArgOp->getType()); 3894 3895 // Estimate cost of scalarized vector call. The source operands are assumed 3896 // to be vectors, so we need to extract individual elements from there, 3897 // execute VF scalar calls, and then gather the result into the vector return 3898 // value. 3899 InstructionCost ScalarCallCost = 3900 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3901 if (VF.isScalar()) 3902 return ScalarCallCost; 3903 3904 // Compute corresponding vector type for return value and arguments. 3905 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3906 for (Type *ScalarTy : ScalarTys) 3907 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3908 3909 // Compute costs of unpacking argument values for the scalar calls and 3910 // packing the return values to a vector. 3911 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3912 3913 InstructionCost Cost = 3914 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3915 3916 // If we can't emit a vector call for this function, then the currently found 3917 // cost is the cost we need to return. 3918 NeedToScalarize = true; 3919 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3920 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3921 3922 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3923 return Cost; 3924 3925 // If the corresponding vector cost is cheaper, return its cost. 3926 InstructionCost VectorCallCost = 3927 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3928 if (VectorCallCost < Cost) { 3929 NeedToScalarize = false; 3930 Cost = VectorCallCost; 3931 } 3932 return Cost; 3933 } 3934 3935 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3936 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3937 return Elt; 3938 return VectorType::get(Elt, VF); 3939 } 3940 3941 InstructionCost 3942 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3943 ElementCount VF) const { 3944 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3945 assert(ID && "Expected intrinsic call!"); 3946 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3947 FastMathFlags FMF; 3948 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3949 FMF = FPMO->getFastMathFlags(); 3950 3951 SmallVector<const Value *> Arguments(CI->args()); 3952 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3953 SmallVector<Type *> ParamTys; 3954 std::transform(FTy->param_begin(), FTy->param_end(), 3955 std::back_inserter(ParamTys), 3956 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3957 3958 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3959 dyn_cast<IntrinsicInst>(CI)); 3960 return TTI.getIntrinsicInstrCost(CostAttrs, 3961 TargetTransformInfo::TCK_RecipThroughput); 3962 } 3963 3964 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3965 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3966 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3967 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3968 } 3969 3970 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3971 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3972 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3973 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3974 } 3975 3976 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3977 // For every instruction `I` in MinBWs, truncate the operands, create a 3978 // truncated version of `I` and reextend its result. InstCombine runs 3979 // later and will remove any ext/trunc pairs. 3980 SmallPtrSet<Value *, 4> Erased; 3981 for (const auto &KV : Cost->getMinimalBitwidths()) { 3982 // If the value wasn't vectorized, we must maintain the original scalar 3983 // type. The absence of the value from State indicates that it 3984 // wasn't vectorized. 3985 // FIXME: Should not rely on getVPValue at this point. 3986 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3987 if (!State.hasAnyVectorValue(Def)) 3988 continue; 3989 for (unsigned Part = 0; Part < UF; ++Part) { 3990 Value *I = State.get(Def, Part); 3991 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3992 continue; 3993 Type *OriginalTy = I->getType(); 3994 Type *ScalarTruncatedTy = 3995 IntegerType::get(OriginalTy->getContext(), KV.second); 3996 auto *TruncatedTy = VectorType::get( 3997 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3998 if (TruncatedTy == OriginalTy) 3999 continue; 4000 4001 IRBuilder<> B(cast<Instruction>(I)); 4002 auto ShrinkOperand = [&](Value *V) -> Value * { 4003 if (auto *ZI = dyn_cast<ZExtInst>(V)) 4004 if (ZI->getSrcTy() == TruncatedTy) 4005 return ZI->getOperand(0); 4006 return B.CreateZExtOrTrunc(V, TruncatedTy); 4007 }; 4008 4009 // The actual instruction modification depends on the instruction type, 4010 // unfortunately. 4011 Value *NewI = nullptr; 4012 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4013 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4014 ShrinkOperand(BO->getOperand(1))); 4015 4016 // Any wrapping introduced by shrinking this operation shouldn't be 4017 // considered undefined behavior. So, we can't unconditionally copy 4018 // arithmetic wrapping flags to NewI. 4019 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4020 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4021 NewI = 4022 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4023 ShrinkOperand(CI->getOperand(1))); 4024 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4025 NewI = B.CreateSelect(SI->getCondition(), 4026 ShrinkOperand(SI->getTrueValue()), 4027 ShrinkOperand(SI->getFalseValue())); 4028 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4029 switch (CI->getOpcode()) { 4030 default: 4031 llvm_unreachable("Unhandled cast!"); 4032 case Instruction::Trunc: 4033 NewI = ShrinkOperand(CI->getOperand(0)); 4034 break; 4035 case Instruction::SExt: 4036 NewI = B.CreateSExtOrTrunc( 4037 CI->getOperand(0), 4038 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4039 break; 4040 case Instruction::ZExt: 4041 NewI = B.CreateZExtOrTrunc( 4042 CI->getOperand(0), 4043 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4044 break; 4045 } 4046 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4047 auto Elements0 = 4048 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4049 auto *O0 = B.CreateZExtOrTrunc( 4050 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4051 auto Elements1 = 4052 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4053 auto *O1 = B.CreateZExtOrTrunc( 4054 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4055 4056 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4057 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4058 // Don't do anything with the operands, just extend the result. 4059 continue; 4060 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4061 auto Elements = 4062 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4063 auto *O0 = B.CreateZExtOrTrunc( 4064 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4065 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4066 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4067 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4068 auto Elements = 4069 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4070 auto *O0 = B.CreateZExtOrTrunc( 4071 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4072 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4073 } else { 4074 // If we don't know what to do, be conservative and don't do anything. 4075 continue; 4076 } 4077 4078 // Lastly, extend the result. 4079 NewI->takeName(cast<Instruction>(I)); 4080 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4081 I->replaceAllUsesWith(Res); 4082 cast<Instruction>(I)->eraseFromParent(); 4083 Erased.insert(I); 4084 State.reset(Def, Res, Part); 4085 } 4086 } 4087 4088 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4089 for (const auto &KV : Cost->getMinimalBitwidths()) { 4090 // If the value wasn't vectorized, we must maintain the original scalar 4091 // type. The absence of the value from State indicates that it 4092 // wasn't vectorized. 4093 // FIXME: Should not rely on getVPValue at this point. 4094 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4095 if (!State.hasAnyVectorValue(Def)) 4096 continue; 4097 for (unsigned Part = 0; Part < UF; ++Part) { 4098 Value *I = State.get(Def, Part); 4099 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4100 if (Inst && Inst->use_empty()) { 4101 Value *NewI = Inst->getOperand(0); 4102 Inst->eraseFromParent(); 4103 State.reset(Def, NewI, Part); 4104 } 4105 } 4106 } 4107 } 4108 4109 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4110 // Insert truncates and extends for any truncated instructions as hints to 4111 // InstCombine. 4112 if (VF.isVector()) 4113 truncateToMinimalBitwidths(State); 4114 4115 // Fix widened non-induction PHIs by setting up the PHI operands. 4116 if (OrigPHIsToFix.size()) { 4117 assert(EnableVPlanNativePath && 4118 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4119 fixNonInductionPHIs(State); 4120 } 4121 4122 // At this point every instruction in the original loop is widened to a 4123 // vector form. Now we need to fix the recurrences in the loop. These PHI 4124 // nodes are currently empty because we did not want to introduce cycles. 4125 // This is the second stage of vectorizing recurrences. 4126 fixCrossIterationPHIs(State); 4127 4128 // Forget the original basic block. 4129 PSE.getSE()->forgetLoop(OrigLoop); 4130 4131 // If we inserted an edge from the middle block to the unique exit block, 4132 // update uses outside the loop (phis) to account for the newly inserted 4133 // edge. 4134 if (!Cost->requiresScalarEpilogue(VF)) { 4135 // Fix-up external users of the induction variables. 4136 for (auto &Entry : Legal->getInductionVars()) 4137 fixupIVUsers(Entry.first, Entry.second, 4138 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4139 IVEndValues[Entry.first], LoopMiddleBlock); 4140 4141 fixLCSSAPHIs(State); 4142 } 4143 4144 for (Instruction *PI : PredicatedInstructions) 4145 sinkScalarOperands(&*PI); 4146 4147 // Remove redundant induction instructions. 4148 cse(LoopVectorBody); 4149 4150 // Set/update profile weights for the vector and remainder loops as original 4151 // loop iterations are now distributed among them. Note that original loop 4152 // represented by LoopScalarBody becomes remainder loop after vectorization. 4153 // 4154 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4155 // end up getting slightly roughened result but that should be OK since 4156 // profile is not inherently precise anyway. Note also possible bypass of 4157 // vector code caused by legality checks is ignored, assigning all the weight 4158 // to the vector loop, optimistically. 4159 // 4160 // For scalable vectorization we can't know at compile time how many iterations 4161 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4162 // vscale of '1'. 4163 setProfileInfoAfterUnrolling( 4164 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4165 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4166 } 4167 4168 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4169 // In order to support recurrences we need to be able to vectorize Phi nodes. 4170 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4171 // stage #2: We now need to fix the recurrences by adding incoming edges to 4172 // the currently empty PHI nodes. At this point every instruction in the 4173 // original loop is widened to a vector form so we can use them to construct 4174 // the incoming edges. 4175 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4176 for (VPRecipeBase &R : Header->phis()) { 4177 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4178 fixReduction(ReductionPhi, State); 4179 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4180 fixFirstOrderRecurrence(FOR, State); 4181 } 4182 } 4183 4184 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4185 VPTransformState &State) { 4186 // This is the second phase of vectorizing first-order recurrences. An 4187 // overview of the transformation is described below. Suppose we have the 4188 // following loop. 4189 // 4190 // for (int i = 0; i < n; ++i) 4191 // b[i] = a[i] - a[i - 1]; 4192 // 4193 // There is a first-order recurrence on "a". For this loop, the shorthand 4194 // scalar IR looks like: 4195 // 4196 // scalar.ph: 4197 // s_init = a[-1] 4198 // br scalar.body 4199 // 4200 // scalar.body: 4201 // i = phi [0, scalar.ph], [i+1, scalar.body] 4202 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4203 // s2 = a[i] 4204 // b[i] = s2 - s1 4205 // br cond, scalar.body, ... 4206 // 4207 // In this example, s1 is a recurrence because it's value depends on the 4208 // previous iteration. In the first phase of vectorization, we created a 4209 // vector phi v1 for s1. We now complete the vectorization and produce the 4210 // shorthand vector IR shown below (for VF = 4, UF = 1). 4211 // 4212 // vector.ph: 4213 // v_init = vector(..., ..., ..., a[-1]) 4214 // br vector.body 4215 // 4216 // vector.body 4217 // i = phi [0, vector.ph], [i+4, vector.body] 4218 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4219 // v2 = a[i, i+1, i+2, i+3]; 4220 // v3 = vector(v1(3), v2(0, 1, 2)) 4221 // b[i, i+1, i+2, i+3] = v2 - v3 4222 // br cond, vector.body, middle.block 4223 // 4224 // middle.block: 4225 // x = v2(3) 4226 // br scalar.ph 4227 // 4228 // scalar.ph: 4229 // s_init = phi [x, middle.block], [a[-1], otherwise] 4230 // br scalar.body 4231 // 4232 // After execution completes the vector loop, we extract the next value of 4233 // the recurrence (x) to use as the initial value in the scalar loop. 4234 4235 // Extract the last vector element in the middle block. This will be the 4236 // initial value for the recurrence when jumping to the scalar loop. 4237 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4238 Value *Incoming = State.get(PreviousDef, UF - 1); 4239 auto *ExtractForScalar = Incoming; 4240 auto *IdxTy = Builder.getInt32Ty(); 4241 if (VF.isVector()) { 4242 auto *One = ConstantInt::get(IdxTy, 1); 4243 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4244 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4245 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4246 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4247 "vector.recur.extract"); 4248 } 4249 // Extract the second last element in the middle block if the 4250 // Phi is used outside the loop. We need to extract the phi itself 4251 // and not the last element (the phi update in the current iteration). This 4252 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4253 // when the scalar loop is not run at all. 4254 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4255 if (VF.isVector()) { 4256 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4257 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4258 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4259 Incoming, Idx, "vector.recur.extract.for.phi"); 4260 } else if (UF > 1) 4261 // When loop is unrolled without vectorizing, initialize 4262 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4263 // of `Incoming`. This is analogous to the vectorized case above: extracting 4264 // the second last element when VF > 1. 4265 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4266 4267 // Fix the initial value of the original recurrence in the scalar loop. 4268 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4269 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4270 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4271 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4272 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4273 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4274 Start->addIncoming(Incoming, BB); 4275 } 4276 4277 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4278 Phi->setName("scalar.recur"); 4279 4280 // Finally, fix users of the recurrence outside the loop. The users will need 4281 // either the last value of the scalar recurrence or the last value of the 4282 // vector recurrence we extracted in the middle block. Since the loop is in 4283 // LCSSA form, we just need to find all the phi nodes for the original scalar 4284 // recurrence in the exit block, and then add an edge for the middle block. 4285 // Note that LCSSA does not imply single entry when the original scalar loop 4286 // had multiple exiting edges (as we always run the last iteration in the 4287 // scalar epilogue); in that case, there is no edge from middle to exit and 4288 // and thus no phis which needed updated. 4289 if (!Cost->requiresScalarEpilogue(VF)) 4290 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4291 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4292 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4293 } 4294 4295 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4296 VPTransformState &State) { 4297 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4298 // Get it's reduction variable descriptor. 4299 assert(Legal->isReductionVariable(OrigPhi) && 4300 "Unable to find the reduction variable"); 4301 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4302 4303 RecurKind RK = RdxDesc.getRecurrenceKind(); 4304 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4305 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4306 setDebugLocFromInst(ReductionStartValue); 4307 4308 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4309 // This is the vector-clone of the value that leaves the loop. 4310 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4311 4312 // Wrap flags are in general invalid after vectorization, clear them. 4313 clearReductionWrapFlags(RdxDesc, State); 4314 4315 // Before each round, move the insertion point right between 4316 // the PHIs and the values we are going to write. 4317 // This allows us to write both PHINodes and the extractelement 4318 // instructions. 4319 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4320 4321 setDebugLocFromInst(LoopExitInst); 4322 4323 Type *PhiTy = OrigPhi->getType(); 4324 // If tail is folded by masking, the vector value to leave the loop should be 4325 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4326 // instead of the former. For an inloop reduction the reduction will already 4327 // be predicated, and does not need to be handled here. 4328 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4329 for (unsigned Part = 0; Part < UF; ++Part) { 4330 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4331 Value *Sel = nullptr; 4332 for (User *U : VecLoopExitInst->users()) { 4333 if (isa<SelectInst>(U)) { 4334 assert(!Sel && "Reduction exit feeding two selects"); 4335 Sel = U; 4336 } else 4337 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4338 } 4339 assert(Sel && "Reduction exit feeds no select"); 4340 State.reset(LoopExitInstDef, Sel, Part); 4341 4342 // If the target can create a predicated operator for the reduction at no 4343 // extra cost in the loop (for example a predicated vadd), it can be 4344 // cheaper for the select to remain in the loop than be sunk out of it, 4345 // and so use the select value for the phi instead of the old 4346 // LoopExitValue. 4347 if (PreferPredicatedReductionSelect || 4348 TTI->preferPredicatedReductionSelect( 4349 RdxDesc.getOpcode(), PhiTy, 4350 TargetTransformInfo::ReductionFlags())) { 4351 auto *VecRdxPhi = 4352 cast<PHINode>(State.get(PhiR, Part)); 4353 VecRdxPhi->setIncomingValueForBlock( 4354 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4355 } 4356 } 4357 } 4358 4359 // If the vector reduction can be performed in a smaller type, we truncate 4360 // then extend the loop exit value to enable InstCombine to evaluate the 4361 // entire expression in the smaller type. 4362 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4363 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4364 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4365 Builder.SetInsertPoint( 4366 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4367 VectorParts RdxParts(UF); 4368 for (unsigned Part = 0; Part < UF; ++Part) { 4369 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4370 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4371 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4372 : Builder.CreateZExt(Trunc, VecTy); 4373 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4374 UI != RdxParts[Part]->user_end();) 4375 if (*UI != Trunc) { 4376 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4377 RdxParts[Part] = Extnd; 4378 } else { 4379 ++UI; 4380 } 4381 } 4382 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4383 for (unsigned Part = 0; Part < UF; ++Part) { 4384 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4385 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4386 } 4387 } 4388 4389 // Reduce all of the unrolled parts into a single vector. 4390 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4391 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4392 4393 // The middle block terminator has already been assigned a DebugLoc here (the 4394 // OrigLoop's single latch terminator). We want the whole middle block to 4395 // appear to execute on this line because: (a) it is all compiler generated, 4396 // (b) these instructions are always executed after evaluating the latch 4397 // conditional branch, and (c) other passes may add new predecessors which 4398 // terminate on this line. This is the easiest way to ensure we don't 4399 // accidentally cause an extra step back into the loop while debugging. 4400 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4401 if (PhiR->isOrdered()) 4402 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4403 else { 4404 // Floating-point operations should have some FMF to enable the reduction. 4405 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4406 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4407 for (unsigned Part = 1; Part < UF; ++Part) { 4408 Value *RdxPart = State.get(LoopExitInstDef, Part); 4409 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4410 ReducedPartRdx = Builder.CreateBinOp( 4411 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4412 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4413 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4414 ReducedPartRdx, RdxPart); 4415 else 4416 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4417 } 4418 } 4419 4420 // Create the reduction after the loop. Note that inloop reductions create the 4421 // target reduction in the loop using a Reduction recipe. 4422 if (VF.isVector() && !PhiR->isInLoop()) { 4423 ReducedPartRdx = 4424 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4425 // If the reduction can be performed in a smaller type, we need to extend 4426 // the reduction to the wider type before we branch to the original loop. 4427 if (PhiTy != RdxDesc.getRecurrenceType()) 4428 ReducedPartRdx = RdxDesc.isSigned() 4429 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4430 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4431 } 4432 4433 // Create a phi node that merges control-flow from the backedge-taken check 4434 // block and the middle block. 4435 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4436 LoopScalarPreHeader->getTerminator()); 4437 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4438 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4439 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4440 4441 // Now, we need to fix the users of the reduction variable 4442 // inside and outside of the scalar remainder loop. 4443 4444 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4445 // in the exit blocks. See comment on analogous loop in 4446 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4447 if (!Cost->requiresScalarEpilogue(VF)) 4448 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4449 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4450 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4451 4452 // Fix the scalar loop reduction variable with the incoming reduction sum 4453 // from the vector body and from the backedge value. 4454 int IncomingEdgeBlockIdx = 4455 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4456 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4457 // Pick the other block. 4458 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4459 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4460 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4461 } 4462 4463 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4464 VPTransformState &State) { 4465 RecurKind RK = RdxDesc.getRecurrenceKind(); 4466 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4467 return; 4468 4469 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4470 assert(LoopExitInstr && "null loop exit instruction"); 4471 SmallVector<Instruction *, 8> Worklist; 4472 SmallPtrSet<Instruction *, 8> Visited; 4473 Worklist.push_back(LoopExitInstr); 4474 Visited.insert(LoopExitInstr); 4475 4476 while (!Worklist.empty()) { 4477 Instruction *Cur = Worklist.pop_back_val(); 4478 if (isa<OverflowingBinaryOperator>(Cur)) 4479 for (unsigned Part = 0; Part < UF; ++Part) { 4480 // FIXME: Should not rely on getVPValue at this point. 4481 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4482 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4483 } 4484 4485 for (User *U : Cur->users()) { 4486 Instruction *UI = cast<Instruction>(U); 4487 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4488 Visited.insert(UI).second) 4489 Worklist.push_back(UI); 4490 } 4491 } 4492 } 4493 4494 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4495 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4496 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4497 // Some phis were already hand updated by the reduction and recurrence 4498 // code above, leave them alone. 4499 continue; 4500 4501 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4502 // Non-instruction incoming values will have only one value. 4503 4504 VPLane Lane = VPLane::getFirstLane(); 4505 if (isa<Instruction>(IncomingValue) && 4506 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4507 VF)) 4508 Lane = VPLane::getLastLaneForVF(VF); 4509 4510 // Can be a loop invariant incoming value or the last scalar value to be 4511 // extracted from the vectorized loop. 4512 // FIXME: Should not rely on getVPValue at this point. 4513 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4514 Value *lastIncomingValue = 4515 OrigLoop->isLoopInvariant(IncomingValue) 4516 ? IncomingValue 4517 : State.get(State.Plan->getVPValue(IncomingValue, true), 4518 VPIteration(UF - 1, Lane)); 4519 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4520 } 4521 } 4522 4523 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4524 // The basic block and loop containing the predicated instruction. 4525 auto *PredBB = PredInst->getParent(); 4526 auto *VectorLoop = LI->getLoopFor(PredBB); 4527 4528 // Initialize a worklist with the operands of the predicated instruction. 4529 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4530 4531 // Holds instructions that we need to analyze again. An instruction may be 4532 // reanalyzed if we don't yet know if we can sink it or not. 4533 SmallVector<Instruction *, 8> InstsToReanalyze; 4534 4535 // Returns true if a given use occurs in the predicated block. Phi nodes use 4536 // their operands in their corresponding predecessor blocks. 4537 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4538 auto *I = cast<Instruction>(U.getUser()); 4539 BasicBlock *BB = I->getParent(); 4540 if (auto *Phi = dyn_cast<PHINode>(I)) 4541 BB = Phi->getIncomingBlock( 4542 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4543 return BB == PredBB; 4544 }; 4545 4546 // Iteratively sink the scalarized operands of the predicated instruction 4547 // into the block we created for it. When an instruction is sunk, it's 4548 // operands are then added to the worklist. The algorithm ends after one pass 4549 // through the worklist doesn't sink a single instruction. 4550 bool Changed; 4551 do { 4552 // Add the instructions that need to be reanalyzed to the worklist, and 4553 // reset the changed indicator. 4554 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4555 InstsToReanalyze.clear(); 4556 Changed = false; 4557 4558 while (!Worklist.empty()) { 4559 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4560 4561 // We can't sink an instruction if it is a phi node, is not in the loop, 4562 // or may have side effects. 4563 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4564 I->mayHaveSideEffects()) 4565 continue; 4566 4567 // If the instruction is already in PredBB, check if we can sink its 4568 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4569 // sinking the scalar instruction I, hence it appears in PredBB; but it 4570 // may have failed to sink I's operands (recursively), which we try 4571 // (again) here. 4572 if (I->getParent() == PredBB) { 4573 Worklist.insert(I->op_begin(), I->op_end()); 4574 continue; 4575 } 4576 4577 // It's legal to sink the instruction if all its uses occur in the 4578 // predicated block. Otherwise, there's nothing to do yet, and we may 4579 // need to reanalyze the instruction. 4580 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4581 InstsToReanalyze.push_back(I); 4582 continue; 4583 } 4584 4585 // Move the instruction to the beginning of the predicated block, and add 4586 // it's operands to the worklist. 4587 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4588 Worklist.insert(I->op_begin(), I->op_end()); 4589 4590 // The sinking may have enabled other instructions to be sunk, so we will 4591 // need to iterate. 4592 Changed = true; 4593 } 4594 } while (Changed); 4595 } 4596 4597 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4598 for (PHINode *OrigPhi : OrigPHIsToFix) { 4599 VPWidenPHIRecipe *VPPhi = 4600 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4601 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4602 // Make sure the builder has a valid insert point. 4603 Builder.SetInsertPoint(NewPhi); 4604 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4605 VPValue *Inc = VPPhi->getIncomingValue(i); 4606 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4607 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4608 } 4609 } 4610 } 4611 4612 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4613 return Cost->useOrderedReductions(RdxDesc); 4614 } 4615 4616 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4617 VPUser &Operands, unsigned UF, 4618 ElementCount VF, bool IsPtrLoopInvariant, 4619 SmallBitVector &IsIndexLoopInvariant, 4620 VPTransformState &State) { 4621 // Construct a vector GEP by widening the operands of the scalar GEP as 4622 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4623 // results in a vector of pointers when at least one operand of the GEP 4624 // is vector-typed. Thus, to keep the representation compact, we only use 4625 // vector-typed operands for loop-varying values. 4626 4627 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4628 // If we are vectorizing, but the GEP has only loop-invariant operands, 4629 // the GEP we build (by only using vector-typed operands for 4630 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4631 // produce a vector of pointers, we need to either arbitrarily pick an 4632 // operand to broadcast, or broadcast a clone of the original GEP. 4633 // Here, we broadcast a clone of the original. 4634 // 4635 // TODO: If at some point we decide to scalarize instructions having 4636 // loop-invariant operands, this special case will no longer be 4637 // required. We would add the scalarization decision to 4638 // collectLoopScalars() and teach getVectorValue() to broadcast 4639 // the lane-zero scalar value. 4640 auto *Clone = Builder.Insert(GEP->clone()); 4641 for (unsigned Part = 0; Part < UF; ++Part) { 4642 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4643 State.set(VPDef, EntryPart, Part); 4644 addMetadata(EntryPart, GEP); 4645 } 4646 } else { 4647 // If the GEP has at least one loop-varying operand, we are sure to 4648 // produce a vector of pointers. But if we are only unrolling, we want 4649 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4650 // produce with the code below will be scalar (if VF == 1) or vector 4651 // (otherwise). Note that for the unroll-only case, we still maintain 4652 // values in the vector mapping with initVector, as we do for other 4653 // instructions. 4654 for (unsigned Part = 0; Part < UF; ++Part) { 4655 // The pointer operand of the new GEP. If it's loop-invariant, we 4656 // won't broadcast it. 4657 auto *Ptr = IsPtrLoopInvariant 4658 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4659 : State.get(Operands.getOperand(0), Part); 4660 4661 // Collect all the indices for the new GEP. If any index is 4662 // loop-invariant, we won't broadcast it. 4663 SmallVector<Value *, 4> Indices; 4664 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4665 VPValue *Operand = Operands.getOperand(I); 4666 if (IsIndexLoopInvariant[I - 1]) 4667 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4668 else 4669 Indices.push_back(State.get(Operand, Part)); 4670 } 4671 4672 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4673 // but it should be a vector, otherwise. 4674 auto *NewGEP = 4675 GEP->isInBounds() 4676 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4677 Indices) 4678 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4679 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4680 "NewGEP is not a pointer vector"); 4681 State.set(VPDef, NewGEP, Part); 4682 addMetadata(NewGEP, GEP); 4683 } 4684 } 4685 } 4686 4687 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4688 VPWidenPHIRecipe *PhiR, 4689 VPTransformState &State) { 4690 PHINode *P = cast<PHINode>(PN); 4691 if (EnableVPlanNativePath) { 4692 // Currently we enter here in the VPlan-native path for non-induction 4693 // PHIs where all control flow is uniform. We simply widen these PHIs. 4694 // Create a vector phi with no operands - the vector phi operands will be 4695 // set at the end of vector code generation. 4696 Type *VecTy = (State.VF.isScalar()) 4697 ? PN->getType() 4698 : VectorType::get(PN->getType(), State.VF); 4699 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4700 State.set(PhiR, VecPhi, 0); 4701 OrigPHIsToFix.push_back(P); 4702 4703 return; 4704 } 4705 4706 assert(PN->getParent() == OrigLoop->getHeader() && 4707 "Non-header phis should have been handled elsewhere"); 4708 4709 // In order to support recurrences we need to be able to vectorize Phi nodes. 4710 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4711 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4712 // this value when we vectorize all of the instructions that use the PHI. 4713 4714 assert(!Legal->isReductionVariable(P) && 4715 "reductions should be handled elsewhere"); 4716 4717 setDebugLocFromInst(P); 4718 4719 // This PHINode must be an induction variable. 4720 // Make sure that we know about it. 4721 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4722 4723 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4724 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4725 4726 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4727 // which can be found from the original scalar operations. 4728 switch (II.getKind()) { 4729 case InductionDescriptor::IK_NoInduction: 4730 llvm_unreachable("Unknown induction"); 4731 case InductionDescriptor::IK_IntInduction: 4732 case InductionDescriptor::IK_FpInduction: 4733 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4734 case InductionDescriptor::IK_PtrInduction: { 4735 // Handle the pointer induction variable case. 4736 assert(P->getType()->isPointerTy() && "Unexpected type."); 4737 4738 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4739 // This is the normalized GEP that starts counting at zero. 4740 Value *PtrInd = 4741 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4742 // Determine the number of scalars we need to generate for each unroll 4743 // iteration. If the instruction is uniform, we only need to generate the 4744 // first lane. Otherwise, we generate all VF values. 4745 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4746 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4747 4748 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4749 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4750 if (NeedsVectorIndex) { 4751 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4752 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4753 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4754 } 4755 4756 for (unsigned Part = 0; Part < UF; ++Part) { 4757 Value *PartStart = createStepForVF( 4758 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4759 4760 if (NeedsVectorIndex) { 4761 // Here we cache the whole vector, which means we can support the 4762 // extraction of any lane. However, in some cases the extractelement 4763 // instruction that is generated for scalar uses of this vector (e.g. 4764 // a load instruction) is not folded away. Therefore we still 4765 // calculate values for the first n lanes to avoid redundant moves 4766 // (when extracting the 0th element) and to produce scalar code (i.e. 4767 // additional add/gep instructions instead of expensive extractelement 4768 // instructions) when extracting higher-order elements. 4769 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4770 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4771 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4772 Value *SclrGep = 4773 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4774 SclrGep->setName("next.gep"); 4775 State.set(PhiR, SclrGep, Part); 4776 } 4777 4778 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4779 Value *Idx = Builder.CreateAdd( 4780 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4781 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4782 Value *SclrGep = 4783 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4784 SclrGep->setName("next.gep"); 4785 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4786 } 4787 } 4788 return; 4789 } 4790 assert(isa<SCEVConstant>(II.getStep()) && 4791 "Induction step not a SCEV constant!"); 4792 Type *PhiType = II.getStep()->getType(); 4793 4794 // Build a pointer phi 4795 Value *ScalarStartValue = II.getStartValue(); 4796 Type *ScStValueType = ScalarStartValue->getType(); 4797 PHINode *NewPointerPhi = 4798 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4799 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4800 4801 // A pointer induction, performed by using a gep 4802 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4803 Instruction *InductionLoc = LoopLatch->getTerminator(); 4804 const SCEV *ScalarStep = II.getStep(); 4805 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4806 Value *ScalarStepValue = 4807 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4808 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4809 Value *NumUnrolledElems = 4810 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4811 Value *InductionGEP = GetElementPtrInst::Create( 4812 II.getElementType(), NewPointerPhi, 4813 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4814 InductionLoc); 4815 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4816 4817 // Create UF many actual address geps that use the pointer 4818 // phi as base and a vectorized version of the step value 4819 // (<step*0, ..., step*N>) as offset. 4820 for (unsigned Part = 0; Part < State.UF; ++Part) { 4821 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4822 Value *StartOffsetScalar = 4823 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4824 Value *StartOffset = 4825 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4826 // Create a vector of consecutive numbers from zero to VF. 4827 StartOffset = 4828 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4829 4830 Value *GEP = Builder.CreateGEP( 4831 II.getElementType(), NewPointerPhi, 4832 Builder.CreateMul( 4833 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4834 "vector.gep")); 4835 State.set(PhiR, GEP, Part); 4836 } 4837 } 4838 } 4839 } 4840 4841 /// A helper function for checking whether an integer division-related 4842 /// instruction may divide by zero (in which case it must be predicated if 4843 /// executed conditionally in the scalar code). 4844 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4845 /// Non-zero divisors that are non compile-time constants will not be 4846 /// converted into multiplication, so we will still end up scalarizing 4847 /// the division, but can do so w/o predication. 4848 static bool mayDivideByZero(Instruction &I) { 4849 assert((I.getOpcode() == Instruction::UDiv || 4850 I.getOpcode() == Instruction::SDiv || 4851 I.getOpcode() == Instruction::URem || 4852 I.getOpcode() == Instruction::SRem) && 4853 "Unexpected instruction"); 4854 Value *Divisor = I.getOperand(1); 4855 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4856 return !CInt || CInt->isZero(); 4857 } 4858 4859 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4860 VPUser &User, 4861 VPTransformState &State) { 4862 switch (I.getOpcode()) { 4863 case Instruction::Call: 4864 case Instruction::Br: 4865 case Instruction::PHI: 4866 case Instruction::GetElementPtr: 4867 case Instruction::Select: 4868 llvm_unreachable("This instruction is handled by a different recipe."); 4869 case Instruction::UDiv: 4870 case Instruction::SDiv: 4871 case Instruction::SRem: 4872 case Instruction::URem: 4873 case Instruction::Add: 4874 case Instruction::FAdd: 4875 case Instruction::Sub: 4876 case Instruction::FSub: 4877 case Instruction::FNeg: 4878 case Instruction::Mul: 4879 case Instruction::FMul: 4880 case Instruction::FDiv: 4881 case Instruction::FRem: 4882 case Instruction::Shl: 4883 case Instruction::LShr: 4884 case Instruction::AShr: 4885 case Instruction::And: 4886 case Instruction::Or: 4887 case Instruction::Xor: { 4888 // Just widen unops and binops. 4889 setDebugLocFromInst(&I); 4890 4891 for (unsigned Part = 0; Part < UF; ++Part) { 4892 SmallVector<Value *, 2> Ops; 4893 for (VPValue *VPOp : User.operands()) 4894 Ops.push_back(State.get(VPOp, Part)); 4895 4896 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4897 4898 if (auto *VecOp = dyn_cast<Instruction>(V)) 4899 VecOp->copyIRFlags(&I); 4900 4901 // Use this vector value for all users of the original instruction. 4902 State.set(Def, V, Part); 4903 addMetadata(V, &I); 4904 } 4905 4906 break; 4907 } 4908 case Instruction::ICmp: 4909 case Instruction::FCmp: { 4910 // Widen compares. Generate vector compares. 4911 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4912 auto *Cmp = cast<CmpInst>(&I); 4913 setDebugLocFromInst(Cmp); 4914 for (unsigned Part = 0; Part < UF; ++Part) { 4915 Value *A = State.get(User.getOperand(0), Part); 4916 Value *B = State.get(User.getOperand(1), Part); 4917 Value *C = nullptr; 4918 if (FCmp) { 4919 // Propagate fast math flags. 4920 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4921 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4922 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4923 } else { 4924 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4925 } 4926 State.set(Def, C, Part); 4927 addMetadata(C, &I); 4928 } 4929 4930 break; 4931 } 4932 4933 case Instruction::ZExt: 4934 case Instruction::SExt: 4935 case Instruction::FPToUI: 4936 case Instruction::FPToSI: 4937 case Instruction::FPExt: 4938 case Instruction::PtrToInt: 4939 case Instruction::IntToPtr: 4940 case Instruction::SIToFP: 4941 case Instruction::UIToFP: 4942 case Instruction::Trunc: 4943 case Instruction::FPTrunc: 4944 case Instruction::BitCast: { 4945 auto *CI = cast<CastInst>(&I); 4946 setDebugLocFromInst(CI); 4947 4948 /// Vectorize casts. 4949 Type *DestTy = 4950 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4951 4952 for (unsigned Part = 0; Part < UF; ++Part) { 4953 Value *A = State.get(User.getOperand(0), Part); 4954 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4955 State.set(Def, Cast, Part); 4956 addMetadata(Cast, &I); 4957 } 4958 break; 4959 } 4960 default: 4961 // This instruction is not vectorized by simple widening. 4962 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4963 llvm_unreachable("Unhandled instruction!"); 4964 } // end of switch. 4965 } 4966 4967 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4968 VPUser &ArgOperands, 4969 VPTransformState &State) { 4970 assert(!isa<DbgInfoIntrinsic>(I) && 4971 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4972 setDebugLocFromInst(&I); 4973 4974 Module *M = I.getParent()->getParent()->getParent(); 4975 auto *CI = cast<CallInst>(&I); 4976 4977 SmallVector<Type *, 4> Tys; 4978 for (Value *ArgOperand : CI->args()) 4979 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4980 4981 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4982 4983 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4984 // version of the instruction. 4985 // Is it beneficial to perform intrinsic call compared to lib call? 4986 bool NeedToScalarize = false; 4987 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4988 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4989 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4990 assert((UseVectorIntrinsic || !NeedToScalarize) && 4991 "Instruction should be scalarized elsewhere."); 4992 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4993 "Either the intrinsic cost or vector call cost must be valid"); 4994 4995 for (unsigned Part = 0; Part < UF; ++Part) { 4996 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4997 SmallVector<Value *, 4> Args; 4998 for (auto &I : enumerate(ArgOperands.operands())) { 4999 // Some intrinsics have a scalar argument - don't replace it with a 5000 // vector. 5001 Value *Arg; 5002 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5003 Arg = State.get(I.value(), Part); 5004 else { 5005 Arg = State.get(I.value(), VPIteration(0, 0)); 5006 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 5007 TysForDecl.push_back(Arg->getType()); 5008 } 5009 Args.push_back(Arg); 5010 } 5011 5012 Function *VectorF; 5013 if (UseVectorIntrinsic) { 5014 // Use vector version of the intrinsic. 5015 if (VF.isVector()) 5016 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5017 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5018 assert(VectorF && "Can't retrieve vector intrinsic."); 5019 } else { 5020 // Use vector version of the function call. 5021 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5022 #ifndef NDEBUG 5023 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5024 "Can't create vector function."); 5025 #endif 5026 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5027 } 5028 SmallVector<OperandBundleDef, 1> OpBundles; 5029 CI->getOperandBundlesAsDefs(OpBundles); 5030 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5031 5032 if (isa<FPMathOperator>(V)) 5033 V->copyFastMathFlags(CI); 5034 5035 State.set(Def, V, Part); 5036 addMetadata(V, &I); 5037 } 5038 } 5039 5040 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5041 VPUser &Operands, 5042 bool InvariantCond, 5043 VPTransformState &State) { 5044 setDebugLocFromInst(&I); 5045 5046 // The condition can be loop invariant but still defined inside the 5047 // loop. This means that we can't just use the original 'cond' value. 5048 // We have to take the 'vectorized' value and pick the first lane. 5049 // Instcombine will make this a no-op. 5050 auto *InvarCond = InvariantCond 5051 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5052 : nullptr; 5053 5054 for (unsigned Part = 0; Part < UF; ++Part) { 5055 Value *Cond = 5056 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5057 Value *Op0 = State.get(Operands.getOperand(1), Part); 5058 Value *Op1 = State.get(Operands.getOperand(2), Part); 5059 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5060 State.set(VPDef, Sel, Part); 5061 addMetadata(Sel, &I); 5062 } 5063 } 5064 5065 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5066 // We should not collect Scalars more than once per VF. Right now, this 5067 // function is called from collectUniformsAndScalars(), which already does 5068 // this check. Collecting Scalars for VF=1 does not make any sense. 5069 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5070 "This function should not be visited twice for the same VF"); 5071 5072 SmallSetVector<Instruction *, 8> Worklist; 5073 5074 // These sets are used to seed the analysis with pointers used by memory 5075 // accesses that will remain scalar. 5076 SmallSetVector<Instruction *, 8> ScalarPtrs; 5077 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5078 auto *Latch = TheLoop->getLoopLatch(); 5079 5080 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5081 // The pointer operands of loads and stores will be scalar as long as the 5082 // memory access is not a gather or scatter operation. The value operand of a 5083 // store will remain scalar if the store is scalarized. 5084 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5085 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5086 assert(WideningDecision != CM_Unknown && 5087 "Widening decision should be ready at this moment"); 5088 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5089 if (Ptr == Store->getValueOperand()) 5090 return WideningDecision == CM_Scalarize; 5091 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5092 "Ptr is neither a value or pointer operand"); 5093 return WideningDecision != CM_GatherScatter; 5094 }; 5095 5096 // A helper that returns true if the given value is a bitcast or 5097 // getelementptr instruction contained in the loop. 5098 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5099 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5100 isa<GetElementPtrInst>(V)) && 5101 !TheLoop->isLoopInvariant(V); 5102 }; 5103 5104 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5105 if (!isa<PHINode>(Ptr) || 5106 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5107 return false; 5108 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5109 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5110 return false; 5111 return isScalarUse(MemAccess, Ptr); 5112 }; 5113 5114 // A helper that evaluates a memory access's use of a pointer. If the 5115 // pointer is actually the pointer induction of a loop, it is being 5116 // inserted into Worklist. If the use will be a scalar use, and the 5117 // pointer is only used by memory accesses, we place the pointer in 5118 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5119 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5120 if (isScalarPtrInduction(MemAccess, Ptr)) { 5121 Worklist.insert(cast<Instruction>(Ptr)); 5122 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5123 << "\n"); 5124 5125 Instruction *Update = cast<Instruction>( 5126 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5127 5128 // If there is more than one user of Update (Ptr), we shouldn't assume it 5129 // will be scalar after vectorisation as other users of the instruction 5130 // may require widening. Otherwise, add it to ScalarPtrs. 5131 if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) { 5132 ScalarPtrs.insert(Update); 5133 return; 5134 } 5135 } 5136 // We only care about bitcast and getelementptr instructions contained in 5137 // the loop. 5138 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5139 return; 5140 5141 // If the pointer has already been identified as scalar (e.g., if it was 5142 // also identified as uniform), there's nothing to do. 5143 auto *I = cast<Instruction>(Ptr); 5144 if (Worklist.count(I)) 5145 return; 5146 5147 // If the use of the pointer will be a scalar use, and all users of the 5148 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5149 // place the pointer in PossibleNonScalarPtrs. 5150 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5151 return isa<LoadInst>(U) || isa<StoreInst>(U); 5152 })) 5153 ScalarPtrs.insert(I); 5154 else 5155 PossibleNonScalarPtrs.insert(I); 5156 }; 5157 5158 // We seed the scalars analysis with three classes of instructions: (1) 5159 // instructions marked uniform-after-vectorization and (2) bitcast, 5160 // getelementptr and (pointer) phi instructions used by memory accesses 5161 // requiring a scalar use. 5162 // 5163 // (1) Add to the worklist all instructions that have been identified as 5164 // uniform-after-vectorization. 5165 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5166 5167 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5168 // memory accesses requiring a scalar use. The pointer operands of loads and 5169 // stores will be scalar as long as the memory accesses is not a gather or 5170 // scatter operation. The value operand of a store will remain scalar if the 5171 // store is scalarized. 5172 for (auto *BB : TheLoop->blocks()) 5173 for (auto &I : *BB) { 5174 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5175 evaluatePtrUse(Load, Load->getPointerOperand()); 5176 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5177 evaluatePtrUse(Store, Store->getPointerOperand()); 5178 evaluatePtrUse(Store, Store->getValueOperand()); 5179 } 5180 } 5181 for (auto *I : ScalarPtrs) 5182 if (!PossibleNonScalarPtrs.count(I)) { 5183 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5184 Worklist.insert(I); 5185 } 5186 5187 // Insert the forced scalars. 5188 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5189 // induction variable when the PHI user is scalarized. 5190 auto ForcedScalar = ForcedScalars.find(VF); 5191 if (ForcedScalar != ForcedScalars.end()) 5192 for (auto *I : ForcedScalar->second) 5193 Worklist.insert(I); 5194 5195 // Expand the worklist by looking through any bitcasts and getelementptr 5196 // instructions we've already identified as scalar. This is similar to the 5197 // expansion step in collectLoopUniforms(); however, here we're only 5198 // expanding to include additional bitcasts and getelementptr instructions. 5199 unsigned Idx = 0; 5200 while (Idx != Worklist.size()) { 5201 Instruction *Dst = Worklist[Idx++]; 5202 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5203 continue; 5204 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5205 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5206 auto *J = cast<Instruction>(U); 5207 return !TheLoop->contains(J) || Worklist.count(J) || 5208 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5209 isScalarUse(J, Src)); 5210 })) { 5211 Worklist.insert(Src); 5212 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5213 } 5214 } 5215 5216 // An induction variable will remain scalar if all users of the induction 5217 // variable and induction variable update remain scalar. 5218 for (auto &Induction : Legal->getInductionVars()) { 5219 auto *Ind = Induction.first; 5220 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5221 5222 // If tail-folding is applied, the primary induction variable will be used 5223 // to feed a vector compare. 5224 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5225 continue; 5226 5227 // Determine if all users of the induction variable are scalar after 5228 // vectorization. 5229 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5230 auto *I = cast<Instruction>(U); 5231 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5232 }); 5233 if (!ScalarInd) 5234 continue; 5235 5236 // Determine if all users of the induction variable update instruction are 5237 // scalar after vectorization. 5238 auto ScalarIndUpdate = 5239 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5240 auto *I = cast<Instruction>(U); 5241 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5242 }); 5243 if (!ScalarIndUpdate) 5244 continue; 5245 5246 // The induction variable and its update instruction will remain scalar. 5247 Worklist.insert(Ind); 5248 Worklist.insert(IndUpdate); 5249 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5250 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5251 << "\n"); 5252 } 5253 5254 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5255 } 5256 5257 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5258 if (!blockNeedsPredication(I->getParent())) 5259 return false; 5260 switch(I->getOpcode()) { 5261 default: 5262 break; 5263 case Instruction::Load: 5264 case Instruction::Store: { 5265 if (!Legal->isMaskRequired(I)) 5266 return false; 5267 auto *Ptr = getLoadStorePointerOperand(I); 5268 auto *Ty = getLoadStoreType(I); 5269 const Align Alignment = getLoadStoreAlignment(I); 5270 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5271 TTI.isLegalMaskedGather(Ty, Alignment)) 5272 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5273 TTI.isLegalMaskedScatter(Ty, Alignment)); 5274 } 5275 case Instruction::UDiv: 5276 case Instruction::SDiv: 5277 case Instruction::SRem: 5278 case Instruction::URem: 5279 return mayDivideByZero(*I); 5280 } 5281 return false; 5282 } 5283 5284 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5285 Instruction *I, ElementCount VF) { 5286 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5287 assert(getWideningDecision(I, VF) == CM_Unknown && 5288 "Decision should not be set yet."); 5289 auto *Group = getInterleavedAccessGroup(I); 5290 assert(Group && "Must have a group."); 5291 5292 // If the instruction's allocated size doesn't equal it's type size, it 5293 // requires padding and will be scalarized. 5294 auto &DL = I->getModule()->getDataLayout(); 5295 auto *ScalarTy = getLoadStoreType(I); 5296 if (hasIrregularType(ScalarTy, DL)) 5297 return false; 5298 5299 // Check if masking is required. 5300 // A Group may need masking for one of two reasons: it resides in a block that 5301 // needs predication, or it was decided to use masking to deal with gaps 5302 // (either a gap at the end of a load-access that may result in a speculative 5303 // load, or any gaps in a store-access). 5304 bool PredicatedAccessRequiresMasking = 5305 blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5306 bool LoadAccessWithGapsRequiresEpilogMasking = 5307 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 5308 !isScalarEpilogueAllowed(); 5309 bool StoreAccessWithGapsRequiresMasking = 5310 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 5311 if (!PredicatedAccessRequiresMasking && 5312 !LoadAccessWithGapsRequiresEpilogMasking && 5313 !StoreAccessWithGapsRequiresMasking) 5314 return true; 5315 5316 // If masked interleaving is required, we expect that the user/target had 5317 // enabled it, because otherwise it either wouldn't have been created or 5318 // it should have been invalidated by the CostModel. 5319 assert(useMaskedInterleavedAccesses(TTI) && 5320 "Masked interleave-groups for predicated accesses are not enabled."); 5321 5322 if (Group->isReverse()) 5323 return false; 5324 5325 auto *Ty = getLoadStoreType(I); 5326 const Align Alignment = getLoadStoreAlignment(I); 5327 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5328 : TTI.isLegalMaskedStore(Ty, Alignment); 5329 } 5330 5331 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5332 Instruction *I, ElementCount VF) { 5333 // Get and ensure we have a valid memory instruction. 5334 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 5335 5336 auto *Ptr = getLoadStorePointerOperand(I); 5337 auto *ScalarTy = getLoadStoreType(I); 5338 5339 // In order to be widened, the pointer should be consecutive, first of all. 5340 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5341 return false; 5342 5343 // If the instruction is a store located in a predicated block, it will be 5344 // scalarized. 5345 if (isScalarWithPredication(I)) 5346 return false; 5347 5348 // If the instruction's allocated size doesn't equal it's type size, it 5349 // requires padding and will be scalarized. 5350 auto &DL = I->getModule()->getDataLayout(); 5351 if (hasIrregularType(ScalarTy, DL)) 5352 return false; 5353 5354 return true; 5355 } 5356 5357 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5358 // We should not collect Uniforms more than once per VF. Right now, 5359 // this function is called from collectUniformsAndScalars(), which 5360 // already does this check. Collecting Uniforms for VF=1 does not make any 5361 // sense. 5362 5363 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5364 "This function should not be visited twice for the same VF"); 5365 5366 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5367 // not analyze again. Uniforms.count(VF) will return 1. 5368 Uniforms[VF].clear(); 5369 5370 // We now know that the loop is vectorizable! 5371 // Collect instructions inside the loop that will remain uniform after 5372 // vectorization. 5373 5374 // Global values, params and instructions outside of current loop are out of 5375 // scope. 5376 auto isOutOfScope = [&](Value *V) -> bool { 5377 Instruction *I = dyn_cast<Instruction>(V); 5378 return (!I || !TheLoop->contains(I)); 5379 }; 5380 5381 // Worklist containing uniform instructions demanding lane 0. 5382 SetVector<Instruction *> Worklist; 5383 BasicBlock *Latch = TheLoop->getLoopLatch(); 5384 5385 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5386 // that are scalar with predication must not be considered uniform after 5387 // vectorization, because that would create an erroneous replicating region 5388 // where only a single instance out of VF should be formed. 5389 // TODO: optimize such seldom cases if found important, see PR40816. 5390 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5391 if (isOutOfScope(I)) { 5392 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5393 << *I << "\n"); 5394 return; 5395 } 5396 if (isScalarWithPredication(I)) { 5397 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5398 << *I << "\n"); 5399 return; 5400 } 5401 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5402 Worklist.insert(I); 5403 }; 5404 5405 // Start with the conditional branch. If the branch condition is an 5406 // instruction contained in the loop that is only used by the branch, it is 5407 // uniform. 5408 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5409 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5410 addToWorklistIfAllowed(Cmp); 5411 5412 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5413 InstWidening WideningDecision = getWideningDecision(I, VF); 5414 assert(WideningDecision != CM_Unknown && 5415 "Widening decision should be ready at this moment"); 5416 5417 // A uniform memory op is itself uniform. We exclude uniform stores 5418 // here as they demand the last lane, not the first one. 5419 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5420 assert(WideningDecision == CM_Scalarize); 5421 return true; 5422 } 5423 5424 return (WideningDecision == CM_Widen || 5425 WideningDecision == CM_Widen_Reverse || 5426 WideningDecision == CM_Interleave); 5427 }; 5428 5429 5430 // Returns true if Ptr is the pointer operand of a memory access instruction 5431 // I, and I is known to not require scalarization. 5432 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5433 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5434 }; 5435 5436 // Holds a list of values which are known to have at least one uniform use. 5437 // Note that there may be other uses which aren't uniform. A "uniform use" 5438 // here is something which only demands lane 0 of the unrolled iterations; 5439 // it does not imply that all lanes produce the same value (e.g. this is not 5440 // the usual meaning of uniform) 5441 SetVector<Value *> HasUniformUse; 5442 5443 // Scan the loop for instructions which are either a) known to have only 5444 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5445 for (auto *BB : TheLoop->blocks()) 5446 for (auto &I : *BB) { 5447 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5448 switch (II->getIntrinsicID()) { 5449 case Intrinsic::sideeffect: 5450 case Intrinsic::experimental_noalias_scope_decl: 5451 case Intrinsic::assume: 5452 case Intrinsic::lifetime_start: 5453 case Intrinsic::lifetime_end: 5454 if (TheLoop->hasLoopInvariantOperands(&I)) 5455 addToWorklistIfAllowed(&I); 5456 break; 5457 default: 5458 break; 5459 } 5460 } 5461 5462 // ExtractValue instructions must be uniform, because the operands are 5463 // known to be loop-invariant. 5464 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5465 assert(isOutOfScope(EVI->getAggregateOperand()) && 5466 "Expected aggregate value to be loop invariant"); 5467 addToWorklistIfAllowed(EVI); 5468 continue; 5469 } 5470 5471 // If there's no pointer operand, there's nothing to do. 5472 auto *Ptr = getLoadStorePointerOperand(&I); 5473 if (!Ptr) 5474 continue; 5475 5476 // A uniform memory op is itself uniform. We exclude uniform stores 5477 // here as they demand the last lane, not the first one. 5478 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5479 addToWorklistIfAllowed(&I); 5480 5481 if (isUniformDecision(&I, VF)) { 5482 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5483 HasUniformUse.insert(Ptr); 5484 } 5485 } 5486 5487 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5488 // demanding) users. Since loops are assumed to be in LCSSA form, this 5489 // disallows uses outside the loop as well. 5490 for (auto *V : HasUniformUse) { 5491 if (isOutOfScope(V)) 5492 continue; 5493 auto *I = cast<Instruction>(V); 5494 auto UsersAreMemAccesses = 5495 llvm::all_of(I->users(), [&](User *U) -> bool { 5496 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5497 }); 5498 if (UsersAreMemAccesses) 5499 addToWorklistIfAllowed(I); 5500 } 5501 5502 // Expand Worklist in topological order: whenever a new instruction 5503 // is added , its users should be already inside Worklist. It ensures 5504 // a uniform instruction will only be used by uniform instructions. 5505 unsigned idx = 0; 5506 while (idx != Worklist.size()) { 5507 Instruction *I = Worklist[idx++]; 5508 5509 for (auto OV : I->operand_values()) { 5510 // isOutOfScope operands cannot be uniform instructions. 5511 if (isOutOfScope(OV)) 5512 continue; 5513 // First order recurrence Phi's should typically be considered 5514 // non-uniform. 5515 auto *OP = dyn_cast<PHINode>(OV); 5516 if (OP && Legal->isFirstOrderRecurrence(OP)) 5517 continue; 5518 // If all the users of the operand are uniform, then add the 5519 // operand into the uniform worklist. 5520 auto *OI = cast<Instruction>(OV); 5521 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5522 auto *J = cast<Instruction>(U); 5523 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5524 })) 5525 addToWorklistIfAllowed(OI); 5526 } 5527 } 5528 5529 // For an instruction to be added into Worklist above, all its users inside 5530 // the loop should also be in Worklist. However, this condition cannot be 5531 // true for phi nodes that form a cyclic dependence. We must process phi 5532 // nodes separately. An induction variable will remain uniform if all users 5533 // of the induction variable and induction variable update remain uniform. 5534 // The code below handles both pointer and non-pointer induction variables. 5535 for (auto &Induction : Legal->getInductionVars()) { 5536 auto *Ind = Induction.first; 5537 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5538 5539 // Determine if all users of the induction variable are uniform after 5540 // vectorization. 5541 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5542 auto *I = cast<Instruction>(U); 5543 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5544 isVectorizedMemAccessUse(I, Ind); 5545 }); 5546 if (!UniformInd) 5547 continue; 5548 5549 // Determine if all users of the induction variable update instruction are 5550 // uniform after vectorization. 5551 auto UniformIndUpdate = 5552 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5553 auto *I = cast<Instruction>(U); 5554 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5555 isVectorizedMemAccessUse(I, IndUpdate); 5556 }); 5557 if (!UniformIndUpdate) 5558 continue; 5559 5560 // The induction variable and its update instruction will remain uniform. 5561 addToWorklistIfAllowed(Ind); 5562 addToWorklistIfAllowed(IndUpdate); 5563 } 5564 5565 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5566 } 5567 5568 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5569 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5570 5571 if (Legal->getRuntimePointerChecking()->Need) { 5572 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5573 "runtime pointer checks needed. Enable vectorization of this " 5574 "loop with '#pragma clang loop vectorize(enable)' when " 5575 "compiling with -Os/-Oz", 5576 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5577 return true; 5578 } 5579 5580 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5581 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5582 "runtime SCEV checks needed. Enable vectorization of this " 5583 "loop with '#pragma clang loop vectorize(enable)' when " 5584 "compiling with -Os/-Oz", 5585 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5586 return true; 5587 } 5588 5589 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5590 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5591 reportVectorizationFailure("Runtime stride check for small trip count", 5592 "runtime stride == 1 checks needed. Enable vectorization of " 5593 "this loop without such check by compiling with -Os/-Oz", 5594 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5595 return true; 5596 } 5597 5598 return false; 5599 } 5600 5601 ElementCount 5602 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5603 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5604 return ElementCount::getScalable(0); 5605 5606 if (Hints->isScalableVectorizationDisabled()) { 5607 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5608 "ScalableVectorizationDisabled", ORE, TheLoop); 5609 return ElementCount::getScalable(0); 5610 } 5611 5612 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5613 5614 auto MaxScalableVF = ElementCount::getScalable( 5615 std::numeric_limits<ElementCount::ScalarTy>::max()); 5616 5617 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5618 // FIXME: While for scalable vectors this is currently sufficient, this should 5619 // be replaced by a more detailed mechanism that filters out specific VFs, 5620 // instead of invalidating vectorization for a whole set of VFs based on the 5621 // MaxVF. 5622 5623 // Disable scalable vectorization if the loop contains unsupported reductions. 5624 if (!canVectorizeReductions(MaxScalableVF)) { 5625 reportVectorizationInfo( 5626 "Scalable vectorization not supported for the reduction " 5627 "operations found in this loop.", 5628 "ScalableVFUnfeasible", ORE, TheLoop); 5629 return ElementCount::getScalable(0); 5630 } 5631 5632 // Disable scalable vectorization if the loop contains any instructions 5633 // with element types not supported for scalable vectors. 5634 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5635 return !Ty->isVoidTy() && 5636 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5637 })) { 5638 reportVectorizationInfo("Scalable vectorization is not supported " 5639 "for all element types found in this loop.", 5640 "ScalableVFUnfeasible", ORE, TheLoop); 5641 return ElementCount::getScalable(0); 5642 } 5643 5644 if (Legal->isSafeForAnyVectorWidth()) 5645 return MaxScalableVF; 5646 5647 // Limit MaxScalableVF by the maximum safe dependence distance. 5648 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5649 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5650 unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) 5651 .getVScaleRangeArgs() 5652 .second; 5653 if (VScaleMax > 0) 5654 MaxVScale = VScaleMax; 5655 } 5656 MaxScalableVF = ElementCount::getScalable( 5657 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5658 if (!MaxScalableVF) 5659 reportVectorizationInfo( 5660 "Max legal vector width too small, scalable vectorization " 5661 "unfeasible.", 5662 "ScalableVFUnfeasible", ORE, TheLoop); 5663 5664 return MaxScalableVF; 5665 } 5666 5667 FixedScalableVFPair 5668 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5669 ElementCount UserVF) { 5670 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5671 unsigned SmallestType, WidestType; 5672 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5673 5674 // Get the maximum safe dependence distance in bits computed by LAA. 5675 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5676 // the memory accesses that is most restrictive (involved in the smallest 5677 // dependence distance). 5678 unsigned MaxSafeElements = 5679 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5680 5681 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5682 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5683 5684 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5685 << ".\n"); 5686 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5687 << ".\n"); 5688 5689 // First analyze the UserVF, fall back if the UserVF should be ignored. 5690 if (UserVF) { 5691 auto MaxSafeUserVF = 5692 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5693 5694 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5695 // If `VF=vscale x N` is safe, then so is `VF=N` 5696 if (UserVF.isScalable()) 5697 return FixedScalableVFPair( 5698 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5699 else 5700 return UserVF; 5701 } 5702 5703 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5704 5705 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5706 // is better to ignore the hint and let the compiler choose a suitable VF. 5707 if (!UserVF.isScalable()) { 5708 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5709 << " is unsafe, clamping to max safe VF=" 5710 << MaxSafeFixedVF << ".\n"); 5711 ORE->emit([&]() { 5712 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5713 TheLoop->getStartLoc(), 5714 TheLoop->getHeader()) 5715 << "User-specified vectorization factor " 5716 << ore::NV("UserVectorizationFactor", UserVF) 5717 << " is unsafe, clamping to maximum safe vectorization factor " 5718 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5719 }); 5720 return MaxSafeFixedVF; 5721 } 5722 5723 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5724 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5725 << " is ignored because scalable vectors are not " 5726 "available.\n"); 5727 ORE->emit([&]() { 5728 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5729 TheLoop->getStartLoc(), 5730 TheLoop->getHeader()) 5731 << "User-specified vectorization factor " 5732 << ore::NV("UserVectorizationFactor", UserVF) 5733 << " is ignored because the target does not support scalable " 5734 "vectors. The compiler will pick a more suitable value."; 5735 }); 5736 } else { 5737 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5738 << " is unsafe. Ignoring scalable UserVF.\n"); 5739 ORE->emit([&]() { 5740 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5741 TheLoop->getStartLoc(), 5742 TheLoop->getHeader()) 5743 << "User-specified vectorization factor " 5744 << ore::NV("UserVectorizationFactor", UserVF) 5745 << " is unsafe. Ignoring the hint to let the compiler pick a " 5746 "more suitable value."; 5747 }); 5748 } 5749 } 5750 5751 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5752 << " / " << WidestType << " bits.\n"); 5753 5754 FixedScalableVFPair Result(ElementCount::getFixed(1), 5755 ElementCount::getScalable(0)); 5756 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5757 WidestType, MaxSafeFixedVF)) 5758 Result.FixedVF = MaxVF; 5759 5760 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5761 WidestType, MaxSafeScalableVF)) 5762 if (MaxVF.isScalable()) { 5763 Result.ScalableVF = MaxVF; 5764 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5765 << "\n"); 5766 } 5767 5768 return Result; 5769 } 5770 5771 FixedScalableVFPair 5772 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5773 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5774 // TODO: It may by useful to do since it's still likely to be dynamically 5775 // uniform if the target can skip. 5776 reportVectorizationFailure( 5777 "Not inserting runtime ptr check for divergent target", 5778 "runtime pointer checks needed. Not enabled for divergent target", 5779 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5780 return FixedScalableVFPair::getNone(); 5781 } 5782 5783 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5784 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5785 if (TC == 1) { 5786 reportVectorizationFailure("Single iteration (non) loop", 5787 "loop trip count is one, irrelevant for vectorization", 5788 "SingleIterationLoop", ORE, TheLoop); 5789 return FixedScalableVFPair::getNone(); 5790 } 5791 5792 switch (ScalarEpilogueStatus) { 5793 case CM_ScalarEpilogueAllowed: 5794 return computeFeasibleMaxVF(TC, UserVF); 5795 case CM_ScalarEpilogueNotAllowedUsePredicate: 5796 LLVM_FALLTHROUGH; 5797 case CM_ScalarEpilogueNotNeededUsePredicate: 5798 LLVM_DEBUG( 5799 dbgs() << "LV: vector predicate hint/switch found.\n" 5800 << "LV: Not allowing scalar epilogue, creating predicated " 5801 << "vector loop.\n"); 5802 break; 5803 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5804 // fallthrough as a special case of OptForSize 5805 case CM_ScalarEpilogueNotAllowedOptSize: 5806 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5807 LLVM_DEBUG( 5808 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5809 else 5810 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5811 << "count.\n"); 5812 5813 // Bail if runtime checks are required, which are not good when optimising 5814 // for size. 5815 if (runtimeChecksRequired()) 5816 return FixedScalableVFPair::getNone(); 5817 5818 break; 5819 } 5820 5821 // The only loops we can vectorize without a scalar epilogue, are loops with 5822 // a bottom-test and a single exiting block. We'd have to handle the fact 5823 // that not every instruction executes on the last iteration. This will 5824 // require a lane mask which varies through the vector loop body. (TODO) 5825 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5826 // If there was a tail-folding hint/switch, but we can't fold the tail by 5827 // masking, fallback to a vectorization with a scalar epilogue. 5828 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5829 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5830 "scalar epilogue instead.\n"); 5831 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5832 return computeFeasibleMaxVF(TC, UserVF); 5833 } 5834 return FixedScalableVFPair::getNone(); 5835 } 5836 5837 // Now try the tail folding 5838 5839 // Invalidate interleave groups that require an epilogue if we can't mask 5840 // the interleave-group. 5841 if (!useMaskedInterleavedAccesses(TTI)) { 5842 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5843 "No decisions should have been taken at this point"); 5844 // Note: There is no need to invalidate any cost modeling decisions here, as 5845 // non where taken so far. 5846 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5847 } 5848 5849 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5850 // Avoid tail folding if the trip count is known to be a multiple of any VF 5851 // we chose. 5852 // FIXME: The condition below pessimises the case for fixed-width vectors, 5853 // when scalable VFs are also candidates for vectorization. 5854 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5855 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5856 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5857 "MaxFixedVF must be a power of 2"); 5858 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5859 : MaxFixedVF.getFixedValue(); 5860 ScalarEvolution *SE = PSE.getSE(); 5861 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5862 const SCEV *ExitCount = SE->getAddExpr( 5863 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5864 const SCEV *Rem = SE->getURemExpr( 5865 SE->applyLoopGuards(ExitCount, TheLoop), 5866 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5867 if (Rem->isZero()) { 5868 // Accept MaxFixedVF if we do not have a tail. 5869 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5870 return MaxFactors; 5871 } 5872 } 5873 5874 // For scalable vectors, don't use tail folding as this is currently not yet 5875 // supported. The code is likely to have ended up here if the tripcount is 5876 // low, in which case it makes sense not to use scalable vectors. 5877 if (MaxFactors.ScalableVF.isVector()) 5878 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5879 5880 // If we don't know the precise trip count, or if the trip count that we 5881 // found modulo the vectorization factor is not zero, try to fold the tail 5882 // by masking. 5883 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5884 if (Legal->prepareToFoldTailByMasking()) { 5885 FoldTailByMasking = true; 5886 return MaxFactors; 5887 } 5888 5889 // If there was a tail-folding hint/switch, but we can't fold the tail by 5890 // masking, fallback to a vectorization with a scalar epilogue. 5891 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5892 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5893 "scalar epilogue instead.\n"); 5894 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5895 return MaxFactors; 5896 } 5897 5898 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5899 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5900 return FixedScalableVFPair::getNone(); 5901 } 5902 5903 if (TC == 0) { 5904 reportVectorizationFailure( 5905 "Unable to calculate the loop count due to complex control flow", 5906 "unable to calculate the loop count due to complex control flow", 5907 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5908 return FixedScalableVFPair::getNone(); 5909 } 5910 5911 reportVectorizationFailure( 5912 "Cannot optimize for size and vectorize at the same time.", 5913 "cannot optimize for size and vectorize at the same time. " 5914 "Enable vectorization of this loop with '#pragma clang loop " 5915 "vectorize(enable)' when compiling with -Os/-Oz", 5916 "NoTailLoopWithOptForSize", ORE, TheLoop); 5917 return FixedScalableVFPair::getNone(); 5918 } 5919 5920 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5921 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5922 const ElementCount &MaxSafeVF) { 5923 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5924 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5925 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5926 : TargetTransformInfo::RGK_FixedWidthVector); 5927 5928 // Convenience function to return the minimum of two ElementCounts. 5929 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5930 assert((LHS.isScalable() == RHS.isScalable()) && 5931 "Scalable flags must match"); 5932 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5933 }; 5934 5935 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5936 // Note that both WidestRegister and WidestType may not be a powers of 2. 5937 auto MaxVectorElementCount = ElementCount::get( 5938 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5939 ComputeScalableMaxVF); 5940 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5941 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5942 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5943 5944 if (!MaxVectorElementCount) { 5945 LLVM_DEBUG(dbgs() << "LV: The target has no " 5946 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5947 << " vector registers.\n"); 5948 return ElementCount::getFixed(1); 5949 } 5950 5951 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5952 if (ConstTripCount && 5953 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5954 isPowerOf2_32(ConstTripCount)) { 5955 // We need to clamp the VF to be the ConstTripCount. There is no point in 5956 // choosing a higher viable VF as done in the loop below. If 5957 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5958 // the TC is less than or equal to the known number of lanes. 5959 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5960 << ConstTripCount << "\n"); 5961 return TripCountEC; 5962 } 5963 5964 ElementCount MaxVF = MaxVectorElementCount; 5965 if (TTI.shouldMaximizeVectorBandwidth() || 5966 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5967 auto MaxVectorElementCountMaxBW = ElementCount::get( 5968 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5969 ComputeScalableMaxVF); 5970 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5971 5972 // Collect all viable vectorization factors larger than the default MaxVF 5973 // (i.e. MaxVectorElementCount). 5974 SmallVector<ElementCount, 8> VFs; 5975 for (ElementCount VS = MaxVectorElementCount * 2; 5976 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5977 VFs.push_back(VS); 5978 5979 // For each VF calculate its register usage. 5980 auto RUs = calculateRegisterUsage(VFs); 5981 5982 // Select the largest VF which doesn't require more registers than existing 5983 // ones. 5984 for (int i = RUs.size() - 1; i >= 0; --i) { 5985 bool Selected = true; 5986 for (auto &pair : RUs[i].MaxLocalUsers) { 5987 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5988 if (pair.second > TargetNumRegisters) 5989 Selected = false; 5990 } 5991 if (Selected) { 5992 MaxVF = VFs[i]; 5993 break; 5994 } 5995 } 5996 if (ElementCount MinVF = 5997 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5998 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5999 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 6000 << ") with target's minimum: " << MinVF << '\n'); 6001 MaxVF = MinVF; 6002 } 6003 } 6004 } 6005 return MaxVF; 6006 } 6007 6008 bool LoopVectorizationCostModel::isMoreProfitable( 6009 const VectorizationFactor &A, const VectorizationFactor &B) const { 6010 InstructionCost CostA = A.Cost; 6011 InstructionCost CostB = B.Cost; 6012 6013 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6014 6015 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6016 MaxTripCount) { 6017 // If we are folding the tail and the trip count is a known (possibly small) 6018 // constant, the trip count will be rounded up to an integer number of 6019 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6020 // which we compare directly. When not folding the tail, the total cost will 6021 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6022 // approximated with the per-lane cost below instead of using the tripcount 6023 // as here. 6024 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6025 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6026 return RTCostA < RTCostB; 6027 } 6028 6029 // When set to preferred, for now assume vscale may be larger than 1, so 6030 // that scalable vectorization is slightly favorable over fixed-width 6031 // vectorization. 6032 if (Hints->isScalableVectorizationPreferred()) 6033 if (A.Width.isScalable() && !B.Width.isScalable()) 6034 return (CostA * B.Width.getKnownMinValue()) <= 6035 (CostB * A.Width.getKnownMinValue()); 6036 6037 // To avoid the need for FP division: 6038 // (CostA / A.Width) < (CostB / B.Width) 6039 // <=> (CostA * B.Width) < (CostB * A.Width) 6040 return (CostA * B.Width.getKnownMinValue()) < 6041 (CostB * A.Width.getKnownMinValue()); 6042 } 6043 6044 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 6045 const ElementCountSet &VFCandidates) { 6046 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6047 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6048 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6049 assert(VFCandidates.count(ElementCount::getFixed(1)) && 6050 "Expected Scalar VF to be a candidate"); 6051 6052 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6053 VectorizationFactor ChosenFactor = ScalarCost; 6054 6055 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6056 if (ForceVectorization && VFCandidates.size() > 1) { 6057 // Ignore scalar width, because the user explicitly wants vectorization. 6058 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6059 // evaluation. 6060 ChosenFactor.Cost = InstructionCost::getMax(); 6061 } 6062 6063 SmallVector<InstructionVFPair> InvalidCosts; 6064 for (const auto &i : VFCandidates) { 6065 // The cost for scalar VF=1 is already calculated, so ignore it. 6066 if (i.isScalar()) 6067 continue; 6068 6069 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 6070 VectorizationFactor Candidate(i, C.first); 6071 LLVM_DEBUG( 6072 dbgs() << "LV: Vector loop of width " << i << " costs: " 6073 << (Candidate.Cost / Candidate.Width.getKnownMinValue()) 6074 << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") 6075 << ".\n"); 6076 6077 if (!C.second && !ForceVectorization) { 6078 LLVM_DEBUG( 6079 dbgs() << "LV: Not considering vector loop of width " << i 6080 << " because it will not generate any vector instructions.\n"); 6081 continue; 6082 } 6083 6084 // If profitable add it to ProfitableVF list. 6085 if (isMoreProfitable(Candidate, ScalarCost)) 6086 ProfitableVFs.push_back(Candidate); 6087 6088 if (isMoreProfitable(Candidate, ChosenFactor)) 6089 ChosenFactor = Candidate; 6090 } 6091 6092 // Emit a report of VFs with invalid costs in the loop. 6093 if (!InvalidCosts.empty()) { 6094 // Group the remarks per instruction, keeping the instruction order from 6095 // InvalidCosts. 6096 std::map<Instruction *, unsigned> Numbering; 6097 unsigned I = 0; 6098 for (auto &Pair : InvalidCosts) 6099 if (!Numbering.count(Pair.first)) 6100 Numbering[Pair.first] = I++; 6101 6102 // Sort the list, first on instruction(number) then on VF. 6103 llvm::sort(InvalidCosts, 6104 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6105 if (Numbering[A.first] != Numbering[B.first]) 6106 return Numbering[A.first] < Numbering[B.first]; 6107 ElementCountComparator ECC; 6108 return ECC(A.second, B.second); 6109 }); 6110 6111 // For a list of ordered instruction-vf pairs: 6112 // [(load, vf1), (load, vf2), (store, vf1)] 6113 // Group the instructions together to emit separate remarks for: 6114 // load (vf1, vf2) 6115 // store (vf1) 6116 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6117 auto Subset = ArrayRef<InstructionVFPair>(); 6118 do { 6119 if (Subset.empty()) 6120 Subset = Tail.take_front(1); 6121 6122 Instruction *I = Subset.front().first; 6123 6124 // If the next instruction is different, or if there are no other pairs, 6125 // emit a remark for the collated subset. e.g. 6126 // [(load, vf1), (load, vf2))] 6127 // to emit: 6128 // remark: invalid costs for 'load' at VF=(vf, vf2) 6129 if (Subset == Tail || Tail[Subset.size()].first != I) { 6130 std::string OutString; 6131 raw_string_ostream OS(OutString); 6132 assert(!Subset.empty() && "Unexpected empty range"); 6133 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6134 for (auto &Pair : Subset) 6135 OS << (Pair.second == Subset.front().second ? "" : ", ") 6136 << Pair.second; 6137 OS << "):"; 6138 if (auto *CI = dyn_cast<CallInst>(I)) 6139 OS << " call to " << CI->getCalledFunction()->getName(); 6140 else 6141 OS << " " << I->getOpcodeName(); 6142 OS.flush(); 6143 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6144 Tail = Tail.drop_front(Subset.size()); 6145 Subset = {}; 6146 } else 6147 // Grow the subset by one element 6148 Subset = Tail.take_front(Subset.size() + 1); 6149 } while (!Tail.empty()); 6150 } 6151 6152 if (!EnableCondStoresVectorization && NumPredStores) { 6153 reportVectorizationFailure("There are conditional stores.", 6154 "store that is conditionally executed prevents vectorization", 6155 "ConditionalStore", ORE, TheLoop); 6156 ChosenFactor = ScalarCost; 6157 } 6158 6159 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6160 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6161 << "LV: Vectorization seems to be not beneficial, " 6162 << "but was forced by a user.\n"); 6163 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6164 return ChosenFactor; 6165 } 6166 6167 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6168 const Loop &L, ElementCount VF) const { 6169 // Cross iteration phis such as reductions need special handling and are 6170 // currently unsupported. 6171 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6172 return Legal->isFirstOrderRecurrence(&Phi) || 6173 Legal->isReductionVariable(&Phi); 6174 })) 6175 return false; 6176 6177 // Phis with uses outside of the loop require special handling and are 6178 // currently unsupported. 6179 for (auto &Entry : Legal->getInductionVars()) { 6180 // Look for uses of the value of the induction at the last iteration. 6181 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6182 for (User *U : PostInc->users()) 6183 if (!L.contains(cast<Instruction>(U))) 6184 return false; 6185 // Look for uses of penultimate value of the induction. 6186 for (User *U : Entry.first->users()) 6187 if (!L.contains(cast<Instruction>(U))) 6188 return false; 6189 } 6190 6191 // Induction variables that are widened require special handling that is 6192 // currently not supported. 6193 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6194 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6195 this->isProfitableToScalarize(Entry.first, VF)); 6196 })) 6197 return false; 6198 6199 // Epilogue vectorization code has not been auditted to ensure it handles 6200 // non-latch exits properly. It may be fine, but it needs auditted and 6201 // tested. 6202 if (L.getExitingBlock() != L.getLoopLatch()) 6203 return false; 6204 6205 return true; 6206 } 6207 6208 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6209 const ElementCount VF) const { 6210 // FIXME: We need a much better cost-model to take different parameters such 6211 // as register pressure, code size increase and cost of extra branches into 6212 // account. For now we apply a very crude heuristic and only consider loops 6213 // with vectorization factors larger than a certain value. 6214 // We also consider epilogue vectorization unprofitable for targets that don't 6215 // consider interleaving beneficial (eg. MVE). 6216 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6217 return false; 6218 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6219 return true; 6220 return false; 6221 } 6222 6223 VectorizationFactor 6224 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6225 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6226 VectorizationFactor Result = VectorizationFactor::Disabled(); 6227 if (!EnableEpilogueVectorization) { 6228 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6229 return Result; 6230 } 6231 6232 if (!isScalarEpilogueAllowed()) { 6233 LLVM_DEBUG( 6234 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6235 "allowed.\n";); 6236 return Result; 6237 } 6238 6239 // FIXME: This can be fixed for scalable vectors later, because at this stage 6240 // the LoopVectorizer will only consider vectorizing a loop with scalable 6241 // vectors when the loop has a hint to enable vectorization for a given VF. 6242 if (MainLoopVF.isScalable()) { 6243 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6244 "yet supported.\n"); 6245 return Result; 6246 } 6247 6248 // Not really a cost consideration, but check for unsupported cases here to 6249 // simplify the logic. 6250 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6251 LLVM_DEBUG( 6252 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6253 "not a supported candidate.\n";); 6254 return Result; 6255 } 6256 6257 if (EpilogueVectorizationForceVF > 1) { 6258 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6259 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 6260 if (LVP.hasPlanWithVFs({MainLoopVF, ForcedEC})) 6261 return {ForcedEC, 0}; 6262 else { 6263 LLVM_DEBUG( 6264 dbgs() 6265 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6266 return Result; 6267 } 6268 } 6269 6270 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6271 TheLoop->getHeader()->getParent()->hasMinSize()) { 6272 LLVM_DEBUG( 6273 dbgs() 6274 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6275 return Result; 6276 } 6277 6278 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6279 return Result; 6280 6281 for (auto &NextVF : ProfitableVFs) 6282 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6283 (Result.Width.getFixedValue() == 1 || 6284 isMoreProfitable(NextVF, Result)) && 6285 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6286 Result = NextVF; 6287 6288 if (Result != VectorizationFactor::Disabled()) 6289 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6290 << Result.Width.getFixedValue() << "\n";); 6291 return Result; 6292 } 6293 6294 std::pair<unsigned, unsigned> 6295 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6296 unsigned MinWidth = -1U; 6297 unsigned MaxWidth = 8; 6298 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6299 for (Type *T : ElementTypesInLoop) { 6300 MinWidth = std::min<unsigned>( 6301 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6302 MaxWidth = std::max<unsigned>( 6303 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6304 } 6305 return {MinWidth, MaxWidth}; 6306 } 6307 6308 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6309 ElementTypesInLoop.clear(); 6310 // For each block. 6311 for (BasicBlock *BB : TheLoop->blocks()) { 6312 // For each instruction in the loop. 6313 for (Instruction &I : BB->instructionsWithoutDebug()) { 6314 Type *T = I.getType(); 6315 6316 // Skip ignored values. 6317 if (ValuesToIgnore.count(&I)) 6318 continue; 6319 6320 // Only examine Loads, Stores and PHINodes. 6321 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6322 continue; 6323 6324 // Examine PHI nodes that are reduction variables. Update the type to 6325 // account for the recurrence type. 6326 if (auto *PN = dyn_cast<PHINode>(&I)) { 6327 if (!Legal->isReductionVariable(PN)) 6328 continue; 6329 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6330 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6331 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6332 RdxDesc.getRecurrenceType(), 6333 TargetTransformInfo::ReductionFlags())) 6334 continue; 6335 T = RdxDesc.getRecurrenceType(); 6336 } 6337 6338 // Examine the stored values. 6339 if (auto *ST = dyn_cast<StoreInst>(&I)) 6340 T = ST->getValueOperand()->getType(); 6341 6342 // Ignore loaded pointer types and stored pointer types that are not 6343 // vectorizable. 6344 // 6345 // FIXME: The check here attempts to predict whether a load or store will 6346 // be vectorized. We only know this for certain after a VF has 6347 // been selected. Here, we assume that if an access can be 6348 // vectorized, it will be. We should also look at extending this 6349 // optimization to non-pointer types. 6350 // 6351 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6352 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6353 continue; 6354 6355 ElementTypesInLoop.insert(T); 6356 } 6357 } 6358 } 6359 6360 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6361 unsigned LoopCost) { 6362 // -- The interleave heuristics -- 6363 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6364 // There are many micro-architectural considerations that we can't predict 6365 // at this level. For example, frontend pressure (on decode or fetch) due to 6366 // code size, or the number and capabilities of the execution ports. 6367 // 6368 // We use the following heuristics to select the interleave count: 6369 // 1. If the code has reductions, then we interleave to break the cross 6370 // iteration dependency. 6371 // 2. If the loop is really small, then we interleave to reduce the loop 6372 // overhead. 6373 // 3. We don't interleave if we think that we will spill registers to memory 6374 // due to the increased register pressure. 6375 6376 if (!isScalarEpilogueAllowed()) 6377 return 1; 6378 6379 // We used the distance for the interleave count. 6380 if (Legal->getMaxSafeDepDistBytes() != -1U) 6381 return 1; 6382 6383 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6384 const bool HasReductions = !Legal->getReductionVars().empty(); 6385 // Do not interleave loops with a relatively small known or estimated trip 6386 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6387 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6388 // because with the above conditions interleaving can expose ILP and break 6389 // cross iteration dependences for reductions. 6390 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6391 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6392 return 1; 6393 6394 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6395 // We divide by these constants so assume that we have at least one 6396 // instruction that uses at least one register. 6397 for (auto& pair : R.MaxLocalUsers) { 6398 pair.second = std::max(pair.second, 1U); 6399 } 6400 6401 // We calculate the interleave count using the following formula. 6402 // Subtract the number of loop invariants from the number of available 6403 // registers. These registers are used by all of the interleaved instances. 6404 // Next, divide the remaining registers by the number of registers that is 6405 // required by the loop, in order to estimate how many parallel instances 6406 // fit without causing spills. All of this is rounded down if necessary to be 6407 // a power of two. We want power of two interleave count to simplify any 6408 // addressing operations or alignment considerations. 6409 // We also want power of two interleave counts to ensure that the induction 6410 // variable of the vector loop wraps to zero, when tail is folded by masking; 6411 // this currently happens when OptForSize, in which case IC is set to 1 above. 6412 unsigned IC = UINT_MAX; 6413 6414 for (auto& pair : R.MaxLocalUsers) { 6415 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6416 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6417 << " registers of " 6418 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6419 if (VF.isScalar()) { 6420 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6421 TargetNumRegisters = ForceTargetNumScalarRegs; 6422 } else { 6423 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6424 TargetNumRegisters = ForceTargetNumVectorRegs; 6425 } 6426 unsigned MaxLocalUsers = pair.second; 6427 unsigned LoopInvariantRegs = 0; 6428 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6429 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6430 6431 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6432 // Don't count the induction variable as interleaved. 6433 if (EnableIndVarRegisterHeur) { 6434 TmpIC = 6435 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6436 std::max(1U, (MaxLocalUsers - 1))); 6437 } 6438 6439 IC = std::min(IC, TmpIC); 6440 } 6441 6442 // Clamp the interleave ranges to reasonable counts. 6443 unsigned MaxInterleaveCount = 6444 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6445 6446 // Check if the user has overridden the max. 6447 if (VF.isScalar()) { 6448 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6449 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6450 } else { 6451 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6452 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6453 } 6454 6455 // If trip count is known or estimated compile time constant, limit the 6456 // interleave count to be less than the trip count divided by VF, provided it 6457 // is at least 1. 6458 // 6459 // For scalable vectors we can't know if interleaving is beneficial. It may 6460 // not be beneficial for small loops if none of the lanes in the second vector 6461 // iterations is enabled. However, for larger loops, there is likely to be a 6462 // similar benefit as for fixed-width vectors. For now, we choose to leave 6463 // the InterleaveCount as if vscale is '1', although if some information about 6464 // the vector is known (e.g. min vector size), we can make a better decision. 6465 if (BestKnownTC) { 6466 MaxInterleaveCount = 6467 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6468 // Make sure MaxInterleaveCount is greater than 0. 6469 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6470 } 6471 6472 assert(MaxInterleaveCount > 0 && 6473 "Maximum interleave count must be greater than 0"); 6474 6475 // Clamp the calculated IC to be between the 1 and the max interleave count 6476 // that the target and trip count allows. 6477 if (IC > MaxInterleaveCount) 6478 IC = MaxInterleaveCount; 6479 else 6480 // Make sure IC is greater than 0. 6481 IC = std::max(1u, IC); 6482 6483 assert(IC > 0 && "Interleave count must be greater than 0."); 6484 6485 // If we did not calculate the cost for VF (because the user selected the VF) 6486 // then we calculate the cost of VF here. 6487 if (LoopCost == 0) { 6488 InstructionCost C = expectedCost(VF).first; 6489 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6490 LoopCost = *C.getValue(); 6491 } 6492 6493 assert(LoopCost && "Non-zero loop cost expected"); 6494 6495 // Interleave if we vectorized this loop and there is a reduction that could 6496 // benefit from interleaving. 6497 if (VF.isVector() && HasReductions) { 6498 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6499 return IC; 6500 } 6501 6502 // Note that if we've already vectorized the loop we will have done the 6503 // runtime check and so interleaving won't require further checks. 6504 bool InterleavingRequiresRuntimePointerCheck = 6505 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6506 6507 // We want to interleave small loops in order to reduce the loop overhead and 6508 // potentially expose ILP opportunities. 6509 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6510 << "LV: IC is " << IC << '\n' 6511 << "LV: VF is " << VF << '\n'); 6512 const bool AggressivelyInterleaveReductions = 6513 TTI.enableAggressiveInterleaving(HasReductions); 6514 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6515 // We assume that the cost overhead is 1 and we use the cost model 6516 // to estimate the cost of the loop and interleave until the cost of the 6517 // loop overhead is about 5% of the cost of the loop. 6518 unsigned SmallIC = 6519 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6520 6521 // Interleave until store/load ports (estimated by max interleave count) are 6522 // saturated. 6523 unsigned NumStores = Legal->getNumStores(); 6524 unsigned NumLoads = Legal->getNumLoads(); 6525 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6526 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6527 6528 // There is little point in interleaving for reductions containing selects 6529 // and compares when VF=1 since it may just create more overhead than it's 6530 // worth for loops with small trip counts. This is because we still have to 6531 // do the final reduction after the loop. 6532 bool HasSelectCmpReductions = 6533 HasReductions && 6534 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6535 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6536 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6537 RdxDesc.getRecurrenceKind()); 6538 }); 6539 if (HasSelectCmpReductions) { 6540 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6541 return 1; 6542 } 6543 6544 // If we have a scalar reduction (vector reductions are already dealt with 6545 // by this point), we can increase the critical path length if the loop 6546 // we're interleaving is inside another loop. For tree-wise reductions 6547 // set the limit to 2, and for ordered reductions it's best to disable 6548 // interleaving entirely. 6549 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6550 bool HasOrderedReductions = 6551 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6552 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6553 return RdxDesc.isOrdered(); 6554 }); 6555 if (HasOrderedReductions) { 6556 LLVM_DEBUG( 6557 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6558 return 1; 6559 } 6560 6561 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6562 SmallIC = std::min(SmallIC, F); 6563 StoresIC = std::min(StoresIC, F); 6564 LoadsIC = std::min(LoadsIC, F); 6565 } 6566 6567 if (EnableLoadStoreRuntimeInterleave && 6568 std::max(StoresIC, LoadsIC) > SmallIC) { 6569 LLVM_DEBUG( 6570 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6571 return std::max(StoresIC, LoadsIC); 6572 } 6573 6574 // If there are scalar reductions and TTI has enabled aggressive 6575 // interleaving for reductions, we will interleave to expose ILP. 6576 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6577 AggressivelyInterleaveReductions) { 6578 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6579 // Interleave no less than SmallIC but not as aggressive as the normal IC 6580 // to satisfy the rare situation when resources are too limited. 6581 return std::max(IC / 2, SmallIC); 6582 } else { 6583 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6584 return SmallIC; 6585 } 6586 } 6587 6588 // Interleave if this is a large loop (small loops are already dealt with by 6589 // this point) that could benefit from interleaving. 6590 if (AggressivelyInterleaveReductions) { 6591 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6592 return IC; 6593 } 6594 6595 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6596 return 1; 6597 } 6598 6599 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6600 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6601 // This function calculates the register usage by measuring the highest number 6602 // of values that are alive at a single location. Obviously, this is a very 6603 // rough estimation. We scan the loop in a topological order in order and 6604 // assign a number to each instruction. We use RPO to ensure that defs are 6605 // met before their users. We assume that each instruction that has in-loop 6606 // users starts an interval. We record every time that an in-loop value is 6607 // used, so we have a list of the first and last occurrences of each 6608 // instruction. Next, we transpose this data structure into a multi map that 6609 // holds the list of intervals that *end* at a specific location. This multi 6610 // map allows us to perform a linear search. We scan the instructions linearly 6611 // and record each time that a new interval starts, by placing it in a set. 6612 // If we find this value in the multi-map then we remove it from the set. 6613 // The max register usage is the maximum size of the set. 6614 // We also search for instructions that are defined outside the loop, but are 6615 // used inside the loop. We need this number separately from the max-interval 6616 // usage number because when we unroll, loop-invariant values do not take 6617 // more register. 6618 LoopBlocksDFS DFS(TheLoop); 6619 DFS.perform(LI); 6620 6621 RegisterUsage RU; 6622 6623 // Each 'key' in the map opens a new interval. The values 6624 // of the map are the index of the 'last seen' usage of the 6625 // instruction that is the key. 6626 using IntervalMap = DenseMap<Instruction *, unsigned>; 6627 6628 // Maps instruction to its index. 6629 SmallVector<Instruction *, 64> IdxToInstr; 6630 // Marks the end of each interval. 6631 IntervalMap EndPoint; 6632 // Saves the list of instruction indices that are used in the loop. 6633 SmallPtrSet<Instruction *, 8> Ends; 6634 // Saves the list of values that are used in the loop but are 6635 // defined outside the loop, such as arguments and constants. 6636 SmallPtrSet<Value *, 8> LoopInvariants; 6637 6638 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6639 for (Instruction &I : BB->instructionsWithoutDebug()) { 6640 IdxToInstr.push_back(&I); 6641 6642 // Save the end location of each USE. 6643 for (Value *U : I.operands()) { 6644 auto *Instr = dyn_cast<Instruction>(U); 6645 6646 // Ignore non-instruction values such as arguments, constants, etc. 6647 if (!Instr) 6648 continue; 6649 6650 // If this instruction is outside the loop then record it and continue. 6651 if (!TheLoop->contains(Instr)) { 6652 LoopInvariants.insert(Instr); 6653 continue; 6654 } 6655 6656 // Overwrite previous end points. 6657 EndPoint[Instr] = IdxToInstr.size(); 6658 Ends.insert(Instr); 6659 } 6660 } 6661 } 6662 6663 // Saves the list of intervals that end with the index in 'key'. 6664 using InstrList = SmallVector<Instruction *, 2>; 6665 DenseMap<unsigned, InstrList> TransposeEnds; 6666 6667 // Transpose the EndPoints to a list of values that end at each index. 6668 for (auto &Interval : EndPoint) 6669 TransposeEnds[Interval.second].push_back(Interval.first); 6670 6671 SmallPtrSet<Instruction *, 8> OpenIntervals; 6672 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6673 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6674 6675 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6676 6677 // A lambda that gets the register usage for the given type and VF. 6678 const auto &TTICapture = TTI; 6679 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6680 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6681 return 0; 6682 InstructionCost::CostType RegUsage = 6683 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6684 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6685 "Nonsensical values for register usage."); 6686 return RegUsage; 6687 }; 6688 6689 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6690 Instruction *I = IdxToInstr[i]; 6691 6692 // Remove all of the instructions that end at this location. 6693 InstrList &List = TransposeEnds[i]; 6694 for (Instruction *ToRemove : List) 6695 OpenIntervals.erase(ToRemove); 6696 6697 // Ignore instructions that are never used within the loop. 6698 if (!Ends.count(I)) 6699 continue; 6700 6701 // Skip ignored values. 6702 if (ValuesToIgnore.count(I)) 6703 continue; 6704 6705 // For each VF find the maximum usage of registers. 6706 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6707 // Count the number of live intervals. 6708 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6709 6710 if (VFs[j].isScalar()) { 6711 for (auto Inst : OpenIntervals) { 6712 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6713 if (RegUsage.find(ClassID) == RegUsage.end()) 6714 RegUsage[ClassID] = 1; 6715 else 6716 RegUsage[ClassID] += 1; 6717 } 6718 } else { 6719 collectUniformsAndScalars(VFs[j]); 6720 for (auto Inst : OpenIntervals) { 6721 // Skip ignored values for VF > 1. 6722 if (VecValuesToIgnore.count(Inst)) 6723 continue; 6724 if (isScalarAfterVectorization(Inst, VFs[j])) { 6725 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6726 if (RegUsage.find(ClassID) == RegUsage.end()) 6727 RegUsage[ClassID] = 1; 6728 else 6729 RegUsage[ClassID] += 1; 6730 } else { 6731 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6732 if (RegUsage.find(ClassID) == RegUsage.end()) 6733 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6734 else 6735 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6736 } 6737 } 6738 } 6739 6740 for (auto& pair : RegUsage) { 6741 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6742 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6743 else 6744 MaxUsages[j][pair.first] = pair.second; 6745 } 6746 } 6747 6748 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6749 << OpenIntervals.size() << '\n'); 6750 6751 // Add the current instruction to the list of open intervals. 6752 OpenIntervals.insert(I); 6753 } 6754 6755 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6756 SmallMapVector<unsigned, unsigned, 4> Invariant; 6757 6758 for (auto Inst : LoopInvariants) { 6759 unsigned Usage = 6760 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6761 unsigned ClassID = 6762 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6763 if (Invariant.find(ClassID) == Invariant.end()) 6764 Invariant[ClassID] = Usage; 6765 else 6766 Invariant[ClassID] += Usage; 6767 } 6768 6769 LLVM_DEBUG({ 6770 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6771 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6772 << " item\n"; 6773 for (const auto &pair : MaxUsages[i]) { 6774 dbgs() << "LV(REG): RegisterClass: " 6775 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6776 << " registers\n"; 6777 } 6778 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6779 << " item\n"; 6780 for (const auto &pair : Invariant) { 6781 dbgs() << "LV(REG): RegisterClass: " 6782 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6783 << " registers\n"; 6784 } 6785 }); 6786 6787 RU.LoopInvariantRegs = Invariant; 6788 RU.MaxLocalUsers = MaxUsages[i]; 6789 RUs[i] = RU; 6790 } 6791 6792 return RUs; 6793 } 6794 6795 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6796 // TODO: Cost model for emulated masked load/store is completely 6797 // broken. This hack guides the cost model to use an artificially 6798 // high enough value to practically disable vectorization with such 6799 // operations, except where previously deployed legality hack allowed 6800 // using very low cost values. This is to avoid regressions coming simply 6801 // from moving "masked load/store" check from legality to cost model. 6802 // Masked Load/Gather emulation was previously never allowed. 6803 // Limited number of Masked Store/Scatter emulation was allowed. 6804 assert(isPredicatedInst(I) && 6805 "Expecting a scalar emulated instruction"); 6806 return isa<LoadInst>(I) || 6807 (isa<StoreInst>(I) && 6808 NumPredStores > NumberOfStoresToPredicate); 6809 } 6810 6811 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6812 // If we aren't vectorizing the loop, or if we've already collected the 6813 // instructions to scalarize, there's nothing to do. Collection may already 6814 // have occurred if we have a user-selected VF and are now computing the 6815 // expected cost for interleaving. 6816 if (VF.isScalar() || VF.isZero() || 6817 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6818 return; 6819 6820 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6821 // not profitable to scalarize any instructions, the presence of VF in the 6822 // map will indicate that we've analyzed it already. 6823 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6824 6825 // Find all the instructions that are scalar with predication in the loop and 6826 // determine if it would be better to not if-convert the blocks they are in. 6827 // If so, we also record the instructions to scalarize. 6828 for (BasicBlock *BB : TheLoop->blocks()) { 6829 if (!blockNeedsPredication(BB)) 6830 continue; 6831 for (Instruction &I : *BB) 6832 if (isScalarWithPredication(&I)) { 6833 ScalarCostsTy ScalarCosts; 6834 // Do not apply discount if scalable, because that would lead to 6835 // invalid scalarization costs. 6836 // Do not apply discount logic if hacked cost is needed 6837 // for emulated masked memrefs. 6838 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6839 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6840 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6841 // Remember that BB will remain after vectorization. 6842 PredicatedBBsAfterVectorization.insert(BB); 6843 } 6844 } 6845 } 6846 6847 int LoopVectorizationCostModel::computePredInstDiscount( 6848 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6849 assert(!isUniformAfterVectorization(PredInst, VF) && 6850 "Instruction marked uniform-after-vectorization will be predicated"); 6851 6852 // Initialize the discount to zero, meaning that the scalar version and the 6853 // vector version cost the same. 6854 InstructionCost Discount = 0; 6855 6856 // Holds instructions to analyze. The instructions we visit are mapped in 6857 // ScalarCosts. Those instructions are the ones that would be scalarized if 6858 // we find that the scalar version costs less. 6859 SmallVector<Instruction *, 8> Worklist; 6860 6861 // Returns true if the given instruction can be scalarized. 6862 auto canBeScalarized = [&](Instruction *I) -> bool { 6863 // We only attempt to scalarize instructions forming a single-use chain 6864 // from the original predicated block that would otherwise be vectorized. 6865 // Although not strictly necessary, we give up on instructions we know will 6866 // already be scalar to avoid traversing chains that are unlikely to be 6867 // beneficial. 6868 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6869 isScalarAfterVectorization(I, VF)) 6870 return false; 6871 6872 // If the instruction is scalar with predication, it will be analyzed 6873 // separately. We ignore it within the context of PredInst. 6874 if (isScalarWithPredication(I)) 6875 return false; 6876 6877 // If any of the instruction's operands are uniform after vectorization, 6878 // the instruction cannot be scalarized. This prevents, for example, a 6879 // masked load from being scalarized. 6880 // 6881 // We assume we will only emit a value for lane zero of an instruction 6882 // marked uniform after vectorization, rather than VF identical values. 6883 // Thus, if we scalarize an instruction that uses a uniform, we would 6884 // create uses of values corresponding to the lanes we aren't emitting code 6885 // for. This behavior can be changed by allowing getScalarValue to clone 6886 // the lane zero values for uniforms rather than asserting. 6887 for (Use &U : I->operands()) 6888 if (auto *J = dyn_cast<Instruction>(U.get())) 6889 if (isUniformAfterVectorization(J, VF)) 6890 return false; 6891 6892 // Otherwise, we can scalarize the instruction. 6893 return true; 6894 }; 6895 6896 // Compute the expected cost discount from scalarizing the entire expression 6897 // feeding the predicated instruction. We currently only consider expressions 6898 // that are single-use instruction chains. 6899 Worklist.push_back(PredInst); 6900 while (!Worklist.empty()) { 6901 Instruction *I = Worklist.pop_back_val(); 6902 6903 // If we've already analyzed the instruction, there's nothing to do. 6904 if (ScalarCosts.find(I) != ScalarCosts.end()) 6905 continue; 6906 6907 // Compute the cost of the vector instruction. Note that this cost already 6908 // includes the scalarization overhead of the predicated instruction. 6909 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6910 6911 // Compute the cost of the scalarized instruction. This cost is the cost of 6912 // the instruction as if it wasn't if-converted and instead remained in the 6913 // predicated block. We will scale this cost by block probability after 6914 // computing the scalarization overhead. 6915 InstructionCost ScalarCost = 6916 VF.getFixedValue() * 6917 getInstructionCost(I, ElementCount::getFixed(1)).first; 6918 6919 // Compute the scalarization overhead of needed insertelement instructions 6920 // and phi nodes. 6921 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6922 ScalarCost += TTI.getScalarizationOverhead( 6923 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6924 APInt::getAllOnes(VF.getFixedValue()), true, false); 6925 ScalarCost += 6926 VF.getFixedValue() * 6927 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6928 } 6929 6930 // Compute the scalarization overhead of needed extractelement 6931 // instructions. For each of the instruction's operands, if the operand can 6932 // be scalarized, add it to the worklist; otherwise, account for the 6933 // overhead. 6934 for (Use &U : I->operands()) 6935 if (auto *J = dyn_cast<Instruction>(U.get())) { 6936 assert(VectorType::isValidElementType(J->getType()) && 6937 "Instruction has non-scalar type"); 6938 if (canBeScalarized(J)) 6939 Worklist.push_back(J); 6940 else if (needsExtract(J, VF)) { 6941 ScalarCost += TTI.getScalarizationOverhead( 6942 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6943 APInt::getAllOnes(VF.getFixedValue()), false, true); 6944 } 6945 } 6946 6947 // Scale the total scalar cost by block probability. 6948 ScalarCost /= getReciprocalPredBlockProb(); 6949 6950 // Compute the discount. A non-negative discount means the vector version 6951 // of the instruction costs more, and scalarizing would be beneficial. 6952 Discount += VectorCost - ScalarCost; 6953 ScalarCosts[I] = ScalarCost; 6954 } 6955 6956 return *Discount.getValue(); 6957 } 6958 6959 LoopVectorizationCostModel::VectorizationCostTy 6960 LoopVectorizationCostModel::expectedCost( 6961 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6962 VectorizationCostTy Cost; 6963 6964 // For each block. 6965 for (BasicBlock *BB : TheLoop->blocks()) { 6966 VectorizationCostTy BlockCost; 6967 6968 // For each instruction in the old loop. 6969 for (Instruction &I : BB->instructionsWithoutDebug()) { 6970 // Skip ignored values. 6971 if (ValuesToIgnore.count(&I) || 6972 (VF.isVector() && VecValuesToIgnore.count(&I))) 6973 continue; 6974 6975 VectorizationCostTy C = getInstructionCost(&I, VF); 6976 6977 // Check if we should override the cost. 6978 if (C.first.isValid() && 6979 ForceTargetInstructionCost.getNumOccurrences() > 0) 6980 C.first = InstructionCost(ForceTargetInstructionCost); 6981 6982 // Keep a list of instructions with invalid costs. 6983 if (Invalid && !C.first.isValid()) 6984 Invalid->emplace_back(&I, VF); 6985 6986 BlockCost.first += C.first; 6987 BlockCost.second |= C.second; 6988 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6989 << " for VF " << VF << " For instruction: " << I 6990 << '\n'); 6991 } 6992 6993 // If we are vectorizing a predicated block, it will have been 6994 // if-converted. This means that the block's instructions (aside from 6995 // stores and instructions that may divide by zero) will now be 6996 // unconditionally executed. For the scalar case, we may not always execute 6997 // the predicated block, if it is an if-else block. Thus, scale the block's 6998 // cost by the probability of executing it. blockNeedsPredication from 6999 // Legal is used so as to not include all blocks in tail folded loops. 7000 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 7001 BlockCost.first /= getReciprocalPredBlockProb(); 7002 7003 Cost.first += BlockCost.first; 7004 Cost.second |= BlockCost.second; 7005 } 7006 7007 return Cost; 7008 } 7009 7010 /// Gets Address Access SCEV after verifying that the access pattern 7011 /// is loop invariant except the induction variable dependence. 7012 /// 7013 /// This SCEV can be sent to the Target in order to estimate the address 7014 /// calculation cost. 7015 static const SCEV *getAddressAccessSCEV( 7016 Value *Ptr, 7017 LoopVectorizationLegality *Legal, 7018 PredicatedScalarEvolution &PSE, 7019 const Loop *TheLoop) { 7020 7021 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 7022 if (!Gep) 7023 return nullptr; 7024 7025 // We are looking for a gep with all loop invariant indices except for one 7026 // which should be an induction variable. 7027 auto SE = PSE.getSE(); 7028 unsigned NumOperands = Gep->getNumOperands(); 7029 for (unsigned i = 1; i < NumOperands; ++i) { 7030 Value *Opd = Gep->getOperand(i); 7031 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 7032 !Legal->isInductionVariable(Opd)) 7033 return nullptr; 7034 } 7035 7036 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 7037 return PSE.getSCEV(Ptr); 7038 } 7039 7040 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 7041 return Legal->hasStride(I->getOperand(0)) || 7042 Legal->hasStride(I->getOperand(1)); 7043 } 7044 7045 InstructionCost 7046 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 7047 ElementCount VF) { 7048 assert(VF.isVector() && 7049 "Scalarization cost of instruction implies vectorization."); 7050 if (VF.isScalable()) 7051 return InstructionCost::getInvalid(); 7052 7053 Type *ValTy = getLoadStoreType(I); 7054 auto SE = PSE.getSE(); 7055 7056 unsigned AS = getLoadStoreAddressSpace(I); 7057 Value *Ptr = getLoadStorePointerOperand(I); 7058 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 7059 7060 // Figure out whether the access is strided and get the stride value 7061 // if it's known in compile time 7062 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 7063 7064 // Get the cost of the scalar memory instruction and address computation. 7065 InstructionCost Cost = 7066 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 7067 7068 // Don't pass *I here, since it is scalar but will actually be part of a 7069 // vectorized loop where the user of it is a vectorized instruction. 7070 const Align Alignment = getLoadStoreAlignment(I); 7071 Cost += VF.getKnownMinValue() * 7072 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 7073 AS, TTI::TCK_RecipThroughput); 7074 7075 // Get the overhead of the extractelement and insertelement instructions 7076 // we might create due to scalarization. 7077 Cost += getScalarizationOverhead(I, VF); 7078 7079 // If we have a predicated load/store, it will need extra i1 extracts and 7080 // conditional branches, but may not be executed for each vector lane. Scale 7081 // the cost by the probability of executing the predicated block. 7082 if (isPredicatedInst(I)) { 7083 Cost /= getReciprocalPredBlockProb(); 7084 7085 // Add the cost of an i1 extract and a branch 7086 auto *Vec_i1Ty = 7087 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 7088 Cost += TTI.getScalarizationOverhead( 7089 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 7090 /*Insert=*/false, /*Extract=*/true); 7091 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 7092 7093 if (useEmulatedMaskMemRefHack(I)) 7094 // Artificially setting to a high enough value to practically disable 7095 // vectorization with such operations. 7096 Cost = 3000000; 7097 } 7098 7099 return Cost; 7100 } 7101 7102 InstructionCost 7103 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7104 ElementCount VF) { 7105 Type *ValTy = getLoadStoreType(I); 7106 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7107 Value *Ptr = getLoadStorePointerOperand(I); 7108 unsigned AS = getLoadStoreAddressSpace(I); 7109 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 7110 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7111 7112 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7113 "Stride should be 1 or -1 for consecutive memory access"); 7114 const Align Alignment = getLoadStoreAlignment(I); 7115 InstructionCost Cost = 0; 7116 if (Legal->isMaskRequired(I)) 7117 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7118 CostKind); 7119 else 7120 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7121 CostKind, I); 7122 7123 bool Reverse = ConsecutiveStride < 0; 7124 if (Reverse) 7125 Cost += 7126 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7127 return Cost; 7128 } 7129 7130 InstructionCost 7131 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7132 ElementCount VF) { 7133 assert(Legal->isUniformMemOp(*I)); 7134 7135 Type *ValTy = getLoadStoreType(I); 7136 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7137 const Align Alignment = getLoadStoreAlignment(I); 7138 unsigned AS = getLoadStoreAddressSpace(I); 7139 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7140 if (isa<LoadInst>(I)) { 7141 return TTI.getAddressComputationCost(ValTy) + 7142 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7143 CostKind) + 7144 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7145 } 7146 StoreInst *SI = cast<StoreInst>(I); 7147 7148 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7149 return TTI.getAddressComputationCost(ValTy) + 7150 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7151 CostKind) + 7152 (isLoopInvariantStoreValue 7153 ? 0 7154 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7155 VF.getKnownMinValue() - 1)); 7156 } 7157 7158 InstructionCost 7159 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7160 ElementCount VF) { 7161 Type *ValTy = getLoadStoreType(I); 7162 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7163 const Align Alignment = getLoadStoreAlignment(I); 7164 const Value *Ptr = getLoadStorePointerOperand(I); 7165 7166 return TTI.getAddressComputationCost(VectorTy) + 7167 TTI.getGatherScatterOpCost( 7168 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7169 TargetTransformInfo::TCK_RecipThroughput, I); 7170 } 7171 7172 InstructionCost 7173 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7174 ElementCount VF) { 7175 // TODO: Once we have support for interleaving with scalable vectors 7176 // we can calculate the cost properly here. 7177 if (VF.isScalable()) 7178 return InstructionCost::getInvalid(); 7179 7180 Type *ValTy = getLoadStoreType(I); 7181 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7182 unsigned AS = getLoadStoreAddressSpace(I); 7183 7184 auto Group = getInterleavedAccessGroup(I); 7185 assert(Group && "Fail to get an interleaved access group."); 7186 7187 unsigned InterleaveFactor = Group->getFactor(); 7188 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7189 7190 // Holds the indices of existing members in the interleaved group. 7191 SmallVector<unsigned, 4> Indices; 7192 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 7193 if (Group->getMember(IF)) 7194 Indices.push_back(IF); 7195 7196 // Calculate the cost of the whole interleaved group. 7197 bool UseMaskForGaps = 7198 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 7199 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 7200 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7201 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7202 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7203 7204 if (Group->isReverse()) { 7205 // TODO: Add support for reversed masked interleaved access. 7206 assert(!Legal->isMaskRequired(I) && 7207 "Reverse masked interleaved access not supported."); 7208 Cost += 7209 Group->getNumMembers() * 7210 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7211 } 7212 return Cost; 7213 } 7214 7215 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7216 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7217 using namespace llvm::PatternMatch; 7218 // Early exit for no inloop reductions 7219 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7220 return None; 7221 auto *VectorTy = cast<VectorType>(Ty); 7222 7223 // We are looking for a pattern of, and finding the minimal acceptable cost: 7224 // reduce(mul(ext(A), ext(B))) or 7225 // reduce(mul(A, B)) or 7226 // reduce(ext(A)) or 7227 // reduce(A). 7228 // The basic idea is that we walk down the tree to do that, finding the root 7229 // reduction instruction in InLoopReductionImmediateChains. From there we find 7230 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7231 // of the components. If the reduction cost is lower then we return it for the 7232 // reduction instruction and 0 for the other instructions in the pattern. If 7233 // it is not we return an invalid cost specifying the orignal cost method 7234 // should be used. 7235 Instruction *RetI = I; 7236 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7237 if (!RetI->hasOneUser()) 7238 return None; 7239 RetI = RetI->user_back(); 7240 } 7241 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7242 RetI->user_back()->getOpcode() == Instruction::Add) { 7243 if (!RetI->hasOneUser()) 7244 return None; 7245 RetI = RetI->user_back(); 7246 } 7247 7248 // Test if the found instruction is a reduction, and if not return an invalid 7249 // cost specifying the parent to use the original cost modelling. 7250 if (!InLoopReductionImmediateChains.count(RetI)) 7251 return None; 7252 7253 // Find the reduction this chain is a part of and calculate the basic cost of 7254 // the reduction on its own. 7255 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7256 Instruction *ReductionPhi = LastChain; 7257 while (!isa<PHINode>(ReductionPhi)) 7258 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7259 7260 const RecurrenceDescriptor &RdxDesc = 7261 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7262 7263 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7264 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7265 7266 // If we're using ordered reductions then we can just return the base cost 7267 // here, since getArithmeticReductionCost calculates the full ordered 7268 // reduction cost when FP reassociation is not allowed. 7269 if (useOrderedReductions(RdxDesc)) 7270 return BaseCost; 7271 7272 // Get the operand that was not the reduction chain and match it to one of the 7273 // patterns, returning the better cost if it is found. 7274 Instruction *RedOp = RetI->getOperand(1) == LastChain 7275 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7276 : dyn_cast<Instruction>(RetI->getOperand(1)); 7277 7278 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7279 7280 Instruction *Op0, *Op1; 7281 if (RedOp && 7282 match(RedOp, 7283 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7284 match(Op0, m_ZExtOrSExt(m_Value())) && 7285 Op0->getOpcode() == Op1->getOpcode() && 7286 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7287 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7288 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7289 7290 // Matched reduce(ext(mul(ext(A), ext(B))) 7291 // Note that the extend opcodes need to all match, or if A==B they will have 7292 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7293 // which is equally fine. 7294 bool IsUnsigned = isa<ZExtInst>(Op0); 7295 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7296 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7297 7298 InstructionCost ExtCost = 7299 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7300 TTI::CastContextHint::None, CostKind, Op0); 7301 InstructionCost MulCost = 7302 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7303 InstructionCost Ext2Cost = 7304 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7305 TTI::CastContextHint::None, CostKind, RedOp); 7306 7307 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7308 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7309 CostKind); 7310 7311 if (RedCost.isValid() && 7312 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7313 return I == RetI ? RedCost : 0; 7314 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7315 !TheLoop->isLoopInvariant(RedOp)) { 7316 // Matched reduce(ext(A)) 7317 bool IsUnsigned = isa<ZExtInst>(RedOp); 7318 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7319 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7320 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7321 CostKind); 7322 7323 InstructionCost ExtCost = 7324 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7325 TTI::CastContextHint::None, CostKind, RedOp); 7326 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7327 return I == RetI ? RedCost : 0; 7328 } else if (RedOp && 7329 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7330 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7331 Op0->getOpcode() == Op1->getOpcode() && 7332 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7333 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7334 bool IsUnsigned = isa<ZExtInst>(Op0); 7335 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7336 // Matched reduce(mul(ext, ext)) 7337 InstructionCost ExtCost = 7338 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7339 TTI::CastContextHint::None, CostKind, Op0); 7340 InstructionCost MulCost = 7341 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7342 7343 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7344 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7345 CostKind); 7346 7347 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7348 return I == RetI ? RedCost : 0; 7349 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7350 // Matched reduce(mul()) 7351 InstructionCost MulCost = 7352 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7353 7354 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7355 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7356 CostKind); 7357 7358 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7359 return I == RetI ? RedCost : 0; 7360 } 7361 } 7362 7363 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7364 } 7365 7366 InstructionCost 7367 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7368 ElementCount VF) { 7369 // Calculate scalar cost only. Vectorization cost should be ready at this 7370 // moment. 7371 if (VF.isScalar()) { 7372 Type *ValTy = getLoadStoreType(I); 7373 const Align Alignment = getLoadStoreAlignment(I); 7374 unsigned AS = getLoadStoreAddressSpace(I); 7375 7376 return TTI.getAddressComputationCost(ValTy) + 7377 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7378 TTI::TCK_RecipThroughput, I); 7379 } 7380 return getWideningCost(I, VF); 7381 } 7382 7383 LoopVectorizationCostModel::VectorizationCostTy 7384 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7385 ElementCount VF) { 7386 // If we know that this instruction will remain uniform, check the cost of 7387 // the scalar version. 7388 if (isUniformAfterVectorization(I, VF)) 7389 VF = ElementCount::getFixed(1); 7390 7391 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7392 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7393 7394 // Forced scalars do not have any scalarization overhead. 7395 auto ForcedScalar = ForcedScalars.find(VF); 7396 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7397 auto InstSet = ForcedScalar->second; 7398 if (InstSet.count(I)) 7399 return VectorizationCostTy( 7400 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7401 VF.getKnownMinValue()), 7402 false); 7403 } 7404 7405 Type *VectorTy; 7406 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7407 7408 bool TypeNotScalarized = 7409 VF.isVector() && VectorTy->isVectorTy() && 7410 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7411 return VectorizationCostTy(C, TypeNotScalarized); 7412 } 7413 7414 InstructionCost 7415 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7416 ElementCount VF) const { 7417 7418 // There is no mechanism yet to create a scalable scalarization loop, 7419 // so this is currently Invalid. 7420 if (VF.isScalable()) 7421 return InstructionCost::getInvalid(); 7422 7423 if (VF.isScalar()) 7424 return 0; 7425 7426 InstructionCost Cost = 0; 7427 Type *RetTy = ToVectorTy(I->getType(), VF); 7428 if (!RetTy->isVoidTy() && 7429 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7430 Cost += TTI.getScalarizationOverhead( 7431 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7432 false); 7433 7434 // Some targets keep addresses scalar. 7435 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7436 return Cost; 7437 7438 // Some targets support efficient element stores. 7439 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7440 return Cost; 7441 7442 // Collect operands to consider. 7443 CallInst *CI = dyn_cast<CallInst>(I); 7444 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7445 7446 // Skip operands that do not require extraction/scalarization and do not incur 7447 // any overhead. 7448 SmallVector<Type *> Tys; 7449 for (auto *V : filterExtractingOperands(Ops, VF)) 7450 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7451 return Cost + TTI.getOperandsScalarizationOverhead( 7452 filterExtractingOperands(Ops, VF), Tys); 7453 } 7454 7455 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7456 if (VF.isScalar()) 7457 return; 7458 NumPredStores = 0; 7459 for (BasicBlock *BB : TheLoop->blocks()) { 7460 // For each instruction in the old loop. 7461 for (Instruction &I : *BB) { 7462 Value *Ptr = getLoadStorePointerOperand(&I); 7463 if (!Ptr) 7464 continue; 7465 7466 // TODO: We should generate better code and update the cost model for 7467 // predicated uniform stores. Today they are treated as any other 7468 // predicated store (see added test cases in 7469 // invariant-store-vectorization.ll). 7470 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7471 NumPredStores++; 7472 7473 if (Legal->isUniformMemOp(I)) { 7474 // TODO: Avoid replicating loads and stores instead of 7475 // relying on instcombine to remove them. 7476 // Load: Scalar load + broadcast 7477 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7478 InstructionCost Cost; 7479 if (isa<StoreInst>(&I) && VF.isScalable() && 7480 isLegalGatherOrScatter(&I)) { 7481 Cost = getGatherScatterCost(&I, VF); 7482 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7483 } else { 7484 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7485 "Cannot yet scalarize uniform stores"); 7486 Cost = getUniformMemOpCost(&I, VF); 7487 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7488 } 7489 continue; 7490 } 7491 7492 // We assume that widening is the best solution when possible. 7493 if (memoryInstructionCanBeWidened(&I, VF)) { 7494 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7495 int ConsecutiveStride = Legal->isConsecutivePtr( 7496 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7497 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7498 "Expected consecutive stride."); 7499 InstWidening Decision = 7500 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7501 setWideningDecision(&I, VF, Decision, Cost); 7502 continue; 7503 } 7504 7505 // Choose between Interleaving, Gather/Scatter or Scalarization. 7506 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7507 unsigned NumAccesses = 1; 7508 if (isAccessInterleaved(&I)) { 7509 auto Group = getInterleavedAccessGroup(&I); 7510 assert(Group && "Fail to get an interleaved access group."); 7511 7512 // Make one decision for the whole group. 7513 if (getWideningDecision(&I, VF) != CM_Unknown) 7514 continue; 7515 7516 NumAccesses = Group->getNumMembers(); 7517 if (interleavedAccessCanBeWidened(&I, VF)) 7518 InterleaveCost = getInterleaveGroupCost(&I, VF); 7519 } 7520 7521 InstructionCost GatherScatterCost = 7522 isLegalGatherOrScatter(&I) 7523 ? getGatherScatterCost(&I, VF) * NumAccesses 7524 : InstructionCost::getInvalid(); 7525 7526 InstructionCost ScalarizationCost = 7527 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7528 7529 // Choose better solution for the current VF, 7530 // write down this decision and use it during vectorization. 7531 InstructionCost Cost; 7532 InstWidening Decision; 7533 if (InterleaveCost <= GatherScatterCost && 7534 InterleaveCost < ScalarizationCost) { 7535 Decision = CM_Interleave; 7536 Cost = InterleaveCost; 7537 } else if (GatherScatterCost < ScalarizationCost) { 7538 Decision = CM_GatherScatter; 7539 Cost = GatherScatterCost; 7540 } else { 7541 Decision = CM_Scalarize; 7542 Cost = ScalarizationCost; 7543 } 7544 // If the instructions belongs to an interleave group, the whole group 7545 // receives the same decision. The whole group receives the cost, but 7546 // the cost will actually be assigned to one instruction. 7547 if (auto Group = getInterleavedAccessGroup(&I)) 7548 setWideningDecision(Group, VF, Decision, Cost); 7549 else 7550 setWideningDecision(&I, VF, Decision, Cost); 7551 } 7552 } 7553 7554 // Make sure that any load of address and any other address computation 7555 // remains scalar unless there is gather/scatter support. This avoids 7556 // inevitable extracts into address registers, and also has the benefit of 7557 // activating LSR more, since that pass can't optimize vectorized 7558 // addresses. 7559 if (TTI.prefersVectorizedAddressing()) 7560 return; 7561 7562 // Start with all scalar pointer uses. 7563 SmallPtrSet<Instruction *, 8> AddrDefs; 7564 for (BasicBlock *BB : TheLoop->blocks()) 7565 for (Instruction &I : *BB) { 7566 Instruction *PtrDef = 7567 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7568 if (PtrDef && TheLoop->contains(PtrDef) && 7569 getWideningDecision(&I, VF) != CM_GatherScatter) 7570 AddrDefs.insert(PtrDef); 7571 } 7572 7573 // Add all instructions used to generate the addresses. 7574 SmallVector<Instruction *, 4> Worklist; 7575 append_range(Worklist, AddrDefs); 7576 while (!Worklist.empty()) { 7577 Instruction *I = Worklist.pop_back_val(); 7578 for (auto &Op : I->operands()) 7579 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7580 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7581 AddrDefs.insert(InstOp).second) 7582 Worklist.push_back(InstOp); 7583 } 7584 7585 for (auto *I : AddrDefs) { 7586 if (isa<LoadInst>(I)) { 7587 // Setting the desired widening decision should ideally be handled in 7588 // by cost functions, but since this involves the task of finding out 7589 // if the loaded register is involved in an address computation, it is 7590 // instead changed here when we know this is the case. 7591 InstWidening Decision = getWideningDecision(I, VF); 7592 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7593 // Scalarize a widened load of address. 7594 setWideningDecision( 7595 I, VF, CM_Scalarize, 7596 (VF.getKnownMinValue() * 7597 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7598 else if (auto Group = getInterleavedAccessGroup(I)) { 7599 // Scalarize an interleave group of address loads. 7600 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7601 if (Instruction *Member = Group->getMember(I)) 7602 setWideningDecision( 7603 Member, VF, CM_Scalarize, 7604 (VF.getKnownMinValue() * 7605 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7606 } 7607 } 7608 } else 7609 // Make sure I gets scalarized and a cost estimate without 7610 // scalarization overhead. 7611 ForcedScalars[VF].insert(I); 7612 } 7613 } 7614 7615 InstructionCost 7616 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7617 Type *&VectorTy) { 7618 Type *RetTy = I->getType(); 7619 if (canTruncateToMinimalBitwidth(I, VF)) 7620 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7621 auto SE = PSE.getSE(); 7622 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7623 7624 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7625 ElementCount VF) -> bool { 7626 if (VF.isScalar()) 7627 return true; 7628 7629 auto Scalarized = InstsToScalarize.find(VF); 7630 assert(Scalarized != InstsToScalarize.end() && 7631 "VF not yet analyzed for scalarization profitability"); 7632 return !Scalarized->second.count(I) && 7633 llvm::all_of(I->users(), [&](User *U) { 7634 auto *UI = cast<Instruction>(U); 7635 return !Scalarized->second.count(UI); 7636 }); 7637 }; 7638 (void) hasSingleCopyAfterVectorization; 7639 7640 if (isScalarAfterVectorization(I, VF)) { 7641 // With the exception of GEPs and PHIs, after scalarization there should 7642 // only be one copy of the instruction generated in the loop. This is 7643 // because the VF is either 1, or any instructions that need scalarizing 7644 // have already been dealt with by the the time we get here. As a result, 7645 // it means we don't have to multiply the instruction cost by VF. 7646 assert(I->getOpcode() == Instruction::GetElementPtr || 7647 I->getOpcode() == Instruction::PHI || 7648 (I->getOpcode() == Instruction::BitCast && 7649 I->getType()->isPointerTy()) || 7650 hasSingleCopyAfterVectorization(I, VF)); 7651 VectorTy = RetTy; 7652 } else 7653 VectorTy = ToVectorTy(RetTy, VF); 7654 7655 // TODO: We need to estimate the cost of intrinsic calls. 7656 switch (I->getOpcode()) { 7657 case Instruction::GetElementPtr: 7658 // We mark this instruction as zero-cost because the cost of GEPs in 7659 // vectorized code depends on whether the corresponding memory instruction 7660 // is scalarized or not. Therefore, we handle GEPs with the memory 7661 // instruction cost. 7662 return 0; 7663 case Instruction::Br: { 7664 // In cases of scalarized and predicated instructions, there will be VF 7665 // predicated blocks in the vectorized loop. Each branch around these 7666 // blocks requires also an extract of its vector compare i1 element. 7667 bool ScalarPredicatedBB = false; 7668 BranchInst *BI = cast<BranchInst>(I); 7669 if (VF.isVector() && BI->isConditional() && 7670 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7671 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7672 ScalarPredicatedBB = true; 7673 7674 if (ScalarPredicatedBB) { 7675 // Not possible to scalarize scalable vector with predicated instructions. 7676 if (VF.isScalable()) 7677 return InstructionCost::getInvalid(); 7678 // Return cost for branches around scalarized and predicated blocks. 7679 auto *Vec_i1Ty = 7680 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7681 return ( 7682 TTI.getScalarizationOverhead( 7683 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7684 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7685 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7686 // The back-edge branch will remain, as will all scalar branches. 7687 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7688 else 7689 // This branch will be eliminated by if-conversion. 7690 return 0; 7691 // Note: We currently assume zero cost for an unconditional branch inside 7692 // a predicated block since it will become a fall-through, although we 7693 // may decide in the future to call TTI for all branches. 7694 } 7695 case Instruction::PHI: { 7696 auto *Phi = cast<PHINode>(I); 7697 7698 // First-order recurrences are replaced by vector shuffles inside the loop. 7699 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7700 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7701 return TTI.getShuffleCost( 7702 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7703 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7704 7705 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7706 // converted into select instructions. We require N - 1 selects per phi 7707 // node, where N is the number of incoming values. 7708 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7709 return (Phi->getNumIncomingValues() - 1) * 7710 TTI.getCmpSelInstrCost( 7711 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7712 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7713 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7714 7715 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7716 } 7717 case Instruction::UDiv: 7718 case Instruction::SDiv: 7719 case Instruction::URem: 7720 case Instruction::SRem: 7721 // If we have a predicated instruction, it may not be executed for each 7722 // vector lane. Get the scalarization cost and scale this amount by the 7723 // probability of executing the predicated block. If the instruction is not 7724 // predicated, we fall through to the next case. 7725 if (VF.isVector() && isScalarWithPredication(I)) { 7726 InstructionCost Cost = 0; 7727 7728 // These instructions have a non-void type, so account for the phi nodes 7729 // that we will create. This cost is likely to be zero. The phi node 7730 // cost, if any, should be scaled by the block probability because it 7731 // models a copy at the end of each predicated block. 7732 Cost += VF.getKnownMinValue() * 7733 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7734 7735 // The cost of the non-predicated instruction. 7736 Cost += VF.getKnownMinValue() * 7737 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7738 7739 // The cost of insertelement and extractelement instructions needed for 7740 // scalarization. 7741 Cost += getScalarizationOverhead(I, VF); 7742 7743 // Scale the cost by the probability of executing the predicated blocks. 7744 // This assumes the predicated block for each vector lane is equally 7745 // likely. 7746 return Cost / getReciprocalPredBlockProb(); 7747 } 7748 LLVM_FALLTHROUGH; 7749 case Instruction::Add: 7750 case Instruction::FAdd: 7751 case Instruction::Sub: 7752 case Instruction::FSub: 7753 case Instruction::Mul: 7754 case Instruction::FMul: 7755 case Instruction::FDiv: 7756 case Instruction::FRem: 7757 case Instruction::Shl: 7758 case Instruction::LShr: 7759 case Instruction::AShr: 7760 case Instruction::And: 7761 case Instruction::Or: 7762 case Instruction::Xor: { 7763 // Since we will replace the stride by 1 the multiplication should go away. 7764 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7765 return 0; 7766 7767 // Detect reduction patterns 7768 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7769 return *RedCost; 7770 7771 // Certain instructions can be cheaper to vectorize if they have a constant 7772 // second vector operand. One example of this are shifts on x86. 7773 Value *Op2 = I->getOperand(1); 7774 TargetTransformInfo::OperandValueProperties Op2VP; 7775 TargetTransformInfo::OperandValueKind Op2VK = 7776 TTI.getOperandInfo(Op2, Op2VP); 7777 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7778 Op2VK = TargetTransformInfo::OK_UniformValue; 7779 7780 SmallVector<const Value *, 4> Operands(I->operand_values()); 7781 return TTI.getArithmeticInstrCost( 7782 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7783 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7784 } 7785 case Instruction::FNeg: { 7786 return TTI.getArithmeticInstrCost( 7787 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7788 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7789 TargetTransformInfo::OP_None, I->getOperand(0), I); 7790 } 7791 case Instruction::Select: { 7792 SelectInst *SI = cast<SelectInst>(I); 7793 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7794 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7795 7796 const Value *Op0, *Op1; 7797 using namespace llvm::PatternMatch; 7798 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7799 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7800 // select x, y, false --> x & y 7801 // select x, true, y --> x | y 7802 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7803 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7804 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7805 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7806 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7807 Op1->getType()->getScalarSizeInBits() == 1); 7808 7809 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7810 return TTI.getArithmeticInstrCost( 7811 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7812 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7813 } 7814 7815 Type *CondTy = SI->getCondition()->getType(); 7816 if (!ScalarCond) 7817 CondTy = VectorType::get(CondTy, VF); 7818 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7819 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7820 } 7821 case Instruction::ICmp: 7822 case Instruction::FCmp: { 7823 Type *ValTy = I->getOperand(0)->getType(); 7824 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7825 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7826 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7827 VectorTy = ToVectorTy(ValTy, VF); 7828 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7829 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7830 } 7831 case Instruction::Store: 7832 case Instruction::Load: { 7833 ElementCount Width = VF; 7834 if (Width.isVector()) { 7835 InstWidening Decision = getWideningDecision(I, Width); 7836 assert(Decision != CM_Unknown && 7837 "CM decision should be taken at this point"); 7838 if (Decision == CM_Scalarize) 7839 Width = ElementCount::getFixed(1); 7840 } 7841 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7842 return getMemoryInstructionCost(I, VF); 7843 } 7844 case Instruction::BitCast: 7845 if (I->getType()->isPointerTy()) 7846 return 0; 7847 LLVM_FALLTHROUGH; 7848 case Instruction::ZExt: 7849 case Instruction::SExt: 7850 case Instruction::FPToUI: 7851 case Instruction::FPToSI: 7852 case Instruction::FPExt: 7853 case Instruction::PtrToInt: 7854 case Instruction::IntToPtr: 7855 case Instruction::SIToFP: 7856 case Instruction::UIToFP: 7857 case Instruction::Trunc: 7858 case Instruction::FPTrunc: { 7859 // Computes the CastContextHint from a Load/Store instruction. 7860 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7861 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7862 "Expected a load or a store!"); 7863 7864 if (VF.isScalar() || !TheLoop->contains(I)) 7865 return TTI::CastContextHint::Normal; 7866 7867 switch (getWideningDecision(I, VF)) { 7868 case LoopVectorizationCostModel::CM_GatherScatter: 7869 return TTI::CastContextHint::GatherScatter; 7870 case LoopVectorizationCostModel::CM_Interleave: 7871 return TTI::CastContextHint::Interleave; 7872 case LoopVectorizationCostModel::CM_Scalarize: 7873 case LoopVectorizationCostModel::CM_Widen: 7874 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7875 : TTI::CastContextHint::Normal; 7876 case LoopVectorizationCostModel::CM_Widen_Reverse: 7877 return TTI::CastContextHint::Reversed; 7878 case LoopVectorizationCostModel::CM_Unknown: 7879 llvm_unreachable("Instr did not go through cost modelling?"); 7880 } 7881 7882 llvm_unreachable("Unhandled case!"); 7883 }; 7884 7885 unsigned Opcode = I->getOpcode(); 7886 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7887 // For Trunc, the context is the only user, which must be a StoreInst. 7888 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7889 if (I->hasOneUse()) 7890 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7891 CCH = ComputeCCH(Store); 7892 } 7893 // For Z/Sext, the context is the operand, which must be a LoadInst. 7894 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7895 Opcode == Instruction::FPExt) { 7896 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7897 CCH = ComputeCCH(Load); 7898 } 7899 7900 // We optimize the truncation of induction variables having constant 7901 // integer steps. The cost of these truncations is the same as the scalar 7902 // operation. 7903 if (isOptimizableIVTruncate(I, VF)) { 7904 auto *Trunc = cast<TruncInst>(I); 7905 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7906 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7907 } 7908 7909 // Detect reduction patterns 7910 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7911 return *RedCost; 7912 7913 Type *SrcScalarTy = I->getOperand(0)->getType(); 7914 Type *SrcVecTy = 7915 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7916 if (canTruncateToMinimalBitwidth(I, VF)) { 7917 // This cast is going to be shrunk. This may remove the cast or it might 7918 // turn it into slightly different cast. For example, if MinBW == 16, 7919 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7920 // 7921 // Calculate the modified src and dest types. 7922 Type *MinVecTy = VectorTy; 7923 if (Opcode == Instruction::Trunc) { 7924 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7925 VectorTy = 7926 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7927 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7928 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7929 VectorTy = 7930 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7931 } 7932 } 7933 7934 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7935 } 7936 case Instruction::Call: { 7937 bool NeedToScalarize; 7938 CallInst *CI = cast<CallInst>(I); 7939 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7940 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7941 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7942 return std::min(CallCost, IntrinsicCost); 7943 } 7944 return CallCost; 7945 } 7946 case Instruction::ExtractValue: 7947 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7948 case Instruction::Alloca: 7949 // We cannot easily widen alloca to a scalable alloca, as 7950 // the result would need to be a vector of pointers. 7951 if (VF.isScalable()) 7952 return InstructionCost::getInvalid(); 7953 LLVM_FALLTHROUGH; 7954 default: 7955 // This opcode is unknown. Assume that it is the same as 'mul'. 7956 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7957 } // end of switch. 7958 } 7959 7960 char LoopVectorize::ID = 0; 7961 7962 static const char lv_name[] = "Loop Vectorization"; 7963 7964 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7965 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7966 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7967 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7968 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7969 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7970 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7971 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7972 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7973 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7974 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7975 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7976 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7977 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7978 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7979 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7980 7981 namespace llvm { 7982 7983 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7984 7985 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7986 bool VectorizeOnlyWhenForced) { 7987 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7988 } 7989 7990 } // end namespace llvm 7991 7992 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7993 // Check if the pointer operand of a load or store instruction is 7994 // consecutive. 7995 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7996 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7997 return false; 7998 } 7999 8000 void LoopVectorizationCostModel::collectValuesToIgnore() { 8001 // Ignore ephemeral values. 8002 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 8003 8004 // Ignore type-promoting instructions we identified during reduction 8005 // detection. 8006 for (auto &Reduction : Legal->getReductionVars()) { 8007 RecurrenceDescriptor &RedDes = Reduction.second; 8008 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 8009 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8010 } 8011 // Ignore type-casting instructions we identified during induction 8012 // detection. 8013 for (auto &Induction : Legal->getInductionVars()) { 8014 InductionDescriptor &IndDes = Induction.second; 8015 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8016 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8017 } 8018 } 8019 8020 void LoopVectorizationCostModel::collectInLoopReductions() { 8021 for (auto &Reduction : Legal->getReductionVars()) { 8022 PHINode *Phi = Reduction.first; 8023 RecurrenceDescriptor &RdxDesc = Reduction.second; 8024 8025 // We don't collect reductions that are type promoted (yet). 8026 if (RdxDesc.getRecurrenceType() != Phi->getType()) 8027 continue; 8028 8029 // If the target would prefer this reduction to happen "in-loop", then we 8030 // want to record it as such. 8031 unsigned Opcode = RdxDesc.getOpcode(); 8032 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 8033 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 8034 TargetTransformInfo::ReductionFlags())) 8035 continue; 8036 8037 // Check that we can correctly put the reductions into the loop, by 8038 // finding the chain of operations that leads from the phi to the loop 8039 // exit value. 8040 SmallVector<Instruction *, 4> ReductionOperations = 8041 RdxDesc.getReductionOpChain(Phi, TheLoop); 8042 bool InLoop = !ReductionOperations.empty(); 8043 if (InLoop) { 8044 InLoopReductionChains[Phi] = ReductionOperations; 8045 // Add the elements to InLoopReductionImmediateChains for cost modelling. 8046 Instruction *LastChain = Phi; 8047 for (auto *I : ReductionOperations) { 8048 InLoopReductionImmediateChains[I] = LastChain; 8049 LastChain = I; 8050 } 8051 } 8052 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 8053 << " reduction for phi: " << *Phi << "\n"); 8054 } 8055 } 8056 8057 // TODO: we could return a pair of values that specify the max VF and 8058 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 8059 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 8060 // doesn't have a cost model that can choose which plan to execute if 8061 // more than one is generated. 8062 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 8063 LoopVectorizationCostModel &CM) { 8064 unsigned WidestType; 8065 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 8066 return WidestVectorRegBits / WidestType; 8067 } 8068 8069 VectorizationFactor 8070 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 8071 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 8072 ElementCount VF = UserVF; 8073 // Outer loop handling: They may require CFG and instruction level 8074 // transformations before even evaluating whether vectorization is profitable. 8075 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8076 // the vectorization pipeline. 8077 if (!OrigLoop->isInnermost()) { 8078 // If the user doesn't provide a vectorization factor, determine a 8079 // reasonable one. 8080 if (UserVF.isZero()) { 8081 VF = ElementCount::getFixed(determineVPlanVF( 8082 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8083 .getFixedSize(), 8084 CM)); 8085 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8086 8087 // Make sure we have a VF > 1 for stress testing. 8088 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8089 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8090 << "overriding computed VF.\n"); 8091 VF = ElementCount::getFixed(4); 8092 } 8093 } 8094 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8095 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8096 "VF needs to be a power of two"); 8097 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8098 << "VF " << VF << " to build VPlans.\n"); 8099 buildVPlans(VF, VF); 8100 8101 // For VPlan build stress testing, we bail out after VPlan construction. 8102 if (VPlanBuildStressTest) 8103 return VectorizationFactor::Disabled(); 8104 8105 return {VF, 0 /*Cost*/}; 8106 } 8107 8108 LLVM_DEBUG( 8109 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8110 "VPlan-native path.\n"); 8111 return VectorizationFactor::Disabled(); 8112 } 8113 8114 Optional<VectorizationFactor> 8115 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8116 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8117 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8118 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8119 return None; 8120 8121 // Invalidate interleave groups if all blocks of loop will be predicated. 8122 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 8123 !useMaskedInterleavedAccesses(*TTI)) { 8124 LLVM_DEBUG( 8125 dbgs() 8126 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8127 "which requires masked-interleaved support.\n"); 8128 if (CM.InterleaveInfo.invalidateGroups()) 8129 // Invalidating interleave groups also requires invalidating all decisions 8130 // based on them, which includes widening decisions and uniform and scalar 8131 // values. 8132 CM.invalidateCostModelingDecisions(); 8133 } 8134 8135 ElementCount MaxUserVF = 8136 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8137 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8138 if (!UserVF.isZero() && UserVFIsLegal) { 8139 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8140 "VF needs to be a power of two"); 8141 // Collect the instructions (and their associated costs) that will be more 8142 // profitable to scalarize. 8143 if (CM.selectUserVectorizationFactor(UserVF)) { 8144 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8145 CM.collectInLoopReductions(); 8146 buildVPlansWithVPRecipes(UserVF, UserVF); 8147 LLVM_DEBUG(printPlans(dbgs())); 8148 return {{UserVF, 0}}; 8149 } else 8150 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8151 "InvalidCost", ORE, OrigLoop); 8152 } 8153 8154 // Populate the set of Vectorization Factor Candidates. 8155 ElementCountSet VFCandidates; 8156 for (auto VF = ElementCount::getFixed(1); 8157 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8158 VFCandidates.insert(VF); 8159 for (auto VF = ElementCount::getScalable(1); 8160 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8161 VFCandidates.insert(VF); 8162 8163 for (const auto &VF : VFCandidates) { 8164 // Collect Uniform and Scalar instructions after vectorization with VF. 8165 CM.collectUniformsAndScalars(VF); 8166 8167 // Collect the instructions (and their associated costs) that will be more 8168 // profitable to scalarize. 8169 if (VF.isVector()) 8170 CM.collectInstsToScalarize(VF); 8171 } 8172 8173 CM.collectInLoopReductions(); 8174 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8175 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8176 8177 LLVM_DEBUG(printPlans(dbgs())); 8178 if (!MaxFactors.hasVector()) 8179 return VectorizationFactor::Disabled(); 8180 8181 // Select the optimal vectorization factor. 8182 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8183 8184 // Check if it is profitable to vectorize with runtime checks. 8185 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8186 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8187 bool PragmaThresholdReached = 8188 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8189 bool ThresholdReached = 8190 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8191 if ((ThresholdReached && !Hints.allowReordering()) || 8192 PragmaThresholdReached) { 8193 ORE->emit([&]() { 8194 return OptimizationRemarkAnalysisAliasing( 8195 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8196 OrigLoop->getHeader()) 8197 << "loop not vectorized: cannot prove it is safe to reorder " 8198 "memory operations"; 8199 }); 8200 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8201 Hints.emitRemarkWithHints(); 8202 return VectorizationFactor::Disabled(); 8203 } 8204 } 8205 return SelectedVF; 8206 } 8207 8208 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 8209 assert(count_if(VPlans, 8210 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 8211 1 && 8212 "Best VF has not a single VPlan."); 8213 8214 for (const VPlanPtr &Plan : VPlans) { 8215 if (Plan->hasVF(VF)) 8216 return *Plan.get(); 8217 } 8218 llvm_unreachable("No plan found!"); 8219 } 8220 8221 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 8222 VPlan &BestVPlan, 8223 InnerLoopVectorizer &ILV, 8224 DominatorTree *DT) { 8225 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 8226 << '\n'); 8227 8228 // Perform the actual loop transformation. 8229 8230 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8231 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 8232 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8233 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8234 State.CanonicalIV = ILV.Induction; 8235 8236 ILV.printDebugTracesAtStart(); 8237 8238 //===------------------------------------------------===// 8239 // 8240 // Notice: any optimization or new instruction that go 8241 // into the code below should also be implemented in 8242 // the cost-model. 8243 // 8244 //===------------------------------------------------===// 8245 8246 // 2. Copy and widen instructions from the old loop into the new loop. 8247 BestVPlan.execute(&State); 8248 8249 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8250 // predication, updating analyses. 8251 ILV.fixVectorizedLoop(State); 8252 8253 ILV.printDebugTracesAtEnd(); 8254 } 8255 8256 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8257 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8258 for (const auto &Plan : VPlans) 8259 if (PrintVPlansInDotFormat) 8260 Plan->printDOT(O); 8261 else 8262 Plan->print(O); 8263 } 8264 #endif 8265 8266 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8267 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8268 8269 // We create new control-flow for the vectorized loop, so the original exit 8270 // conditions will be dead after vectorization if it's only used by the 8271 // terminator 8272 SmallVector<BasicBlock*> ExitingBlocks; 8273 OrigLoop->getExitingBlocks(ExitingBlocks); 8274 for (auto *BB : ExitingBlocks) { 8275 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8276 if (!Cmp || !Cmp->hasOneUse()) 8277 continue; 8278 8279 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8280 if (!DeadInstructions.insert(Cmp).second) 8281 continue; 8282 8283 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8284 // TODO: can recurse through operands in general 8285 for (Value *Op : Cmp->operands()) { 8286 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8287 DeadInstructions.insert(cast<Instruction>(Op)); 8288 } 8289 } 8290 8291 // We create new "steps" for induction variable updates to which the original 8292 // induction variables map. An original update instruction will be dead if 8293 // all its users except the induction variable are dead. 8294 auto *Latch = OrigLoop->getLoopLatch(); 8295 for (auto &Induction : Legal->getInductionVars()) { 8296 PHINode *Ind = Induction.first; 8297 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8298 8299 // If the tail is to be folded by masking, the primary induction variable, 8300 // if exists, isn't dead: it will be used for masking. Don't kill it. 8301 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8302 continue; 8303 8304 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8305 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8306 })) 8307 DeadInstructions.insert(IndUpdate); 8308 8309 // We record as "Dead" also the type-casting instructions we had identified 8310 // during induction analysis. We don't need any handling for them in the 8311 // vectorized loop because we have proven that, under a proper runtime 8312 // test guarding the vectorized loop, the value of the phi, and the casted 8313 // value of the phi, are the same. The last instruction in this casting chain 8314 // will get its scalar/vector/widened def from the scalar/vector/widened def 8315 // of the respective phi node. Any other casts in the induction def-use chain 8316 // have no other uses outside the phi update chain, and will be ignored. 8317 InductionDescriptor &IndDes = Induction.second; 8318 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8319 DeadInstructions.insert(Casts.begin(), Casts.end()); 8320 } 8321 } 8322 8323 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8324 8325 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8326 8327 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8328 Value *Step, 8329 Instruction::BinaryOps BinOp) { 8330 // When unrolling and the VF is 1, we only need to add a simple scalar. 8331 Type *Ty = Val->getType(); 8332 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8333 8334 if (Ty->isFloatingPointTy()) { 8335 // Floating-point operations inherit FMF via the builder's flags. 8336 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8337 return Builder.CreateBinOp(BinOp, Val, MulOp); 8338 } 8339 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8340 } 8341 8342 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8343 SmallVector<Metadata *, 4> MDs; 8344 // Reserve first location for self reference to the LoopID metadata node. 8345 MDs.push_back(nullptr); 8346 bool IsUnrollMetadata = false; 8347 MDNode *LoopID = L->getLoopID(); 8348 if (LoopID) { 8349 // First find existing loop unrolling disable metadata. 8350 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8351 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8352 if (MD) { 8353 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8354 IsUnrollMetadata = 8355 S && S->getString().startswith("llvm.loop.unroll.disable"); 8356 } 8357 MDs.push_back(LoopID->getOperand(i)); 8358 } 8359 } 8360 8361 if (!IsUnrollMetadata) { 8362 // Add runtime unroll disable metadata. 8363 LLVMContext &Context = L->getHeader()->getContext(); 8364 SmallVector<Metadata *, 1> DisableOperands; 8365 DisableOperands.push_back( 8366 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8367 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8368 MDs.push_back(DisableNode); 8369 MDNode *NewLoopID = MDNode::get(Context, MDs); 8370 // Set operand 0 to refer to the loop id itself. 8371 NewLoopID->replaceOperandWith(0, NewLoopID); 8372 L->setLoopID(NewLoopID); 8373 } 8374 } 8375 8376 //===--------------------------------------------------------------------===// 8377 // EpilogueVectorizerMainLoop 8378 //===--------------------------------------------------------------------===// 8379 8380 /// This function is partially responsible for generating the control flow 8381 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8382 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8383 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8384 Loop *Lp = createVectorLoopSkeleton(""); 8385 8386 // Generate the code to check the minimum iteration count of the vector 8387 // epilogue (see below). 8388 EPI.EpilogueIterationCountCheck = 8389 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8390 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8391 8392 // Generate the code to check any assumptions that we've made for SCEV 8393 // expressions. 8394 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8395 8396 // Generate the code that checks at runtime if arrays overlap. We put the 8397 // checks into a separate block to make the more common case of few elements 8398 // faster. 8399 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8400 8401 // Generate the iteration count check for the main loop, *after* the check 8402 // for the epilogue loop, so that the path-length is shorter for the case 8403 // that goes directly through the vector epilogue. The longer-path length for 8404 // the main loop is compensated for, by the gain from vectorizing the larger 8405 // trip count. Note: the branch will get updated later on when we vectorize 8406 // the epilogue. 8407 EPI.MainLoopIterationCountCheck = 8408 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8409 8410 // Generate the induction variable. 8411 OldInduction = Legal->getPrimaryInduction(); 8412 Type *IdxTy = Legal->getWidestInductionType(); 8413 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8414 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8415 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8416 EPI.VectorTripCount = CountRoundDown; 8417 Induction = 8418 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8419 getDebugLocFromInstOrOperands(OldInduction)); 8420 8421 // Skip induction resume value creation here because they will be created in 8422 // the second pass. If we created them here, they wouldn't be used anyway, 8423 // because the vplan in the second pass still contains the inductions from the 8424 // original loop. 8425 8426 return completeLoopSkeleton(Lp, OrigLoopID); 8427 } 8428 8429 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8430 LLVM_DEBUG({ 8431 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8432 << "Main Loop VF:" << EPI.MainLoopVF 8433 << ", Main Loop UF:" << EPI.MainLoopUF 8434 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8435 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8436 }); 8437 } 8438 8439 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8440 DEBUG_WITH_TYPE(VerboseDebug, { 8441 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8442 }); 8443 } 8444 8445 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8446 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8447 assert(L && "Expected valid Loop."); 8448 assert(Bypass && "Expected valid bypass basic block."); 8449 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8450 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8451 Value *Count = getOrCreateTripCount(L); 8452 // Reuse existing vector loop preheader for TC checks. 8453 // Note that new preheader block is generated for vector loop. 8454 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8455 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8456 8457 // Generate code to check if the loop's trip count is less than VF * UF of the 8458 // main vector loop. 8459 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8460 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8461 8462 Value *CheckMinIters = Builder.CreateICmp( 8463 P, Count, getRuntimeVF(Builder, Count->getType(), VFactor * UFactor), 8464 "min.iters.check"); 8465 8466 if (!ForEpilogue) 8467 TCCheckBlock->setName("vector.main.loop.iter.check"); 8468 8469 // Create new preheader for vector loop. 8470 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8471 DT, LI, nullptr, "vector.ph"); 8472 8473 if (ForEpilogue) { 8474 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8475 DT->getNode(Bypass)->getIDom()) && 8476 "TC check is expected to dominate Bypass"); 8477 8478 // Update dominator for Bypass & LoopExit. 8479 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8480 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8481 // For loops with multiple exits, there's no edge from the middle block 8482 // to exit blocks (as the epilogue must run) and thus no need to update 8483 // the immediate dominator of the exit blocks. 8484 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8485 8486 LoopBypassBlocks.push_back(TCCheckBlock); 8487 8488 // Save the trip count so we don't have to regenerate it in the 8489 // vec.epilog.iter.check. This is safe to do because the trip count 8490 // generated here dominates the vector epilog iter check. 8491 EPI.TripCount = Count; 8492 } 8493 8494 ReplaceInstWithInst( 8495 TCCheckBlock->getTerminator(), 8496 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8497 8498 return TCCheckBlock; 8499 } 8500 8501 //===--------------------------------------------------------------------===// 8502 // EpilogueVectorizerEpilogueLoop 8503 //===--------------------------------------------------------------------===// 8504 8505 /// This function is partially responsible for generating the control flow 8506 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8507 BasicBlock * 8508 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8509 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8510 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8511 8512 // Now, compare the remaining count and if there aren't enough iterations to 8513 // execute the vectorized epilogue skip to the scalar part. 8514 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8515 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8516 LoopVectorPreHeader = 8517 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8518 LI, nullptr, "vec.epilog.ph"); 8519 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8520 VecEpilogueIterationCountCheck); 8521 8522 // Adjust the control flow taking the state info from the main loop 8523 // vectorization into account. 8524 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8525 "expected this to be saved from the previous pass."); 8526 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8527 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8528 8529 DT->changeImmediateDominator(LoopVectorPreHeader, 8530 EPI.MainLoopIterationCountCheck); 8531 8532 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8533 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8534 8535 if (EPI.SCEVSafetyCheck) 8536 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8537 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8538 if (EPI.MemSafetyCheck) 8539 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8540 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8541 8542 DT->changeImmediateDominator( 8543 VecEpilogueIterationCountCheck, 8544 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8545 8546 DT->changeImmediateDominator(LoopScalarPreHeader, 8547 EPI.EpilogueIterationCountCheck); 8548 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8549 // If there is an epilogue which must run, there's no edge from the 8550 // middle block to exit blocks and thus no need to update the immediate 8551 // dominator of the exit blocks. 8552 DT->changeImmediateDominator(LoopExitBlock, 8553 EPI.EpilogueIterationCountCheck); 8554 8555 // Keep track of bypass blocks, as they feed start values to the induction 8556 // phis in the scalar loop preheader. 8557 if (EPI.SCEVSafetyCheck) 8558 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8559 if (EPI.MemSafetyCheck) 8560 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8561 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8562 8563 // Generate a resume induction for the vector epilogue and put it in the 8564 // vector epilogue preheader 8565 Type *IdxTy = Legal->getWidestInductionType(); 8566 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8567 LoopVectorPreHeader->getFirstNonPHI()); 8568 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8569 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8570 EPI.MainLoopIterationCountCheck); 8571 8572 // Generate the induction variable. 8573 OldInduction = Legal->getPrimaryInduction(); 8574 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8575 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8576 Value *StartIdx = EPResumeVal; 8577 Induction = 8578 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8579 getDebugLocFromInstOrOperands(OldInduction)); 8580 8581 // Generate induction resume values. These variables save the new starting 8582 // indexes for the scalar loop. They are used to test if there are any tail 8583 // iterations left once the vector loop has completed. 8584 // Note that when the vectorized epilogue is skipped due to iteration count 8585 // check, then the resume value for the induction variable comes from 8586 // the trip count of the main vector loop, hence passing the AdditionalBypass 8587 // argument. 8588 createInductionResumeValues(Lp, CountRoundDown, 8589 {VecEpilogueIterationCountCheck, 8590 EPI.VectorTripCount} /* AdditionalBypass */); 8591 8592 AddRuntimeUnrollDisableMetaData(Lp); 8593 return completeLoopSkeleton(Lp, OrigLoopID); 8594 } 8595 8596 BasicBlock * 8597 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8598 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8599 8600 assert(EPI.TripCount && 8601 "Expected trip count to have been safed in the first pass."); 8602 assert( 8603 (!isa<Instruction>(EPI.TripCount) || 8604 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8605 "saved trip count does not dominate insertion point."); 8606 Value *TC = EPI.TripCount; 8607 IRBuilder<> Builder(Insert->getTerminator()); 8608 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8609 8610 // Generate code to check if the loop's trip count is less than VF * UF of the 8611 // vector epilogue loop. 8612 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8613 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8614 8615 Value *CheckMinIters = Builder.CreateICmp( 8616 P, Count, 8617 getRuntimeVF(Builder, Count->getType(), EPI.EpilogueVF * EPI.EpilogueUF), 8618 "min.epilog.iters.check"); 8619 8620 ReplaceInstWithInst( 8621 Insert->getTerminator(), 8622 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8623 8624 LoopBypassBlocks.push_back(Insert); 8625 return Insert; 8626 } 8627 8628 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8629 LLVM_DEBUG({ 8630 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8631 << "Epilogue Loop VF:" << EPI.EpilogueVF 8632 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8633 }); 8634 } 8635 8636 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8637 DEBUG_WITH_TYPE(VerboseDebug, { 8638 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8639 }); 8640 } 8641 8642 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8643 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8644 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8645 bool PredicateAtRangeStart = Predicate(Range.Start); 8646 8647 for (ElementCount TmpVF = Range.Start * 2; 8648 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8649 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8650 Range.End = TmpVF; 8651 break; 8652 } 8653 8654 return PredicateAtRangeStart; 8655 } 8656 8657 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8658 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8659 /// of VF's starting at a given VF and extending it as much as possible. Each 8660 /// vectorization decision can potentially shorten this sub-range during 8661 /// buildVPlan(). 8662 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8663 ElementCount MaxVF) { 8664 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8665 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8666 VFRange SubRange = {VF, MaxVFPlusOne}; 8667 VPlans.push_back(buildVPlan(SubRange)); 8668 VF = SubRange.End; 8669 } 8670 } 8671 8672 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8673 VPlanPtr &Plan) { 8674 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8675 8676 // Look for cached value. 8677 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8678 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8679 if (ECEntryIt != EdgeMaskCache.end()) 8680 return ECEntryIt->second; 8681 8682 VPValue *SrcMask = createBlockInMask(Src, Plan); 8683 8684 // The terminator has to be a branch inst! 8685 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8686 assert(BI && "Unexpected terminator found"); 8687 8688 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8689 return EdgeMaskCache[Edge] = SrcMask; 8690 8691 // If source is an exiting block, we know the exit edge is dynamically dead 8692 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8693 // adding uses of an otherwise potentially dead instruction. 8694 if (OrigLoop->isLoopExiting(Src)) 8695 return EdgeMaskCache[Edge] = SrcMask; 8696 8697 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8698 assert(EdgeMask && "No Edge Mask found for condition"); 8699 8700 if (BI->getSuccessor(0) != Dst) 8701 EdgeMask = Builder.createNot(EdgeMask); 8702 8703 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8704 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8705 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8706 // The select version does not introduce new UB if SrcMask is false and 8707 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8708 VPValue *False = Plan->getOrAddVPValue( 8709 ConstantInt::getFalse(BI->getCondition()->getType())); 8710 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8711 } 8712 8713 return EdgeMaskCache[Edge] = EdgeMask; 8714 } 8715 8716 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8717 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8718 8719 // Look for cached value. 8720 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8721 if (BCEntryIt != BlockMaskCache.end()) 8722 return BCEntryIt->second; 8723 8724 // All-one mask is modelled as no-mask following the convention for masked 8725 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8726 VPValue *BlockMask = nullptr; 8727 8728 if (OrigLoop->getHeader() == BB) { 8729 if (!CM.blockNeedsPredication(BB)) 8730 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8731 8732 // Create the block in mask as the first non-phi instruction in the block. 8733 VPBuilder::InsertPointGuard Guard(Builder); 8734 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8735 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8736 8737 // Introduce the early-exit compare IV <= BTC to form header block mask. 8738 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8739 // Start by constructing the desired canonical IV. 8740 VPValue *IV = nullptr; 8741 if (Legal->getPrimaryInduction()) 8742 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8743 else { 8744 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8745 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8746 IV = IVRecipe; 8747 } 8748 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8749 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8750 8751 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8752 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8753 // as a second argument, we only pass the IV here and extract the 8754 // tripcount from the transform state where codegen of the VP instructions 8755 // happen. 8756 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8757 } else { 8758 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8759 } 8760 return BlockMaskCache[BB] = BlockMask; 8761 } 8762 8763 // This is the block mask. We OR all incoming edges. 8764 for (auto *Predecessor : predecessors(BB)) { 8765 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8766 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8767 return BlockMaskCache[BB] = EdgeMask; 8768 8769 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8770 BlockMask = EdgeMask; 8771 continue; 8772 } 8773 8774 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8775 } 8776 8777 return BlockMaskCache[BB] = BlockMask; 8778 } 8779 8780 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8781 ArrayRef<VPValue *> Operands, 8782 VFRange &Range, 8783 VPlanPtr &Plan) { 8784 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8785 "Must be called with either a load or store"); 8786 8787 auto willWiden = [&](ElementCount VF) -> bool { 8788 if (VF.isScalar()) 8789 return false; 8790 LoopVectorizationCostModel::InstWidening Decision = 8791 CM.getWideningDecision(I, VF); 8792 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8793 "CM decision should be taken at this point."); 8794 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8795 return true; 8796 if (CM.isScalarAfterVectorization(I, VF) || 8797 CM.isProfitableToScalarize(I, VF)) 8798 return false; 8799 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8800 }; 8801 8802 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8803 return nullptr; 8804 8805 VPValue *Mask = nullptr; 8806 if (Legal->isMaskRequired(I)) 8807 Mask = createBlockInMask(I->getParent(), Plan); 8808 8809 // Determine if the pointer operand of the access is either consecutive or 8810 // reverse consecutive. 8811 LoopVectorizationCostModel::InstWidening Decision = 8812 CM.getWideningDecision(I, Range.Start); 8813 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8814 bool Consecutive = 8815 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8816 8817 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8818 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8819 Consecutive, Reverse); 8820 8821 StoreInst *Store = cast<StoreInst>(I); 8822 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8823 Mask, Consecutive, Reverse); 8824 } 8825 8826 VPWidenIntOrFpInductionRecipe * 8827 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8828 ArrayRef<VPValue *> Operands) const { 8829 // Check if this is an integer or fp induction. If so, build the recipe that 8830 // produces its scalar and vector values. 8831 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8832 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8833 II.getKind() == InductionDescriptor::IK_FpInduction) { 8834 assert(II.getStartValue() == 8835 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8836 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8837 return new VPWidenIntOrFpInductionRecipe( 8838 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8839 } 8840 8841 return nullptr; 8842 } 8843 8844 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8845 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8846 VPlan &Plan) const { 8847 // Optimize the special case where the source is a constant integer 8848 // induction variable. Notice that we can only optimize the 'trunc' case 8849 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8850 // (c) other casts depend on pointer size. 8851 8852 // Determine whether \p K is a truncation based on an induction variable that 8853 // can be optimized. 8854 auto isOptimizableIVTruncate = 8855 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8856 return [=](ElementCount VF) -> bool { 8857 return CM.isOptimizableIVTruncate(K, VF); 8858 }; 8859 }; 8860 8861 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8862 isOptimizableIVTruncate(I), Range)) { 8863 8864 InductionDescriptor II = 8865 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8866 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8867 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8868 Start, nullptr, I); 8869 } 8870 return nullptr; 8871 } 8872 8873 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8874 ArrayRef<VPValue *> Operands, 8875 VPlanPtr &Plan) { 8876 // If all incoming values are equal, the incoming VPValue can be used directly 8877 // instead of creating a new VPBlendRecipe. 8878 VPValue *FirstIncoming = Operands[0]; 8879 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8880 return FirstIncoming == Inc; 8881 })) { 8882 return Operands[0]; 8883 } 8884 8885 // We know that all PHIs in non-header blocks are converted into selects, so 8886 // we don't have to worry about the insertion order and we can just use the 8887 // builder. At this point we generate the predication tree. There may be 8888 // duplications since this is a simple recursive scan, but future 8889 // optimizations will clean it up. 8890 SmallVector<VPValue *, 2> OperandsWithMask; 8891 unsigned NumIncoming = Phi->getNumIncomingValues(); 8892 8893 for (unsigned In = 0; In < NumIncoming; In++) { 8894 VPValue *EdgeMask = 8895 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8896 assert((EdgeMask || NumIncoming == 1) && 8897 "Multiple predecessors with one having a full mask"); 8898 OperandsWithMask.push_back(Operands[In]); 8899 if (EdgeMask) 8900 OperandsWithMask.push_back(EdgeMask); 8901 } 8902 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8903 } 8904 8905 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8906 ArrayRef<VPValue *> Operands, 8907 VFRange &Range) const { 8908 8909 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8910 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8911 Range); 8912 8913 if (IsPredicated) 8914 return nullptr; 8915 8916 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8917 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8918 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8919 ID == Intrinsic::pseudoprobe || 8920 ID == Intrinsic::experimental_noalias_scope_decl)) 8921 return nullptr; 8922 8923 auto willWiden = [&](ElementCount VF) -> bool { 8924 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8925 // The following case may be scalarized depending on the VF. 8926 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8927 // version of the instruction. 8928 // Is it beneficial to perform intrinsic call compared to lib call? 8929 bool NeedToScalarize = false; 8930 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8931 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8932 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8933 return UseVectorIntrinsic || !NeedToScalarize; 8934 }; 8935 8936 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8937 return nullptr; 8938 8939 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8940 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8941 } 8942 8943 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8944 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8945 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8946 // Instruction should be widened, unless it is scalar after vectorization, 8947 // scalarization is profitable or it is predicated. 8948 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8949 return CM.isScalarAfterVectorization(I, VF) || 8950 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8951 }; 8952 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8953 Range); 8954 } 8955 8956 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8957 ArrayRef<VPValue *> Operands) const { 8958 auto IsVectorizableOpcode = [](unsigned Opcode) { 8959 switch (Opcode) { 8960 case Instruction::Add: 8961 case Instruction::And: 8962 case Instruction::AShr: 8963 case Instruction::BitCast: 8964 case Instruction::FAdd: 8965 case Instruction::FCmp: 8966 case Instruction::FDiv: 8967 case Instruction::FMul: 8968 case Instruction::FNeg: 8969 case Instruction::FPExt: 8970 case Instruction::FPToSI: 8971 case Instruction::FPToUI: 8972 case Instruction::FPTrunc: 8973 case Instruction::FRem: 8974 case Instruction::FSub: 8975 case Instruction::ICmp: 8976 case Instruction::IntToPtr: 8977 case Instruction::LShr: 8978 case Instruction::Mul: 8979 case Instruction::Or: 8980 case Instruction::PtrToInt: 8981 case Instruction::SDiv: 8982 case Instruction::Select: 8983 case Instruction::SExt: 8984 case Instruction::Shl: 8985 case Instruction::SIToFP: 8986 case Instruction::SRem: 8987 case Instruction::Sub: 8988 case Instruction::Trunc: 8989 case Instruction::UDiv: 8990 case Instruction::UIToFP: 8991 case Instruction::URem: 8992 case Instruction::Xor: 8993 case Instruction::ZExt: 8994 return true; 8995 } 8996 return false; 8997 }; 8998 8999 if (!IsVectorizableOpcode(I->getOpcode())) 9000 return nullptr; 9001 9002 // Success: widen this instruction. 9003 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 9004 } 9005 9006 void VPRecipeBuilder::fixHeaderPhis() { 9007 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 9008 for (VPWidenPHIRecipe *R : PhisToFix) { 9009 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 9010 VPRecipeBase *IncR = 9011 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 9012 R->addOperand(IncR->getVPSingleValue()); 9013 } 9014 } 9015 9016 VPBasicBlock *VPRecipeBuilder::handleReplication( 9017 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 9018 VPlanPtr &Plan) { 9019 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 9020 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 9021 Range); 9022 9023 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 9024 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 9025 9026 // Even if the instruction is not marked as uniform, there are certain 9027 // intrinsic calls that can be effectively treated as such, so we check for 9028 // them here. Conservatively, we only do this for scalable vectors, since 9029 // for fixed-width VFs we can always fall back on full scalarization. 9030 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 9031 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 9032 case Intrinsic::assume: 9033 case Intrinsic::lifetime_start: 9034 case Intrinsic::lifetime_end: 9035 // For scalable vectors if one of the operands is variant then we still 9036 // want to mark as uniform, which will generate one instruction for just 9037 // the first lane of the vector. We can't scalarize the call in the same 9038 // way as for fixed-width vectors because we don't know how many lanes 9039 // there are. 9040 // 9041 // The reasons for doing it this way for scalable vectors are: 9042 // 1. For the assume intrinsic generating the instruction for the first 9043 // lane is still be better than not generating any at all. For 9044 // example, the input may be a splat across all lanes. 9045 // 2. For the lifetime start/end intrinsics the pointer operand only 9046 // does anything useful when the input comes from a stack object, 9047 // which suggests it should always be uniform. For non-stack objects 9048 // the effect is to poison the object, which still allows us to 9049 // remove the call. 9050 IsUniform = true; 9051 break; 9052 default: 9053 break; 9054 } 9055 } 9056 9057 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 9058 IsUniform, IsPredicated); 9059 setRecipe(I, Recipe); 9060 Plan->addVPValue(I, Recipe); 9061 9062 // Find if I uses a predicated instruction. If so, it will use its scalar 9063 // value. Avoid hoisting the insert-element which packs the scalar value into 9064 // a vector value, as that happens iff all users use the vector value. 9065 for (VPValue *Op : Recipe->operands()) { 9066 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 9067 if (!PredR) 9068 continue; 9069 auto *RepR = 9070 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 9071 assert(RepR->isPredicated() && 9072 "expected Replicate recipe to be predicated"); 9073 RepR->setAlsoPack(false); 9074 } 9075 9076 // Finalize the recipe for Instr, first if it is not predicated. 9077 if (!IsPredicated) { 9078 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9079 VPBB->appendRecipe(Recipe); 9080 return VPBB; 9081 } 9082 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9083 assert(VPBB->getSuccessors().empty() && 9084 "VPBB has successors when handling predicated replication."); 9085 // Record predicated instructions for above packing optimizations. 9086 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9087 VPBlockUtils::insertBlockAfter(Region, VPBB); 9088 auto *RegSucc = new VPBasicBlock(); 9089 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9090 return RegSucc; 9091 } 9092 9093 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9094 VPRecipeBase *PredRecipe, 9095 VPlanPtr &Plan) { 9096 // Instructions marked for predication are replicated and placed under an 9097 // if-then construct to prevent side-effects. 9098 9099 // Generate recipes to compute the block mask for this region. 9100 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9101 9102 // Build the triangular if-then region. 9103 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9104 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9105 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9106 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9107 auto *PHIRecipe = Instr->getType()->isVoidTy() 9108 ? nullptr 9109 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9110 if (PHIRecipe) { 9111 Plan->removeVPValueFor(Instr); 9112 Plan->addVPValue(Instr, PHIRecipe); 9113 } 9114 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9115 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9116 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9117 9118 // Note: first set Entry as region entry and then connect successors starting 9119 // from it in order, to propagate the "parent" of each VPBasicBlock. 9120 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9121 VPBlockUtils::connectBlocks(Pred, Exit); 9122 9123 return Region; 9124 } 9125 9126 VPRecipeOrVPValueTy 9127 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9128 ArrayRef<VPValue *> Operands, 9129 VFRange &Range, VPlanPtr &Plan) { 9130 // First, check for specific widening recipes that deal with calls, memory 9131 // operations, inductions and Phi nodes. 9132 if (auto *CI = dyn_cast<CallInst>(Instr)) 9133 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9134 9135 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9136 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9137 9138 VPRecipeBase *Recipe; 9139 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9140 if (Phi->getParent() != OrigLoop->getHeader()) 9141 return tryToBlend(Phi, Operands, Plan); 9142 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9143 return toVPRecipeResult(Recipe); 9144 9145 VPWidenPHIRecipe *PhiRecipe = nullptr; 9146 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9147 VPValue *StartV = Operands[0]; 9148 if (Legal->isReductionVariable(Phi)) { 9149 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9150 assert(RdxDesc.getRecurrenceStartValue() == 9151 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9152 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9153 CM.isInLoopReduction(Phi), 9154 CM.useOrderedReductions(RdxDesc)); 9155 } else { 9156 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9157 } 9158 9159 // Record the incoming value from the backedge, so we can add the incoming 9160 // value from the backedge after all recipes have been created. 9161 recordRecipeOf(cast<Instruction>( 9162 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9163 PhisToFix.push_back(PhiRecipe); 9164 } else { 9165 // TODO: record start and backedge value for remaining pointer induction 9166 // phis. 9167 assert(Phi->getType()->isPointerTy() && 9168 "only pointer phis should be handled here"); 9169 PhiRecipe = new VPWidenPHIRecipe(Phi); 9170 } 9171 9172 return toVPRecipeResult(PhiRecipe); 9173 } 9174 9175 if (isa<TruncInst>(Instr) && 9176 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9177 Range, *Plan))) 9178 return toVPRecipeResult(Recipe); 9179 9180 if (!shouldWiden(Instr, Range)) 9181 return nullptr; 9182 9183 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9184 return toVPRecipeResult(new VPWidenGEPRecipe( 9185 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9186 9187 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9188 bool InvariantCond = 9189 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9190 return toVPRecipeResult(new VPWidenSelectRecipe( 9191 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9192 } 9193 9194 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9195 } 9196 9197 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9198 ElementCount MaxVF) { 9199 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9200 9201 // Collect instructions from the original loop that will become trivially dead 9202 // in the vectorized loop. We don't need to vectorize these instructions. For 9203 // example, original induction update instructions can become dead because we 9204 // separately emit induction "steps" when generating code for the new loop. 9205 // Similarly, we create a new latch condition when setting up the structure 9206 // of the new loop, so the old one can become dead. 9207 SmallPtrSet<Instruction *, 4> DeadInstructions; 9208 collectTriviallyDeadInstructions(DeadInstructions); 9209 9210 // Add assume instructions we need to drop to DeadInstructions, to prevent 9211 // them from being added to the VPlan. 9212 // TODO: We only need to drop assumes in blocks that get flattend. If the 9213 // control flow is preserved, we should keep them. 9214 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9215 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9216 9217 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9218 // Dead instructions do not need sinking. Remove them from SinkAfter. 9219 for (Instruction *I : DeadInstructions) 9220 SinkAfter.erase(I); 9221 9222 // Cannot sink instructions after dead instructions (there won't be any 9223 // recipes for them). Instead, find the first non-dead previous instruction. 9224 for (auto &P : Legal->getSinkAfter()) { 9225 Instruction *SinkTarget = P.second; 9226 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9227 (void)FirstInst; 9228 while (DeadInstructions.contains(SinkTarget)) { 9229 assert( 9230 SinkTarget != FirstInst && 9231 "Must find a live instruction (at least the one feeding the " 9232 "first-order recurrence PHI) before reaching beginning of the block"); 9233 SinkTarget = SinkTarget->getPrevNode(); 9234 assert(SinkTarget != P.first && 9235 "sink source equals target, no sinking required"); 9236 } 9237 P.second = SinkTarget; 9238 } 9239 9240 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9241 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9242 VFRange SubRange = {VF, MaxVFPlusOne}; 9243 VPlans.push_back( 9244 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9245 VF = SubRange.End; 9246 } 9247 } 9248 9249 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9250 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9251 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9252 9253 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9254 9255 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9256 9257 // --------------------------------------------------------------------------- 9258 // Pre-construction: record ingredients whose recipes we'll need to further 9259 // process after constructing the initial VPlan. 9260 // --------------------------------------------------------------------------- 9261 9262 // Mark instructions we'll need to sink later and their targets as 9263 // ingredients whose recipe we'll need to record. 9264 for (auto &Entry : SinkAfter) { 9265 RecipeBuilder.recordRecipeOf(Entry.first); 9266 RecipeBuilder.recordRecipeOf(Entry.second); 9267 } 9268 for (auto &Reduction : CM.getInLoopReductionChains()) { 9269 PHINode *Phi = Reduction.first; 9270 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9271 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9272 9273 RecipeBuilder.recordRecipeOf(Phi); 9274 for (auto &R : ReductionOperations) { 9275 RecipeBuilder.recordRecipeOf(R); 9276 // For min/max reducitons, where we have a pair of icmp/select, we also 9277 // need to record the ICmp recipe, so it can be removed later. 9278 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9279 "Only min/max recurrences allowed for inloop reductions"); 9280 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9281 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9282 } 9283 } 9284 9285 // For each interleave group which is relevant for this (possibly trimmed) 9286 // Range, add it to the set of groups to be later applied to the VPlan and add 9287 // placeholders for its members' Recipes which we'll be replacing with a 9288 // single VPInterleaveRecipe. 9289 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9290 auto applyIG = [IG, this](ElementCount VF) -> bool { 9291 return (VF.isVector() && // Query is illegal for VF == 1 9292 CM.getWideningDecision(IG->getInsertPos(), VF) == 9293 LoopVectorizationCostModel::CM_Interleave); 9294 }; 9295 if (!getDecisionAndClampRange(applyIG, Range)) 9296 continue; 9297 InterleaveGroups.insert(IG); 9298 for (unsigned i = 0; i < IG->getFactor(); i++) 9299 if (Instruction *Member = IG->getMember(i)) 9300 RecipeBuilder.recordRecipeOf(Member); 9301 }; 9302 9303 // --------------------------------------------------------------------------- 9304 // Build initial VPlan: Scan the body of the loop in a topological order to 9305 // visit each basic block after having visited its predecessor basic blocks. 9306 // --------------------------------------------------------------------------- 9307 9308 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9309 auto Plan = std::make_unique<VPlan>(); 9310 9311 // Scan the body of the loop in a topological order to visit each basic block 9312 // after having visited its predecessor basic blocks. 9313 LoopBlocksDFS DFS(OrigLoop); 9314 DFS.perform(LI); 9315 9316 VPBasicBlock *VPBB = nullptr; 9317 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9318 // Relevant instructions from basic block BB will be grouped into VPRecipe 9319 // ingredients and fill a new VPBasicBlock. 9320 unsigned VPBBsForBB = 0; 9321 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9322 if (VPBB) 9323 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9324 else 9325 Plan->setEntry(FirstVPBBForBB); 9326 VPBB = FirstVPBBForBB; 9327 Builder.setInsertPoint(VPBB); 9328 9329 // Introduce each ingredient into VPlan. 9330 // TODO: Model and preserve debug instrinsics in VPlan. 9331 for (Instruction &I : BB->instructionsWithoutDebug()) { 9332 Instruction *Instr = &I; 9333 9334 // First filter out irrelevant instructions, to ensure no recipes are 9335 // built for them. 9336 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9337 continue; 9338 9339 SmallVector<VPValue *, 4> Operands; 9340 auto *Phi = dyn_cast<PHINode>(Instr); 9341 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9342 Operands.push_back(Plan->getOrAddVPValue( 9343 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9344 } else { 9345 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9346 Operands = {OpRange.begin(), OpRange.end()}; 9347 } 9348 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9349 Instr, Operands, Range, Plan)) { 9350 // If Instr can be simplified to an existing VPValue, use it. 9351 if (RecipeOrValue.is<VPValue *>()) { 9352 auto *VPV = RecipeOrValue.get<VPValue *>(); 9353 Plan->addVPValue(Instr, VPV); 9354 // If the re-used value is a recipe, register the recipe for the 9355 // instruction, in case the recipe for Instr needs to be recorded. 9356 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9357 RecipeBuilder.setRecipe(Instr, R); 9358 continue; 9359 } 9360 // Otherwise, add the new recipe. 9361 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9362 for (auto *Def : Recipe->definedValues()) { 9363 auto *UV = Def->getUnderlyingValue(); 9364 Plan->addVPValue(UV, Def); 9365 } 9366 9367 RecipeBuilder.setRecipe(Instr, Recipe); 9368 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe)) { 9369 // Make sure induction recipes are all kept in the header block. 9370 // VPWidenIntOrFpInductionRecipe may be generated when reaching a 9371 // Trunc of an induction Phi, where Trunc may not be in the header. 9372 auto *Header = Plan->getEntry()->getEntryBasicBlock(); 9373 Header->insert(Recipe, Header->getFirstNonPhi()); 9374 } else 9375 VPBB->appendRecipe(Recipe); 9376 continue; 9377 } 9378 9379 // Otherwise, if all widening options failed, Instruction is to be 9380 // replicated. This may create a successor for VPBB. 9381 VPBasicBlock *NextVPBB = 9382 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9383 if (NextVPBB != VPBB) { 9384 VPBB = NextVPBB; 9385 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9386 : ""); 9387 } 9388 } 9389 } 9390 9391 assert(isa<VPBasicBlock>(Plan->getEntry()) && 9392 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9393 "entry block must be set to a non-empty VPBasicBlock"); 9394 RecipeBuilder.fixHeaderPhis(); 9395 9396 // --------------------------------------------------------------------------- 9397 // Transform initial VPlan: Apply previously taken decisions, in order, to 9398 // bring the VPlan to its final state. 9399 // --------------------------------------------------------------------------- 9400 9401 // Apply Sink-After legal constraints. 9402 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9403 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9404 if (Region && Region->isReplicator()) { 9405 assert(Region->getNumSuccessors() == 1 && 9406 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9407 assert(R->getParent()->size() == 1 && 9408 "A recipe in an original replicator region must be the only " 9409 "recipe in its block"); 9410 return Region; 9411 } 9412 return nullptr; 9413 }; 9414 for (auto &Entry : SinkAfter) { 9415 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9416 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9417 9418 auto *TargetRegion = GetReplicateRegion(Target); 9419 auto *SinkRegion = GetReplicateRegion(Sink); 9420 if (!SinkRegion) { 9421 // If the sink source is not a replicate region, sink the recipe directly. 9422 if (TargetRegion) { 9423 // The target is in a replication region, make sure to move Sink to 9424 // the block after it, not into the replication region itself. 9425 VPBasicBlock *NextBlock = 9426 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9427 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9428 } else 9429 Sink->moveAfter(Target); 9430 continue; 9431 } 9432 9433 // The sink source is in a replicate region. Unhook the region from the CFG. 9434 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9435 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9436 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9437 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9438 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9439 9440 if (TargetRegion) { 9441 // The target recipe is also in a replicate region, move the sink region 9442 // after the target region. 9443 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9444 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9445 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9446 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9447 } else { 9448 // The sink source is in a replicate region, we need to move the whole 9449 // replicate region, which should only contain a single recipe in the 9450 // main block. 9451 auto *SplitBlock = 9452 Target->getParent()->splitAt(std::next(Target->getIterator())); 9453 9454 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9455 9456 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9457 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9458 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9459 if (VPBB == SplitPred) 9460 VPBB = SplitBlock; 9461 } 9462 } 9463 9464 // Adjust the recipes for any inloop reductions. 9465 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9466 9467 // Introduce a recipe to combine the incoming and previous values of a 9468 // first-order recurrence. 9469 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9470 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9471 if (!RecurPhi) 9472 continue; 9473 9474 auto *RecurSplice = cast<VPInstruction>( 9475 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9476 {RecurPhi, RecurPhi->getBackedgeValue()})); 9477 9478 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9479 if (auto *Region = GetReplicateRegion(PrevRecipe)) { 9480 VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9481 RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi()); 9482 } else 9483 RecurSplice->moveAfter(PrevRecipe); 9484 RecurPhi->replaceAllUsesWith(RecurSplice); 9485 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9486 // all users. 9487 RecurSplice->setOperand(0, RecurPhi); 9488 } 9489 9490 // Interleave memory: for each Interleave Group we marked earlier as relevant 9491 // for this VPlan, replace the Recipes widening its memory instructions with a 9492 // single VPInterleaveRecipe at its insertion point. 9493 for (auto IG : InterleaveGroups) { 9494 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9495 RecipeBuilder.getRecipe(IG->getInsertPos())); 9496 SmallVector<VPValue *, 4> StoredValues; 9497 for (unsigned i = 0; i < IG->getFactor(); ++i) 9498 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9499 auto *StoreR = 9500 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9501 StoredValues.push_back(StoreR->getStoredValue()); 9502 } 9503 9504 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9505 Recipe->getMask()); 9506 VPIG->insertBefore(Recipe); 9507 unsigned J = 0; 9508 for (unsigned i = 0; i < IG->getFactor(); ++i) 9509 if (Instruction *Member = IG->getMember(i)) { 9510 if (!Member->getType()->isVoidTy()) { 9511 VPValue *OriginalV = Plan->getVPValue(Member); 9512 Plan->removeVPValueFor(Member); 9513 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9514 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9515 J++; 9516 } 9517 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9518 } 9519 } 9520 9521 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9522 // in ways that accessing values using original IR values is incorrect. 9523 Plan->disableValue2VPValue(); 9524 9525 VPlanTransforms::sinkScalarOperands(*Plan); 9526 VPlanTransforms::mergeReplicateRegions(*Plan); 9527 9528 std::string PlanName; 9529 raw_string_ostream RSO(PlanName); 9530 ElementCount VF = Range.Start; 9531 Plan->addVF(VF); 9532 RSO << "Initial VPlan for VF={" << VF; 9533 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9534 Plan->addVF(VF); 9535 RSO << "," << VF; 9536 } 9537 RSO << "},UF>=1"; 9538 RSO.flush(); 9539 Plan->setName(PlanName); 9540 9541 return Plan; 9542 } 9543 9544 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9545 // Outer loop handling: They may require CFG and instruction level 9546 // transformations before even evaluating whether vectorization is profitable. 9547 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9548 // the vectorization pipeline. 9549 assert(!OrigLoop->isInnermost()); 9550 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9551 9552 // Create new empty VPlan 9553 auto Plan = std::make_unique<VPlan>(); 9554 9555 // Build hierarchical CFG 9556 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9557 HCFGBuilder.buildHierarchicalCFG(); 9558 9559 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9560 VF *= 2) 9561 Plan->addVF(VF); 9562 9563 if (EnableVPlanPredication) { 9564 VPlanPredicator VPP(*Plan); 9565 VPP.predicate(); 9566 9567 // Avoid running transformation to recipes until masked code generation in 9568 // VPlan-native path is in place. 9569 return Plan; 9570 } 9571 9572 SmallPtrSet<Instruction *, 1> DeadInstructions; 9573 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9574 Legal->getInductionVars(), 9575 DeadInstructions, *PSE.getSE()); 9576 return Plan; 9577 } 9578 9579 // Adjust the recipes for reductions. For in-loop reductions the chain of 9580 // instructions leading from the loop exit instr to the phi need to be converted 9581 // to reductions, with one operand being vector and the other being the scalar 9582 // reduction chain. For other reductions, a select is introduced between the phi 9583 // and live-out recipes when folding the tail. 9584 void LoopVectorizationPlanner::adjustRecipesForReductions( 9585 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9586 ElementCount MinVF) { 9587 for (auto &Reduction : CM.getInLoopReductionChains()) { 9588 PHINode *Phi = Reduction.first; 9589 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9590 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9591 9592 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9593 continue; 9594 9595 // ReductionOperations are orders top-down from the phi's use to the 9596 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9597 // which of the two operands will remain scalar and which will be reduced. 9598 // For minmax the chain will be the select instructions. 9599 Instruction *Chain = Phi; 9600 for (Instruction *R : ReductionOperations) { 9601 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9602 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9603 9604 VPValue *ChainOp = Plan->getVPValue(Chain); 9605 unsigned FirstOpId; 9606 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9607 "Only min/max recurrences allowed for inloop reductions"); 9608 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9609 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9610 "Expected to replace a VPWidenSelectSC"); 9611 FirstOpId = 1; 9612 } else { 9613 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && 9614 "Expected to replace a VPWidenSC"); 9615 FirstOpId = 0; 9616 } 9617 unsigned VecOpId = 9618 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9619 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9620 9621 auto *CondOp = CM.foldTailByMasking() 9622 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9623 : nullptr; 9624 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9625 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9626 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9627 Plan->removeVPValueFor(R); 9628 Plan->addVPValue(R, RedRecipe); 9629 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9630 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9631 WidenRecipe->eraseFromParent(); 9632 9633 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9634 VPRecipeBase *CompareRecipe = 9635 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9636 assert(isa<VPWidenRecipe>(CompareRecipe) && 9637 "Expected to replace a VPWidenSC"); 9638 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9639 "Expected no remaining users"); 9640 CompareRecipe->eraseFromParent(); 9641 } 9642 Chain = R; 9643 } 9644 } 9645 9646 // If tail is folded by masking, introduce selects between the phi 9647 // and the live-out instruction of each reduction, at the end of the latch. 9648 if (CM.foldTailByMasking()) { 9649 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9650 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9651 if (!PhiR || PhiR->isInLoop()) 9652 continue; 9653 Builder.setInsertPoint(LatchVPBB); 9654 VPValue *Cond = 9655 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9656 VPValue *Red = PhiR->getBackedgeValue(); 9657 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9658 } 9659 } 9660 } 9661 9662 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9663 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9664 VPSlotTracker &SlotTracker) const { 9665 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9666 IG->getInsertPos()->printAsOperand(O, false); 9667 O << ", "; 9668 getAddr()->printAsOperand(O, SlotTracker); 9669 VPValue *Mask = getMask(); 9670 if (Mask) { 9671 O << ", "; 9672 Mask->printAsOperand(O, SlotTracker); 9673 } 9674 9675 unsigned OpIdx = 0; 9676 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9677 if (!IG->getMember(i)) 9678 continue; 9679 if (getNumStoreOperands() > 0) { 9680 O << "\n" << Indent << " store "; 9681 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9682 O << " to index " << i; 9683 } else { 9684 O << "\n" << Indent << " "; 9685 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9686 O << " = load from index " << i; 9687 } 9688 ++OpIdx; 9689 } 9690 } 9691 #endif 9692 9693 void VPWidenCallRecipe::execute(VPTransformState &State) { 9694 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9695 *this, State); 9696 } 9697 9698 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9699 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9700 this, *this, InvariantCond, State); 9701 } 9702 9703 void VPWidenRecipe::execute(VPTransformState &State) { 9704 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9705 } 9706 9707 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9708 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9709 *this, State.UF, State.VF, IsPtrLoopInvariant, 9710 IsIndexLoopInvariant, State); 9711 } 9712 9713 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9714 assert(!State.Instance && "Int or FP induction being replicated."); 9715 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9716 getTruncInst(), getVPValue(0), 9717 getCastValue(), State); 9718 } 9719 9720 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9721 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9722 State); 9723 } 9724 9725 void VPBlendRecipe::execute(VPTransformState &State) { 9726 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9727 // We know that all PHIs in non-header blocks are converted into 9728 // selects, so we don't have to worry about the insertion order and we 9729 // can just use the builder. 9730 // At this point we generate the predication tree. There may be 9731 // duplications since this is a simple recursive scan, but future 9732 // optimizations will clean it up. 9733 9734 unsigned NumIncoming = getNumIncomingValues(); 9735 9736 // Generate a sequence of selects of the form: 9737 // SELECT(Mask3, In3, 9738 // SELECT(Mask2, In2, 9739 // SELECT(Mask1, In1, 9740 // In0))) 9741 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9742 // are essentially undef are taken from In0. 9743 InnerLoopVectorizer::VectorParts Entry(State.UF); 9744 for (unsigned In = 0; In < NumIncoming; ++In) { 9745 for (unsigned Part = 0; Part < State.UF; ++Part) { 9746 // We might have single edge PHIs (blocks) - use an identity 9747 // 'select' for the first PHI operand. 9748 Value *In0 = State.get(getIncomingValue(In), Part); 9749 if (In == 0) 9750 Entry[Part] = In0; // Initialize with the first incoming value. 9751 else { 9752 // Select between the current value and the previous incoming edge 9753 // based on the incoming mask. 9754 Value *Cond = State.get(getMask(In), Part); 9755 Entry[Part] = 9756 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9757 } 9758 } 9759 } 9760 for (unsigned Part = 0; Part < State.UF; ++Part) 9761 State.set(this, Entry[Part], Part); 9762 } 9763 9764 void VPInterleaveRecipe::execute(VPTransformState &State) { 9765 assert(!State.Instance && "Interleave group being replicated."); 9766 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9767 getStoredValues(), getMask()); 9768 } 9769 9770 void VPReductionRecipe::execute(VPTransformState &State) { 9771 assert(!State.Instance && "Reduction being replicated."); 9772 Value *PrevInChain = State.get(getChainOp(), 0); 9773 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9774 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9775 // Propagate the fast-math flags carried by the underlying instruction. 9776 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9777 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9778 for (unsigned Part = 0; Part < State.UF; ++Part) { 9779 Value *NewVecOp = State.get(getVecOp(), Part); 9780 if (VPValue *Cond = getCondOp()) { 9781 Value *NewCond = State.get(Cond, Part); 9782 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9783 Value *Iden = RdxDesc->getRecurrenceIdentity( 9784 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9785 Value *IdenVec = 9786 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9787 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9788 NewVecOp = Select; 9789 } 9790 Value *NewRed; 9791 Value *NextInChain; 9792 if (IsOrdered) { 9793 if (State.VF.isVector()) 9794 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9795 PrevInChain); 9796 else 9797 NewRed = State.Builder.CreateBinOp( 9798 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9799 NewVecOp); 9800 PrevInChain = NewRed; 9801 } else { 9802 PrevInChain = State.get(getChainOp(), Part); 9803 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9804 } 9805 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9806 NextInChain = 9807 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9808 NewRed, PrevInChain); 9809 } else if (IsOrdered) 9810 NextInChain = NewRed; 9811 else 9812 NextInChain = State.Builder.CreateBinOp( 9813 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9814 PrevInChain); 9815 State.set(this, NextInChain, Part); 9816 } 9817 } 9818 9819 void VPReplicateRecipe::execute(VPTransformState &State) { 9820 if (State.Instance) { // Generate a single instance. 9821 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9822 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9823 *State.Instance, IsPredicated, State); 9824 // Insert scalar instance packing it into a vector. 9825 if (AlsoPack && State.VF.isVector()) { 9826 // If we're constructing lane 0, initialize to start from poison. 9827 if (State.Instance->Lane.isFirstLane()) { 9828 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9829 Value *Poison = PoisonValue::get( 9830 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9831 State.set(this, Poison, State.Instance->Part); 9832 } 9833 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9834 } 9835 return; 9836 } 9837 9838 // Generate scalar instances for all VF lanes of all UF parts, unless the 9839 // instruction is uniform inwhich case generate only the first lane for each 9840 // of the UF parts. 9841 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9842 assert((!State.VF.isScalable() || IsUniform) && 9843 "Can't scalarize a scalable vector"); 9844 for (unsigned Part = 0; Part < State.UF; ++Part) 9845 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9846 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9847 VPIteration(Part, Lane), IsPredicated, 9848 State); 9849 } 9850 9851 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9852 assert(State.Instance && "Branch on Mask works only on single instance."); 9853 9854 unsigned Part = State.Instance->Part; 9855 unsigned Lane = State.Instance->Lane.getKnownLane(); 9856 9857 Value *ConditionBit = nullptr; 9858 VPValue *BlockInMask = getMask(); 9859 if (BlockInMask) { 9860 ConditionBit = State.get(BlockInMask, Part); 9861 if (ConditionBit->getType()->isVectorTy()) 9862 ConditionBit = State.Builder.CreateExtractElement( 9863 ConditionBit, State.Builder.getInt32(Lane)); 9864 } else // Block in mask is all-one. 9865 ConditionBit = State.Builder.getTrue(); 9866 9867 // Replace the temporary unreachable terminator with a new conditional branch, 9868 // whose two destinations will be set later when they are created. 9869 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9870 assert(isa<UnreachableInst>(CurrentTerminator) && 9871 "Expected to replace unreachable terminator with conditional branch."); 9872 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9873 CondBr->setSuccessor(0, nullptr); 9874 ReplaceInstWithInst(CurrentTerminator, CondBr); 9875 } 9876 9877 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9878 assert(State.Instance && "Predicated instruction PHI works per instance."); 9879 Instruction *ScalarPredInst = 9880 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9881 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9882 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9883 assert(PredicatingBB && "Predicated block has no single predecessor."); 9884 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9885 "operand must be VPReplicateRecipe"); 9886 9887 // By current pack/unpack logic we need to generate only a single phi node: if 9888 // a vector value for the predicated instruction exists at this point it means 9889 // the instruction has vector users only, and a phi for the vector value is 9890 // needed. In this case the recipe of the predicated instruction is marked to 9891 // also do that packing, thereby "hoisting" the insert-element sequence. 9892 // Otherwise, a phi node for the scalar value is needed. 9893 unsigned Part = State.Instance->Part; 9894 if (State.hasVectorValue(getOperand(0), Part)) { 9895 Value *VectorValue = State.get(getOperand(0), Part); 9896 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9897 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9898 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9899 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9900 if (State.hasVectorValue(this, Part)) 9901 State.reset(this, VPhi, Part); 9902 else 9903 State.set(this, VPhi, Part); 9904 // NOTE: Currently we need to update the value of the operand, so the next 9905 // predicated iteration inserts its generated value in the correct vector. 9906 State.reset(getOperand(0), VPhi, Part); 9907 } else { 9908 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9909 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9910 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9911 PredicatingBB); 9912 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9913 if (State.hasScalarValue(this, *State.Instance)) 9914 State.reset(this, Phi, *State.Instance); 9915 else 9916 State.set(this, Phi, *State.Instance); 9917 // NOTE: Currently we need to update the value of the operand, so the next 9918 // predicated iteration inserts its generated value in the correct vector. 9919 State.reset(getOperand(0), Phi, *State.Instance); 9920 } 9921 } 9922 9923 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9924 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9925 State.ILV->vectorizeMemoryInstruction( 9926 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9927 StoredValue, getMask(), Consecutive, Reverse); 9928 } 9929 9930 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9931 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9932 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9933 // for predication. 9934 static ScalarEpilogueLowering getScalarEpilogueLowering( 9935 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9936 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9937 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9938 LoopVectorizationLegality &LVL) { 9939 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9940 // don't look at hints or options, and don't request a scalar epilogue. 9941 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9942 // LoopAccessInfo (due to code dependency and not being able to reliably get 9943 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9944 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9945 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9946 // back to the old way and vectorize with versioning when forced. See D81345.) 9947 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9948 PGSOQueryType::IRPass) && 9949 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9950 return CM_ScalarEpilogueNotAllowedOptSize; 9951 9952 // 2) If set, obey the directives 9953 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9954 switch (PreferPredicateOverEpilogue) { 9955 case PreferPredicateTy::ScalarEpilogue: 9956 return CM_ScalarEpilogueAllowed; 9957 case PreferPredicateTy::PredicateElseScalarEpilogue: 9958 return CM_ScalarEpilogueNotNeededUsePredicate; 9959 case PreferPredicateTy::PredicateOrDontVectorize: 9960 return CM_ScalarEpilogueNotAllowedUsePredicate; 9961 }; 9962 } 9963 9964 // 3) If set, obey the hints 9965 switch (Hints.getPredicate()) { 9966 case LoopVectorizeHints::FK_Enabled: 9967 return CM_ScalarEpilogueNotNeededUsePredicate; 9968 case LoopVectorizeHints::FK_Disabled: 9969 return CM_ScalarEpilogueAllowed; 9970 }; 9971 9972 // 4) if the TTI hook indicates this is profitable, request predication. 9973 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9974 LVL.getLAI())) 9975 return CM_ScalarEpilogueNotNeededUsePredicate; 9976 9977 return CM_ScalarEpilogueAllowed; 9978 } 9979 9980 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9981 // If Values have been set for this Def return the one relevant for \p Part. 9982 if (hasVectorValue(Def, Part)) 9983 return Data.PerPartOutput[Def][Part]; 9984 9985 if (!hasScalarValue(Def, {Part, 0})) { 9986 Value *IRV = Def->getLiveInIRValue(); 9987 Value *B = ILV->getBroadcastInstrs(IRV); 9988 set(Def, B, Part); 9989 return B; 9990 } 9991 9992 Value *ScalarValue = get(Def, {Part, 0}); 9993 // If we aren't vectorizing, we can just copy the scalar map values over 9994 // to the vector map. 9995 if (VF.isScalar()) { 9996 set(Def, ScalarValue, Part); 9997 return ScalarValue; 9998 } 9999 10000 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10001 bool IsUniform = RepR && RepR->isUniform(); 10002 10003 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10004 // Check if there is a scalar value for the selected lane. 10005 if (!hasScalarValue(Def, {Part, LastLane})) { 10006 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10007 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10008 "unexpected recipe found to be invariant"); 10009 IsUniform = true; 10010 LastLane = 0; 10011 } 10012 10013 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10014 // Set the insert point after the last scalarized instruction or after the 10015 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10016 // will directly follow the scalar definitions. 10017 auto OldIP = Builder.saveIP(); 10018 auto NewIP = 10019 isa<PHINode>(LastInst) 10020 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10021 : std::next(BasicBlock::iterator(LastInst)); 10022 Builder.SetInsertPoint(&*NewIP); 10023 10024 // However, if we are vectorizing, we need to construct the vector values. 10025 // If the value is known to be uniform after vectorization, we can just 10026 // broadcast the scalar value corresponding to lane zero for each unroll 10027 // iteration. Otherwise, we construct the vector values using 10028 // insertelement instructions. Since the resulting vectors are stored in 10029 // State, we will only generate the insertelements once. 10030 Value *VectorValue = nullptr; 10031 if (IsUniform) { 10032 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10033 set(Def, VectorValue, Part); 10034 } else { 10035 // Initialize packing with insertelements to start from undef. 10036 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10037 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10038 set(Def, Undef, Part); 10039 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10040 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10041 VectorValue = get(Def, Part); 10042 } 10043 Builder.restoreIP(OldIP); 10044 return VectorValue; 10045 } 10046 10047 // Process the loop in the VPlan-native vectorization path. This path builds 10048 // VPlan upfront in the vectorization pipeline, which allows to apply 10049 // VPlan-to-VPlan transformations from the very beginning without modifying the 10050 // input LLVM IR. 10051 static bool processLoopInVPlanNativePath( 10052 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10053 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10054 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10055 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10056 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10057 LoopVectorizationRequirements &Requirements) { 10058 10059 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10060 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10061 return false; 10062 } 10063 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10064 Function *F = L->getHeader()->getParent(); 10065 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10066 10067 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10068 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10069 10070 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10071 &Hints, IAI); 10072 // Use the planner for outer loop vectorization. 10073 // TODO: CM is not used at this point inside the planner. Turn CM into an 10074 // optional argument if we don't need it in the future. 10075 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10076 Requirements, ORE); 10077 10078 // Get user vectorization factor. 10079 ElementCount UserVF = Hints.getWidth(); 10080 10081 CM.collectElementTypesForWidening(); 10082 10083 // Plan how to best vectorize, return the best VF and its cost. 10084 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10085 10086 // If we are stress testing VPlan builds, do not attempt to generate vector 10087 // code. Masked vector code generation support will follow soon. 10088 // Also, do not attempt to vectorize if no vector code will be produced. 10089 if (VPlanBuildStressTest || EnableVPlanPredication || 10090 VectorizationFactor::Disabled() == VF) 10091 return false; 10092 10093 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10094 10095 { 10096 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10097 F->getParent()->getDataLayout()); 10098 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10099 &CM, BFI, PSI, Checks); 10100 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10101 << L->getHeader()->getParent()->getName() << "\"\n"); 10102 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10103 } 10104 10105 // Mark the loop as already vectorized to avoid vectorizing again. 10106 Hints.setAlreadyVectorized(); 10107 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10108 return true; 10109 } 10110 10111 // Emit a remark if there are stores to floats that required a floating point 10112 // extension. If the vectorized loop was generated with floating point there 10113 // will be a performance penalty from the conversion overhead and the change in 10114 // the vector width. 10115 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10116 SmallVector<Instruction *, 4> Worklist; 10117 for (BasicBlock *BB : L->getBlocks()) { 10118 for (Instruction &Inst : *BB) { 10119 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10120 if (S->getValueOperand()->getType()->isFloatTy()) 10121 Worklist.push_back(S); 10122 } 10123 } 10124 } 10125 10126 // Traverse the floating point stores upwards searching, for floating point 10127 // conversions. 10128 SmallPtrSet<const Instruction *, 4> Visited; 10129 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10130 while (!Worklist.empty()) { 10131 auto *I = Worklist.pop_back_val(); 10132 if (!L->contains(I)) 10133 continue; 10134 if (!Visited.insert(I).second) 10135 continue; 10136 10137 // Emit a remark if the floating point store required a floating 10138 // point conversion. 10139 // TODO: More work could be done to identify the root cause such as a 10140 // constant or a function return type and point the user to it. 10141 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10142 ORE->emit([&]() { 10143 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10144 I->getDebugLoc(), L->getHeader()) 10145 << "floating point conversion changes vector width. " 10146 << "Mixed floating point precision requires an up/down " 10147 << "cast that will negatively impact performance."; 10148 }); 10149 10150 for (Use &Op : I->operands()) 10151 if (auto *OpI = dyn_cast<Instruction>(Op)) 10152 Worklist.push_back(OpI); 10153 } 10154 } 10155 10156 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10157 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10158 !EnableLoopInterleaving), 10159 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10160 !EnableLoopVectorization) {} 10161 10162 bool LoopVectorizePass::processLoop(Loop *L) { 10163 assert((EnableVPlanNativePath || L->isInnermost()) && 10164 "VPlan-native path is not enabled. Only process inner loops."); 10165 10166 #ifndef NDEBUG 10167 const std::string DebugLocStr = getDebugLocString(L); 10168 #endif /* NDEBUG */ 10169 10170 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10171 << L->getHeader()->getParent()->getName() << "\" from " 10172 << DebugLocStr << "\n"); 10173 10174 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10175 10176 LLVM_DEBUG( 10177 dbgs() << "LV: Loop hints:" 10178 << " force=" 10179 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10180 ? "disabled" 10181 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10182 ? "enabled" 10183 : "?")) 10184 << " width=" << Hints.getWidth() 10185 << " interleave=" << Hints.getInterleave() << "\n"); 10186 10187 // Function containing loop 10188 Function *F = L->getHeader()->getParent(); 10189 10190 // Looking at the diagnostic output is the only way to determine if a loop 10191 // was vectorized (other than looking at the IR or machine code), so it 10192 // is important to generate an optimization remark for each loop. Most of 10193 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10194 // generated as OptimizationRemark and OptimizationRemarkMissed are 10195 // less verbose reporting vectorized loops and unvectorized loops that may 10196 // benefit from vectorization, respectively. 10197 10198 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10199 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10200 return false; 10201 } 10202 10203 PredicatedScalarEvolution PSE(*SE, *L); 10204 10205 // Check if it is legal to vectorize the loop. 10206 LoopVectorizationRequirements Requirements; 10207 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10208 &Requirements, &Hints, DB, AC, BFI, PSI); 10209 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10210 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10211 Hints.emitRemarkWithHints(); 10212 return false; 10213 } 10214 10215 // Check the function attributes and profiles to find out if this function 10216 // should be optimized for size. 10217 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10218 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10219 10220 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10221 // here. They may require CFG and instruction level transformations before 10222 // even evaluating whether vectorization is profitable. Since we cannot modify 10223 // the incoming IR, we need to build VPlan upfront in the vectorization 10224 // pipeline. 10225 if (!L->isInnermost()) 10226 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10227 ORE, BFI, PSI, Hints, Requirements); 10228 10229 assert(L->isInnermost() && "Inner loop expected."); 10230 10231 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10232 // count by optimizing for size, to minimize overheads. 10233 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10234 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10235 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10236 << "This loop is worth vectorizing only if no scalar " 10237 << "iteration overheads are incurred."); 10238 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10239 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10240 else { 10241 LLVM_DEBUG(dbgs() << "\n"); 10242 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10243 } 10244 } 10245 10246 // Check the function attributes to see if implicit floats are allowed. 10247 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10248 // an integer loop and the vector instructions selected are purely integer 10249 // vector instructions? 10250 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10251 reportVectorizationFailure( 10252 "Can't vectorize when the NoImplicitFloat attribute is used", 10253 "loop not vectorized due to NoImplicitFloat attribute", 10254 "NoImplicitFloat", ORE, L); 10255 Hints.emitRemarkWithHints(); 10256 return false; 10257 } 10258 10259 // Check if the target supports potentially unsafe FP vectorization. 10260 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10261 // for the target we're vectorizing for, to make sure none of the 10262 // additional fp-math flags can help. 10263 if (Hints.isPotentiallyUnsafe() && 10264 TTI->isFPVectorizationPotentiallyUnsafe()) { 10265 reportVectorizationFailure( 10266 "Potentially unsafe FP op prevents vectorization", 10267 "loop not vectorized due to unsafe FP support.", 10268 "UnsafeFP", ORE, L); 10269 Hints.emitRemarkWithHints(); 10270 return false; 10271 } 10272 10273 bool AllowOrderedReductions; 10274 // If the flag is set, use that instead and override the TTI behaviour. 10275 if (ForceOrderedReductions.getNumOccurrences() > 0) 10276 AllowOrderedReductions = ForceOrderedReductions; 10277 else 10278 AllowOrderedReductions = TTI->enableOrderedReductions(); 10279 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10280 ORE->emit([&]() { 10281 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10282 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10283 ExactFPMathInst->getDebugLoc(), 10284 ExactFPMathInst->getParent()) 10285 << "loop not vectorized: cannot prove it is safe to reorder " 10286 "floating-point operations"; 10287 }); 10288 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10289 "reorder floating-point operations\n"); 10290 Hints.emitRemarkWithHints(); 10291 return false; 10292 } 10293 10294 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10295 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10296 10297 // If an override option has been passed in for interleaved accesses, use it. 10298 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10299 UseInterleaved = EnableInterleavedMemAccesses; 10300 10301 // Analyze interleaved memory accesses. 10302 if (UseInterleaved) { 10303 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10304 } 10305 10306 // Use the cost model. 10307 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10308 F, &Hints, IAI); 10309 CM.collectValuesToIgnore(); 10310 CM.collectElementTypesForWidening(); 10311 10312 // Use the planner for vectorization. 10313 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10314 Requirements, ORE); 10315 10316 // Get user vectorization factor and interleave count. 10317 ElementCount UserVF = Hints.getWidth(); 10318 unsigned UserIC = Hints.getInterleave(); 10319 10320 // Plan how to best vectorize, return the best VF and its cost. 10321 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10322 10323 VectorizationFactor VF = VectorizationFactor::Disabled(); 10324 unsigned IC = 1; 10325 10326 if (MaybeVF) { 10327 VF = *MaybeVF; 10328 // Select the interleave count. 10329 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10330 } 10331 10332 // Identify the diagnostic messages that should be produced. 10333 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10334 bool VectorizeLoop = true, InterleaveLoop = true; 10335 if (VF.Width.isScalar()) { 10336 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10337 VecDiagMsg = std::make_pair( 10338 "VectorizationNotBeneficial", 10339 "the cost-model indicates that vectorization is not beneficial"); 10340 VectorizeLoop = false; 10341 } 10342 10343 if (!MaybeVF && UserIC > 1) { 10344 // Tell the user interleaving was avoided up-front, despite being explicitly 10345 // requested. 10346 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10347 "interleaving should be avoided up front\n"); 10348 IntDiagMsg = std::make_pair( 10349 "InterleavingAvoided", 10350 "Ignoring UserIC, because interleaving was avoided up front"); 10351 InterleaveLoop = false; 10352 } else if (IC == 1 && UserIC <= 1) { 10353 // Tell the user interleaving is not beneficial. 10354 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10355 IntDiagMsg = std::make_pair( 10356 "InterleavingNotBeneficial", 10357 "the cost-model indicates that interleaving is not beneficial"); 10358 InterleaveLoop = false; 10359 if (UserIC == 1) { 10360 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10361 IntDiagMsg.second += 10362 " and is explicitly disabled or interleave count is set to 1"; 10363 } 10364 } else if (IC > 1 && UserIC == 1) { 10365 // Tell the user interleaving is beneficial, but it explicitly disabled. 10366 LLVM_DEBUG( 10367 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10368 IntDiagMsg = std::make_pair( 10369 "InterleavingBeneficialButDisabled", 10370 "the cost-model indicates that interleaving is beneficial " 10371 "but is explicitly disabled or interleave count is set to 1"); 10372 InterleaveLoop = false; 10373 } 10374 10375 // Override IC if user provided an interleave count. 10376 IC = UserIC > 0 ? UserIC : IC; 10377 10378 // Emit diagnostic messages, if any. 10379 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10380 if (!VectorizeLoop && !InterleaveLoop) { 10381 // Do not vectorize or interleaving the loop. 10382 ORE->emit([&]() { 10383 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10384 L->getStartLoc(), L->getHeader()) 10385 << VecDiagMsg.second; 10386 }); 10387 ORE->emit([&]() { 10388 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10389 L->getStartLoc(), L->getHeader()) 10390 << IntDiagMsg.second; 10391 }); 10392 return false; 10393 } else if (!VectorizeLoop && InterleaveLoop) { 10394 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10395 ORE->emit([&]() { 10396 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10397 L->getStartLoc(), L->getHeader()) 10398 << VecDiagMsg.second; 10399 }); 10400 } else if (VectorizeLoop && !InterleaveLoop) { 10401 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10402 << ") in " << DebugLocStr << '\n'); 10403 ORE->emit([&]() { 10404 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10405 L->getStartLoc(), L->getHeader()) 10406 << IntDiagMsg.second; 10407 }); 10408 } else if (VectorizeLoop && InterleaveLoop) { 10409 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10410 << ") in " << DebugLocStr << '\n'); 10411 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10412 } 10413 10414 bool DisableRuntimeUnroll = false; 10415 MDNode *OrigLoopID = L->getLoopID(); 10416 { 10417 // Optimistically generate runtime checks. Drop them if they turn out to not 10418 // be profitable. Limit the scope of Checks, so the cleanup happens 10419 // immediately after vector codegeneration is done. 10420 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10421 F->getParent()->getDataLayout()); 10422 if (!VF.Width.isScalar() || IC > 1) 10423 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10424 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10425 10426 using namespace ore; 10427 if (!VectorizeLoop) { 10428 assert(IC > 1 && "interleave count should not be 1 or 0"); 10429 // If we decided that it is not legal to vectorize the loop, then 10430 // interleave it. 10431 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10432 &CM, BFI, PSI, Checks); 10433 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10434 10435 ORE->emit([&]() { 10436 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10437 L->getHeader()) 10438 << "interleaved loop (interleaved count: " 10439 << NV("InterleaveCount", IC) << ")"; 10440 }); 10441 } else { 10442 // If we decided that it is *legal* to vectorize the loop, then do it. 10443 10444 // Consider vectorizing the epilogue too if it's profitable. 10445 VectorizationFactor EpilogueVF = 10446 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10447 if (EpilogueVF.Width.isVector()) { 10448 10449 // The first pass vectorizes the main loop and creates a scalar epilogue 10450 // to be vectorized by executing the plan (potentially with a different 10451 // factor) again shortly afterwards. 10452 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10453 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10454 EPI, &LVL, &CM, BFI, PSI, Checks); 10455 10456 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestPlan, MainILV, DT); 10457 ++LoopsVectorized; 10458 10459 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10460 formLCSSARecursively(*L, *DT, LI, SE); 10461 10462 // Second pass vectorizes the epilogue and adjusts the control flow 10463 // edges from the first pass. 10464 EPI.MainLoopVF = EPI.EpilogueVF; 10465 EPI.MainLoopUF = EPI.EpilogueUF; 10466 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10467 ORE, EPI, &LVL, &CM, BFI, PSI, 10468 Checks); 10469 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestPlan, EpilogILV, 10470 DT); 10471 ++LoopsEpilogueVectorized; 10472 10473 if (!MainILV.areSafetyChecksAdded()) 10474 DisableRuntimeUnroll = true; 10475 } else { 10476 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10477 &LVL, &CM, BFI, PSI, Checks); 10478 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10479 ++LoopsVectorized; 10480 10481 // Add metadata to disable runtime unrolling a scalar loop when there 10482 // are no runtime checks about strides and memory. A scalar loop that is 10483 // rarely used is not worth unrolling. 10484 if (!LB.areSafetyChecksAdded()) 10485 DisableRuntimeUnroll = true; 10486 } 10487 // Report the vectorization decision. 10488 ORE->emit([&]() { 10489 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10490 L->getHeader()) 10491 << "vectorized loop (vectorization width: " 10492 << NV("VectorizationFactor", VF.Width) 10493 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10494 }); 10495 } 10496 10497 if (ORE->allowExtraAnalysis(LV_NAME)) 10498 checkMixedPrecision(L, ORE); 10499 } 10500 10501 Optional<MDNode *> RemainderLoopID = 10502 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10503 LLVMLoopVectorizeFollowupEpilogue}); 10504 if (RemainderLoopID.hasValue()) { 10505 L->setLoopID(RemainderLoopID.getValue()); 10506 } else { 10507 if (DisableRuntimeUnroll) 10508 AddRuntimeUnrollDisableMetaData(L); 10509 10510 // Mark the loop as already vectorized to avoid vectorizing again. 10511 Hints.setAlreadyVectorized(); 10512 } 10513 10514 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10515 return true; 10516 } 10517 10518 LoopVectorizeResult LoopVectorizePass::runImpl( 10519 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10520 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10521 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10522 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10523 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10524 SE = &SE_; 10525 LI = &LI_; 10526 TTI = &TTI_; 10527 DT = &DT_; 10528 BFI = &BFI_; 10529 TLI = TLI_; 10530 AA = &AA_; 10531 AC = &AC_; 10532 GetLAA = &GetLAA_; 10533 DB = &DB_; 10534 ORE = &ORE_; 10535 PSI = PSI_; 10536 10537 // Don't attempt if 10538 // 1. the target claims to have no vector registers, and 10539 // 2. interleaving won't help ILP. 10540 // 10541 // The second condition is necessary because, even if the target has no 10542 // vector registers, loop vectorization may still enable scalar 10543 // interleaving. 10544 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10545 TTI->getMaxInterleaveFactor(1) < 2) 10546 return LoopVectorizeResult(false, false); 10547 10548 bool Changed = false, CFGChanged = false; 10549 10550 // The vectorizer requires loops to be in simplified form. 10551 // Since simplification may add new inner loops, it has to run before the 10552 // legality and profitability checks. This means running the loop vectorizer 10553 // will simplify all loops, regardless of whether anything end up being 10554 // vectorized. 10555 for (auto &L : *LI) 10556 Changed |= CFGChanged |= 10557 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10558 10559 // Build up a worklist of inner-loops to vectorize. This is necessary as 10560 // the act of vectorizing or partially unrolling a loop creates new loops 10561 // and can invalidate iterators across the loops. 10562 SmallVector<Loop *, 8> Worklist; 10563 10564 for (Loop *L : *LI) 10565 collectSupportedLoops(*L, LI, ORE, Worklist); 10566 10567 LoopsAnalyzed += Worklist.size(); 10568 10569 // Now walk the identified inner loops. 10570 while (!Worklist.empty()) { 10571 Loop *L = Worklist.pop_back_val(); 10572 10573 // For the inner loops we actually process, form LCSSA to simplify the 10574 // transform. 10575 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10576 10577 Changed |= CFGChanged |= processLoop(L); 10578 } 10579 10580 // Process each loop nest in the function. 10581 return LoopVectorizeResult(Changed, CFGChanged); 10582 } 10583 10584 PreservedAnalyses LoopVectorizePass::run(Function &F, 10585 FunctionAnalysisManager &AM) { 10586 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10587 auto &LI = AM.getResult<LoopAnalysis>(F); 10588 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10589 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10590 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10591 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10592 auto &AA = AM.getResult<AAManager>(F); 10593 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10594 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10595 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10596 10597 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10598 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10599 [&](Loop &L) -> const LoopAccessInfo & { 10600 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10601 TLI, TTI, nullptr, nullptr, nullptr}; 10602 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10603 }; 10604 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10605 ProfileSummaryInfo *PSI = 10606 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10607 LoopVectorizeResult Result = 10608 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10609 if (!Result.MadeAnyChange) 10610 return PreservedAnalyses::all(); 10611 PreservedAnalyses PA; 10612 10613 // We currently do not preserve loopinfo/dominator analyses with outer loop 10614 // vectorization. Until this is addressed, mark these analyses as preserved 10615 // only for non-VPlan-native path. 10616 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10617 if (!EnableVPlanNativePath) { 10618 PA.preserve<LoopAnalysis>(); 10619 PA.preserve<DominatorTreeAnalysis>(); 10620 } 10621 if (!Result.MadeCFGChange) 10622 PA.preserveSet<CFGAnalyses>(); 10623 return PA; 10624 } 10625 10626 void LoopVectorizePass::printPipeline( 10627 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10628 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10629 OS, MapClassName2PassName); 10630 10631 OS << "<"; 10632 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10633 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10634 OS << ">"; 10635 } 10636