1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/PatternMatch.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/InstructionCost.h" 135 #include "llvm/Support/MathExtras.h" 136 #include "llvm/Support/raw_ostream.h" 137 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 138 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 139 #include "llvm/Transforms/Utils/LoopSimplify.h" 140 #include "llvm/Transforms/Utils/LoopUtils.h" 141 #include "llvm/Transforms/Utils/LoopVersioning.h" 142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 143 #include "llvm/Transforms/Utils/SizeOpts.h" 144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 145 #include <algorithm> 146 #include <cassert> 147 #include <cstdint> 148 #include <cstdlib> 149 #include <functional> 150 #include <iterator> 151 #include <limits> 152 #include <memory> 153 #include <string> 154 #include <tuple> 155 #include <utility> 156 157 using namespace llvm; 158 159 #define LV_NAME "loop-vectorize" 160 #define DEBUG_TYPE LV_NAME 161 162 #ifndef NDEBUG 163 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 164 #endif 165 166 /// @{ 167 /// Metadata attribute names 168 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 169 const char LLVMLoopVectorizeFollowupVectorized[] = 170 "llvm.loop.vectorize.followup_vectorized"; 171 const char LLVMLoopVectorizeFollowupEpilogue[] = 172 "llvm.loop.vectorize.followup_epilogue"; 173 /// @} 174 175 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 176 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 177 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 178 179 static cl::opt<bool> EnableEpilogueVectorization( 180 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 181 cl::desc("Enable vectorization of epilogue loops.")); 182 183 static cl::opt<unsigned> EpilogueVectorizationForceVF( 184 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 185 cl::desc("When epilogue vectorization is enabled, and a value greater than " 186 "1 is specified, forces the given VF for all applicable epilogue " 187 "loops.")); 188 189 static cl::opt<unsigned> EpilogueVectorizationMinVF( 190 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 191 cl::desc("Only loops with vectorization factor equal to or larger than " 192 "the specified value are considered for epilogue vectorization.")); 193 194 /// Loops with a known constant trip count below this number are vectorized only 195 /// if no scalar iteration overheads are incurred. 196 static cl::opt<unsigned> TinyTripCountVectorThreshold( 197 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 198 cl::desc("Loops with a constant trip count that is smaller than this " 199 "value are vectorized only if no scalar iteration overheads " 200 "are incurred.")); 201 202 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 203 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 204 cl::desc("The maximum allowed number of runtime memory checks with a " 205 "vectorize(enable) pragma.")); 206 207 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 208 // that predication is preferred, and this lists all options. I.e., the 209 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 210 // and predicate the instructions accordingly. If tail-folding fails, there are 211 // different fallback strategies depending on these values: 212 namespace PreferPredicateTy { 213 enum Option { 214 ScalarEpilogue = 0, 215 PredicateElseScalarEpilogue, 216 PredicateOrDontVectorize 217 }; 218 } // namespace PreferPredicateTy 219 220 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 221 "prefer-predicate-over-epilogue", 222 cl::init(PreferPredicateTy::ScalarEpilogue), 223 cl::Hidden, 224 cl::desc("Tail-folding and predication preferences over creating a scalar " 225 "epilogue loop."), 226 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 227 "scalar-epilogue", 228 "Don't tail-predicate loops, create scalar epilogue"), 229 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 230 "predicate-else-scalar-epilogue", 231 "prefer tail-folding, create scalar epilogue if tail " 232 "folding fails."), 233 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 234 "predicate-dont-vectorize", 235 "prefers tail-folding, don't attempt vectorization if " 236 "tail-folding fails."))); 237 238 static cl::opt<bool> MaximizeBandwidth( 239 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 240 cl::desc("Maximize bandwidth when selecting vectorization factor which " 241 "will be determined by the smallest type in loop.")); 242 243 static cl::opt<bool> EnableInterleavedMemAccesses( 244 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 245 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 246 247 /// An interleave-group may need masking if it resides in a block that needs 248 /// predication, or in order to mask away gaps. 249 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 250 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 251 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 252 253 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 254 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 255 cl::desc("We don't interleave loops with a estimated constant trip count " 256 "below this number")); 257 258 static cl::opt<unsigned> ForceTargetNumScalarRegs( 259 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 260 cl::desc("A flag that overrides the target's number of scalar registers.")); 261 262 static cl::opt<unsigned> ForceTargetNumVectorRegs( 263 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 264 cl::desc("A flag that overrides the target's number of vector registers.")); 265 266 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 267 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 268 cl::desc("A flag that overrides the target's max interleave factor for " 269 "scalar loops.")); 270 271 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 272 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 273 cl::desc("A flag that overrides the target's max interleave factor for " 274 "vectorized loops.")); 275 276 static cl::opt<unsigned> ForceTargetInstructionCost( 277 "force-target-instruction-cost", cl::init(0), cl::Hidden, 278 cl::desc("A flag that overrides the target's expected cost for " 279 "an instruction to a single constant value. Mostly " 280 "useful for getting consistent testing.")); 281 282 static cl::opt<bool> ForceTargetSupportsScalableVectors( 283 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 284 cl::desc( 285 "Pretend that scalable vectors are supported, even if the target does " 286 "not support them. This flag should only be used for testing.")); 287 288 static cl::opt<unsigned> SmallLoopCost( 289 "small-loop-cost", cl::init(20), cl::Hidden, 290 cl::desc( 291 "The cost of a loop that is considered 'small' by the interleaver.")); 292 293 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 294 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 295 cl::desc("Enable the use of the block frequency analysis to access PGO " 296 "heuristics minimizing code growth in cold regions and being more " 297 "aggressive in hot regions.")); 298 299 // Runtime interleave loops for load/store throughput. 300 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 301 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 302 cl::desc( 303 "Enable runtime interleaving until load/store ports are saturated")); 304 305 /// Interleave small loops with scalar reductions. 306 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 307 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 308 cl::desc("Enable interleaving for loops with small iteration counts that " 309 "contain scalar reductions to expose ILP.")); 310 311 /// The number of stores in a loop that are allowed to need predication. 312 static cl::opt<unsigned> NumberOfStoresToPredicate( 313 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 314 cl::desc("Max number of stores to be predicated behind an if.")); 315 316 static cl::opt<bool> EnableIndVarRegisterHeur( 317 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 318 cl::desc("Count the induction variable only once when interleaving")); 319 320 static cl::opt<bool> EnableCondStoresVectorization( 321 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 322 cl::desc("Enable if predication of stores during vectorization.")); 323 324 static cl::opt<unsigned> MaxNestedScalarReductionIC( 325 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 326 cl::desc("The maximum interleave count to use when interleaving a scalar " 327 "reduction in a nested loop.")); 328 329 static cl::opt<bool> 330 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 331 cl::Hidden, 332 cl::desc("Prefer in-loop vector reductions, " 333 "overriding the targets preference.")); 334 335 cl::opt<bool> ForceOrderedReductions( 336 "force-ordered-reductions", cl::init(false), cl::Hidden, 337 cl::desc("Enable the vectorisation of loops with in-order (strict) " 338 "FP reductions")); 339 340 static cl::opt<bool> PreferPredicatedReductionSelect( 341 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 342 cl::desc( 343 "Prefer predicating a reduction operation over an after loop select.")); 344 345 cl::opt<bool> EnableVPlanNativePath( 346 "enable-vplan-native-path", cl::init(false), cl::Hidden, 347 cl::desc("Enable VPlan-native vectorization path with " 348 "support for outer loop vectorization.")); 349 350 // FIXME: Remove this switch once we have divergence analysis. Currently we 351 // assume divergent non-backedge branches when this switch is true. 352 cl::opt<bool> EnableVPlanPredication( 353 "enable-vplan-predication", cl::init(false), cl::Hidden, 354 cl::desc("Enable VPlan-native vectorization path predicator with " 355 "support for outer loop vectorization.")); 356 357 // This flag enables the stress testing of the VPlan H-CFG construction in the 358 // VPlan-native vectorization path. It must be used in conjuction with 359 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 360 // verification of the H-CFGs built. 361 static cl::opt<bool> VPlanBuildStressTest( 362 "vplan-build-stress-test", cl::init(false), cl::Hidden, 363 cl::desc( 364 "Build VPlan for every supported loop nest in the function and bail " 365 "out right after the build (stress test the VPlan H-CFG construction " 366 "in the VPlan-native vectorization path).")); 367 368 cl::opt<bool> llvm::EnableLoopInterleaving( 369 "interleave-loops", cl::init(true), cl::Hidden, 370 cl::desc("Enable loop interleaving in Loop vectorization passes")); 371 cl::opt<bool> llvm::EnableLoopVectorization( 372 "vectorize-loops", cl::init(true), cl::Hidden, 373 cl::desc("Run the Loop vectorization passes")); 374 375 cl::opt<bool> PrintVPlansInDotFormat( 376 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 377 cl::desc("Use dot format instead of plain text when dumping VPlans")); 378 379 /// A helper function that returns true if the given type is irregular. The 380 /// type is irregular if its allocated size doesn't equal the store size of an 381 /// element of the corresponding vector type. 382 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 383 // Determine if an array of N elements of type Ty is "bitcast compatible" 384 // with a <N x Ty> vector. 385 // This is only true if there is no padding between the array elements. 386 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 387 } 388 389 /// A helper function that returns the reciprocal of the block probability of 390 /// predicated blocks. If we return X, we are assuming the predicated block 391 /// will execute once for every X iterations of the loop header. 392 /// 393 /// TODO: We should use actual block probability here, if available. Currently, 394 /// we always assume predicated blocks have a 50% chance of executing. 395 static unsigned getReciprocalPredBlockProb() { return 2; } 396 397 /// A helper function that returns an integer or floating-point constant with 398 /// value C. 399 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 400 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 401 : ConstantFP::get(Ty, C); 402 } 403 404 /// Returns "best known" trip count for the specified loop \p L as defined by 405 /// the following procedure: 406 /// 1) Returns exact trip count if it is known. 407 /// 2) Returns expected trip count according to profile data if any. 408 /// 3) Returns upper bound estimate if it is known. 409 /// 4) Returns None if all of the above failed. 410 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 411 // Check if exact trip count is known. 412 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 413 return ExpectedTC; 414 415 // Check if there is an expected trip count available from profile data. 416 if (LoopVectorizeWithBlockFrequency) 417 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 418 return EstimatedTC; 419 420 // Check if upper bound estimate is known. 421 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 422 return ExpectedTC; 423 424 return None; 425 } 426 427 // Forward declare GeneratedRTChecks. 428 class GeneratedRTChecks; 429 430 namespace llvm { 431 432 /// InnerLoopVectorizer vectorizes loops which contain only one basic 433 /// block to a specified vectorization factor (VF). 434 /// This class performs the widening of scalars into vectors, or multiple 435 /// scalars. This class also implements the following features: 436 /// * It inserts an epilogue loop for handling loops that don't have iteration 437 /// counts that are known to be a multiple of the vectorization factor. 438 /// * It handles the code generation for reduction variables. 439 /// * Scalarization (implementation using scalars) of un-vectorizable 440 /// instructions. 441 /// InnerLoopVectorizer does not perform any vectorization-legality 442 /// checks, and relies on the caller to check for the different legality 443 /// aspects. The InnerLoopVectorizer relies on the 444 /// LoopVectorizationLegality class to provide information about the induction 445 /// and reduction variables that were found to a given vectorization factor. 446 class InnerLoopVectorizer { 447 public: 448 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 449 LoopInfo *LI, DominatorTree *DT, 450 const TargetLibraryInfo *TLI, 451 const TargetTransformInfo *TTI, AssumptionCache *AC, 452 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 453 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 459 PSI(PSI), RTChecks(RTChecks) { 460 // Query this against the original loop and save it here because the profile 461 // of the original loop header may change as the transformation happens. 462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 464 } 465 466 virtual ~InnerLoopVectorizer() = default; 467 468 /// Create a new empty loop that will contain vectorized instructions later 469 /// on, while the old loop will be used as the scalar remainder. Control flow 470 /// is generated around the vectorized (and scalar epilogue) loops consisting 471 /// of various checks and bypasses. Return the pre-header block of the new 472 /// loop. 473 /// In the case of epilogue vectorization, this function is overriden to 474 /// handle the more complex control flow around the loops. 475 virtual BasicBlock *createVectorizedLoopSkeleton(); 476 477 /// Widen a single instruction within the innermost loop. 478 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 479 VPTransformState &State); 480 481 /// Widen a single call instruction within the innermost loop. 482 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 483 VPTransformState &State); 484 485 /// Widen a single select instruction within the innermost loop. 486 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 487 bool InvariantCond, VPTransformState &State); 488 489 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 490 void fixVectorizedLoop(VPTransformState &State); 491 492 // Return true if any runtime check is added. 493 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 494 495 /// A type for vectorized values in the new loop. Each value from the 496 /// original loop, when vectorized, is represented by UF vector values in the 497 /// new unrolled loop, where UF is the unroll factor. 498 using VectorParts = SmallVector<Value *, 2>; 499 500 /// Vectorize a single GetElementPtrInst based on information gathered and 501 /// decisions taken during planning. 502 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 503 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 504 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 505 506 /// Vectorize a single first-order recurrence or pointer induction PHINode in 507 /// a block. This method handles the induction variable canonicalization. It 508 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 509 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 510 VPTransformState &State); 511 512 /// A helper function to scalarize a single Instruction in the innermost loop. 513 /// Generates a sequence of scalar instances for each lane between \p MinLane 514 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 515 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 516 /// Instr's operands. 517 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 518 const VPIteration &Instance, bool IfPredicateInstr, 519 VPTransformState &State); 520 521 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 522 /// is provided, the integer induction variable will first be truncated to 523 /// the corresponding type. 524 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 525 VPValue *Def, VPValue *CastDef, 526 VPTransformState &State); 527 528 /// Construct the vector value of a scalarized value \p V one lane at a time. 529 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 530 VPTransformState &State); 531 532 /// Try to vectorize interleaved access group \p Group with the base address 533 /// given in \p Addr, optionally masking the vector operations if \p 534 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 535 /// values in the vectorized loop. 536 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 537 ArrayRef<VPValue *> VPDefs, 538 VPTransformState &State, VPValue *Addr, 539 ArrayRef<VPValue *> StoredValues, 540 VPValue *BlockInMask = nullptr); 541 542 /// Vectorize Load and Store instructions with the base address given in \p 543 /// Addr, optionally masking the vector operations if \p BlockInMask is 544 /// non-null. Use \p State to translate given VPValues to IR values in the 545 /// vectorized loop. 546 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 547 VPValue *Def, VPValue *Addr, 548 VPValue *StoredValue, VPValue *BlockInMask); 549 550 /// Set the debug location in the builder \p Ptr using the debug location in 551 /// \p V. If \p Ptr is None then it uses the class member's Builder. 552 void setDebugLocFromInst(const Value *V, 553 Optional<IRBuilder<> *> CustomBuilder = None); 554 555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 556 void fixNonInductionPHIs(VPTransformState &State); 557 558 /// Returns true if the reordering of FP operations is not allowed, but we are 559 /// able to vectorize with strict in-order reductions for the given RdxDesc. 560 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 561 562 /// Create a broadcast instruction. This method generates a broadcast 563 /// instruction (shuffle) for loop invariant values and for the induction 564 /// value. If this is the induction variable then we extend it to N, N+1, ... 565 /// this is needed because each iteration in the loop corresponds to a SIMD 566 /// element. 567 virtual Value *getBroadcastInstrs(Value *V); 568 569 protected: 570 friend class LoopVectorizationPlanner; 571 572 /// A small list of PHINodes. 573 using PhiVector = SmallVector<PHINode *, 4>; 574 575 /// A type for scalarized values in the new loop. Each value from the 576 /// original loop, when scalarized, is represented by UF x VF scalar values 577 /// in the new unrolled loop, where UF is the unroll factor and VF is the 578 /// vectorization factor. 579 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 580 581 /// Set up the values of the IVs correctly when exiting the vector loop. 582 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 583 Value *CountRoundDown, Value *EndValue, 584 BasicBlock *MiddleBlock); 585 586 /// Create a new induction variable inside L. 587 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 588 Value *Step, Instruction *DL); 589 590 /// Handle all cross-iteration phis in the header. 591 void fixCrossIterationPHIs(VPTransformState &State); 592 593 /// Create the exit value of first order recurrences in the middle block and 594 /// update their users. 595 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 596 597 /// Create code for the loop exit value of the reduction. 598 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 599 600 /// Clear NSW/NUW flags from reduction instructions if necessary. 601 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 602 VPTransformState &State); 603 604 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 605 /// means we need to add the appropriate incoming value from the middle 606 /// block as exiting edges from the scalar epilogue loop (if present) are 607 /// already in place, and we exit the vector loop exclusively to the middle 608 /// block. 609 void fixLCSSAPHIs(VPTransformState &State); 610 611 /// Iteratively sink the scalarized operands of a predicated instruction into 612 /// the block that was created for it. 613 void sinkScalarOperands(Instruction *PredInst); 614 615 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 616 /// represented as. 617 void truncateToMinimalBitwidths(VPTransformState &State); 618 619 /// This function adds 620 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 621 /// to each vector element of Val. The sequence starts at StartIndex. 622 /// \p Opcode is relevant for FP induction variable. 623 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 624 Instruction::BinaryOps Opcode = 625 Instruction::BinaryOpsEnd); 626 627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 628 /// variable on which to base the steps, \p Step is the size of the step, and 629 /// \p EntryVal is the value from the original loop that maps to the steps. 630 /// Note that \p EntryVal doesn't have to be an induction variable - it 631 /// can also be a truncate instruction. 632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 633 const InductionDescriptor &ID, VPValue *Def, 634 VPValue *CastDef, VPTransformState &State); 635 636 /// Create a vector induction phi node based on an existing scalar one. \p 637 /// EntryVal is the value from the original loop that maps to the vector phi 638 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 639 /// truncate instruction, instead of widening the original IV, we widen a 640 /// version of the IV truncated to \p EntryVal's type. 641 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 642 Value *Step, Value *Start, 643 Instruction *EntryVal, VPValue *Def, 644 VPValue *CastDef, 645 VPTransformState &State); 646 647 /// Returns true if an instruction \p I should be scalarized instead of 648 /// vectorized for the chosen vectorization factor. 649 bool shouldScalarizeInstruction(Instruction *I) const; 650 651 /// Returns true if we should generate a scalar version of \p IV. 652 bool needsScalarInduction(Instruction *IV) const; 653 654 /// If there is a cast involved in the induction variable \p ID, which should 655 /// be ignored in the vectorized loop body, this function records the 656 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 657 /// cast. We had already proved that the casted Phi is equal to the uncasted 658 /// Phi in the vectorized loop (under a runtime guard), and therefore 659 /// there is no need to vectorize the cast - the same value can be used in the 660 /// vector loop for both the Phi and the cast. 661 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 662 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 663 /// 664 /// \p EntryVal is the value from the original loop that maps to the vector 665 /// phi node and is used to distinguish what is the IV currently being 666 /// processed - original one (if \p EntryVal is a phi corresponding to the 667 /// original IV) or the "newly-created" one based on the proof mentioned above 668 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 669 /// latter case \p EntryVal is a TruncInst and we must not record anything for 670 /// that IV, but it's error-prone to expect callers of this routine to care 671 /// about that, hence this explicit parameter. 672 void recordVectorLoopValueForInductionCast( 673 const InductionDescriptor &ID, const Instruction *EntryVal, 674 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 675 unsigned Part, unsigned Lane = UINT_MAX); 676 677 /// Generate a shuffle sequence that will reverse the vector Vec. 678 virtual Value *reverseVector(Value *Vec); 679 680 /// Returns (and creates if needed) the original loop trip count. 681 Value *getOrCreateTripCount(Loop *NewLoop); 682 683 /// Returns (and creates if needed) the trip count of the widened loop. 684 Value *getOrCreateVectorTripCount(Loop *NewLoop); 685 686 /// Returns a bitcasted value to the requested vector type. 687 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 688 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 689 const DataLayout &DL); 690 691 /// Emit a bypass check to see if the vector trip count is zero, including if 692 /// it overflows. 693 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 694 695 /// Emit a bypass check to see if all of the SCEV assumptions we've 696 /// had to make are correct. Returns the block containing the checks or 697 /// nullptr if no checks have been added. 698 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 699 700 /// Emit bypass checks to check any memory assumptions we may have made. 701 /// Returns the block containing the checks or nullptr if no checks have been 702 /// added. 703 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 704 705 /// Compute the transformed value of Index at offset StartValue using step 706 /// StepValue. 707 /// For integer induction, returns StartValue + Index * StepValue. 708 /// For pointer induction, returns StartValue[Index * StepValue]. 709 /// FIXME: The newly created binary instructions should contain nsw/nuw 710 /// flags, which can be found from the original scalar operations. 711 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 712 const DataLayout &DL, 713 const InductionDescriptor &ID) const; 714 715 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 716 /// vector loop preheader, middle block and scalar preheader. Also 717 /// allocate a loop object for the new vector loop and return it. 718 Loop *createVectorLoopSkeleton(StringRef Prefix); 719 720 /// Create new phi nodes for the induction variables to resume iteration count 721 /// in the scalar epilogue, from where the vectorized loop left off (given by 722 /// \p VectorTripCount). 723 /// In cases where the loop skeleton is more complicated (eg. epilogue 724 /// vectorization) and the resume values can come from an additional bypass 725 /// block, the \p AdditionalBypass pair provides information about the bypass 726 /// block and the end value on the edge from bypass to this loop. 727 void createInductionResumeValues( 728 Loop *L, Value *VectorTripCount, 729 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 730 731 /// Complete the loop skeleton by adding debug MDs, creating appropriate 732 /// conditional branches in the middle block, preparing the builder and 733 /// running the verifier. Take in the vector loop \p L as argument, and return 734 /// the preheader of the completed vector loop. 735 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 736 737 /// Add additional metadata to \p To that was not present on \p Orig. 738 /// 739 /// Currently this is used to add the noalias annotations based on the 740 /// inserted memchecks. Use this for instructions that are *cloned* into the 741 /// vector loop. 742 void addNewMetadata(Instruction *To, const Instruction *Orig); 743 744 /// Add metadata from one instruction to another. 745 /// 746 /// This includes both the original MDs from \p From and additional ones (\see 747 /// addNewMetadata). Use this for *newly created* instructions in the vector 748 /// loop. 749 void addMetadata(Instruction *To, Instruction *From); 750 751 /// Similar to the previous function but it adds the metadata to a 752 /// vector of instructions. 753 void addMetadata(ArrayRef<Value *> To, Instruction *From); 754 755 /// Allow subclasses to override and print debug traces before/after vplan 756 /// execution, when trace information is requested. 757 virtual void printDebugTracesAtStart(){}; 758 virtual void printDebugTracesAtEnd(){}; 759 760 /// The original loop. 761 Loop *OrigLoop; 762 763 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 764 /// dynamic knowledge to simplify SCEV expressions and converts them to a 765 /// more usable form. 766 PredicatedScalarEvolution &PSE; 767 768 /// Loop Info. 769 LoopInfo *LI; 770 771 /// Dominator Tree. 772 DominatorTree *DT; 773 774 /// Alias Analysis. 775 AAResults *AA; 776 777 /// Target Library Info. 778 const TargetLibraryInfo *TLI; 779 780 /// Target Transform Info. 781 const TargetTransformInfo *TTI; 782 783 /// Assumption Cache. 784 AssumptionCache *AC; 785 786 /// Interface to emit optimization remarks. 787 OptimizationRemarkEmitter *ORE; 788 789 /// LoopVersioning. It's only set up (non-null) if memchecks were 790 /// used. 791 /// 792 /// This is currently only used to add no-alias metadata based on the 793 /// memchecks. The actually versioning is performed manually. 794 std::unique_ptr<LoopVersioning> LVer; 795 796 /// The vectorization SIMD factor to use. Each vector will have this many 797 /// vector elements. 798 ElementCount VF; 799 800 /// The vectorization unroll factor to use. Each scalar is vectorized to this 801 /// many different vector instructions. 802 unsigned UF; 803 804 /// The builder that we use 805 IRBuilder<> Builder; 806 807 // --- Vectorization state --- 808 809 /// The vector-loop preheader. 810 BasicBlock *LoopVectorPreHeader; 811 812 /// The scalar-loop preheader. 813 BasicBlock *LoopScalarPreHeader; 814 815 /// Middle Block between the vector and the scalar. 816 BasicBlock *LoopMiddleBlock; 817 818 /// The unique ExitBlock of the scalar loop if one exists. Note that 819 /// there can be multiple exiting edges reaching this block. 820 BasicBlock *LoopExitBlock; 821 822 /// The vector loop body. 823 BasicBlock *LoopVectorBody; 824 825 /// The scalar loop body. 826 BasicBlock *LoopScalarBody; 827 828 /// A list of all bypass blocks. The first block is the entry of the loop. 829 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 830 831 /// The new Induction variable which was added to the new block. 832 PHINode *Induction = nullptr; 833 834 /// The induction variable of the old basic block. 835 PHINode *OldInduction = nullptr; 836 837 /// Store instructions that were predicated. 838 SmallVector<Instruction *, 4> PredicatedInstructions; 839 840 /// Trip count of the original loop. 841 Value *TripCount = nullptr; 842 843 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 844 Value *VectorTripCount = nullptr; 845 846 /// The legality analysis. 847 LoopVectorizationLegality *Legal; 848 849 /// The profitablity analysis. 850 LoopVectorizationCostModel *Cost; 851 852 // Record whether runtime checks are added. 853 bool AddedSafetyChecks = false; 854 855 // Holds the end values for each induction variable. We save the end values 856 // so we can later fix-up the external users of the induction variables. 857 DenseMap<PHINode *, Value *> IVEndValues; 858 859 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 860 // fixed up at the end of vector code generation. 861 SmallVector<PHINode *, 8> OrigPHIsToFix; 862 863 /// BFI and PSI are used to check for profile guided size optimizations. 864 BlockFrequencyInfo *BFI; 865 ProfileSummaryInfo *PSI; 866 867 // Whether this loop should be optimized for size based on profile guided size 868 // optimizatios. 869 bool OptForSizeBasedOnProfile; 870 871 /// Structure to hold information about generated runtime checks, responsible 872 /// for cleaning the checks, if vectorization turns out unprofitable. 873 GeneratedRTChecks &RTChecks; 874 }; 875 876 class InnerLoopUnroller : public InnerLoopVectorizer { 877 public: 878 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 879 LoopInfo *LI, DominatorTree *DT, 880 const TargetLibraryInfo *TLI, 881 const TargetTransformInfo *TTI, AssumptionCache *AC, 882 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 883 LoopVectorizationLegality *LVL, 884 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 885 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 887 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 888 BFI, PSI, Check) {} 889 890 private: 891 Value *getBroadcastInstrs(Value *V) override; 892 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 893 Instruction::BinaryOps Opcode = 894 Instruction::BinaryOpsEnd) override; 895 Value *reverseVector(Value *Vec) override; 896 }; 897 898 /// Encapsulate information regarding vectorization of a loop and its epilogue. 899 /// This information is meant to be updated and used across two stages of 900 /// epilogue vectorization. 901 struct EpilogueLoopVectorizationInfo { 902 ElementCount MainLoopVF = ElementCount::getFixed(0); 903 unsigned MainLoopUF = 0; 904 ElementCount EpilogueVF = ElementCount::getFixed(0); 905 unsigned EpilogueUF = 0; 906 BasicBlock *MainLoopIterationCountCheck = nullptr; 907 BasicBlock *EpilogueIterationCountCheck = nullptr; 908 BasicBlock *SCEVSafetyCheck = nullptr; 909 BasicBlock *MemSafetyCheck = nullptr; 910 Value *TripCount = nullptr; 911 Value *VectorTripCount = nullptr; 912 913 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 914 unsigned EUF) 915 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 916 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 917 assert(EUF == 1 && 918 "A high UF for the epilogue loop is likely not beneficial."); 919 } 920 }; 921 922 /// An extension of the inner loop vectorizer that creates a skeleton for a 923 /// vectorized loop that has its epilogue (residual) also vectorized. 924 /// The idea is to run the vplan on a given loop twice, firstly to setup the 925 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 926 /// from the first step and vectorize the epilogue. This is achieved by 927 /// deriving two concrete strategy classes from this base class and invoking 928 /// them in succession from the loop vectorizer planner. 929 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 930 public: 931 InnerLoopAndEpilogueVectorizer( 932 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 933 DominatorTree *DT, const TargetLibraryInfo *TLI, 934 const TargetTransformInfo *TTI, AssumptionCache *AC, 935 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 936 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 937 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 938 GeneratedRTChecks &Checks) 939 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 940 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 941 Checks), 942 EPI(EPI) {} 943 944 // Override this function to handle the more complex control flow around the 945 // three loops. 946 BasicBlock *createVectorizedLoopSkeleton() final override { 947 return createEpilogueVectorizedLoopSkeleton(); 948 } 949 950 /// The interface for creating a vectorized skeleton using one of two 951 /// different strategies, each corresponding to one execution of the vplan 952 /// as described above. 953 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 954 955 /// Holds and updates state information required to vectorize the main loop 956 /// and its epilogue in two separate passes. This setup helps us avoid 957 /// regenerating and recomputing runtime safety checks. It also helps us to 958 /// shorten the iteration-count-check path length for the cases where the 959 /// iteration count of the loop is so small that the main vector loop is 960 /// completely skipped. 961 EpilogueLoopVectorizationInfo &EPI; 962 }; 963 964 /// A specialized derived class of inner loop vectorizer that performs 965 /// vectorization of *main* loops in the process of vectorizing loops and their 966 /// epilogues. 967 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 968 public: 969 EpilogueVectorizerMainLoop( 970 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 971 DominatorTree *DT, const TargetLibraryInfo *TLI, 972 const TargetTransformInfo *TTI, AssumptionCache *AC, 973 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 974 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 975 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 976 GeneratedRTChecks &Check) 977 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 978 EPI, LVL, CM, BFI, PSI, Check) {} 979 /// Implements the interface for creating a vectorized skeleton using the 980 /// *main loop* strategy (ie the first pass of vplan execution). 981 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 982 983 protected: 984 /// Emits an iteration count bypass check once for the main loop (when \p 985 /// ForEpilogue is false) and once for the epilogue loop (when \p 986 /// ForEpilogue is true). 987 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 988 bool ForEpilogue); 989 void printDebugTracesAtStart() override; 990 void printDebugTracesAtEnd() override; 991 }; 992 993 // A specialized derived class of inner loop vectorizer that performs 994 // vectorization of *epilogue* loops in the process of vectorizing loops and 995 // their epilogues. 996 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 997 public: 998 EpilogueVectorizerEpilogueLoop( 999 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1000 DominatorTree *DT, const TargetLibraryInfo *TLI, 1001 const TargetTransformInfo *TTI, AssumptionCache *AC, 1002 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1003 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1004 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1005 GeneratedRTChecks &Checks) 1006 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1007 EPI, LVL, CM, BFI, PSI, Checks) {} 1008 /// Implements the interface for creating a vectorized skeleton using the 1009 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1010 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1011 1012 protected: 1013 /// Emits an iteration count bypass check after the main vector loop has 1014 /// finished to see if there are any iterations left to execute by either 1015 /// the vector epilogue or the scalar epilogue. 1016 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1017 BasicBlock *Bypass, 1018 BasicBlock *Insert); 1019 void printDebugTracesAtStart() override; 1020 void printDebugTracesAtEnd() override; 1021 }; 1022 } // end namespace llvm 1023 1024 /// Look for a meaningful debug location on the instruction or it's 1025 /// operands. 1026 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1027 if (!I) 1028 return I; 1029 1030 DebugLoc Empty; 1031 if (I->getDebugLoc() != Empty) 1032 return I; 1033 1034 for (Use &Op : I->operands()) { 1035 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1036 if (OpInst->getDebugLoc() != Empty) 1037 return OpInst; 1038 } 1039 1040 return I; 1041 } 1042 1043 void InnerLoopVectorizer::setDebugLocFromInst( 1044 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1045 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1046 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1047 const DILocation *DIL = Inst->getDebugLoc(); 1048 1049 // When a FSDiscriminator is enabled, we don't need to add the multiply 1050 // factors to the discriminators. 1051 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1052 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1053 // FIXME: For scalable vectors, assume vscale=1. 1054 auto NewDIL = 1055 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1056 if (NewDIL) 1057 B->SetCurrentDebugLocation(NewDIL.getValue()); 1058 else 1059 LLVM_DEBUG(dbgs() 1060 << "Failed to create new discriminator: " 1061 << DIL->getFilename() << " Line: " << DIL->getLine()); 1062 } else 1063 B->SetCurrentDebugLocation(DIL); 1064 } else 1065 B->SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1069 /// is passed, the message relates to that particular instruction. 1070 #ifndef NDEBUG 1071 static void debugVectorizationMessage(const StringRef Prefix, 1072 const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: " << Prefix << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1104 } 1105 1106 /// Return a value for Step multiplied by VF. 1107 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1108 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get( 1110 Step->getType(), 1111 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1113 } 1114 1115 namespace llvm { 1116 1117 /// Return the runtime value for VF. 1118 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1119 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1120 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1121 } 1122 1123 void reportVectorizationFailure(const StringRef DebugMsg, 1124 const StringRef OREMsg, const StringRef ORETag, 1125 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1126 Instruction *I) { 1127 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1128 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1129 ORE->emit( 1130 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1131 << "loop not vectorized: " << OREMsg); 1132 } 1133 1134 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1135 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1136 Instruction *I) { 1137 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1138 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1139 ORE->emit( 1140 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1141 << Msg); 1142 } 1143 1144 } // end namespace llvm 1145 1146 #ifndef NDEBUG 1147 /// \return string containing a file name and a line # for the given loop. 1148 static std::string getDebugLocString(const Loop *L) { 1149 std::string Result; 1150 if (L) { 1151 raw_string_ostream OS(Result); 1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1153 LoopDbgLoc.print(OS); 1154 else 1155 // Just print the module name. 1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1157 OS.flush(); 1158 } 1159 return Result; 1160 } 1161 #endif 1162 1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1164 const Instruction *Orig) { 1165 // If the loop was versioned with memchecks, add the corresponding no-alias 1166 // metadata. 1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1168 LVer->annotateInstWithNoAlias(To, Orig); 1169 } 1170 1171 void InnerLoopVectorizer::addMetadata(Instruction *To, 1172 Instruction *From) { 1173 propagateMetadata(To, From); 1174 addNewMetadata(To, From); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1178 Instruction *From) { 1179 for (Value *V : To) { 1180 if (Instruction *I = dyn_cast<Instruction>(V)) 1181 addMetadata(I, From); 1182 } 1183 } 1184 1185 namespace llvm { 1186 1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1188 // lowered. 1189 enum ScalarEpilogueLowering { 1190 1191 // The default: allowing scalar epilogues. 1192 CM_ScalarEpilogueAllowed, 1193 1194 // Vectorization with OptForSize: don't allow epilogues. 1195 CM_ScalarEpilogueNotAllowedOptSize, 1196 1197 // A special case of vectorisation with OptForSize: loops with a very small 1198 // trip count are considered for vectorization under OptForSize, thereby 1199 // making sure the cost of their loop body is dominant, free of runtime 1200 // guards and scalar iteration overheads. 1201 CM_ScalarEpilogueNotAllowedLowTripLoop, 1202 1203 // Loop hint predicate indicating an epilogue is undesired. 1204 CM_ScalarEpilogueNotNeededUsePredicate, 1205 1206 // Directive indicating we must either tail fold or not vectorize 1207 CM_ScalarEpilogueNotAllowedUsePredicate 1208 }; 1209 1210 /// ElementCountComparator creates a total ordering for ElementCount 1211 /// for the purposes of using it in a set structure. 1212 struct ElementCountComparator { 1213 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1214 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1215 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1216 } 1217 }; 1218 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1219 1220 /// LoopVectorizationCostModel - estimates the expected speedups due to 1221 /// vectorization. 1222 /// In many cases vectorization is not profitable. This can happen because of 1223 /// a number of reasons. In this class we mainly attempt to predict the 1224 /// expected speedup/slowdowns due to the supported instruction set. We use the 1225 /// TargetTransformInfo to query the different backends for the cost of 1226 /// different operations. 1227 class LoopVectorizationCostModel { 1228 public: 1229 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1230 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1231 LoopVectorizationLegality *Legal, 1232 const TargetTransformInfo &TTI, 1233 const TargetLibraryInfo *TLI, DemandedBits *DB, 1234 AssumptionCache *AC, 1235 OptimizationRemarkEmitter *ORE, const Function *F, 1236 const LoopVectorizeHints *Hints, 1237 InterleavedAccessInfo &IAI) 1238 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1239 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1240 Hints(Hints), InterleaveInfo(IAI) {} 1241 1242 /// \return An upper bound for the vectorization factors (both fixed and 1243 /// scalable). If the factors are 0, vectorization and interleaving should be 1244 /// avoided up front. 1245 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1246 1247 /// \return True if runtime checks are required for vectorization, and false 1248 /// otherwise. 1249 bool runtimeChecksRequired(); 1250 1251 /// \return The most profitable vectorization factor and the cost of that VF. 1252 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1253 /// then this vectorization factor will be selected if vectorization is 1254 /// possible. 1255 VectorizationFactor 1256 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1257 1258 VectorizationFactor 1259 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1260 const LoopVectorizationPlanner &LVP); 1261 1262 /// Setup cost-based decisions for user vectorization factor. 1263 /// \return true if the UserVF is a feasible VF to be chosen. 1264 bool selectUserVectorizationFactor(ElementCount UserVF) { 1265 collectUniformsAndScalars(UserVF); 1266 collectInstsToScalarize(UserVF); 1267 return expectedCost(UserVF).first.isValid(); 1268 } 1269 1270 /// \return The size (in bits) of the smallest and widest types in the code 1271 /// that needs to be vectorized. We ignore values that remain scalar such as 1272 /// 64 bit loop indices. 1273 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1274 1275 /// \return The desired interleave count. 1276 /// If interleave count has been specified by metadata it will be returned. 1277 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1278 /// are the selected vectorization factor and the cost of the selected VF. 1279 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1280 1281 /// Memory access instruction may be vectorized in more than one way. 1282 /// Form of instruction after vectorization depends on cost. 1283 /// This function takes cost-based decisions for Load/Store instructions 1284 /// and collects them in a map. This decisions map is used for building 1285 /// the lists of loop-uniform and loop-scalar instructions. 1286 /// The calculated cost is saved with widening decision in order to 1287 /// avoid redundant calculations. 1288 void setCostBasedWideningDecision(ElementCount VF); 1289 1290 /// A struct that represents some properties of the register usage 1291 /// of a loop. 1292 struct RegisterUsage { 1293 /// Holds the number of loop invariant values that are used in the loop. 1294 /// The key is ClassID of target-provided register class. 1295 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1296 /// Holds the maximum number of concurrent live intervals in the loop. 1297 /// The key is ClassID of target-provided register class. 1298 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1299 }; 1300 1301 /// \return Returns information about the register usages of the loop for the 1302 /// given vectorization factors. 1303 SmallVector<RegisterUsage, 8> 1304 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1305 1306 /// Collect values we want to ignore in the cost model. 1307 void collectValuesToIgnore(); 1308 1309 /// Collect all element types in the loop for which widening is needed. 1310 void collectElementTypesForWidening(); 1311 1312 /// Split reductions into those that happen in the loop, and those that happen 1313 /// outside. In loop reductions are collected into InLoopReductionChains. 1314 void collectInLoopReductions(); 1315 1316 /// Returns true if we should use strict in-order reductions for the given 1317 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1318 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1319 /// of FP operations. 1320 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1321 return ForceOrderedReductions && !Hints->allowReordering() && 1322 RdxDesc.isOrdered(); 1323 } 1324 1325 /// \returns The smallest bitwidth each instruction can be represented with. 1326 /// The vector equivalents of these instructions should be truncated to this 1327 /// type. 1328 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1329 return MinBWs; 1330 } 1331 1332 /// \returns True if it is more profitable to scalarize instruction \p I for 1333 /// vectorization factor \p VF. 1334 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1335 assert(VF.isVector() && 1336 "Profitable to scalarize relevant only for VF > 1."); 1337 1338 // Cost model is not run in the VPlan-native path - return conservative 1339 // result until this changes. 1340 if (EnableVPlanNativePath) 1341 return false; 1342 1343 auto Scalars = InstsToScalarize.find(VF); 1344 assert(Scalars != InstsToScalarize.end() && 1345 "VF not yet analyzed for scalarization profitability"); 1346 return Scalars->second.find(I) != Scalars->second.end(); 1347 } 1348 1349 /// Returns true if \p I is known to be uniform after vectorization. 1350 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1351 if (VF.isScalar()) 1352 return true; 1353 1354 // Cost model is not run in the VPlan-native path - return conservative 1355 // result until this changes. 1356 if (EnableVPlanNativePath) 1357 return false; 1358 1359 auto UniformsPerVF = Uniforms.find(VF); 1360 assert(UniformsPerVF != Uniforms.end() && 1361 "VF not yet analyzed for uniformity"); 1362 return UniformsPerVF->second.count(I); 1363 } 1364 1365 /// Returns true if \p I is known to be scalar after vectorization. 1366 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1367 if (VF.isScalar()) 1368 return true; 1369 1370 // Cost model is not run in the VPlan-native path - return conservative 1371 // result until this changes. 1372 if (EnableVPlanNativePath) 1373 return false; 1374 1375 auto ScalarsPerVF = Scalars.find(VF); 1376 assert(ScalarsPerVF != Scalars.end() && 1377 "Scalar values are not calculated for VF"); 1378 return ScalarsPerVF->second.count(I); 1379 } 1380 1381 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1382 /// for vectorization factor \p VF. 1383 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1384 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1385 !isProfitableToScalarize(I, VF) && 1386 !isScalarAfterVectorization(I, VF); 1387 } 1388 1389 /// Decision that was taken during cost calculation for memory instruction. 1390 enum InstWidening { 1391 CM_Unknown, 1392 CM_Widen, // For consecutive accesses with stride +1. 1393 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1394 CM_Interleave, 1395 CM_GatherScatter, 1396 CM_Scalarize 1397 }; 1398 1399 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1400 /// instruction \p I and vector width \p VF. 1401 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1402 InstructionCost Cost) { 1403 assert(VF.isVector() && "Expected VF >=2"); 1404 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1405 } 1406 1407 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1408 /// interleaving group \p Grp and vector width \p VF. 1409 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1410 ElementCount VF, InstWidening W, 1411 InstructionCost Cost) { 1412 assert(VF.isVector() && "Expected VF >=2"); 1413 /// Broadcast this decicion to all instructions inside the group. 1414 /// But the cost will be assigned to one instruction only. 1415 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1416 if (auto *I = Grp->getMember(i)) { 1417 if (Grp->getInsertPos() == I) 1418 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1419 else 1420 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1421 } 1422 } 1423 } 1424 1425 /// Return the cost model decision for the given instruction \p I and vector 1426 /// width \p VF. Return CM_Unknown if this instruction did not pass 1427 /// through the cost modeling. 1428 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1429 assert(VF.isVector() && "Expected VF to be a vector VF"); 1430 // Cost model is not run in the VPlan-native path - return conservative 1431 // result until this changes. 1432 if (EnableVPlanNativePath) 1433 return CM_GatherScatter; 1434 1435 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1436 auto Itr = WideningDecisions.find(InstOnVF); 1437 if (Itr == WideningDecisions.end()) 1438 return CM_Unknown; 1439 return Itr->second.first; 1440 } 1441 1442 /// Return the vectorization cost for the given instruction \p I and vector 1443 /// width \p VF. 1444 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1445 assert(VF.isVector() && "Expected VF >=2"); 1446 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1447 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1448 "The cost is not calculated"); 1449 return WideningDecisions[InstOnVF].second; 1450 } 1451 1452 /// Return True if instruction \p I is an optimizable truncate whose operand 1453 /// is an induction variable. Such a truncate will be removed by adding a new 1454 /// induction variable with the destination type. 1455 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1456 // If the instruction is not a truncate, return false. 1457 auto *Trunc = dyn_cast<TruncInst>(I); 1458 if (!Trunc) 1459 return false; 1460 1461 // Get the source and destination types of the truncate. 1462 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1463 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1464 1465 // If the truncate is free for the given types, return false. Replacing a 1466 // free truncate with an induction variable would add an induction variable 1467 // update instruction to each iteration of the loop. We exclude from this 1468 // check the primary induction variable since it will need an update 1469 // instruction regardless. 1470 Value *Op = Trunc->getOperand(0); 1471 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1472 return false; 1473 1474 // If the truncated value is not an induction variable, return false. 1475 return Legal->isInductionPhi(Op); 1476 } 1477 1478 /// Collects the instructions to scalarize for each predicated instruction in 1479 /// the loop. 1480 void collectInstsToScalarize(ElementCount VF); 1481 1482 /// Collect Uniform and Scalar values for the given \p VF. 1483 /// The sets depend on CM decision for Load/Store instructions 1484 /// that may be vectorized as interleave, gather-scatter or scalarized. 1485 void collectUniformsAndScalars(ElementCount VF) { 1486 // Do the analysis once. 1487 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1488 return; 1489 setCostBasedWideningDecision(VF); 1490 collectLoopUniforms(VF); 1491 collectLoopScalars(VF); 1492 } 1493 1494 /// Returns true if the target machine supports masked store operation 1495 /// for the given \p DataType and kind of access to \p Ptr. 1496 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1497 return Legal->isConsecutivePtr(Ptr) && 1498 TTI.isLegalMaskedStore(DataType, Alignment); 1499 } 1500 1501 /// Returns true if the target machine supports masked load operation 1502 /// for the given \p DataType and kind of access to \p Ptr. 1503 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1504 return Legal->isConsecutivePtr(Ptr) && 1505 TTI.isLegalMaskedLoad(DataType, Alignment); 1506 } 1507 1508 /// Returns true if the target machine can represent \p V as a masked gather 1509 /// or scatter operation. 1510 bool isLegalGatherOrScatter(Value *V) { 1511 bool LI = isa<LoadInst>(V); 1512 bool SI = isa<StoreInst>(V); 1513 if (!LI && !SI) 1514 return false; 1515 auto *Ty = getLoadStoreType(V); 1516 Align Align = getLoadStoreAlignment(V); 1517 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1518 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1519 } 1520 1521 /// Returns true if the target machine supports all of the reduction 1522 /// variables found for the given VF. 1523 bool canVectorizeReductions(ElementCount VF) const { 1524 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1525 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1526 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1527 })); 1528 } 1529 1530 /// Returns true if \p I is an instruction that will be scalarized with 1531 /// predication. Such instructions include conditional stores and 1532 /// instructions that may divide by zero. 1533 /// If a non-zero VF has been calculated, we check if I will be scalarized 1534 /// predication for that VF. 1535 bool isScalarWithPredication(Instruction *I) const; 1536 1537 // Returns true if \p I is an instruction that will be predicated either 1538 // through scalar predication or masked load/store or masked gather/scatter. 1539 // Superset of instructions that return true for isScalarWithPredication. 1540 bool isPredicatedInst(Instruction *I) { 1541 if (!blockNeedsPredication(I->getParent())) 1542 return false; 1543 // Loads and stores that need some form of masked operation are predicated 1544 // instructions. 1545 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1546 return Legal->isMaskRequired(I); 1547 return isScalarWithPredication(I); 1548 } 1549 1550 /// Returns true if \p I is a memory instruction with consecutive memory 1551 /// access that can be widened. 1552 bool 1553 memoryInstructionCanBeWidened(Instruction *I, 1554 ElementCount VF = ElementCount::getFixed(1)); 1555 1556 /// Returns true if \p I is a memory instruction in an interleaved-group 1557 /// of memory accesses that can be vectorized with wide vector loads/stores 1558 /// and shuffles. 1559 bool 1560 interleavedAccessCanBeWidened(Instruction *I, 1561 ElementCount VF = ElementCount::getFixed(1)); 1562 1563 /// Check if \p Instr belongs to any interleaved access group. 1564 bool isAccessInterleaved(Instruction *Instr) { 1565 return InterleaveInfo.isInterleaved(Instr); 1566 } 1567 1568 /// Get the interleaved access group that \p Instr belongs to. 1569 const InterleaveGroup<Instruction> * 1570 getInterleavedAccessGroup(Instruction *Instr) { 1571 return InterleaveInfo.getInterleaveGroup(Instr); 1572 } 1573 1574 /// Returns true if we're required to use a scalar epilogue for at least 1575 /// the final iteration of the original loop. 1576 bool requiresScalarEpilogue(ElementCount VF) const { 1577 if (!isScalarEpilogueAllowed()) 1578 return false; 1579 // If we might exit from anywhere but the latch, must run the exiting 1580 // iteration in scalar form. 1581 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1582 return true; 1583 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1584 } 1585 1586 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1587 /// loop hint annotation. 1588 bool isScalarEpilogueAllowed() const { 1589 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1590 } 1591 1592 /// Returns true if all loop blocks should be masked to fold tail loop. 1593 bool foldTailByMasking() const { return FoldTailByMasking; } 1594 1595 bool blockNeedsPredication(BasicBlock *BB) const { 1596 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1597 } 1598 1599 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1600 /// nodes to the chain of instructions representing the reductions. Uses a 1601 /// MapVector to ensure deterministic iteration order. 1602 using ReductionChainMap = 1603 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1604 1605 /// Return the chain of instructions representing an inloop reduction. 1606 const ReductionChainMap &getInLoopReductionChains() const { 1607 return InLoopReductionChains; 1608 } 1609 1610 /// Returns true if the Phi is part of an inloop reduction. 1611 bool isInLoopReduction(PHINode *Phi) const { 1612 return InLoopReductionChains.count(Phi); 1613 } 1614 1615 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1616 /// with factor VF. Return the cost of the instruction, including 1617 /// scalarization overhead if it's needed. 1618 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1619 1620 /// Estimate cost of a call instruction CI if it were vectorized with factor 1621 /// VF. Return the cost of the instruction, including scalarization overhead 1622 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1623 /// scalarized - 1624 /// i.e. either vector version isn't available, or is too expensive. 1625 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1626 bool &NeedToScalarize) const; 1627 1628 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1629 /// that of B. 1630 bool isMoreProfitable(const VectorizationFactor &A, 1631 const VectorizationFactor &B) const; 1632 1633 /// Invalidates decisions already taken by the cost model. 1634 void invalidateCostModelingDecisions() { 1635 WideningDecisions.clear(); 1636 Uniforms.clear(); 1637 Scalars.clear(); 1638 } 1639 1640 private: 1641 unsigned NumPredStores = 0; 1642 1643 /// \return An upper bound for the vectorization factors for both 1644 /// fixed and scalable vectorization, where the minimum-known number of 1645 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1646 /// disabled or unsupported, then the scalable part will be equal to 1647 /// ElementCount::getScalable(0). 1648 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1649 ElementCount UserVF); 1650 1651 /// \return the maximized element count based on the targets vector 1652 /// registers and the loop trip-count, but limited to a maximum safe VF. 1653 /// This is a helper function of computeFeasibleMaxVF. 1654 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1655 /// issue that occurred on one of the buildbots which cannot be reproduced 1656 /// without having access to the properietary compiler (see comments on 1657 /// D98509). The issue is currently under investigation and this workaround 1658 /// will be removed as soon as possible. 1659 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1660 unsigned SmallestType, 1661 unsigned WidestType, 1662 const ElementCount &MaxSafeVF); 1663 1664 /// \return the maximum legal scalable VF, based on the safe max number 1665 /// of elements. 1666 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1667 1668 /// The vectorization cost is a combination of the cost itself and a boolean 1669 /// indicating whether any of the contributing operations will actually 1670 /// operate on vector values after type legalization in the backend. If this 1671 /// latter value is false, then all operations will be scalarized (i.e. no 1672 /// vectorization has actually taken place). 1673 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1674 1675 /// Returns the expected execution cost. The unit of the cost does 1676 /// not matter because we use the 'cost' units to compare different 1677 /// vector widths. The cost that is returned is *not* normalized by 1678 /// the factor width. If \p Invalid is not nullptr, this function 1679 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1680 /// each instruction that has an Invalid cost for the given VF. 1681 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1682 VectorizationCostTy 1683 expectedCost(ElementCount VF, 1684 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1685 1686 /// Returns the execution time cost of an instruction for a given vector 1687 /// width. Vector width of one means scalar. 1688 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1689 1690 /// The cost-computation logic from getInstructionCost which provides 1691 /// the vector type as an output parameter. 1692 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1693 Type *&VectorTy); 1694 1695 /// Return the cost of instructions in an inloop reduction pattern, if I is 1696 /// part of that pattern. 1697 Optional<InstructionCost> 1698 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1699 TTI::TargetCostKind CostKind); 1700 1701 /// Calculate vectorization cost of memory instruction \p I. 1702 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1703 1704 /// The cost computation for scalarized memory instruction. 1705 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1706 1707 /// The cost computation for interleaving group of memory instructions. 1708 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1709 1710 /// The cost computation for Gather/Scatter instruction. 1711 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1712 1713 /// The cost computation for widening instruction \p I with consecutive 1714 /// memory access. 1715 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1716 1717 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1718 /// Load: scalar load + broadcast. 1719 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1720 /// element) 1721 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1722 1723 /// Estimate the overhead of scalarizing an instruction. This is a 1724 /// convenience wrapper for the type-based getScalarizationOverhead API. 1725 InstructionCost getScalarizationOverhead(Instruction *I, 1726 ElementCount VF) const; 1727 1728 /// Returns whether the instruction is a load or store and will be a emitted 1729 /// as a vector operation. 1730 bool isConsecutiveLoadOrStore(Instruction *I); 1731 1732 /// Returns true if an artificially high cost for emulated masked memrefs 1733 /// should be used. 1734 bool useEmulatedMaskMemRefHack(Instruction *I); 1735 1736 /// Map of scalar integer values to the smallest bitwidth they can be legally 1737 /// represented as. The vector equivalents of these values should be truncated 1738 /// to this type. 1739 MapVector<Instruction *, uint64_t> MinBWs; 1740 1741 /// A type representing the costs for instructions if they were to be 1742 /// scalarized rather than vectorized. The entries are Instruction-Cost 1743 /// pairs. 1744 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1745 1746 /// A set containing all BasicBlocks that are known to present after 1747 /// vectorization as a predicated block. 1748 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1749 1750 /// Records whether it is allowed to have the original scalar loop execute at 1751 /// least once. This may be needed as a fallback loop in case runtime 1752 /// aliasing/dependence checks fail, or to handle the tail/remainder 1753 /// iterations when the trip count is unknown or doesn't divide by the VF, 1754 /// or as a peel-loop to handle gaps in interleave-groups. 1755 /// Under optsize and when the trip count is very small we don't allow any 1756 /// iterations to execute in the scalar loop. 1757 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1758 1759 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1760 bool FoldTailByMasking = false; 1761 1762 /// A map holding scalar costs for different vectorization factors. The 1763 /// presence of a cost for an instruction in the mapping indicates that the 1764 /// instruction will be scalarized when vectorizing with the associated 1765 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1766 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1767 1768 /// Holds the instructions known to be uniform after vectorization. 1769 /// The data is collected per VF. 1770 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1771 1772 /// Holds the instructions known to be scalar after vectorization. 1773 /// The data is collected per VF. 1774 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1775 1776 /// Holds the instructions (address computations) that are forced to be 1777 /// scalarized. 1778 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1779 1780 /// PHINodes of the reductions that should be expanded in-loop along with 1781 /// their associated chains of reduction operations, in program order from top 1782 /// (PHI) to bottom 1783 ReductionChainMap InLoopReductionChains; 1784 1785 /// A Map of inloop reduction operations and their immediate chain operand. 1786 /// FIXME: This can be removed once reductions can be costed correctly in 1787 /// vplan. This was added to allow quick lookup to the inloop operations, 1788 /// without having to loop through InLoopReductionChains. 1789 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1790 1791 /// Returns the expected difference in cost from scalarizing the expression 1792 /// feeding a predicated instruction \p PredInst. The instructions to 1793 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1794 /// non-negative return value implies the expression will be scalarized. 1795 /// Currently, only single-use chains are considered for scalarization. 1796 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1797 ElementCount VF); 1798 1799 /// Collect the instructions that are uniform after vectorization. An 1800 /// instruction is uniform if we represent it with a single scalar value in 1801 /// the vectorized loop corresponding to each vector iteration. Examples of 1802 /// uniform instructions include pointer operands of consecutive or 1803 /// interleaved memory accesses. Note that although uniformity implies an 1804 /// instruction will be scalar, the reverse is not true. In general, a 1805 /// scalarized instruction will be represented by VF scalar values in the 1806 /// vectorized loop, each corresponding to an iteration of the original 1807 /// scalar loop. 1808 void collectLoopUniforms(ElementCount VF); 1809 1810 /// Collect the instructions that are scalar after vectorization. An 1811 /// instruction is scalar if it is known to be uniform or will be scalarized 1812 /// during vectorization. Non-uniform scalarized instructions will be 1813 /// represented by VF values in the vectorized loop, each corresponding to an 1814 /// iteration of the original scalar loop. 1815 void collectLoopScalars(ElementCount VF); 1816 1817 /// Keeps cost model vectorization decision and cost for instructions. 1818 /// Right now it is used for memory instructions only. 1819 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1820 std::pair<InstWidening, InstructionCost>>; 1821 1822 DecisionList WideningDecisions; 1823 1824 /// Returns true if \p V is expected to be vectorized and it needs to be 1825 /// extracted. 1826 bool needsExtract(Value *V, ElementCount VF) const { 1827 Instruction *I = dyn_cast<Instruction>(V); 1828 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1829 TheLoop->isLoopInvariant(I)) 1830 return false; 1831 1832 // Assume we can vectorize V (and hence we need extraction) if the 1833 // scalars are not computed yet. This can happen, because it is called 1834 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1835 // the scalars are collected. That should be a safe assumption in most 1836 // cases, because we check if the operands have vectorizable types 1837 // beforehand in LoopVectorizationLegality. 1838 return Scalars.find(VF) == Scalars.end() || 1839 !isScalarAfterVectorization(I, VF); 1840 }; 1841 1842 /// Returns a range containing only operands needing to be extracted. 1843 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1844 ElementCount VF) const { 1845 return SmallVector<Value *, 4>(make_filter_range( 1846 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1847 } 1848 1849 /// Determines if we have the infrastructure to vectorize loop \p L and its 1850 /// epilogue, assuming the main loop is vectorized by \p VF. 1851 bool isCandidateForEpilogueVectorization(const Loop &L, 1852 const ElementCount VF) const; 1853 1854 /// Returns true if epilogue vectorization is considered profitable, and 1855 /// false otherwise. 1856 /// \p VF is the vectorization factor chosen for the original loop. 1857 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1858 1859 public: 1860 /// The loop that we evaluate. 1861 Loop *TheLoop; 1862 1863 /// Predicated scalar evolution analysis. 1864 PredicatedScalarEvolution &PSE; 1865 1866 /// Loop Info analysis. 1867 LoopInfo *LI; 1868 1869 /// Vectorization legality. 1870 LoopVectorizationLegality *Legal; 1871 1872 /// Vector target information. 1873 const TargetTransformInfo &TTI; 1874 1875 /// Target Library Info. 1876 const TargetLibraryInfo *TLI; 1877 1878 /// Demanded bits analysis. 1879 DemandedBits *DB; 1880 1881 /// Assumption cache. 1882 AssumptionCache *AC; 1883 1884 /// Interface to emit optimization remarks. 1885 OptimizationRemarkEmitter *ORE; 1886 1887 const Function *TheFunction; 1888 1889 /// Loop Vectorize Hint. 1890 const LoopVectorizeHints *Hints; 1891 1892 /// The interleave access information contains groups of interleaved accesses 1893 /// with the same stride and close to each other. 1894 InterleavedAccessInfo &InterleaveInfo; 1895 1896 /// Values to ignore in the cost model. 1897 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1898 1899 /// Values to ignore in the cost model when VF > 1. 1900 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1901 1902 /// All element types found in the loop. 1903 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1904 1905 /// Profitable vector factors. 1906 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1907 }; 1908 } // end namespace llvm 1909 1910 /// Helper struct to manage generating runtime checks for vectorization. 1911 /// 1912 /// The runtime checks are created up-front in temporary blocks to allow better 1913 /// estimating the cost and un-linked from the existing IR. After deciding to 1914 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1915 /// temporary blocks are completely removed. 1916 class GeneratedRTChecks { 1917 /// Basic block which contains the generated SCEV checks, if any. 1918 BasicBlock *SCEVCheckBlock = nullptr; 1919 1920 /// The value representing the result of the generated SCEV checks. If it is 1921 /// nullptr, either no SCEV checks have been generated or they have been used. 1922 Value *SCEVCheckCond = nullptr; 1923 1924 /// Basic block which contains the generated memory runtime checks, if any. 1925 BasicBlock *MemCheckBlock = nullptr; 1926 1927 /// The value representing the result of the generated memory runtime checks. 1928 /// If it is nullptr, either no memory runtime checks have been generated or 1929 /// they have been used. 1930 Instruction *MemRuntimeCheckCond = nullptr; 1931 1932 DominatorTree *DT; 1933 LoopInfo *LI; 1934 1935 SCEVExpander SCEVExp; 1936 SCEVExpander MemCheckExp; 1937 1938 public: 1939 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1940 const DataLayout &DL) 1941 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1942 MemCheckExp(SE, DL, "scev.check") {} 1943 1944 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1945 /// accurately estimate the cost of the runtime checks. The blocks are 1946 /// un-linked from the IR and is added back during vector code generation. If 1947 /// there is no vector code generation, the check blocks are removed 1948 /// completely. 1949 void Create(Loop *L, const LoopAccessInfo &LAI, 1950 const SCEVUnionPredicate &UnionPred) { 1951 1952 BasicBlock *LoopHeader = L->getHeader(); 1953 BasicBlock *Preheader = L->getLoopPreheader(); 1954 1955 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1956 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1957 // may be used by SCEVExpander. The blocks will be un-linked from their 1958 // predecessors and removed from LI & DT at the end of the function. 1959 if (!UnionPred.isAlwaysTrue()) { 1960 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1961 nullptr, "vector.scevcheck"); 1962 1963 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1964 &UnionPred, SCEVCheckBlock->getTerminator()); 1965 } 1966 1967 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1968 if (RtPtrChecking.Need) { 1969 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1970 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1971 "vector.memcheck"); 1972 1973 std::tie(std::ignore, MemRuntimeCheckCond) = 1974 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1975 RtPtrChecking.getChecks(), MemCheckExp); 1976 assert(MemRuntimeCheckCond && 1977 "no RT checks generated although RtPtrChecking " 1978 "claimed checks are required"); 1979 } 1980 1981 if (!MemCheckBlock && !SCEVCheckBlock) 1982 return; 1983 1984 // Unhook the temporary block with the checks, update various places 1985 // accordingly. 1986 if (SCEVCheckBlock) 1987 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1988 if (MemCheckBlock) 1989 MemCheckBlock->replaceAllUsesWith(Preheader); 1990 1991 if (SCEVCheckBlock) { 1992 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1993 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1994 Preheader->getTerminator()->eraseFromParent(); 1995 } 1996 if (MemCheckBlock) { 1997 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1998 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1999 Preheader->getTerminator()->eraseFromParent(); 2000 } 2001 2002 DT->changeImmediateDominator(LoopHeader, Preheader); 2003 if (MemCheckBlock) { 2004 DT->eraseNode(MemCheckBlock); 2005 LI->removeBlock(MemCheckBlock); 2006 } 2007 if (SCEVCheckBlock) { 2008 DT->eraseNode(SCEVCheckBlock); 2009 LI->removeBlock(SCEVCheckBlock); 2010 } 2011 } 2012 2013 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2014 /// unused. 2015 ~GeneratedRTChecks() { 2016 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2017 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2018 if (!SCEVCheckCond) 2019 SCEVCleaner.markResultUsed(); 2020 2021 if (!MemRuntimeCheckCond) 2022 MemCheckCleaner.markResultUsed(); 2023 2024 if (MemRuntimeCheckCond) { 2025 auto &SE = *MemCheckExp.getSE(); 2026 // Memory runtime check generation creates compares that use expanded 2027 // values. Remove them before running the SCEVExpanderCleaners. 2028 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2029 if (MemCheckExp.isInsertedInstruction(&I)) 2030 continue; 2031 SE.forgetValue(&I); 2032 SE.eraseValueFromMap(&I); 2033 I.eraseFromParent(); 2034 } 2035 } 2036 MemCheckCleaner.cleanup(); 2037 SCEVCleaner.cleanup(); 2038 2039 if (SCEVCheckCond) 2040 SCEVCheckBlock->eraseFromParent(); 2041 if (MemRuntimeCheckCond) 2042 MemCheckBlock->eraseFromParent(); 2043 } 2044 2045 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2046 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2047 /// depending on the generated condition. 2048 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2049 BasicBlock *LoopVectorPreHeader, 2050 BasicBlock *LoopExitBlock) { 2051 if (!SCEVCheckCond) 2052 return nullptr; 2053 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2054 if (C->isZero()) 2055 return nullptr; 2056 2057 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2058 2059 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2060 // Create new preheader for vector loop. 2061 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2062 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2063 2064 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2065 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2066 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2067 SCEVCheckBlock); 2068 2069 DT->addNewBlock(SCEVCheckBlock, Pred); 2070 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2071 2072 ReplaceInstWithInst( 2073 SCEVCheckBlock->getTerminator(), 2074 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2075 // Mark the check as used, to prevent it from being removed during cleanup. 2076 SCEVCheckCond = nullptr; 2077 return SCEVCheckBlock; 2078 } 2079 2080 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2081 /// the branches to branch to the vector preheader or \p Bypass, depending on 2082 /// the generated condition. 2083 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2084 BasicBlock *LoopVectorPreHeader) { 2085 // Check if we generated code that checks in runtime if arrays overlap. 2086 if (!MemRuntimeCheckCond) 2087 return nullptr; 2088 2089 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2090 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2091 MemCheckBlock); 2092 2093 DT->addNewBlock(MemCheckBlock, Pred); 2094 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2095 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2096 2097 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2098 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2099 2100 ReplaceInstWithInst( 2101 MemCheckBlock->getTerminator(), 2102 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2103 MemCheckBlock->getTerminator()->setDebugLoc( 2104 Pred->getTerminator()->getDebugLoc()); 2105 2106 // Mark the check as used, to prevent it from being removed during cleanup. 2107 MemRuntimeCheckCond = nullptr; 2108 return MemCheckBlock; 2109 } 2110 }; 2111 2112 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2113 // vectorization. The loop needs to be annotated with #pragma omp simd 2114 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2115 // vector length information is not provided, vectorization is not considered 2116 // explicit. Interleave hints are not allowed either. These limitations will be 2117 // relaxed in the future. 2118 // Please, note that we are currently forced to abuse the pragma 'clang 2119 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2120 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2121 // provides *explicit vectorization hints* (LV can bypass legal checks and 2122 // assume that vectorization is legal). However, both hints are implemented 2123 // using the same metadata (llvm.loop.vectorize, processed by 2124 // LoopVectorizeHints). This will be fixed in the future when the native IR 2125 // representation for pragma 'omp simd' is introduced. 2126 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2127 OptimizationRemarkEmitter *ORE) { 2128 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2129 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2130 2131 // Only outer loops with an explicit vectorization hint are supported. 2132 // Unannotated outer loops are ignored. 2133 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2134 return false; 2135 2136 Function *Fn = OuterLp->getHeader()->getParent(); 2137 if (!Hints.allowVectorization(Fn, OuterLp, 2138 true /*VectorizeOnlyWhenForced*/)) { 2139 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2140 return false; 2141 } 2142 2143 if (Hints.getInterleave() > 1) { 2144 // TODO: Interleave support is future work. 2145 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2146 "outer loops.\n"); 2147 Hints.emitRemarkWithHints(); 2148 return false; 2149 } 2150 2151 return true; 2152 } 2153 2154 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2155 OptimizationRemarkEmitter *ORE, 2156 SmallVectorImpl<Loop *> &V) { 2157 // Collect inner loops and outer loops without irreducible control flow. For 2158 // now, only collect outer loops that have explicit vectorization hints. If we 2159 // are stress testing the VPlan H-CFG construction, we collect the outermost 2160 // loop of every loop nest. 2161 if (L.isInnermost() || VPlanBuildStressTest || 2162 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2163 LoopBlocksRPO RPOT(&L); 2164 RPOT.perform(LI); 2165 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2166 V.push_back(&L); 2167 // TODO: Collect inner loops inside marked outer loops in case 2168 // vectorization fails for the outer loop. Do not invoke 2169 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2170 // already known to be reducible. We can use an inherited attribute for 2171 // that. 2172 return; 2173 } 2174 } 2175 for (Loop *InnerL : L) 2176 collectSupportedLoops(*InnerL, LI, ORE, V); 2177 } 2178 2179 namespace { 2180 2181 /// The LoopVectorize Pass. 2182 struct LoopVectorize : public FunctionPass { 2183 /// Pass identification, replacement for typeid 2184 static char ID; 2185 2186 LoopVectorizePass Impl; 2187 2188 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2189 bool VectorizeOnlyWhenForced = false) 2190 : FunctionPass(ID), 2191 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2192 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2193 } 2194 2195 bool runOnFunction(Function &F) override { 2196 if (skipFunction(F)) 2197 return false; 2198 2199 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2200 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2201 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2202 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2203 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2204 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2205 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2206 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2207 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2208 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2209 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2210 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2211 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2212 2213 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2214 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2215 2216 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2217 GetLAA, *ORE, PSI).MadeAnyChange; 2218 } 2219 2220 void getAnalysisUsage(AnalysisUsage &AU) const override { 2221 AU.addRequired<AssumptionCacheTracker>(); 2222 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2223 AU.addRequired<DominatorTreeWrapperPass>(); 2224 AU.addRequired<LoopInfoWrapperPass>(); 2225 AU.addRequired<ScalarEvolutionWrapperPass>(); 2226 AU.addRequired<TargetTransformInfoWrapperPass>(); 2227 AU.addRequired<AAResultsWrapperPass>(); 2228 AU.addRequired<LoopAccessLegacyAnalysis>(); 2229 AU.addRequired<DemandedBitsWrapperPass>(); 2230 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2231 AU.addRequired<InjectTLIMappingsLegacy>(); 2232 2233 // We currently do not preserve loopinfo/dominator analyses with outer loop 2234 // vectorization. Until this is addressed, mark these analyses as preserved 2235 // only for non-VPlan-native path. 2236 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2237 if (!EnableVPlanNativePath) { 2238 AU.addPreserved<LoopInfoWrapperPass>(); 2239 AU.addPreserved<DominatorTreeWrapperPass>(); 2240 } 2241 2242 AU.addPreserved<BasicAAWrapperPass>(); 2243 AU.addPreserved<GlobalsAAWrapperPass>(); 2244 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2245 } 2246 }; 2247 2248 } // end anonymous namespace 2249 2250 //===----------------------------------------------------------------------===// 2251 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2252 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2253 //===----------------------------------------------------------------------===// 2254 2255 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2256 // We need to place the broadcast of invariant variables outside the loop, 2257 // but only if it's proven safe to do so. Else, broadcast will be inside 2258 // vector loop body. 2259 Instruction *Instr = dyn_cast<Instruction>(V); 2260 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2261 (!Instr || 2262 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2263 // Place the code for broadcasting invariant variables in the new preheader. 2264 IRBuilder<>::InsertPointGuard Guard(Builder); 2265 if (SafeToHoist) 2266 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2267 2268 // Broadcast the scalar into all locations in the vector. 2269 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2270 2271 return Shuf; 2272 } 2273 2274 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2275 const InductionDescriptor &II, Value *Step, Value *Start, 2276 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2277 VPTransformState &State) { 2278 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2279 "Expected either an induction phi-node or a truncate of it!"); 2280 2281 // Construct the initial value of the vector IV in the vector loop preheader 2282 auto CurrIP = Builder.saveIP(); 2283 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2284 if (isa<TruncInst>(EntryVal)) { 2285 assert(Start->getType()->isIntegerTy() && 2286 "Truncation requires an integer type"); 2287 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2288 Step = Builder.CreateTrunc(Step, TruncType); 2289 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2290 } 2291 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2292 Value *SteppedStart = 2293 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2294 2295 // We create vector phi nodes for both integer and floating-point induction 2296 // variables. Here, we determine the kind of arithmetic we will perform. 2297 Instruction::BinaryOps AddOp; 2298 Instruction::BinaryOps MulOp; 2299 if (Step->getType()->isIntegerTy()) { 2300 AddOp = Instruction::Add; 2301 MulOp = Instruction::Mul; 2302 } else { 2303 AddOp = II.getInductionOpcode(); 2304 MulOp = Instruction::FMul; 2305 } 2306 2307 // Multiply the vectorization factor by the step using integer or 2308 // floating-point arithmetic as appropriate. 2309 Type *StepType = Step->getType(); 2310 if (Step->getType()->isFloatingPointTy()) 2311 StepType = IntegerType::get(StepType->getContext(), 2312 StepType->getScalarSizeInBits()); 2313 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2314 if (Step->getType()->isFloatingPointTy()) 2315 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2316 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2317 2318 // Create a vector splat to use in the induction update. 2319 // 2320 // FIXME: If the step is non-constant, we create the vector splat with 2321 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2322 // handle a constant vector splat. 2323 Value *SplatVF = isa<Constant>(Mul) 2324 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2325 : Builder.CreateVectorSplat(VF, Mul); 2326 Builder.restoreIP(CurrIP); 2327 2328 // We may need to add the step a number of times, depending on the unroll 2329 // factor. The last of those goes into the PHI. 2330 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2331 &*LoopVectorBody->getFirstInsertionPt()); 2332 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2333 Instruction *LastInduction = VecInd; 2334 for (unsigned Part = 0; Part < UF; ++Part) { 2335 State.set(Def, LastInduction, Part); 2336 2337 if (isa<TruncInst>(EntryVal)) 2338 addMetadata(LastInduction, EntryVal); 2339 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2340 State, Part); 2341 2342 LastInduction = cast<Instruction>( 2343 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2344 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2345 } 2346 2347 // Move the last step to the end of the latch block. This ensures consistent 2348 // placement of all induction updates. 2349 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2350 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2351 auto *ICmp = cast<Instruction>(Br->getCondition()); 2352 LastInduction->moveBefore(ICmp); 2353 LastInduction->setName("vec.ind.next"); 2354 2355 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2356 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2357 } 2358 2359 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2360 return Cost->isScalarAfterVectorization(I, VF) || 2361 Cost->isProfitableToScalarize(I, VF); 2362 } 2363 2364 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2365 if (shouldScalarizeInstruction(IV)) 2366 return true; 2367 auto isScalarInst = [&](User *U) -> bool { 2368 auto *I = cast<Instruction>(U); 2369 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2370 }; 2371 return llvm::any_of(IV->users(), isScalarInst); 2372 } 2373 2374 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2375 const InductionDescriptor &ID, const Instruction *EntryVal, 2376 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2377 unsigned Part, unsigned Lane) { 2378 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2379 "Expected either an induction phi-node or a truncate of it!"); 2380 2381 // This induction variable is not the phi from the original loop but the 2382 // newly-created IV based on the proof that casted Phi is equal to the 2383 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2384 // re-uses the same InductionDescriptor that original IV uses but we don't 2385 // have to do any recording in this case - that is done when original IV is 2386 // processed. 2387 if (isa<TruncInst>(EntryVal)) 2388 return; 2389 2390 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2391 if (Casts.empty()) 2392 return; 2393 // Only the first Cast instruction in the Casts vector is of interest. 2394 // The rest of the Casts (if exist) have no uses outside the 2395 // induction update chain itself. 2396 if (Lane < UINT_MAX) 2397 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2398 else 2399 State.set(CastDef, VectorLoopVal, Part); 2400 } 2401 2402 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2403 TruncInst *Trunc, VPValue *Def, 2404 VPValue *CastDef, 2405 VPTransformState &State) { 2406 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2407 "Primary induction variable must have an integer type"); 2408 2409 auto II = Legal->getInductionVars().find(IV); 2410 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2411 2412 auto ID = II->second; 2413 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2414 2415 // The value from the original loop to which we are mapping the new induction 2416 // variable. 2417 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2418 2419 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2420 2421 // Generate code for the induction step. Note that induction steps are 2422 // required to be loop-invariant 2423 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2424 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2425 "Induction step should be loop invariant"); 2426 if (PSE.getSE()->isSCEVable(IV->getType())) { 2427 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2428 return Exp.expandCodeFor(Step, Step->getType(), 2429 LoopVectorPreHeader->getTerminator()); 2430 } 2431 return cast<SCEVUnknown>(Step)->getValue(); 2432 }; 2433 2434 // The scalar value to broadcast. This is derived from the canonical 2435 // induction variable. If a truncation type is given, truncate the canonical 2436 // induction variable and step. Otherwise, derive these values from the 2437 // induction descriptor. 2438 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2439 Value *ScalarIV = Induction; 2440 if (IV != OldInduction) { 2441 ScalarIV = IV->getType()->isIntegerTy() 2442 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2443 : Builder.CreateCast(Instruction::SIToFP, Induction, 2444 IV->getType()); 2445 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2446 ScalarIV->setName("offset.idx"); 2447 } 2448 if (Trunc) { 2449 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2450 assert(Step->getType()->isIntegerTy() && 2451 "Truncation requires an integer step"); 2452 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2453 Step = Builder.CreateTrunc(Step, TruncType); 2454 } 2455 return ScalarIV; 2456 }; 2457 2458 // Create the vector values from the scalar IV, in the absence of creating a 2459 // vector IV. 2460 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2461 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2462 for (unsigned Part = 0; Part < UF; ++Part) { 2463 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2464 Value *EntryPart = 2465 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2466 ID.getInductionOpcode()); 2467 State.set(Def, EntryPart, Part); 2468 if (Trunc) 2469 addMetadata(EntryPart, Trunc); 2470 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2471 State, Part); 2472 } 2473 }; 2474 2475 // Fast-math-flags propagate from the original induction instruction. 2476 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2477 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2478 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2479 2480 // Now do the actual transformations, and start with creating the step value. 2481 Value *Step = CreateStepValue(ID.getStep()); 2482 if (VF.isZero() || VF.isScalar()) { 2483 Value *ScalarIV = CreateScalarIV(Step); 2484 CreateSplatIV(ScalarIV, Step); 2485 return; 2486 } 2487 2488 // Determine if we want a scalar version of the induction variable. This is 2489 // true if the induction variable itself is not widened, or if it has at 2490 // least one user in the loop that is not widened. 2491 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2492 if (!NeedsScalarIV) { 2493 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2494 State); 2495 return; 2496 } 2497 2498 // Try to create a new independent vector induction variable. If we can't 2499 // create the phi node, we will splat the scalar induction variable in each 2500 // loop iteration. 2501 if (!shouldScalarizeInstruction(EntryVal)) { 2502 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2503 State); 2504 Value *ScalarIV = CreateScalarIV(Step); 2505 // Create scalar steps that can be used by instructions we will later 2506 // scalarize. Note that the addition of the scalar steps will not increase 2507 // the number of instructions in the loop in the common case prior to 2508 // InstCombine. We will be trading one vector extract for each scalar step. 2509 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2510 return; 2511 } 2512 2513 // All IV users are scalar instructions, so only emit a scalar IV, not a 2514 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2515 // predicate used by the masked loads/stores. 2516 Value *ScalarIV = CreateScalarIV(Step); 2517 if (!Cost->isScalarEpilogueAllowed()) 2518 CreateSplatIV(ScalarIV, Step); 2519 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2520 } 2521 2522 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2523 Instruction::BinaryOps BinOp) { 2524 // Create and check the types. 2525 auto *ValVTy = cast<VectorType>(Val->getType()); 2526 ElementCount VLen = ValVTy->getElementCount(); 2527 2528 Type *STy = Val->getType()->getScalarType(); 2529 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2530 "Induction Step must be an integer or FP"); 2531 assert(Step->getType() == STy && "Step has wrong type"); 2532 2533 SmallVector<Constant *, 8> Indices; 2534 2535 // Create a vector of consecutive numbers from zero to VF. 2536 VectorType *InitVecValVTy = ValVTy; 2537 Type *InitVecValSTy = STy; 2538 if (STy->isFloatingPointTy()) { 2539 InitVecValSTy = 2540 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2541 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2542 } 2543 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2544 2545 // Add on StartIdx 2546 Value *StartIdxSplat = Builder.CreateVectorSplat( 2547 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2548 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2549 2550 if (STy->isIntegerTy()) { 2551 Step = Builder.CreateVectorSplat(VLen, Step); 2552 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2553 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2554 // which can be found from the original scalar operations. 2555 Step = Builder.CreateMul(InitVec, Step); 2556 return Builder.CreateAdd(Val, Step, "induction"); 2557 } 2558 2559 // Floating point induction. 2560 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2561 "Binary Opcode should be specified for FP induction"); 2562 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2563 Step = Builder.CreateVectorSplat(VLen, Step); 2564 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2565 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2566 } 2567 2568 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2569 Instruction *EntryVal, 2570 const InductionDescriptor &ID, 2571 VPValue *Def, VPValue *CastDef, 2572 VPTransformState &State) { 2573 // We shouldn't have to build scalar steps if we aren't vectorizing. 2574 assert(VF.isVector() && "VF should be greater than one"); 2575 // Get the value type and ensure it and the step have the same integer type. 2576 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2577 assert(ScalarIVTy == Step->getType() && 2578 "Val and Step should have the same type"); 2579 2580 // We build scalar steps for both integer and floating-point induction 2581 // variables. Here, we determine the kind of arithmetic we will perform. 2582 Instruction::BinaryOps AddOp; 2583 Instruction::BinaryOps MulOp; 2584 if (ScalarIVTy->isIntegerTy()) { 2585 AddOp = Instruction::Add; 2586 MulOp = Instruction::Mul; 2587 } else { 2588 AddOp = ID.getInductionOpcode(); 2589 MulOp = Instruction::FMul; 2590 } 2591 2592 // Determine the number of scalars we need to generate for each unroll 2593 // iteration. If EntryVal is uniform, we only need to generate the first 2594 // lane. Otherwise, we generate all VF values. 2595 bool IsUniform = 2596 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2597 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2598 // Compute the scalar steps and save the results in State. 2599 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2600 ScalarIVTy->getScalarSizeInBits()); 2601 Type *VecIVTy = nullptr; 2602 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2603 if (!IsUniform && VF.isScalable()) { 2604 VecIVTy = VectorType::get(ScalarIVTy, VF); 2605 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2606 SplatStep = Builder.CreateVectorSplat(VF, Step); 2607 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2608 } 2609 2610 for (unsigned Part = 0; Part < UF; ++Part) { 2611 Value *StartIdx0 = 2612 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2613 2614 if (!IsUniform && VF.isScalable()) { 2615 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2616 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2617 if (ScalarIVTy->isFloatingPointTy()) 2618 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2619 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2620 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2621 State.set(Def, Add, Part); 2622 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2623 Part); 2624 // It's useful to record the lane values too for the known minimum number 2625 // of elements so we do those below. This improves the code quality when 2626 // trying to extract the first element, for example. 2627 } 2628 2629 if (ScalarIVTy->isFloatingPointTy()) 2630 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2631 2632 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2633 Value *StartIdx = Builder.CreateBinOp( 2634 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2635 // The step returned by `createStepForVF` is a runtime-evaluated value 2636 // when VF is scalable. Otherwise, it should be folded into a Constant. 2637 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2638 "Expected StartIdx to be folded to a constant when VF is not " 2639 "scalable"); 2640 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2641 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2642 State.set(Def, Add, VPIteration(Part, Lane)); 2643 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2644 Part, Lane); 2645 } 2646 } 2647 } 2648 2649 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2650 const VPIteration &Instance, 2651 VPTransformState &State) { 2652 Value *ScalarInst = State.get(Def, Instance); 2653 Value *VectorValue = State.get(Def, Instance.Part); 2654 VectorValue = Builder.CreateInsertElement( 2655 VectorValue, ScalarInst, 2656 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2657 State.set(Def, VectorValue, Instance.Part); 2658 } 2659 2660 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2661 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2662 return Builder.CreateVectorReverse(Vec, "reverse"); 2663 } 2664 2665 // Return whether we allow using masked interleave-groups (for dealing with 2666 // strided loads/stores that reside in predicated blocks, or for dealing 2667 // with gaps). 2668 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2669 // If an override option has been passed in for interleaved accesses, use it. 2670 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2671 return EnableMaskedInterleavedMemAccesses; 2672 2673 return TTI.enableMaskedInterleavedAccessVectorization(); 2674 } 2675 2676 // Try to vectorize the interleave group that \p Instr belongs to. 2677 // 2678 // E.g. Translate following interleaved load group (factor = 3): 2679 // for (i = 0; i < N; i+=3) { 2680 // R = Pic[i]; // Member of index 0 2681 // G = Pic[i+1]; // Member of index 1 2682 // B = Pic[i+2]; // Member of index 2 2683 // ... // do something to R, G, B 2684 // } 2685 // To: 2686 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2687 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2688 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2689 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2690 // 2691 // Or translate following interleaved store group (factor = 3): 2692 // for (i = 0; i < N; i+=3) { 2693 // ... do something to R, G, B 2694 // Pic[i] = R; // Member of index 0 2695 // Pic[i+1] = G; // Member of index 1 2696 // Pic[i+2] = B; // Member of index 2 2697 // } 2698 // To: 2699 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2700 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2701 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2702 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2703 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2704 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2705 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2706 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2707 VPValue *BlockInMask) { 2708 Instruction *Instr = Group->getInsertPos(); 2709 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2710 2711 // Prepare for the vector type of the interleaved load/store. 2712 Type *ScalarTy = getLoadStoreType(Instr); 2713 unsigned InterleaveFactor = Group->getFactor(); 2714 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2715 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2716 2717 // Prepare for the new pointers. 2718 SmallVector<Value *, 2> AddrParts; 2719 unsigned Index = Group->getIndex(Instr); 2720 2721 // TODO: extend the masked interleaved-group support to reversed access. 2722 assert((!BlockInMask || !Group->isReverse()) && 2723 "Reversed masked interleave-group not supported."); 2724 2725 // If the group is reverse, adjust the index to refer to the last vector lane 2726 // instead of the first. We adjust the index from the first vector lane, 2727 // rather than directly getting the pointer for lane VF - 1, because the 2728 // pointer operand of the interleaved access is supposed to be uniform. For 2729 // uniform instructions, we're only required to generate a value for the 2730 // first vector lane in each unroll iteration. 2731 if (Group->isReverse()) 2732 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2733 2734 for (unsigned Part = 0; Part < UF; Part++) { 2735 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2736 setDebugLocFromInst(AddrPart); 2737 2738 // Notice current instruction could be any index. Need to adjust the address 2739 // to the member of index 0. 2740 // 2741 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2742 // b = A[i]; // Member of index 0 2743 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2744 // 2745 // E.g. A[i+1] = a; // Member of index 1 2746 // A[i] = b; // Member of index 0 2747 // A[i+2] = c; // Member of index 2 (Current instruction) 2748 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2749 2750 bool InBounds = false; 2751 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2752 InBounds = gep->isInBounds(); 2753 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2754 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2755 2756 // Cast to the vector pointer type. 2757 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2758 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2759 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2760 } 2761 2762 setDebugLocFromInst(Instr); 2763 Value *PoisonVec = PoisonValue::get(VecTy); 2764 2765 Value *MaskForGaps = nullptr; 2766 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2767 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2768 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2769 } 2770 2771 // Vectorize the interleaved load group. 2772 if (isa<LoadInst>(Instr)) { 2773 // For each unroll part, create a wide load for the group. 2774 SmallVector<Value *, 2> NewLoads; 2775 for (unsigned Part = 0; Part < UF; Part++) { 2776 Instruction *NewLoad; 2777 if (BlockInMask || MaskForGaps) { 2778 assert(useMaskedInterleavedAccesses(*TTI) && 2779 "masked interleaved groups are not allowed."); 2780 Value *GroupMask = MaskForGaps; 2781 if (BlockInMask) { 2782 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2783 Value *ShuffledMask = Builder.CreateShuffleVector( 2784 BlockInMaskPart, 2785 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2786 "interleaved.mask"); 2787 GroupMask = MaskForGaps 2788 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2789 MaskForGaps) 2790 : ShuffledMask; 2791 } 2792 NewLoad = 2793 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2794 GroupMask, PoisonVec, "wide.masked.vec"); 2795 } 2796 else 2797 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2798 Group->getAlign(), "wide.vec"); 2799 Group->addMetadata(NewLoad); 2800 NewLoads.push_back(NewLoad); 2801 } 2802 2803 // For each member in the group, shuffle out the appropriate data from the 2804 // wide loads. 2805 unsigned J = 0; 2806 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2807 Instruction *Member = Group->getMember(I); 2808 2809 // Skip the gaps in the group. 2810 if (!Member) 2811 continue; 2812 2813 auto StrideMask = 2814 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2815 for (unsigned Part = 0; Part < UF; Part++) { 2816 Value *StridedVec = Builder.CreateShuffleVector( 2817 NewLoads[Part], StrideMask, "strided.vec"); 2818 2819 // If this member has different type, cast the result type. 2820 if (Member->getType() != ScalarTy) { 2821 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2822 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2823 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2824 } 2825 2826 if (Group->isReverse()) 2827 StridedVec = reverseVector(StridedVec); 2828 2829 State.set(VPDefs[J], StridedVec, Part); 2830 } 2831 ++J; 2832 } 2833 return; 2834 } 2835 2836 // The sub vector type for current instruction. 2837 auto *SubVT = VectorType::get(ScalarTy, VF); 2838 2839 // Vectorize the interleaved store group. 2840 for (unsigned Part = 0; Part < UF; Part++) { 2841 // Collect the stored vector from each member. 2842 SmallVector<Value *, 4> StoredVecs; 2843 for (unsigned i = 0; i < InterleaveFactor; i++) { 2844 // Interleaved store group doesn't allow a gap, so each index has a member 2845 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2846 2847 Value *StoredVec = State.get(StoredValues[i], Part); 2848 2849 if (Group->isReverse()) 2850 StoredVec = reverseVector(StoredVec); 2851 2852 // If this member has different type, cast it to a unified type. 2853 2854 if (StoredVec->getType() != SubVT) 2855 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2856 2857 StoredVecs.push_back(StoredVec); 2858 } 2859 2860 // Concatenate all vectors into a wide vector. 2861 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2862 2863 // Interleave the elements in the wide vector. 2864 Value *IVec = Builder.CreateShuffleVector( 2865 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2866 "interleaved.vec"); 2867 2868 Instruction *NewStoreInstr; 2869 if (BlockInMask) { 2870 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2871 Value *ShuffledMask = Builder.CreateShuffleVector( 2872 BlockInMaskPart, 2873 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2874 "interleaved.mask"); 2875 NewStoreInstr = Builder.CreateMaskedStore( 2876 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2877 } 2878 else 2879 NewStoreInstr = 2880 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2881 2882 Group->addMetadata(NewStoreInstr); 2883 } 2884 } 2885 2886 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2887 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2888 VPValue *StoredValue, VPValue *BlockInMask) { 2889 // Attempt to issue a wide load. 2890 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2891 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2892 2893 assert((LI || SI) && "Invalid Load/Store instruction"); 2894 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2895 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2896 2897 LoopVectorizationCostModel::InstWidening Decision = 2898 Cost->getWideningDecision(Instr, VF); 2899 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2900 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2901 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2902 "CM decision is not to widen the memory instruction"); 2903 2904 Type *ScalarDataTy = getLoadStoreType(Instr); 2905 2906 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2907 const Align Alignment = getLoadStoreAlignment(Instr); 2908 2909 // Determine if the pointer operand of the access is either consecutive or 2910 // reverse consecutive. 2911 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2912 bool ConsecutiveStride = 2913 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2914 bool CreateGatherScatter = 2915 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2916 2917 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2918 // gather/scatter. Otherwise Decision should have been to Scalarize. 2919 assert((ConsecutiveStride || CreateGatherScatter) && 2920 "The instruction should be scalarized"); 2921 (void)ConsecutiveStride; 2922 2923 VectorParts BlockInMaskParts(UF); 2924 bool isMaskRequired = BlockInMask; 2925 if (isMaskRequired) 2926 for (unsigned Part = 0; Part < UF; ++Part) 2927 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2928 2929 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2930 // Calculate the pointer for the specific unroll-part. 2931 GetElementPtrInst *PartPtr = nullptr; 2932 2933 bool InBounds = false; 2934 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2935 InBounds = gep->isInBounds(); 2936 if (Reverse) { 2937 // If the address is consecutive but reversed, then the 2938 // wide store needs to start at the last vector element. 2939 // RunTimeVF = VScale * VF.getKnownMinValue() 2940 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2941 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2942 // NumElt = -Part * RunTimeVF 2943 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2944 // LastLane = 1 - RunTimeVF 2945 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2946 PartPtr = 2947 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2948 PartPtr->setIsInBounds(InBounds); 2949 PartPtr = cast<GetElementPtrInst>( 2950 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2951 PartPtr->setIsInBounds(InBounds); 2952 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2953 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2954 } else { 2955 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2956 PartPtr = cast<GetElementPtrInst>( 2957 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2958 PartPtr->setIsInBounds(InBounds); 2959 } 2960 2961 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2962 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2963 }; 2964 2965 // Handle Stores: 2966 if (SI) { 2967 setDebugLocFromInst(SI); 2968 2969 for (unsigned Part = 0; Part < UF; ++Part) { 2970 Instruction *NewSI = nullptr; 2971 Value *StoredVal = State.get(StoredValue, Part); 2972 if (CreateGatherScatter) { 2973 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2974 Value *VectorGep = State.get(Addr, Part); 2975 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2976 MaskPart); 2977 } else { 2978 if (Reverse) { 2979 // If we store to reverse consecutive memory locations, then we need 2980 // to reverse the order of elements in the stored value. 2981 StoredVal = reverseVector(StoredVal); 2982 // We don't want to update the value in the map as it might be used in 2983 // another expression. So don't call resetVectorValue(StoredVal). 2984 } 2985 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2986 if (isMaskRequired) 2987 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2988 BlockInMaskParts[Part]); 2989 else 2990 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2991 } 2992 addMetadata(NewSI, SI); 2993 } 2994 return; 2995 } 2996 2997 // Handle loads. 2998 assert(LI && "Must have a load instruction"); 2999 setDebugLocFromInst(LI); 3000 for (unsigned Part = 0; Part < UF; ++Part) { 3001 Value *NewLI; 3002 if (CreateGatherScatter) { 3003 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3004 Value *VectorGep = State.get(Addr, Part); 3005 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3006 nullptr, "wide.masked.gather"); 3007 addMetadata(NewLI, LI); 3008 } else { 3009 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3010 if (isMaskRequired) 3011 NewLI = Builder.CreateMaskedLoad( 3012 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3013 PoisonValue::get(DataTy), "wide.masked.load"); 3014 else 3015 NewLI = 3016 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3017 3018 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3019 addMetadata(NewLI, LI); 3020 if (Reverse) 3021 NewLI = reverseVector(NewLI); 3022 } 3023 3024 State.set(Def, NewLI, Part); 3025 } 3026 } 3027 3028 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3029 VPUser &User, 3030 const VPIteration &Instance, 3031 bool IfPredicateInstr, 3032 VPTransformState &State) { 3033 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3034 3035 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3036 // the first lane and part. 3037 if (isa<NoAliasScopeDeclInst>(Instr)) 3038 if (!Instance.isFirstIteration()) 3039 return; 3040 3041 setDebugLocFromInst(Instr); 3042 3043 // Does this instruction return a value ? 3044 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3045 3046 Instruction *Cloned = Instr->clone(); 3047 if (!IsVoidRetTy) 3048 Cloned->setName(Instr->getName() + ".cloned"); 3049 3050 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3051 Builder.GetInsertPoint()); 3052 // Replace the operands of the cloned instructions with their scalar 3053 // equivalents in the new loop. 3054 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3055 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3056 auto InputInstance = Instance; 3057 if (!Operand || !OrigLoop->contains(Operand) || 3058 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3059 InputInstance.Lane = VPLane::getFirstLane(); 3060 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3061 Cloned->setOperand(op, NewOp); 3062 } 3063 addNewMetadata(Cloned, Instr); 3064 3065 // Place the cloned scalar in the new loop. 3066 Builder.Insert(Cloned); 3067 3068 State.set(Def, Cloned, Instance); 3069 3070 // If we just cloned a new assumption, add it the assumption cache. 3071 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3072 AC->registerAssumption(II); 3073 3074 // End if-block. 3075 if (IfPredicateInstr) 3076 PredicatedInstructions.push_back(Cloned); 3077 } 3078 3079 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3080 Value *End, Value *Step, 3081 Instruction *DL) { 3082 BasicBlock *Header = L->getHeader(); 3083 BasicBlock *Latch = L->getLoopLatch(); 3084 // As we're just creating this loop, it's possible no latch exists 3085 // yet. If so, use the header as this will be a single block loop. 3086 if (!Latch) 3087 Latch = Header; 3088 3089 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3090 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3091 setDebugLocFromInst(OldInst, &B); 3092 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3093 3094 B.SetInsertPoint(Latch->getTerminator()); 3095 setDebugLocFromInst(OldInst, &B); 3096 3097 // Create i+1 and fill the PHINode. 3098 // 3099 // If the tail is not folded, we know that End - Start >= Step (either 3100 // statically or through the minimum iteration checks). We also know that both 3101 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3102 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3103 // overflows and we can mark the induction increment as NUW. 3104 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3105 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3106 Induction->addIncoming(Start, L->getLoopPreheader()); 3107 Induction->addIncoming(Next, Latch); 3108 // Create the compare. 3109 Value *ICmp = B.CreateICmpEQ(Next, End); 3110 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3111 3112 // Now we have two terminators. Remove the old one from the block. 3113 Latch->getTerminator()->eraseFromParent(); 3114 3115 return Induction; 3116 } 3117 3118 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3119 if (TripCount) 3120 return TripCount; 3121 3122 assert(L && "Create Trip Count for null loop."); 3123 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3124 // Find the loop boundaries. 3125 ScalarEvolution *SE = PSE.getSE(); 3126 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3127 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3128 "Invalid loop count"); 3129 3130 Type *IdxTy = Legal->getWidestInductionType(); 3131 assert(IdxTy && "No type for induction"); 3132 3133 // The exit count might have the type of i64 while the phi is i32. This can 3134 // happen if we have an induction variable that is sign extended before the 3135 // compare. The only way that we get a backedge taken count is that the 3136 // induction variable was signed and as such will not overflow. In such a case 3137 // truncation is legal. 3138 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3139 IdxTy->getPrimitiveSizeInBits()) 3140 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3141 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3142 3143 // Get the total trip count from the count by adding 1. 3144 const SCEV *ExitCount = SE->getAddExpr( 3145 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3146 3147 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3148 3149 // Expand the trip count and place the new instructions in the preheader. 3150 // Notice that the pre-header does not change, only the loop body. 3151 SCEVExpander Exp(*SE, DL, "induction"); 3152 3153 // Count holds the overall loop count (N). 3154 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3155 L->getLoopPreheader()->getTerminator()); 3156 3157 if (TripCount->getType()->isPointerTy()) 3158 TripCount = 3159 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3160 L->getLoopPreheader()->getTerminator()); 3161 3162 return TripCount; 3163 } 3164 3165 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3166 if (VectorTripCount) 3167 return VectorTripCount; 3168 3169 Value *TC = getOrCreateTripCount(L); 3170 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3171 3172 Type *Ty = TC->getType(); 3173 // This is where we can make the step a runtime constant. 3174 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3175 3176 // If the tail is to be folded by masking, round the number of iterations N 3177 // up to a multiple of Step instead of rounding down. This is done by first 3178 // adding Step-1 and then rounding down. Note that it's ok if this addition 3179 // overflows: the vector induction variable will eventually wrap to zero given 3180 // that it starts at zero and its Step is a power of two; the loop will then 3181 // exit, with the last early-exit vector comparison also producing all-true. 3182 if (Cost->foldTailByMasking()) { 3183 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3184 "VF*UF must be a power of 2 when folding tail by masking"); 3185 assert(!VF.isScalable() && 3186 "Tail folding not yet supported for scalable vectors"); 3187 TC = Builder.CreateAdd( 3188 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3189 } 3190 3191 // Now we need to generate the expression for the part of the loop that the 3192 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3193 // iterations are not required for correctness, or N - Step, otherwise. Step 3194 // is equal to the vectorization factor (number of SIMD elements) times the 3195 // unroll factor (number of SIMD instructions). 3196 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3197 3198 // There are cases where we *must* run at least one iteration in the remainder 3199 // loop. See the cost model for when this can happen. If the step evenly 3200 // divides the trip count, we set the remainder to be equal to the step. If 3201 // the step does not evenly divide the trip count, no adjustment is necessary 3202 // since there will already be scalar iterations. Note that the minimum 3203 // iterations check ensures that N >= Step. 3204 if (Cost->requiresScalarEpilogue(VF)) { 3205 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3206 R = Builder.CreateSelect(IsZero, Step, R); 3207 } 3208 3209 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3210 3211 return VectorTripCount; 3212 } 3213 3214 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3215 const DataLayout &DL) { 3216 // Verify that V is a vector type with same number of elements as DstVTy. 3217 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3218 unsigned VF = DstFVTy->getNumElements(); 3219 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3220 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3221 Type *SrcElemTy = SrcVecTy->getElementType(); 3222 Type *DstElemTy = DstFVTy->getElementType(); 3223 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3224 "Vector elements must have same size"); 3225 3226 // Do a direct cast if element types are castable. 3227 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3228 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3229 } 3230 // V cannot be directly casted to desired vector type. 3231 // May happen when V is a floating point vector but DstVTy is a vector of 3232 // pointers or vice-versa. Handle this using a two-step bitcast using an 3233 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3234 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3235 "Only one type should be a pointer type"); 3236 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3237 "Only one type should be a floating point type"); 3238 Type *IntTy = 3239 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3240 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3241 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3242 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3243 } 3244 3245 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3246 BasicBlock *Bypass) { 3247 Value *Count = getOrCreateTripCount(L); 3248 // Reuse existing vector loop preheader for TC checks. 3249 // Note that new preheader block is generated for vector loop. 3250 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3251 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3252 3253 // Generate code to check if the loop's trip count is less than VF * UF, or 3254 // equal to it in case a scalar epilogue is required; this implies that the 3255 // vector trip count is zero. This check also covers the case where adding one 3256 // to the backedge-taken count overflowed leading to an incorrect trip count 3257 // of zero. In this case we will also jump to the scalar loop. 3258 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3259 : ICmpInst::ICMP_ULT; 3260 3261 // If tail is to be folded, vector loop takes care of all iterations. 3262 Value *CheckMinIters = Builder.getFalse(); 3263 if (!Cost->foldTailByMasking()) { 3264 Value *Step = 3265 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3266 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3267 } 3268 // Create new preheader for vector loop. 3269 LoopVectorPreHeader = 3270 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3271 "vector.ph"); 3272 3273 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3274 DT->getNode(Bypass)->getIDom()) && 3275 "TC check is expected to dominate Bypass"); 3276 3277 // Update dominator for Bypass & LoopExit (if needed). 3278 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3279 if (!Cost->requiresScalarEpilogue(VF)) 3280 // If there is an epilogue which must run, there's no edge from the 3281 // middle block to exit blocks and thus no need to update the immediate 3282 // dominator of the exit blocks. 3283 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3284 3285 ReplaceInstWithInst( 3286 TCCheckBlock->getTerminator(), 3287 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3288 LoopBypassBlocks.push_back(TCCheckBlock); 3289 } 3290 3291 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3292 3293 BasicBlock *const SCEVCheckBlock = 3294 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3295 if (!SCEVCheckBlock) 3296 return nullptr; 3297 3298 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3299 (OptForSizeBasedOnProfile && 3300 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3301 "Cannot SCEV check stride or overflow when optimizing for size"); 3302 3303 3304 // Update dominator only if this is first RT check. 3305 if (LoopBypassBlocks.empty()) { 3306 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3307 if (!Cost->requiresScalarEpilogue(VF)) 3308 // If there is an epilogue which must run, there's no edge from the 3309 // middle block to exit blocks and thus no need to update the immediate 3310 // dominator of the exit blocks. 3311 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3312 } 3313 3314 LoopBypassBlocks.push_back(SCEVCheckBlock); 3315 AddedSafetyChecks = true; 3316 return SCEVCheckBlock; 3317 } 3318 3319 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3320 BasicBlock *Bypass) { 3321 // VPlan-native path does not do any analysis for runtime checks currently. 3322 if (EnableVPlanNativePath) 3323 return nullptr; 3324 3325 BasicBlock *const MemCheckBlock = 3326 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3327 3328 // Check if we generated code that checks in runtime if arrays overlap. We put 3329 // the checks into a separate block to make the more common case of few 3330 // elements faster. 3331 if (!MemCheckBlock) 3332 return nullptr; 3333 3334 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3335 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3336 "Cannot emit memory checks when optimizing for size, unless forced " 3337 "to vectorize."); 3338 ORE->emit([&]() { 3339 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3340 L->getStartLoc(), L->getHeader()) 3341 << "Code-size may be reduced by not forcing " 3342 "vectorization, or by source-code modifications " 3343 "eliminating the need for runtime checks " 3344 "(e.g., adding 'restrict')."; 3345 }); 3346 } 3347 3348 LoopBypassBlocks.push_back(MemCheckBlock); 3349 3350 AddedSafetyChecks = true; 3351 3352 // We currently don't use LoopVersioning for the actual loop cloning but we 3353 // still use it to add the noalias metadata. 3354 LVer = std::make_unique<LoopVersioning>( 3355 *Legal->getLAI(), 3356 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3357 DT, PSE.getSE()); 3358 LVer->prepareNoAliasMetadata(); 3359 return MemCheckBlock; 3360 } 3361 3362 Value *InnerLoopVectorizer::emitTransformedIndex( 3363 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3364 const InductionDescriptor &ID) const { 3365 3366 SCEVExpander Exp(*SE, DL, "induction"); 3367 auto Step = ID.getStep(); 3368 auto StartValue = ID.getStartValue(); 3369 assert(Index->getType()->getScalarType() == Step->getType() && 3370 "Index scalar type does not match StepValue type"); 3371 3372 // Note: the IR at this point is broken. We cannot use SE to create any new 3373 // SCEV and then expand it, hoping that SCEV's simplification will give us 3374 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3375 // lead to various SCEV crashes. So all we can do is to use builder and rely 3376 // on InstCombine for future simplifications. Here we handle some trivial 3377 // cases only. 3378 auto CreateAdd = [&B](Value *X, Value *Y) { 3379 assert(X->getType() == Y->getType() && "Types don't match!"); 3380 if (auto *CX = dyn_cast<ConstantInt>(X)) 3381 if (CX->isZero()) 3382 return Y; 3383 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3384 if (CY->isZero()) 3385 return X; 3386 return B.CreateAdd(X, Y); 3387 }; 3388 3389 // We allow X to be a vector type, in which case Y will potentially be 3390 // splatted into a vector with the same element count. 3391 auto CreateMul = [&B](Value *X, Value *Y) { 3392 assert(X->getType()->getScalarType() == Y->getType() && 3393 "Types don't match!"); 3394 if (auto *CX = dyn_cast<ConstantInt>(X)) 3395 if (CX->isOne()) 3396 return Y; 3397 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3398 if (CY->isOne()) 3399 return X; 3400 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3401 if (XVTy && !isa<VectorType>(Y->getType())) 3402 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3403 return B.CreateMul(X, Y); 3404 }; 3405 3406 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3407 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3408 // the DomTree is not kept up-to-date for additional blocks generated in the 3409 // vector loop. By using the header as insertion point, we guarantee that the 3410 // expanded instructions dominate all their uses. 3411 auto GetInsertPoint = [this, &B]() { 3412 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3413 if (InsertBB != LoopVectorBody && 3414 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3415 return LoopVectorBody->getTerminator(); 3416 return &*B.GetInsertPoint(); 3417 }; 3418 3419 switch (ID.getKind()) { 3420 case InductionDescriptor::IK_IntInduction: { 3421 assert(!isa<VectorType>(Index->getType()) && 3422 "Vector indices not supported for integer inductions yet"); 3423 assert(Index->getType() == StartValue->getType() && 3424 "Index type does not match StartValue type"); 3425 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3426 return B.CreateSub(StartValue, Index); 3427 auto *Offset = CreateMul( 3428 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3429 return CreateAdd(StartValue, Offset); 3430 } 3431 case InductionDescriptor::IK_PtrInduction: { 3432 assert(isa<SCEVConstant>(Step) && 3433 "Expected constant step for pointer induction"); 3434 return B.CreateGEP( 3435 StartValue->getType()->getPointerElementType(), StartValue, 3436 CreateMul(Index, 3437 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3438 GetInsertPoint()))); 3439 } 3440 case InductionDescriptor::IK_FpInduction: { 3441 assert(!isa<VectorType>(Index->getType()) && 3442 "Vector indices not supported for FP inductions yet"); 3443 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3444 auto InductionBinOp = ID.getInductionBinOp(); 3445 assert(InductionBinOp && 3446 (InductionBinOp->getOpcode() == Instruction::FAdd || 3447 InductionBinOp->getOpcode() == Instruction::FSub) && 3448 "Original bin op should be defined for FP induction"); 3449 3450 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3451 Value *MulExp = B.CreateFMul(StepValue, Index); 3452 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3453 "induction"); 3454 } 3455 case InductionDescriptor::IK_NoInduction: 3456 return nullptr; 3457 } 3458 llvm_unreachable("invalid enum"); 3459 } 3460 3461 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3462 LoopScalarBody = OrigLoop->getHeader(); 3463 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3464 assert(LoopVectorPreHeader && "Invalid loop structure"); 3465 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3466 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3467 "multiple exit loop without required epilogue?"); 3468 3469 LoopMiddleBlock = 3470 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3471 LI, nullptr, Twine(Prefix) + "middle.block"); 3472 LoopScalarPreHeader = 3473 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3474 nullptr, Twine(Prefix) + "scalar.ph"); 3475 3476 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3477 3478 // Set up the middle block terminator. Two cases: 3479 // 1) If we know that we must execute the scalar epilogue, emit an 3480 // unconditional branch. 3481 // 2) Otherwise, we must have a single unique exit block (due to how we 3482 // implement the multiple exit case). In this case, set up a conditonal 3483 // branch from the middle block to the loop scalar preheader, and the 3484 // exit block. completeLoopSkeleton will update the condition to use an 3485 // iteration check, if required to decide whether to execute the remainder. 3486 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3487 BranchInst::Create(LoopScalarPreHeader) : 3488 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3489 Builder.getTrue()); 3490 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3491 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3492 3493 // We intentionally don't let SplitBlock to update LoopInfo since 3494 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3495 // LoopVectorBody is explicitly added to the correct place few lines later. 3496 LoopVectorBody = 3497 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3498 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3499 3500 // Update dominator for loop exit. 3501 if (!Cost->requiresScalarEpilogue(VF)) 3502 // If there is an epilogue which must run, there's no edge from the 3503 // middle block to exit blocks and thus no need to update the immediate 3504 // dominator of the exit blocks. 3505 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3506 3507 // Create and register the new vector loop. 3508 Loop *Lp = LI->AllocateLoop(); 3509 Loop *ParentLoop = OrigLoop->getParentLoop(); 3510 3511 // Insert the new loop into the loop nest and register the new basic blocks 3512 // before calling any utilities such as SCEV that require valid LoopInfo. 3513 if (ParentLoop) { 3514 ParentLoop->addChildLoop(Lp); 3515 } else { 3516 LI->addTopLevelLoop(Lp); 3517 } 3518 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3519 return Lp; 3520 } 3521 3522 void InnerLoopVectorizer::createInductionResumeValues( 3523 Loop *L, Value *VectorTripCount, 3524 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3525 assert(VectorTripCount && L && "Expected valid arguments"); 3526 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3527 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3528 "Inconsistent information about additional bypass."); 3529 // We are going to resume the execution of the scalar loop. 3530 // Go over all of the induction variables that we found and fix the 3531 // PHIs that are left in the scalar version of the loop. 3532 // The starting values of PHI nodes depend on the counter of the last 3533 // iteration in the vectorized loop. 3534 // If we come from a bypass edge then we need to start from the original 3535 // start value. 3536 for (auto &InductionEntry : Legal->getInductionVars()) { 3537 PHINode *OrigPhi = InductionEntry.first; 3538 InductionDescriptor II = InductionEntry.second; 3539 3540 // Create phi nodes to merge from the backedge-taken check block. 3541 PHINode *BCResumeVal = 3542 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3543 LoopScalarPreHeader->getTerminator()); 3544 // Copy original phi DL over to the new one. 3545 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3546 Value *&EndValue = IVEndValues[OrigPhi]; 3547 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3548 if (OrigPhi == OldInduction) { 3549 // We know what the end value is. 3550 EndValue = VectorTripCount; 3551 } else { 3552 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3553 3554 // Fast-math-flags propagate from the original induction instruction. 3555 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3556 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3557 3558 Type *StepType = II.getStep()->getType(); 3559 Instruction::CastOps CastOp = 3560 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3561 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3562 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3563 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3564 EndValue->setName("ind.end"); 3565 3566 // Compute the end value for the additional bypass (if applicable). 3567 if (AdditionalBypass.first) { 3568 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3569 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3570 StepType, true); 3571 CRD = 3572 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3573 EndValueFromAdditionalBypass = 3574 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3575 EndValueFromAdditionalBypass->setName("ind.end"); 3576 } 3577 } 3578 // The new PHI merges the original incoming value, in case of a bypass, 3579 // or the value at the end of the vectorized loop. 3580 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3581 3582 // Fix the scalar body counter (PHI node). 3583 // The old induction's phi node in the scalar body needs the truncated 3584 // value. 3585 for (BasicBlock *BB : LoopBypassBlocks) 3586 BCResumeVal->addIncoming(II.getStartValue(), BB); 3587 3588 if (AdditionalBypass.first) 3589 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3590 EndValueFromAdditionalBypass); 3591 3592 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3593 } 3594 } 3595 3596 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3597 MDNode *OrigLoopID) { 3598 assert(L && "Expected valid loop."); 3599 3600 // The trip counts should be cached by now. 3601 Value *Count = getOrCreateTripCount(L); 3602 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3603 3604 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3605 3606 // Add a check in the middle block to see if we have completed 3607 // all of the iterations in the first vector loop. Three cases: 3608 // 1) If we require a scalar epilogue, there is no conditional branch as 3609 // we unconditionally branch to the scalar preheader. Do nothing. 3610 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3611 // Thus if tail is to be folded, we know we don't need to run the 3612 // remainder and we can use the previous value for the condition (true). 3613 // 3) Otherwise, construct a runtime check. 3614 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3615 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3616 Count, VectorTripCount, "cmp.n", 3617 LoopMiddleBlock->getTerminator()); 3618 3619 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3620 // of the corresponding compare because they may have ended up with 3621 // different line numbers and we want to avoid awkward line stepping while 3622 // debugging. Eg. if the compare has got a line number inside the loop. 3623 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3624 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3625 } 3626 3627 // Get ready to start creating new instructions into the vectorized body. 3628 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3629 "Inconsistent vector loop preheader"); 3630 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3631 3632 Optional<MDNode *> VectorizedLoopID = 3633 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3634 LLVMLoopVectorizeFollowupVectorized}); 3635 if (VectorizedLoopID.hasValue()) { 3636 L->setLoopID(VectorizedLoopID.getValue()); 3637 3638 // Do not setAlreadyVectorized if loop attributes have been defined 3639 // explicitly. 3640 return LoopVectorPreHeader; 3641 } 3642 3643 // Keep all loop hints from the original loop on the vector loop (we'll 3644 // replace the vectorizer-specific hints below). 3645 if (MDNode *LID = OrigLoop->getLoopID()) 3646 L->setLoopID(LID); 3647 3648 LoopVectorizeHints Hints(L, true, *ORE); 3649 Hints.setAlreadyVectorized(); 3650 3651 #ifdef EXPENSIVE_CHECKS 3652 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3653 LI->verify(*DT); 3654 #endif 3655 3656 return LoopVectorPreHeader; 3657 } 3658 3659 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3660 /* 3661 In this function we generate a new loop. The new loop will contain 3662 the vectorized instructions while the old loop will continue to run the 3663 scalar remainder. 3664 3665 [ ] <-- loop iteration number check. 3666 / | 3667 / v 3668 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3669 | / | 3670 | / v 3671 || [ ] <-- vector pre header. 3672 |/ | 3673 | v 3674 | [ ] \ 3675 | [ ]_| <-- vector loop. 3676 | | 3677 | v 3678 \ -[ ] <--- middle-block. 3679 \/ | 3680 /\ v 3681 | ->[ ] <--- new preheader. 3682 | | 3683 (opt) v <-- edge from middle to exit iff epilogue is not required. 3684 | [ ] \ 3685 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3686 \ | 3687 \ v 3688 >[ ] <-- exit block(s). 3689 ... 3690 */ 3691 3692 // Get the metadata of the original loop before it gets modified. 3693 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3694 3695 // Workaround! Compute the trip count of the original loop and cache it 3696 // before we start modifying the CFG. This code has a systemic problem 3697 // wherein it tries to run analysis over partially constructed IR; this is 3698 // wrong, and not simply for SCEV. The trip count of the original loop 3699 // simply happens to be prone to hitting this in practice. In theory, we 3700 // can hit the same issue for any SCEV, or ValueTracking query done during 3701 // mutation. See PR49900. 3702 getOrCreateTripCount(OrigLoop); 3703 3704 // Create an empty vector loop, and prepare basic blocks for the runtime 3705 // checks. 3706 Loop *Lp = createVectorLoopSkeleton(""); 3707 3708 // Now, compare the new count to zero. If it is zero skip the vector loop and 3709 // jump to the scalar loop. This check also covers the case where the 3710 // backedge-taken count is uint##_max: adding one to it will overflow leading 3711 // to an incorrect trip count of zero. In this (rare) case we will also jump 3712 // to the scalar loop. 3713 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3714 3715 // Generate the code to check any assumptions that we've made for SCEV 3716 // expressions. 3717 emitSCEVChecks(Lp, LoopScalarPreHeader); 3718 3719 // Generate the code that checks in runtime if arrays overlap. We put the 3720 // checks into a separate block to make the more common case of few elements 3721 // faster. 3722 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3723 3724 // Some loops have a single integer induction variable, while other loops 3725 // don't. One example is c++ iterators that often have multiple pointer 3726 // induction variables. In the code below we also support a case where we 3727 // don't have a single induction variable. 3728 // 3729 // We try to obtain an induction variable from the original loop as hard 3730 // as possible. However if we don't find one that: 3731 // - is an integer 3732 // - counts from zero, stepping by one 3733 // - is the size of the widest induction variable type 3734 // then we create a new one. 3735 OldInduction = Legal->getPrimaryInduction(); 3736 Type *IdxTy = Legal->getWidestInductionType(); 3737 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3738 // The loop step is equal to the vectorization factor (num of SIMD elements) 3739 // times the unroll factor (num of SIMD instructions). 3740 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3741 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3742 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3743 Induction = 3744 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3745 getDebugLocFromInstOrOperands(OldInduction)); 3746 3747 // Emit phis for the new starting index of the scalar loop. 3748 createInductionResumeValues(Lp, CountRoundDown); 3749 3750 return completeLoopSkeleton(Lp, OrigLoopID); 3751 } 3752 3753 // Fix up external users of the induction variable. At this point, we are 3754 // in LCSSA form, with all external PHIs that use the IV having one input value, 3755 // coming from the remainder loop. We need those PHIs to also have a correct 3756 // value for the IV when arriving directly from the middle block. 3757 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3758 const InductionDescriptor &II, 3759 Value *CountRoundDown, Value *EndValue, 3760 BasicBlock *MiddleBlock) { 3761 // There are two kinds of external IV usages - those that use the value 3762 // computed in the last iteration (the PHI) and those that use the penultimate 3763 // value (the value that feeds into the phi from the loop latch). 3764 // We allow both, but they, obviously, have different values. 3765 3766 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3767 3768 DenseMap<Value *, Value *> MissingVals; 3769 3770 // An external user of the last iteration's value should see the value that 3771 // the remainder loop uses to initialize its own IV. 3772 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3773 for (User *U : PostInc->users()) { 3774 Instruction *UI = cast<Instruction>(U); 3775 if (!OrigLoop->contains(UI)) { 3776 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3777 MissingVals[UI] = EndValue; 3778 } 3779 } 3780 3781 // An external user of the penultimate value need to see EndValue - Step. 3782 // The simplest way to get this is to recompute it from the constituent SCEVs, 3783 // that is Start + (Step * (CRD - 1)). 3784 for (User *U : OrigPhi->users()) { 3785 auto *UI = cast<Instruction>(U); 3786 if (!OrigLoop->contains(UI)) { 3787 const DataLayout &DL = 3788 OrigLoop->getHeader()->getModule()->getDataLayout(); 3789 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3790 3791 IRBuilder<> B(MiddleBlock->getTerminator()); 3792 3793 // Fast-math-flags propagate from the original induction instruction. 3794 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3795 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3796 3797 Value *CountMinusOne = B.CreateSub( 3798 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3799 Value *CMO = 3800 !II.getStep()->getType()->isIntegerTy() 3801 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3802 II.getStep()->getType()) 3803 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3804 CMO->setName("cast.cmo"); 3805 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3806 Escape->setName("ind.escape"); 3807 MissingVals[UI] = Escape; 3808 } 3809 } 3810 3811 for (auto &I : MissingVals) { 3812 PHINode *PHI = cast<PHINode>(I.first); 3813 // One corner case we have to handle is two IVs "chasing" each-other, 3814 // that is %IV2 = phi [...], [ %IV1, %latch ] 3815 // In this case, if IV1 has an external use, we need to avoid adding both 3816 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3817 // don't already have an incoming value for the middle block. 3818 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3819 PHI->addIncoming(I.second, MiddleBlock); 3820 } 3821 } 3822 3823 namespace { 3824 3825 struct CSEDenseMapInfo { 3826 static bool canHandle(const Instruction *I) { 3827 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3828 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3829 } 3830 3831 static inline Instruction *getEmptyKey() { 3832 return DenseMapInfo<Instruction *>::getEmptyKey(); 3833 } 3834 3835 static inline Instruction *getTombstoneKey() { 3836 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3837 } 3838 3839 static unsigned getHashValue(const Instruction *I) { 3840 assert(canHandle(I) && "Unknown instruction!"); 3841 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3842 I->value_op_end())); 3843 } 3844 3845 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3846 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3847 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3848 return LHS == RHS; 3849 return LHS->isIdenticalTo(RHS); 3850 } 3851 }; 3852 3853 } // end anonymous namespace 3854 3855 ///Perform cse of induction variable instructions. 3856 static void cse(BasicBlock *BB) { 3857 // Perform simple cse. 3858 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3859 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3860 Instruction *In = &*I++; 3861 3862 if (!CSEDenseMapInfo::canHandle(In)) 3863 continue; 3864 3865 // Check if we can replace this instruction with any of the 3866 // visited instructions. 3867 if (Instruction *V = CSEMap.lookup(In)) { 3868 In->replaceAllUsesWith(V); 3869 In->eraseFromParent(); 3870 continue; 3871 } 3872 3873 CSEMap[In] = In; 3874 } 3875 } 3876 3877 InstructionCost 3878 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3879 bool &NeedToScalarize) const { 3880 Function *F = CI->getCalledFunction(); 3881 Type *ScalarRetTy = CI->getType(); 3882 SmallVector<Type *, 4> Tys, ScalarTys; 3883 for (auto &ArgOp : CI->arg_operands()) 3884 ScalarTys.push_back(ArgOp->getType()); 3885 3886 // Estimate cost of scalarized vector call. The source operands are assumed 3887 // to be vectors, so we need to extract individual elements from there, 3888 // execute VF scalar calls, and then gather the result into the vector return 3889 // value. 3890 InstructionCost ScalarCallCost = 3891 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3892 if (VF.isScalar()) 3893 return ScalarCallCost; 3894 3895 // Compute corresponding vector type for return value and arguments. 3896 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3897 for (Type *ScalarTy : ScalarTys) 3898 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3899 3900 // Compute costs of unpacking argument values for the scalar calls and 3901 // packing the return values to a vector. 3902 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3903 3904 InstructionCost Cost = 3905 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3906 3907 // If we can't emit a vector call for this function, then the currently found 3908 // cost is the cost we need to return. 3909 NeedToScalarize = true; 3910 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3911 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3912 3913 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3914 return Cost; 3915 3916 // If the corresponding vector cost is cheaper, return its cost. 3917 InstructionCost VectorCallCost = 3918 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3919 if (VectorCallCost < Cost) { 3920 NeedToScalarize = false; 3921 Cost = VectorCallCost; 3922 } 3923 return Cost; 3924 } 3925 3926 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3927 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3928 return Elt; 3929 return VectorType::get(Elt, VF); 3930 } 3931 3932 InstructionCost 3933 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3934 ElementCount VF) const { 3935 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3936 assert(ID && "Expected intrinsic call!"); 3937 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3938 FastMathFlags FMF; 3939 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3940 FMF = FPMO->getFastMathFlags(); 3941 3942 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3943 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3944 SmallVector<Type *> ParamTys; 3945 std::transform(FTy->param_begin(), FTy->param_end(), 3946 std::back_inserter(ParamTys), 3947 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3948 3949 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3950 dyn_cast<IntrinsicInst>(CI)); 3951 return TTI.getIntrinsicInstrCost(CostAttrs, 3952 TargetTransformInfo::TCK_RecipThroughput); 3953 } 3954 3955 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3956 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3957 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3958 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3959 } 3960 3961 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3962 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3963 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3964 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3965 } 3966 3967 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3968 // For every instruction `I` in MinBWs, truncate the operands, create a 3969 // truncated version of `I` and reextend its result. InstCombine runs 3970 // later and will remove any ext/trunc pairs. 3971 SmallPtrSet<Value *, 4> Erased; 3972 for (const auto &KV : Cost->getMinimalBitwidths()) { 3973 // If the value wasn't vectorized, we must maintain the original scalar 3974 // type. The absence of the value from State indicates that it 3975 // wasn't vectorized. 3976 VPValue *Def = State.Plan->getVPValue(KV.first); 3977 if (!State.hasAnyVectorValue(Def)) 3978 continue; 3979 for (unsigned Part = 0; Part < UF; ++Part) { 3980 Value *I = State.get(Def, Part); 3981 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3982 continue; 3983 Type *OriginalTy = I->getType(); 3984 Type *ScalarTruncatedTy = 3985 IntegerType::get(OriginalTy->getContext(), KV.second); 3986 auto *TruncatedTy = VectorType::get( 3987 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3988 if (TruncatedTy == OriginalTy) 3989 continue; 3990 3991 IRBuilder<> B(cast<Instruction>(I)); 3992 auto ShrinkOperand = [&](Value *V) -> Value * { 3993 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3994 if (ZI->getSrcTy() == TruncatedTy) 3995 return ZI->getOperand(0); 3996 return B.CreateZExtOrTrunc(V, TruncatedTy); 3997 }; 3998 3999 // The actual instruction modification depends on the instruction type, 4000 // unfortunately. 4001 Value *NewI = nullptr; 4002 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4003 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4004 ShrinkOperand(BO->getOperand(1))); 4005 4006 // Any wrapping introduced by shrinking this operation shouldn't be 4007 // considered undefined behavior. So, we can't unconditionally copy 4008 // arithmetic wrapping flags to NewI. 4009 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4010 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4011 NewI = 4012 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4013 ShrinkOperand(CI->getOperand(1))); 4014 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4015 NewI = B.CreateSelect(SI->getCondition(), 4016 ShrinkOperand(SI->getTrueValue()), 4017 ShrinkOperand(SI->getFalseValue())); 4018 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4019 switch (CI->getOpcode()) { 4020 default: 4021 llvm_unreachable("Unhandled cast!"); 4022 case Instruction::Trunc: 4023 NewI = ShrinkOperand(CI->getOperand(0)); 4024 break; 4025 case Instruction::SExt: 4026 NewI = B.CreateSExtOrTrunc( 4027 CI->getOperand(0), 4028 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4029 break; 4030 case Instruction::ZExt: 4031 NewI = B.CreateZExtOrTrunc( 4032 CI->getOperand(0), 4033 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4034 break; 4035 } 4036 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4037 auto Elements0 = 4038 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4039 auto *O0 = B.CreateZExtOrTrunc( 4040 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4041 auto Elements1 = 4042 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4043 auto *O1 = B.CreateZExtOrTrunc( 4044 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4045 4046 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4047 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4048 // Don't do anything with the operands, just extend the result. 4049 continue; 4050 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4051 auto Elements = 4052 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4053 auto *O0 = B.CreateZExtOrTrunc( 4054 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4055 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4056 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4057 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4058 auto Elements = 4059 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4060 auto *O0 = B.CreateZExtOrTrunc( 4061 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4062 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4063 } else { 4064 // If we don't know what to do, be conservative and don't do anything. 4065 continue; 4066 } 4067 4068 // Lastly, extend the result. 4069 NewI->takeName(cast<Instruction>(I)); 4070 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4071 I->replaceAllUsesWith(Res); 4072 cast<Instruction>(I)->eraseFromParent(); 4073 Erased.insert(I); 4074 State.reset(Def, Res, Part); 4075 } 4076 } 4077 4078 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4079 for (const auto &KV : Cost->getMinimalBitwidths()) { 4080 // If the value wasn't vectorized, we must maintain the original scalar 4081 // type. The absence of the value from State indicates that it 4082 // wasn't vectorized. 4083 VPValue *Def = State.Plan->getVPValue(KV.first); 4084 if (!State.hasAnyVectorValue(Def)) 4085 continue; 4086 for (unsigned Part = 0; Part < UF; ++Part) { 4087 Value *I = State.get(Def, Part); 4088 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4089 if (Inst && Inst->use_empty()) { 4090 Value *NewI = Inst->getOperand(0); 4091 Inst->eraseFromParent(); 4092 State.reset(Def, NewI, Part); 4093 } 4094 } 4095 } 4096 } 4097 4098 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4099 // Insert truncates and extends for any truncated instructions as hints to 4100 // InstCombine. 4101 if (VF.isVector()) 4102 truncateToMinimalBitwidths(State); 4103 4104 // Fix widened non-induction PHIs by setting up the PHI operands. 4105 if (OrigPHIsToFix.size()) { 4106 assert(EnableVPlanNativePath && 4107 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4108 fixNonInductionPHIs(State); 4109 } 4110 4111 // At this point every instruction in the original loop is widened to a 4112 // vector form. Now we need to fix the recurrences in the loop. These PHI 4113 // nodes are currently empty because we did not want to introduce cycles. 4114 // This is the second stage of vectorizing recurrences. 4115 fixCrossIterationPHIs(State); 4116 4117 // Forget the original basic block. 4118 PSE.getSE()->forgetLoop(OrigLoop); 4119 4120 // If we inserted an edge from the middle block to the unique exit block, 4121 // update uses outside the loop (phis) to account for the newly inserted 4122 // edge. 4123 if (!Cost->requiresScalarEpilogue(VF)) { 4124 // Fix-up external users of the induction variables. 4125 for (auto &Entry : Legal->getInductionVars()) 4126 fixupIVUsers(Entry.first, Entry.second, 4127 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4128 IVEndValues[Entry.first], LoopMiddleBlock); 4129 4130 fixLCSSAPHIs(State); 4131 } 4132 4133 for (Instruction *PI : PredicatedInstructions) 4134 sinkScalarOperands(&*PI); 4135 4136 // Remove redundant induction instructions. 4137 cse(LoopVectorBody); 4138 4139 // Set/update profile weights for the vector and remainder loops as original 4140 // loop iterations are now distributed among them. Note that original loop 4141 // represented by LoopScalarBody becomes remainder loop after vectorization. 4142 // 4143 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4144 // end up getting slightly roughened result but that should be OK since 4145 // profile is not inherently precise anyway. Note also possible bypass of 4146 // vector code caused by legality checks is ignored, assigning all the weight 4147 // to the vector loop, optimistically. 4148 // 4149 // For scalable vectorization we can't know at compile time how many iterations 4150 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4151 // vscale of '1'. 4152 setProfileInfoAfterUnrolling( 4153 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4154 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4155 } 4156 4157 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4158 // In order to support recurrences we need to be able to vectorize Phi nodes. 4159 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4160 // stage #2: We now need to fix the recurrences by adding incoming edges to 4161 // the currently empty PHI nodes. At this point every instruction in the 4162 // original loop is widened to a vector form so we can use them to construct 4163 // the incoming edges. 4164 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4165 for (VPRecipeBase &R : Header->phis()) { 4166 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4167 fixReduction(ReductionPhi, State); 4168 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4169 fixFirstOrderRecurrence(FOR, State); 4170 } 4171 } 4172 4173 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4174 VPTransformState &State) { 4175 // This is the second phase of vectorizing first-order recurrences. An 4176 // overview of the transformation is described below. Suppose we have the 4177 // following loop. 4178 // 4179 // for (int i = 0; i < n; ++i) 4180 // b[i] = a[i] - a[i - 1]; 4181 // 4182 // There is a first-order recurrence on "a". For this loop, the shorthand 4183 // scalar IR looks like: 4184 // 4185 // scalar.ph: 4186 // s_init = a[-1] 4187 // br scalar.body 4188 // 4189 // scalar.body: 4190 // i = phi [0, scalar.ph], [i+1, scalar.body] 4191 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4192 // s2 = a[i] 4193 // b[i] = s2 - s1 4194 // br cond, scalar.body, ... 4195 // 4196 // In this example, s1 is a recurrence because it's value depends on the 4197 // previous iteration. In the first phase of vectorization, we created a 4198 // vector phi v1 for s1. We now complete the vectorization and produce the 4199 // shorthand vector IR shown below (for VF = 4, UF = 1). 4200 // 4201 // vector.ph: 4202 // v_init = vector(..., ..., ..., a[-1]) 4203 // br vector.body 4204 // 4205 // vector.body 4206 // i = phi [0, vector.ph], [i+4, vector.body] 4207 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4208 // v2 = a[i, i+1, i+2, i+3]; 4209 // v3 = vector(v1(3), v2(0, 1, 2)) 4210 // b[i, i+1, i+2, i+3] = v2 - v3 4211 // br cond, vector.body, middle.block 4212 // 4213 // middle.block: 4214 // x = v2(3) 4215 // br scalar.ph 4216 // 4217 // scalar.ph: 4218 // s_init = phi [x, middle.block], [a[-1], otherwise] 4219 // br scalar.body 4220 // 4221 // After execution completes the vector loop, we extract the next value of 4222 // the recurrence (x) to use as the initial value in the scalar loop. 4223 4224 // Extract the last vector element in the middle block. This will be the 4225 // initial value for the recurrence when jumping to the scalar loop. 4226 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4227 Value *Incoming = State.get(PreviousDef, UF - 1); 4228 auto *ExtractForScalar = Incoming; 4229 auto *IdxTy = Builder.getInt32Ty(); 4230 if (VF.isVector()) { 4231 auto *One = ConstantInt::get(IdxTy, 1); 4232 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4233 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4234 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4235 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4236 "vector.recur.extract"); 4237 } 4238 // Extract the second last element in the middle block if the 4239 // Phi is used outside the loop. We need to extract the phi itself 4240 // and not the last element (the phi update in the current iteration). This 4241 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4242 // when the scalar loop is not run at all. 4243 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4244 if (VF.isVector()) { 4245 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4246 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4247 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4248 Incoming, Idx, "vector.recur.extract.for.phi"); 4249 } else if (UF > 1) 4250 // When loop is unrolled without vectorizing, initialize 4251 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4252 // of `Incoming`. This is analogous to the vectorized case above: extracting 4253 // the second last element when VF > 1. 4254 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4255 4256 // Fix the initial value of the original recurrence in the scalar loop. 4257 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4258 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4259 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4260 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4261 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4262 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4263 Start->addIncoming(Incoming, BB); 4264 } 4265 4266 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4267 Phi->setName("scalar.recur"); 4268 4269 // Finally, fix users of the recurrence outside the loop. The users will need 4270 // either the last value of the scalar recurrence or the last value of the 4271 // vector recurrence we extracted in the middle block. Since the loop is in 4272 // LCSSA form, we just need to find all the phi nodes for the original scalar 4273 // recurrence in the exit block, and then add an edge for the middle block. 4274 // Note that LCSSA does not imply single entry when the original scalar loop 4275 // had multiple exiting edges (as we always run the last iteration in the 4276 // scalar epilogue); in that case, there is no edge from middle to exit and 4277 // and thus no phis which needed updated. 4278 if (!Cost->requiresScalarEpilogue(VF)) 4279 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4280 if (any_of(LCSSAPhi.incoming_values(), 4281 [Phi](Value *V) { return V == Phi; })) 4282 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4283 } 4284 4285 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4286 VPTransformState &State) { 4287 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4288 // Get it's reduction variable descriptor. 4289 assert(Legal->isReductionVariable(OrigPhi) && 4290 "Unable to find the reduction variable"); 4291 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4292 4293 RecurKind RK = RdxDesc.getRecurrenceKind(); 4294 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4295 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4296 setDebugLocFromInst(ReductionStartValue); 4297 4298 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4299 // This is the vector-clone of the value that leaves the loop. 4300 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4301 4302 // Wrap flags are in general invalid after vectorization, clear them. 4303 clearReductionWrapFlags(RdxDesc, State); 4304 4305 // Before each round, move the insertion point right between 4306 // the PHIs and the values we are going to write. 4307 // This allows us to write both PHINodes and the extractelement 4308 // instructions. 4309 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4310 4311 setDebugLocFromInst(LoopExitInst); 4312 4313 Type *PhiTy = OrigPhi->getType(); 4314 // If tail is folded by masking, the vector value to leave the loop should be 4315 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4316 // instead of the former. For an inloop reduction the reduction will already 4317 // be predicated, and does not need to be handled here. 4318 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4319 for (unsigned Part = 0; Part < UF; ++Part) { 4320 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4321 Value *Sel = nullptr; 4322 for (User *U : VecLoopExitInst->users()) { 4323 if (isa<SelectInst>(U)) { 4324 assert(!Sel && "Reduction exit feeding two selects"); 4325 Sel = U; 4326 } else 4327 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4328 } 4329 assert(Sel && "Reduction exit feeds no select"); 4330 State.reset(LoopExitInstDef, Sel, Part); 4331 4332 // If the target can create a predicated operator for the reduction at no 4333 // extra cost in the loop (for example a predicated vadd), it can be 4334 // cheaper for the select to remain in the loop than be sunk out of it, 4335 // and so use the select value for the phi instead of the old 4336 // LoopExitValue. 4337 if (PreferPredicatedReductionSelect || 4338 TTI->preferPredicatedReductionSelect( 4339 RdxDesc.getOpcode(), PhiTy, 4340 TargetTransformInfo::ReductionFlags())) { 4341 auto *VecRdxPhi = 4342 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4343 VecRdxPhi->setIncomingValueForBlock( 4344 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4345 } 4346 } 4347 } 4348 4349 // If the vector reduction can be performed in a smaller type, we truncate 4350 // then extend the loop exit value to enable InstCombine to evaluate the 4351 // entire expression in the smaller type. 4352 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4353 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4354 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4355 Builder.SetInsertPoint( 4356 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4357 VectorParts RdxParts(UF); 4358 for (unsigned Part = 0; Part < UF; ++Part) { 4359 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4360 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4361 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4362 : Builder.CreateZExt(Trunc, VecTy); 4363 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4364 UI != RdxParts[Part]->user_end();) 4365 if (*UI != Trunc) { 4366 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4367 RdxParts[Part] = Extnd; 4368 } else { 4369 ++UI; 4370 } 4371 } 4372 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4373 for (unsigned Part = 0; Part < UF; ++Part) { 4374 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4375 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4376 } 4377 } 4378 4379 // Reduce all of the unrolled parts into a single vector. 4380 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4381 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4382 4383 // The middle block terminator has already been assigned a DebugLoc here (the 4384 // OrigLoop's single latch terminator). We want the whole middle block to 4385 // appear to execute on this line because: (a) it is all compiler generated, 4386 // (b) these instructions are always executed after evaluating the latch 4387 // conditional branch, and (c) other passes may add new predecessors which 4388 // terminate on this line. This is the easiest way to ensure we don't 4389 // accidentally cause an extra step back into the loop while debugging. 4390 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4391 if (PhiR->isOrdered()) 4392 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4393 else { 4394 // Floating-point operations should have some FMF to enable the reduction. 4395 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4396 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4397 for (unsigned Part = 1; Part < UF; ++Part) { 4398 Value *RdxPart = State.get(LoopExitInstDef, Part); 4399 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4400 ReducedPartRdx = Builder.CreateBinOp( 4401 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4402 } else { 4403 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4404 } 4405 } 4406 } 4407 4408 // Create the reduction after the loop. Note that inloop reductions create the 4409 // target reduction in the loop using a Reduction recipe. 4410 if (VF.isVector() && !PhiR->isInLoop()) { 4411 ReducedPartRdx = 4412 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4413 // If the reduction can be performed in a smaller type, we need to extend 4414 // the reduction to the wider type before we branch to the original loop. 4415 if (PhiTy != RdxDesc.getRecurrenceType()) 4416 ReducedPartRdx = RdxDesc.isSigned() 4417 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4418 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4419 } 4420 4421 // Create a phi node that merges control-flow from the backedge-taken check 4422 // block and the middle block. 4423 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4424 LoopScalarPreHeader->getTerminator()); 4425 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4426 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4427 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4428 4429 // Now, we need to fix the users of the reduction variable 4430 // inside and outside of the scalar remainder loop. 4431 4432 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4433 // in the exit blocks. See comment on analogous loop in 4434 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4435 if (!Cost->requiresScalarEpilogue(VF)) 4436 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4437 if (any_of(LCSSAPhi.incoming_values(), 4438 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4439 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4440 4441 // Fix the scalar loop reduction variable with the incoming reduction sum 4442 // from the vector body and from the backedge value. 4443 int IncomingEdgeBlockIdx = 4444 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4445 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4446 // Pick the other block. 4447 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4448 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4449 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4450 } 4451 4452 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4453 VPTransformState &State) { 4454 RecurKind RK = RdxDesc.getRecurrenceKind(); 4455 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4456 return; 4457 4458 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4459 assert(LoopExitInstr && "null loop exit instruction"); 4460 SmallVector<Instruction *, 8> Worklist; 4461 SmallPtrSet<Instruction *, 8> Visited; 4462 Worklist.push_back(LoopExitInstr); 4463 Visited.insert(LoopExitInstr); 4464 4465 while (!Worklist.empty()) { 4466 Instruction *Cur = Worklist.pop_back_val(); 4467 if (isa<OverflowingBinaryOperator>(Cur)) 4468 for (unsigned Part = 0; Part < UF; ++Part) { 4469 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4470 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4471 } 4472 4473 for (User *U : Cur->users()) { 4474 Instruction *UI = cast<Instruction>(U); 4475 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4476 Visited.insert(UI).second) 4477 Worklist.push_back(UI); 4478 } 4479 } 4480 } 4481 4482 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4483 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4484 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4485 // Some phis were already hand updated by the reduction and recurrence 4486 // code above, leave them alone. 4487 continue; 4488 4489 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4490 // Non-instruction incoming values will have only one value. 4491 4492 VPLane Lane = VPLane::getFirstLane(); 4493 if (isa<Instruction>(IncomingValue) && 4494 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4495 VF)) 4496 Lane = VPLane::getLastLaneForVF(VF); 4497 4498 // Can be a loop invariant incoming value or the last scalar value to be 4499 // extracted from the vectorized loop. 4500 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4501 Value *lastIncomingValue = 4502 OrigLoop->isLoopInvariant(IncomingValue) 4503 ? IncomingValue 4504 : State.get(State.Plan->getVPValue(IncomingValue), 4505 VPIteration(UF - 1, Lane)); 4506 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4507 } 4508 } 4509 4510 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4511 // The basic block and loop containing the predicated instruction. 4512 auto *PredBB = PredInst->getParent(); 4513 auto *VectorLoop = LI->getLoopFor(PredBB); 4514 4515 // Initialize a worklist with the operands of the predicated instruction. 4516 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4517 4518 // Holds instructions that we need to analyze again. An instruction may be 4519 // reanalyzed if we don't yet know if we can sink it or not. 4520 SmallVector<Instruction *, 8> InstsToReanalyze; 4521 4522 // Returns true if a given use occurs in the predicated block. Phi nodes use 4523 // their operands in their corresponding predecessor blocks. 4524 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4525 auto *I = cast<Instruction>(U.getUser()); 4526 BasicBlock *BB = I->getParent(); 4527 if (auto *Phi = dyn_cast<PHINode>(I)) 4528 BB = Phi->getIncomingBlock( 4529 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4530 return BB == PredBB; 4531 }; 4532 4533 // Iteratively sink the scalarized operands of the predicated instruction 4534 // into the block we created for it. When an instruction is sunk, it's 4535 // operands are then added to the worklist. The algorithm ends after one pass 4536 // through the worklist doesn't sink a single instruction. 4537 bool Changed; 4538 do { 4539 // Add the instructions that need to be reanalyzed to the worklist, and 4540 // reset the changed indicator. 4541 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4542 InstsToReanalyze.clear(); 4543 Changed = false; 4544 4545 while (!Worklist.empty()) { 4546 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4547 4548 // We can't sink an instruction if it is a phi node, is not in the loop, 4549 // or may have side effects. 4550 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4551 I->mayHaveSideEffects()) 4552 continue; 4553 4554 // If the instruction is already in PredBB, check if we can sink its 4555 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4556 // sinking the scalar instruction I, hence it appears in PredBB; but it 4557 // may have failed to sink I's operands (recursively), which we try 4558 // (again) here. 4559 if (I->getParent() == PredBB) { 4560 Worklist.insert(I->op_begin(), I->op_end()); 4561 continue; 4562 } 4563 4564 // It's legal to sink the instruction if all its uses occur in the 4565 // predicated block. Otherwise, there's nothing to do yet, and we may 4566 // need to reanalyze the instruction. 4567 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4568 InstsToReanalyze.push_back(I); 4569 continue; 4570 } 4571 4572 // Move the instruction to the beginning of the predicated block, and add 4573 // it's operands to the worklist. 4574 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4575 Worklist.insert(I->op_begin(), I->op_end()); 4576 4577 // The sinking may have enabled other instructions to be sunk, so we will 4578 // need to iterate. 4579 Changed = true; 4580 } 4581 } while (Changed); 4582 } 4583 4584 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4585 for (PHINode *OrigPhi : OrigPHIsToFix) { 4586 VPWidenPHIRecipe *VPPhi = 4587 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4588 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4589 // Make sure the builder has a valid insert point. 4590 Builder.SetInsertPoint(NewPhi); 4591 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4592 VPValue *Inc = VPPhi->getIncomingValue(i); 4593 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4594 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4595 } 4596 } 4597 } 4598 4599 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4600 return Cost->useOrderedReductions(RdxDesc); 4601 } 4602 4603 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4604 VPUser &Operands, unsigned UF, 4605 ElementCount VF, bool IsPtrLoopInvariant, 4606 SmallBitVector &IsIndexLoopInvariant, 4607 VPTransformState &State) { 4608 // Construct a vector GEP by widening the operands of the scalar GEP as 4609 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4610 // results in a vector of pointers when at least one operand of the GEP 4611 // is vector-typed. Thus, to keep the representation compact, we only use 4612 // vector-typed operands for loop-varying values. 4613 4614 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4615 // If we are vectorizing, but the GEP has only loop-invariant operands, 4616 // the GEP we build (by only using vector-typed operands for 4617 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4618 // produce a vector of pointers, we need to either arbitrarily pick an 4619 // operand to broadcast, or broadcast a clone of the original GEP. 4620 // Here, we broadcast a clone of the original. 4621 // 4622 // TODO: If at some point we decide to scalarize instructions having 4623 // loop-invariant operands, this special case will no longer be 4624 // required. We would add the scalarization decision to 4625 // collectLoopScalars() and teach getVectorValue() to broadcast 4626 // the lane-zero scalar value. 4627 auto *Clone = Builder.Insert(GEP->clone()); 4628 for (unsigned Part = 0; Part < UF; ++Part) { 4629 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4630 State.set(VPDef, EntryPart, Part); 4631 addMetadata(EntryPart, GEP); 4632 } 4633 } else { 4634 // If the GEP has at least one loop-varying operand, we are sure to 4635 // produce a vector of pointers. But if we are only unrolling, we want 4636 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4637 // produce with the code below will be scalar (if VF == 1) or vector 4638 // (otherwise). Note that for the unroll-only case, we still maintain 4639 // values in the vector mapping with initVector, as we do for other 4640 // instructions. 4641 for (unsigned Part = 0; Part < UF; ++Part) { 4642 // The pointer operand of the new GEP. If it's loop-invariant, we 4643 // won't broadcast it. 4644 auto *Ptr = IsPtrLoopInvariant 4645 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4646 : State.get(Operands.getOperand(0), Part); 4647 4648 // Collect all the indices for the new GEP. If any index is 4649 // loop-invariant, we won't broadcast it. 4650 SmallVector<Value *, 4> Indices; 4651 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4652 VPValue *Operand = Operands.getOperand(I); 4653 if (IsIndexLoopInvariant[I - 1]) 4654 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4655 else 4656 Indices.push_back(State.get(Operand, Part)); 4657 } 4658 4659 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4660 // but it should be a vector, otherwise. 4661 auto *NewGEP = 4662 GEP->isInBounds() 4663 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4664 Indices) 4665 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4666 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4667 "NewGEP is not a pointer vector"); 4668 State.set(VPDef, NewGEP, Part); 4669 addMetadata(NewGEP, GEP); 4670 } 4671 } 4672 } 4673 4674 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4675 VPWidenPHIRecipe *PhiR, 4676 VPTransformState &State) { 4677 PHINode *P = cast<PHINode>(PN); 4678 if (EnableVPlanNativePath) { 4679 // Currently we enter here in the VPlan-native path for non-induction 4680 // PHIs where all control flow is uniform. We simply widen these PHIs. 4681 // Create a vector phi with no operands - the vector phi operands will be 4682 // set at the end of vector code generation. 4683 Type *VecTy = (State.VF.isScalar()) 4684 ? PN->getType() 4685 : VectorType::get(PN->getType(), State.VF); 4686 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4687 State.set(PhiR, VecPhi, 0); 4688 OrigPHIsToFix.push_back(P); 4689 4690 return; 4691 } 4692 4693 assert(PN->getParent() == OrigLoop->getHeader() && 4694 "Non-header phis should have been handled elsewhere"); 4695 4696 // In order to support recurrences we need to be able to vectorize Phi nodes. 4697 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4698 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4699 // this value when we vectorize all of the instructions that use the PHI. 4700 4701 assert(!Legal->isReductionVariable(P) && 4702 "reductions should be handled elsewhere"); 4703 4704 setDebugLocFromInst(P); 4705 4706 // This PHINode must be an induction variable. 4707 // Make sure that we know about it. 4708 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4709 4710 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4711 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4712 4713 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4714 // which can be found from the original scalar operations. 4715 switch (II.getKind()) { 4716 case InductionDescriptor::IK_NoInduction: 4717 llvm_unreachable("Unknown induction"); 4718 case InductionDescriptor::IK_IntInduction: 4719 case InductionDescriptor::IK_FpInduction: 4720 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4721 case InductionDescriptor::IK_PtrInduction: { 4722 // Handle the pointer induction variable case. 4723 assert(P->getType()->isPointerTy() && "Unexpected type."); 4724 4725 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4726 // This is the normalized GEP that starts counting at zero. 4727 Value *PtrInd = 4728 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4729 // Determine the number of scalars we need to generate for each unroll 4730 // iteration. If the instruction is uniform, we only need to generate the 4731 // first lane. Otherwise, we generate all VF values. 4732 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4733 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4734 4735 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4736 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4737 if (NeedsVectorIndex) { 4738 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4739 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4740 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4741 } 4742 4743 for (unsigned Part = 0; Part < UF; ++Part) { 4744 Value *PartStart = createStepForVF( 4745 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4746 4747 if (NeedsVectorIndex) { 4748 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4749 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4750 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4751 Value *SclrGep = 4752 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4753 SclrGep->setName("next.gep"); 4754 State.set(PhiR, SclrGep, Part); 4755 // We've cached the whole vector, which means we can support the 4756 // extraction of any lane. 4757 continue; 4758 } 4759 4760 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4761 Value *Idx = Builder.CreateAdd( 4762 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4763 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4764 Value *SclrGep = 4765 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4766 SclrGep->setName("next.gep"); 4767 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4768 } 4769 } 4770 return; 4771 } 4772 assert(isa<SCEVConstant>(II.getStep()) && 4773 "Induction step not a SCEV constant!"); 4774 Type *PhiType = II.getStep()->getType(); 4775 4776 // Build a pointer phi 4777 Value *ScalarStartValue = II.getStartValue(); 4778 Type *ScStValueType = ScalarStartValue->getType(); 4779 PHINode *NewPointerPhi = 4780 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4781 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4782 4783 // A pointer induction, performed by using a gep 4784 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4785 Instruction *InductionLoc = LoopLatch->getTerminator(); 4786 const SCEV *ScalarStep = II.getStep(); 4787 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4788 Value *ScalarStepValue = 4789 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4790 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4791 Value *NumUnrolledElems = 4792 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4793 Value *InductionGEP = GetElementPtrInst::Create( 4794 ScStValueType->getPointerElementType(), NewPointerPhi, 4795 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4796 InductionLoc); 4797 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4798 4799 // Create UF many actual address geps that use the pointer 4800 // phi as base and a vectorized version of the step value 4801 // (<step*0, ..., step*N>) as offset. 4802 for (unsigned Part = 0; Part < State.UF; ++Part) { 4803 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4804 Value *StartOffsetScalar = 4805 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4806 Value *StartOffset = 4807 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4808 // Create a vector of consecutive numbers from zero to VF. 4809 StartOffset = 4810 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4811 4812 Value *GEP = Builder.CreateGEP( 4813 ScStValueType->getPointerElementType(), NewPointerPhi, 4814 Builder.CreateMul( 4815 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4816 "vector.gep")); 4817 State.set(PhiR, GEP, Part); 4818 } 4819 } 4820 } 4821 } 4822 4823 /// A helper function for checking whether an integer division-related 4824 /// instruction may divide by zero (in which case it must be predicated if 4825 /// executed conditionally in the scalar code). 4826 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4827 /// Non-zero divisors that are non compile-time constants will not be 4828 /// converted into multiplication, so we will still end up scalarizing 4829 /// the division, but can do so w/o predication. 4830 static bool mayDivideByZero(Instruction &I) { 4831 assert((I.getOpcode() == Instruction::UDiv || 4832 I.getOpcode() == Instruction::SDiv || 4833 I.getOpcode() == Instruction::URem || 4834 I.getOpcode() == Instruction::SRem) && 4835 "Unexpected instruction"); 4836 Value *Divisor = I.getOperand(1); 4837 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4838 return !CInt || CInt->isZero(); 4839 } 4840 4841 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4842 VPUser &User, 4843 VPTransformState &State) { 4844 switch (I.getOpcode()) { 4845 case Instruction::Call: 4846 case Instruction::Br: 4847 case Instruction::PHI: 4848 case Instruction::GetElementPtr: 4849 case Instruction::Select: 4850 llvm_unreachable("This instruction is handled by a different recipe."); 4851 case Instruction::UDiv: 4852 case Instruction::SDiv: 4853 case Instruction::SRem: 4854 case Instruction::URem: 4855 case Instruction::Add: 4856 case Instruction::FAdd: 4857 case Instruction::Sub: 4858 case Instruction::FSub: 4859 case Instruction::FNeg: 4860 case Instruction::Mul: 4861 case Instruction::FMul: 4862 case Instruction::FDiv: 4863 case Instruction::FRem: 4864 case Instruction::Shl: 4865 case Instruction::LShr: 4866 case Instruction::AShr: 4867 case Instruction::And: 4868 case Instruction::Or: 4869 case Instruction::Xor: { 4870 // Just widen unops and binops. 4871 setDebugLocFromInst(&I); 4872 4873 for (unsigned Part = 0; Part < UF; ++Part) { 4874 SmallVector<Value *, 2> Ops; 4875 for (VPValue *VPOp : User.operands()) 4876 Ops.push_back(State.get(VPOp, Part)); 4877 4878 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4879 4880 if (auto *VecOp = dyn_cast<Instruction>(V)) 4881 VecOp->copyIRFlags(&I); 4882 4883 // Use this vector value for all users of the original instruction. 4884 State.set(Def, V, Part); 4885 addMetadata(V, &I); 4886 } 4887 4888 break; 4889 } 4890 case Instruction::ICmp: 4891 case Instruction::FCmp: { 4892 // Widen compares. Generate vector compares. 4893 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4894 auto *Cmp = cast<CmpInst>(&I); 4895 setDebugLocFromInst(Cmp); 4896 for (unsigned Part = 0; Part < UF; ++Part) { 4897 Value *A = State.get(User.getOperand(0), Part); 4898 Value *B = State.get(User.getOperand(1), Part); 4899 Value *C = nullptr; 4900 if (FCmp) { 4901 // Propagate fast math flags. 4902 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4903 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4904 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4905 } else { 4906 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4907 } 4908 State.set(Def, C, Part); 4909 addMetadata(C, &I); 4910 } 4911 4912 break; 4913 } 4914 4915 case Instruction::ZExt: 4916 case Instruction::SExt: 4917 case Instruction::FPToUI: 4918 case Instruction::FPToSI: 4919 case Instruction::FPExt: 4920 case Instruction::PtrToInt: 4921 case Instruction::IntToPtr: 4922 case Instruction::SIToFP: 4923 case Instruction::UIToFP: 4924 case Instruction::Trunc: 4925 case Instruction::FPTrunc: 4926 case Instruction::BitCast: { 4927 auto *CI = cast<CastInst>(&I); 4928 setDebugLocFromInst(CI); 4929 4930 /// Vectorize casts. 4931 Type *DestTy = 4932 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4933 4934 for (unsigned Part = 0; Part < UF; ++Part) { 4935 Value *A = State.get(User.getOperand(0), Part); 4936 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4937 State.set(Def, Cast, Part); 4938 addMetadata(Cast, &I); 4939 } 4940 break; 4941 } 4942 default: 4943 // This instruction is not vectorized by simple widening. 4944 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4945 llvm_unreachable("Unhandled instruction!"); 4946 } // end of switch. 4947 } 4948 4949 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4950 VPUser &ArgOperands, 4951 VPTransformState &State) { 4952 assert(!isa<DbgInfoIntrinsic>(I) && 4953 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4954 setDebugLocFromInst(&I); 4955 4956 Module *M = I.getParent()->getParent()->getParent(); 4957 auto *CI = cast<CallInst>(&I); 4958 4959 SmallVector<Type *, 4> Tys; 4960 for (Value *ArgOperand : CI->arg_operands()) 4961 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4962 4963 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4964 4965 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4966 // version of the instruction. 4967 // Is it beneficial to perform intrinsic call compared to lib call? 4968 bool NeedToScalarize = false; 4969 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4970 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4971 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4972 assert((UseVectorIntrinsic || !NeedToScalarize) && 4973 "Instruction should be scalarized elsewhere."); 4974 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4975 "Either the intrinsic cost or vector call cost must be valid"); 4976 4977 for (unsigned Part = 0; Part < UF; ++Part) { 4978 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4979 SmallVector<Value *, 4> Args; 4980 for (auto &I : enumerate(ArgOperands.operands())) { 4981 // Some intrinsics have a scalar argument - don't replace it with a 4982 // vector. 4983 Value *Arg; 4984 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4985 Arg = State.get(I.value(), Part); 4986 else { 4987 Arg = State.get(I.value(), VPIteration(0, 0)); 4988 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4989 TysForDecl.push_back(Arg->getType()); 4990 } 4991 Args.push_back(Arg); 4992 } 4993 4994 Function *VectorF; 4995 if (UseVectorIntrinsic) { 4996 // Use vector version of the intrinsic. 4997 if (VF.isVector()) 4998 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4999 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5000 assert(VectorF && "Can't retrieve vector intrinsic."); 5001 } else { 5002 // Use vector version of the function call. 5003 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5004 #ifndef NDEBUG 5005 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5006 "Can't create vector function."); 5007 #endif 5008 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5009 } 5010 SmallVector<OperandBundleDef, 1> OpBundles; 5011 CI->getOperandBundlesAsDefs(OpBundles); 5012 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5013 5014 if (isa<FPMathOperator>(V)) 5015 V->copyFastMathFlags(CI); 5016 5017 State.set(Def, V, Part); 5018 addMetadata(V, &I); 5019 } 5020 } 5021 5022 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5023 VPUser &Operands, 5024 bool InvariantCond, 5025 VPTransformState &State) { 5026 setDebugLocFromInst(&I); 5027 5028 // The condition can be loop invariant but still defined inside the 5029 // loop. This means that we can't just use the original 'cond' value. 5030 // We have to take the 'vectorized' value and pick the first lane. 5031 // Instcombine will make this a no-op. 5032 auto *InvarCond = InvariantCond 5033 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5034 : nullptr; 5035 5036 for (unsigned Part = 0; Part < UF; ++Part) { 5037 Value *Cond = 5038 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5039 Value *Op0 = State.get(Operands.getOperand(1), Part); 5040 Value *Op1 = State.get(Operands.getOperand(2), Part); 5041 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5042 State.set(VPDef, Sel, Part); 5043 addMetadata(Sel, &I); 5044 } 5045 } 5046 5047 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5048 // We should not collect Scalars more than once per VF. Right now, this 5049 // function is called from collectUniformsAndScalars(), which already does 5050 // this check. Collecting Scalars for VF=1 does not make any sense. 5051 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5052 "This function should not be visited twice for the same VF"); 5053 5054 SmallSetVector<Instruction *, 8> Worklist; 5055 5056 // These sets are used to seed the analysis with pointers used by memory 5057 // accesses that will remain scalar. 5058 SmallSetVector<Instruction *, 8> ScalarPtrs; 5059 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5060 auto *Latch = TheLoop->getLoopLatch(); 5061 5062 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5063 // The pointer operands of loads and stores will be scalar as long as the 5064 // memory access is not a gather or scatter operation. The value operand of a 5065 // store will remain scalar if the store is scalarized. 5066 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5067 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5068 assert(WideningDecision != CM_Unknown && 5069 "Widening decision should be ready at this moment"); 5070 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5071 if (Ptr == Store->getValueOperand()) 5072 return WideningDecision == CM_Scalarize; 5073 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5074 "Ptr is neither a value or pointer operand"); 5075 return WideningDecision != CM_GatherScatter; 5076 }; 5077 5078 // A helper that returns true if the given value is a bitcast or 5079 // getelementptr instruction contained in the loop. 5080 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5081 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5082 isa<GetElementPtrInst>(V)) && 5083 !TheLoop->isLoopInvariant(V); 5084 }; 5085 5086 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5087 if (!isa<PHINode>(Ptr) || 5088 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5089 return false; 5090 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5091 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5092 return false; 5093 return isScalarUse(MemAccess, Ptr); 5094 }; 5095 5096 // A helper that evaluates a memory access's use of a pointer. If the 5097 // pointer is actually the pointer induction of a loop, it is being 5098 // inserted into Worklist. If the use will be a scalar use, and the 5099 // pointer is only used by memory accesses, we place the pointer in 5100 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5101 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5102 if (isScalarPtrInduction(MemAccess, Ptr)) { 5103 Worklist.insert(cast<Instruction>(Ptr)); 5104 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5105 << "\n"); 5106 5107 Instruction *Update = cast<Instruction>( 5108 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5109 ScalarPtrs.insert(Update); 5110 return; 5111 } 5112 // We only care about bitcast and getelementptr instructions contained in 5113 // the loop. 5114 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5115 return; 5116 5117 // If the pointer has already been identified as scalar (e.g., if it was 5118 // also identified as uniform), there's nothing to do. 5119 auto *I = cast<Instruction>(Ptr); 5120 if (Worklist.count(I)) 5121 return; 5122 5123 // If the use of the pointer will be a scalar use, and all users of the 5124 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5125 // place the pointer in PossibleNonScalarPtrs. 5126 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5127 return isa<LoadInst>(U) || isa<StoreInst>(U); 5128 })) 5129 ScalarPtrs.insert(I); 5130 else 5131 PossibleNonScalarPtrs.insert(I); 5132 }; 5133 5134 // We seed the scalars analysis with three classes of instructions: (1) 5135 // instructions marked uniform-after-vectorization and (2) bitcast, 5136 // getelementptr and (pointer) phi instructions used by memory accesses 5137 // requiring a scalar use. 5138 // 5139 // (1) Add to the worklist all instructions that have been identified as 5140 // uniform-after-vectorization. 5141 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5142 5143 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5144 // memory accesses requiring a scalar use. The pointer operands of loads and 5145 // stores will be scalar as long as the memory accesses is not a gather or 5146 // scatter operation. The value operand of a store will remain scalar if the 5147 // store is scalarized. 5148 for (auto *BB : TheLoop->blocks()) 5149 for (auto &I : *BB) { 5150 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5151 evaluatePtrUse(Load, Load->getPointerOperand()); 5152 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5153 evaluatePtrUse(Store, Store->getPointerOperand()); 5154 evaluatePtrUse(Store, Store->getValueOperand()); 5155 } 5156 } 5157 for (auto *I : ScalarPtrs) 5158 if (!PossibleNonScalarPtrs.count(I)) { 5159 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5160 Worklist.insert(I); 5161 } 5162 5163 // Insert the forced scalars. 5164 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5165 // induction variable when the PHI user is scalarized. 5166 auto ForcedScalar = ForcedScalars.find(VF); 5167 if (ForcedScalar != ForcedScalars.end()) 5168 for (auto *I : ForcedScalar->second) 5169 Worklist.insert(I); 5170 5171 // Expand the worklist by looking through any bitcasts and getelementptr 5172 // instructions we've already identified as scalar. This is similar to the 5173 // expansion step in collectLoopUniforms(); however, here we're only 5174 // expanding to include additional bitcasts and getelementptr instructions. 5175 unsigned Idx = 0; 5176 while (Idx != Worklist.size()) { 5177 Instruction *Dst = Worklist[Idx++]; 5178 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5179 continue; 5180 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5181 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5182 auto *J = cast<Instruction>(U); 5183 return !TheLoop->contains(J) || Worklist.count(J) || 5184 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5185 isScalarUse(J, Src)); 5186 })) { 5187 Worklist.insert(Src); 5188 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5189 } 5190 } 5191 5192 // An induction variable will remain scalar if all users of the induction 5193 // variable and induction variable update remain scalar. 5194 for (auto &Induction : Legal->getInductionVars()) { 5195 auto *Ind = Induction.first; 5196 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5197 5198 // If tail-folding is applied, the primary induction variable will be used 5199 // to feed a vector compare. 5200 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5201 continue; 5202 5203 // Determine if all users of the induction variable are scalar after 5204 // vectorization. 5205 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5206 auto *I = cast<Instruction>(U); 5207 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5208 }); 5209 if (!ScalarInd) 5210 continue; 5211 5212 // Determine if all users of the induction variable update instruction are 5213 // scalar after vectorization. 5214 auto ScalarIndUpdate = 5215 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5216 auto *I = cast<Instruction>(U); 5217 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5218 }); 5219 if (!ScalarIndUpdate) 5220 continue; 5221 5222 // The induction variable and its update instruction will remain scalar. 5223 Worklist.insert(Ind); 5224 Worklist.insert(IndUpdate); 5225 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5226 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5227 << "\n"); 5228 } 5229 5230 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5231 } 5232 5233 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5234 if (!blockNeedsPredication(I->getParent())) 5235 return false; 5236 switch(I->getOpcode()) { 5237 default: 5238 break; 5239 case Instruction::Load: 5240 case Instruction::Store: { 5241 if (!Legal->isMaskRequired(I)) 5242 return false; 5243 auto *Ptr = getLoadStorePointerOperand(I); 5244 auto *Ty = getLoadStoreType(I); 5245 const Align Alignment = getLoadStoreAlignment(I); 5246 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5247 TTI.isLegalMaskedGather(Ty, Alignment)) 5248 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5249 TTI.isLegalMaskedScatter(Ty, Alignment)); 5250 } 5251 case Instruction::UDiv: 5252 case Instruction::SDiv: 5253 case Instruction::SRem: 5254 case Instruction::URem: 5255 return mayDivideByZero(*I); 5256 } 5257 return false; 5258 } 5259 5260 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5261 Instruction *I, ElementCount VF) { 5262 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5263 assert(getWideningDecision(I, VF) == CM_Unknown && 5264 "Decision should not be set yet."); 5265 auto *Group = getInterleavedAccessGroup(I); 5266 assert(Group && "Must have a group."); 5267 5268 // If the instruction's allocated size doesn't equal it's type size, it 5269 // requires padding and will be scalarized. 5270 auto &DL = I->getModule()->getDataLayout(); 5271 auto *ScalarTy = getLoadStoreType(I); 5272 if (hasIrregularType(ScalarTy, DL)) 5273 return false; 5274 5275 // Check if masking is required. 5276 // A Group may need masking for one of two reasons: it resides in a block that 5277 // needs predication, or it was decided to use masking to deal with gaps. 5278 bool PredicatedAccessRequiresMasking = 5279 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5280 bool AccessWithGapsRequiresMasking = 5281 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5282 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5283 return true; 5284 5285 // If masked interleaving is required, we expect that the user/target had 5286 // enabled it, because otherwise it either wouldn't have been created or 5287 // it should have been invalidated by the CostModel. 5288 assert(useMaskedInterleavedAccesses(TTI) && 5289 "Masked interleave-groups for predicated accesses are not enabled."); 5290 5291 auto *Ty = getLoadStoreType(I); 5292 const Align Alignment = getLoadStoreAlignment(I); 5293 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5294 : TTI.isLegalMaskedStore(Ty, Alignment); 5295 } 5296 5297 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5298 Instruction *I, ElementCount VF) { 5299 // Get and ensure we have a valid memory instruction. 5300 LoadInst *LI = dyn_cast<LoadInst>(I); 5301 StoreInst *SI = dyn_cast<StoreInst>(I); 5302 assert((LI || SI) && "Invalid memory instruction"); 5303 5304 auto *Ptr = getLoadStorePointerOperand(I); 5305 5306 // In order to be widened, the pointer should be consecutive, first of all. 5307 if (!Legal->isConsecutivePtr(Ptr)) 5308 return false; 5309 5310 // If the instruction is a store located in a predicated block, it will be 5311 // scalarized. 5312 if (isScalarWithPredication(I)) 5313 return false; 5314 5315 // If the instruction's allocated size doesn't equal it's type size, it 5316 // requires padding and will be scalarized. 5317 auto &DL = I->getModule()->getDataLayout(); 5318 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5319 if (hasIrregularType(ScalarTy, DL)) 5320 return false; 5321 5322 return true; 5323 } 5324 5325 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5326 // We should not collect Uniforms more than once per VF. Right now, 5327 // this function is called from collectUniformsAndScalars(), which 5328 // already does this check. Collecting Uniforms for VF=1 does not make any 5329 // sense. 5330 5331 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5332 "This function should not be visited twice for the same VF"); 5333 5334 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5335 // not analyze again. Uniforms.count(VF) will return 1. 5336 Uniforms[VF].clear(); 5337 5338 // We now know that the loop is vectorizable! 5339 // Collect instructions inside the loop that will remain uniform after 5340 // vectorization. 5341 5342 // Global values, params and instructions outside of current loop are out of 5343 // scope. 5344 auto isOutOfScope = [&](Value *V) -> bool { 5345 Instruction *I = dyn_cast<Instruction>(V); 5346 return (!I || !TheLoop->contains(I)); 5347 }; 5348 5349 SetVector<Instruction *> Worklist; 5350 BasicBlock *Latch = TheLoop->getLoopLatch(); 5351 5352 // Instructions that are scalar with predication must not be considered 5353 // uniform after vectorization, because that would create an erroneous 5354 // replicating region where only a single instance out of VF should be formed. 5355 // TODO: optimize such seldom cases if found important, see PR40816. 5356 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5357 if (isOutOfScope(I)) { 5358 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5359 << *I << "\n"); 5360 return; 5361 } 5362 if (isScalarWithPredication(I)) { 5363 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5364 << *I << "\n"); 5365 return; 5366 } 5367 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5368 Worklist.insert(I); 5369 }; 5370 5371 // Start with the conditional branch. If the branch condition is an 5372 // instruction contained in the loop that is only used by the branch, it is 5373 // uniform. 5374 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5375 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5376 addToWorklistIfAllowed(Cmp); 5377 5378 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5379 InstWidening WideningDecision = getWideningDecision(I, VF); 5380 assert(WideningDecision != CM_Unknown && 5381 "Widening decision should be ready at this moment"); 5382 5383 // A uniform memory op is itself uniform. We exclude uniform stores 5384 // here as they demand the last lane, not the first one. 5385 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5386 assert(WideningDecision == CM_Scalarize); 5387 return true; 5388 } 5389 5390 return (WideningDecision == CM_Widen || 5391 WideningDecision == CM_Widen_Reverse || 5392 WideningDecision == CM_Interleave); 5393 }; 5394 5395 5396 // Returns true if Ptr is the pointer operand of a memory access instruction 5397 // I, and I is known to not require scalarization. 5398 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5399 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5400 }; 5401 5402 // Holds a list of values which are known to have at least one uniform use. 5403 // Note that there may be other uses which aren't uniform. A "uniform use" 5404 // here is something which only demands lane 0 of the unrolled iterations; 5405 // it does not imply that all lanes produce the same value (e.g. this is not 5406 // the usual meaning of uniform) 5407 SetVector<Value *> HasUniformUse; 5408 5409 // Scan the loop for instructions which are either a) known to have only 5410 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5411 for (auto *BB : TheLoop->blocks()) 5412 for (auto &I : *BB) { 5413 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5414 switch (II->getIntrinsicID()) { 5415 case Intrinsic::sideeffect: 5416 case Intrinsic::experimental_noalias_scope_decl: 5417 case Intrinsic::assume: 5418 case Intrinsic::lifetime_start: 5419 case Intrinsic::lifetime_end: 5420 if (TheLoop->hasLoopInvariantOperands(&I)) 5421 addToWorklistIfAllowed(&I); 5422 break; 5423 default: 5424 break; 5425 } 5426 } 5427 5428 // ExtractValue instructions must be uniform, because the operands are 5429 // known to be loop-invariant. 5430 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5431 assert(isOutOfScope(EVI->getAggregateOperand()) && 5432 "Expected aggregate value to be loop invariant"); 5433 addToWorklistIfAllowed(EVI); 5434 continue; 5435 } 5436 5437 // If there's no pointer operand, there's nothing to do. 5438 auto *Ptr = getLoadStorePointerOperand(&I); 5439 if (!Ptr) 5440 continue; 5441 5442 // A uniform memory op is itself uniform. We exclude uniform stores 5443 // here as they demand the last lane, not the first one. 5444 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5445 addToWorklistIfAllowed(&I); 5446 5447 if (isUniformDecision(&I, VF)) { 5448 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5449 HasUniformUse.insert(Ptr); 5450 } 5451 } 5452 5453 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5454 // demanding) users. Since loops are assumed to be in LCSSA form, this 5455 // disallows uses outside the loop as well. 5456 for (auto *V : HasUniformUse) { 5457 if (isOutOfScope(V)) 5458 continue; 5459 auto *I = cast<Instruction>(V); 5460 auto UsersAreMemAccesses = 5461 llvm::all_of(I->users(), [&](User *U) -> bool { 5462 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5463 }); 5464 if (UsersAreMemAccesses) 5465 addToWorklistIfAllowed(I); 5466 } 5467 5468 // Expand Worklist in topological order: whenever a new instruction 5469 // is added , its users should be already inside Worklist. It ensures 5470 // a uniform instruction will only be used by uniform instructions. 5471 unsigned idx = 0; 5472 while (idx != Worklist.size()) { 5473 Instruction *I = Worklist[idx++]; 5474 5475 for (auto OV : I->operand_values()) { 5476 // isOutOfScope operands cannot be uniform instructions. 5477 if (isOutOfScope(OV)) 5478 continue; 5479 // First order recurrence Phi's should typically be considered 5480 // non-uniform. 5481 auto *OP = dyn_cast<PHINode>(OV); 5482 if (OP && Legal->isFirstOrderRecurrence(OP)) 5483 continue; 5484 // If all the users of the operand are uniform, then add the 5485 // operand into the uniform worklist. 5486 auto *OI = cast<Instruction>(OV); 5487 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5488 auto *J = cast<Instruction>(U); 5489 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5490 })) 5491 addToWorklistIfAllowed(OI); 5492 } 5493 } 5494 5495 // For an instruction to be added into Worklist above, all its users inside 5496 // the loop should also be in Worklist. However, this condition cannot be 5497 // true for phi nodes that form a cyclic dependence. We must process phi 5498 // nodes separately. An induction variable will remain uniform if all users 5499 // of the induction variable and induction variable update remain uniform. 5500 // The code below handles both pointer and non-pointer induction variables. 5501 for (auto &Induction : Legal->getInductionVars()) { 5502 auto *Ind = Induction.first; 5503 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5504 5505 // Determine if all users of the induction variable are uniform after 5506 // vectorization. 5507 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5508 auto *I = cast<Instruction>(U); 5509 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5510 isVectorizedMemAccessUse(I, Ind); 5511 }); 5512 if (!UniformInd) 5513 continue; 5514 5515 // Determine if all users of the induction variable update instruction are 5516 // uniform after vectorization. 5517 auto UniformIndUpdate = 5518 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5519 auto *I = cast<Instruction>(U); 5520 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5521 isVectorizedMemAccessUse(I, IndUpdate); 5522 }); 5523 if (!UniformIndUpdate) 5524 continue; 5525 5526 // The induction variable and its update instruction will remain uniform. 5527 addToWorklistIfAllowed(Ind); 5528 addToWorklistIfAllowed(IndUpdate); 5529 } 5530 5531 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5532 } 5533 5534 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5535 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5536 5537 if (Legal->getRuntimePointerChecking()->Need) { 5538 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5539 "runtime pointer checks needed. Enable vectorization of this " 5540 "loop with '#pragma clang loop vectorize(enable)' when " 5541 "compiling with -Os/-Oz", 5542 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5543 return true; 5544 } 5545 5546 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5547 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5548 "runtime SCEV checks needed. Enable vectorization of this " 5549 "loop with '#pragma clang loop vectorize(enable)' when " 5550 "compiling with -Os/-Oz", 5551 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5552 return true; 5553 } 5554 5555 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5556 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5557 reportVectorizationFailure("Runtime stride check for small trip count", 5558 "runtime stride == 1 checks needed. Enable vectorization of " 5559 "this loop without such check by compiling with -Os/-Oz", 5560 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5561 return true; 5562 } 5563 5564 return false; 5565 } 5566 5567 ElementCount 5568 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5569 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5570 reportVectorizationInfo( 5571 "Disabling scalable vectorization, because target does not " 5572 "support scalable vectors.", 5573 "ScalableVectorsUnsupported", ORE, TheLoop); 5574 return ElementCount::getScalable(0); 5575 } 5576 5577 if (Hints->isScalableVectorizationDisabled()) { 5578 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5579 "ScalableVectorizationDisabled", ORE, TheLoop); 5580 return ElementCount::getScalable(0); 5581 } 5582 5583 auto MaxScalableVF = ElementCount::getScalable( 5584 std::numeric_limits<ElementCount::ScalarTy>::max()); 5585 5586 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5587 // FIXME: While for scalable vectors this is currently sufficient, this should 5588 // be replaced by a more detailed mechanism that filters out specific VFs, 5589 // instead of invalidating vectorization for a whole set of VFs based on the 5590 // MaxVF. 5591 5592 // Disable scalable vectorization if the loop contains unsupported reductions. 5593 if (!canVectorizeReductions(MaxScalableVF)) { 5594 reportVectorizationInfo( 5595 "Scalable vectorization not supported for the reduction " 5596 "operations found in this loop.", 5597 "ScalableVFUnfeasible", ORE, TheLoop); 5598 return ElementCount::getScalable(0); 5599 } 5600 5601 // Disable scalable vectorization if the loop contains any instructions 5602 // with element types not supported for scalable vectors. 5603 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5604 return !Ty->isVoidTy() && 5605 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5606 })) { 5607 reportVectorizationInfo("Scalable vectorization is not supported " 5608 "for all element types found in this loop.", 5609 "ScalableVFUnfeasible", ORE, TheLoop); 5610 return ElementCount::getScalable(0); 5611 } 5612 5613 if (Legal->isSafeForAnyVectorWidth()) 5614 return MaxScalableVF; 5615 5616 // Limit MaxScalableVF by the maximum safe dependence distance. 5617 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5618 MaxScalableVF = ElementCount::getScalable( 5619 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5620 if (!MaxScalableVF) 5621 reportVectorizationInfo( 5622 "Max legal vector width too small, scalable vectorization " 5623 "unfeasible.", 5624 "ScalableVFUnfeasible", ORE, TheLoop); 5625 5626 return MaxScalableVF; 5627 } 5628 5629 FixedScalableVFPair 5630 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5631 ElementCount UserVF) { 5632 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5633 unsigned SmallestType, WidestType; 5634 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5635 5636 // Get the maximum safe dependence distance in bits computed by LAA. 5637 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5638 // the memory accesses that is most restrictive (involved in the smallest 5639 // dependence distance). 5640 unsigned MaxSafeElements = 5641 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5642 5643 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5644 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5645 5646 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5647 << ".\n"); 5648 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5649 << ".\n"); 5650 5651 // First analyze the UserVF, fall back if the UserVF should be ignored. 5652 if (UserVF) { 5653 auto MaxSafeUserVF = 5654 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5655 5656 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5657 // If `VF=vscale x N` is safe, then so is `VF=N` 5658 if (UserVF.isScalable()) 5659 return FixedScalableVFPair( 5660 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5661 else 5662 return UserVF; 5663 } 5664 5665 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5666 5667 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5668 // is better to ignore the hint and let the compiler choose a suitable VF. 5669 if (!UserVF.isScalable()) { 5670 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5671 << " is unsafe, clamping to max safe VF=" 5672 << MaxSafeFixedVF << ".\n"); 5673 ORE->emit([&]() { 5674 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5675 TheLoop->getStartLoc(), 5676 TheLoop->getHeader()) 5677 << "User-specified vectorization factor " 5678 << ore::NV("UserVectorizationFactor", UserVF) 5679 << " is unsafe, clamping to maximum safe vectorization factor " 5680 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5681 }); 5682 return MaxSafeFixedVF; 5683 } 5684 5685 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5686 << " is unsafe. Ignoring scalable UserVF.\n"); 5687 ORE->emit([&]() { 5688 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5689 TheLoop->getStartLoc(), 5690 TheLoop->getHeader()) 5691 << "User-specified vectorization factor " 5692 << ore::NV("UserVectorizationFactor", UserVF) 5693 << " is unsafe. Ignoring the hint to let the compiler pick a " 5694 "suitable VF."; 5695 }); 5696 } 5697 5698 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5699 << " / " << WidestType << " bits.\n"); 5700 5701 FixedScalableVFPair Result(ElementCount::getFixed(1), 5702 ElementCount::getScalable(0)); 5703 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5704 WidestType, MaxSafeFixedVF)) 5705 Result.FixedVF = MaxVF; 5706 5707 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5708 WidestType, MaxSafeScalableVF)) 5709 if (MaxVF.isScalable()) { 5710 Result.ScalableVF = MaxVF; 5711 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5712 << "\n"); 5713 } 5714 5715 return Result; 5716 } 5717 5718 FixedScalableVFPair 5719 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5720 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5721 // TODO: It may by useful to do since it's still likely to be dynamically 5722 // uniform if the target can skip. 5723 reportVectorizationFailure( 5724 "Not inserting runtime ptr check for divergent target", 5725 "runtime pointer checks needed. Not enabled for divergent target", 5726 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5727 return FixedScalableVFPair::getNone(); 5728 } 5729 5730 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5731 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5732 if (TC == 1) { 5733 reportVectorizationFailure("Single iteration (non) loop", 5734 "loop trip count is one, irrelevant for vectorization", 5735 "SingleIterationLoop", ORE, TheLoop); 5736 return FixedScalableVFPair::getNone(); 5737 } 5738 5739 switch (ScalarEpilogueStatus) { 5740 case CM_ScalarEpilogueAllowed: 5741 return computeFeasibleMaxVF(TC, UserVF); 5742 case CM_ScalarEpilogueNotAllowedUsePredicate: 5743 LLVM_FALLTHROUGH; 5744 case CM_ScalarEpilogueNotNeededUsePredicate: 5745 LLVM_DEBUG( 5746 dbgs() << "LV: vector predicate hint/switch found.\n" 5747 << "LV: Not allowing scalar epilogue, creating predicated " 5748 << "vector loop.\n"); 5749 break; 5750 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5751 // fallthrough as a special case of OptForSize 5752 case CM_ScalarEpilogueNotAllowedOptSize: 5753 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5754 LLVM_DEBUG( 5755 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5756 else 5757 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5758 << "count.\n"); 5759 5760 // Bail if runtime checks are required, which are not good when optimising 5761 // for size. 5762 if (runtimeChecksRequired()) 5763 return FixedScalableVFPair::getNone(); 5764 5765 break; 5766 } 5767 5768 // The only loops we can vectorize without a scalar epilogue, are loops with 5769 // a bottom-test and a single exiting block. We'd have to handle the fact 5770 // that not every instruction executes on the last iteration. This will 5771 // require a lane mask which varies through the vector loop body. (TODO) 5772 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5773 // If there was a tail-folding hint/switch, but we can't fold the tail by 5774 // masking, fallback to a vectorization with a scalar epilogue. 5775 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5776 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5777 "scalar epilogue instead.\n"); 5778 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5779 return computeFeasibleMaxVF(TC, UserVF); 5780 } 5781 return FixedScalableVFPair::getNone(); 5782 } 5783 5784 // Now try the tail folding 5785 5786 // Invalidate interleave groups that require an epilogue if we can't mask 5787 // the interleave-group. 5788 if (!useMaskedInterleavedAccesses(TTI)) { 5789 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5790 "No decisions should have been taken at this point"); 5791 // Note: There is no need to invalidate any cost modeling decisions here, as 5792 // non where taken so far. 5793 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5794 } 5795 5796 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5797 // Avoid tail folding if the trip count is known to be a multiple of any VF 5798 // we chose. 5799 // FIXME: The condition below pessimises the case for fixed-width vectors, 5800 // when scalable VFs are also candidates for vectorization. 5801 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5802 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5803 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5804 "MaxFixedVF must be a power of 2"); 5805 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5806 : MaxFixedVF.getFixedValue(); 5807 ScalarEvolution *SE = PSE.getSE(); 5808 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5809 const SCEV *ExitCount = SE->getAddExpr( 5810 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5811 const SCEV *Rem = SE->getURemExpr( 5812 SE->applyLoopGuards(ExitCount, TheLoop), 5813 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5814 if (Rem->isZero()) { 5815 // Accept MaxFixedVF if we do not have a tail. 5816 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5817 return MaxFactors; 5818 } 5819 } 5820 5821 // For scalable vectors, don't use tail folding as this is currently not yet 5822 // supported. The code is likely to have ended up here if the tripcount is 5823 // low, in which case it makes sense not to use scalable vectors. 5824 if (MaxFactors.ScalableVF.isVector()) 5825 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5826 5827 // If we don't know the precise trip count, or if the trip count that we 5828 // found modulo the vectorization factor is not zero, try to fold the tail 5829 // by masking. 5830 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5831 if (Legal->prepareToFoldTailByMasking()) { 5832 FoldTailByMasking = true; 5833 return MaxFactors; 5834 } 5835 5836 // If there was a tail-folding hint/switch, but we can't fold the tail by 5837 // masking, fallback to a vectorization with a scalar epilogue. 5838 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5839 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5840 "scalar epilogue instead.\n"); 5841 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5842 return MaxFactors; 5843 } 5844 5845 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5846 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5847 return FixedScalableVFPair::getNone(); 5848 } 5849 5850 if (TC == 0) { 5851 reportVectorizationFailure( 5852 "Unable to calculate the loop count due to complex control flow", 5853 "unable to calculate the loop count due to complex control flow", 5854 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5855 return FixedScalableVFPair::getNone(); 5856 } 5857 5858 reportVectorizationFailure( 5859 "Cannot optimize for size and vectorize at the same time.", 5860 "cannot optimize for size and vectorize at the same time. " 5861 "Enable vectorization of this loop with '#pragma clang loop " 5862 "vectorize(enable)' when compiling with -Os/-Oz", 5863 "NoTailLoopWithOptForSize", ORE, TheLoop); 5864 return FixedScalableVFPair::getNone(); 5865 } 5866 5867 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5868 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5869 const ElementCount &MaxSafeVF) { 5870 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5871 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5872 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5873 : TargetTransformInfo::RGK_FixedWidthVector); 5874 5875 // Convenience function to return the minimum of two ElementCounts. 5876 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5877 assert((LHS.isScalable() == RHS.isScalable()) && 5878 "Scalable flags must match"); 5879 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5880 }; 5881 5882 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5883 // Note that both WidestRegister and WidestType may not be a powers of 2. 5884 auto MaxVectorElementCount = ElementCount::get( 5885 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5886 ComputeScalableMaxVF); 5887 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5888 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5889 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5890 5891 if (!MaxVectorElementCount) { 5892 LLVM_DEBUG(dbgs() << "LV: The target has no " 5893 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5894 << " vector registers.\n"); 5895 return ElementCount::getFixed(1); 5896 } 5897 5898 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5899 if (ConstTripCount && 5900 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5901 isPowerOf2_32(ConstTripCount)) { 5902 // We need to clamp the VF to be the ConstTripCount. There is no point in 5903 // choosing a higher viable VF as done in the loop below. If 5904 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5905 // the TC is less than or equal to the known number of lanes. 5906 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5907 << ConstTripCount << "\n"); 5908 return TripCountEC; 5909 } 5910 5911 ElementCount MaxVF = MaxVectorElementCount; 5912 if (TTI.shouldMaximizeVectorBandwidth() || 5913 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5914 auto MaxVectorElementCountMaxBW = ElementCount::get( 5915 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5916 ComputeScalableMaxVF); 5917 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5918 5919 // Collect all viable vectorization factors larger than the default MaxVF 5920 // (i.e. MaxVectorElementCount). 5921 SmallVector<ElementCount, 8> VFs; 5922 for (ElementCount VS = MaxVectorElementCount * 2; 5923 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5924 VFs.push_back(VS); 5925 5926 // For each VF calculate its register usage. 5927 auto RUs = calculateRegisterUsage(VFs); 5928 5929 // Select the largest VF which doesn't require more registers than existing 5930 // ones. 5931 for (int i = RUs.size() - 1; i >= 0; --i) { 5932 bool Selected = true; 5933 for (auto &pair : RUs[i].MaxLocalUsers) { 5934 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5935 if (pair.second > TargetNumRegisters) 5936 Selected = false; 5937 } 5938 if (Selected) { 5939 MaxVF = VFs[i]; 5940 break; 5941 } 5942 } 5943 if (ElementCount MinVF = 5944 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5945 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5946 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5947 << ") with target's minimum: " << MinVF << '\n'); 5948 MaxVF = MinVF; 5949 } 5950 } 5951 } 5952 return MaxVF; 5953 } 5954 5955 bool LoopVectorizationCostModel::isMoreProfitable( 5956 const VectorizationFactor &A, const VectorizationFactor &B) const { 5957 InstructionCost CostA = A.Cost; 5958 InstructionCost CostB = B.Cost; 5959 5960 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5961 5962 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5963 MaxTripCount) { 5964 // If we are folding the tail and the trip count is a known (possibly small) 5965 // constant, the trip count will be rounded up to an integer number of 5966 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5967 // which we compare directly. When not folding the tail, the total cost will 5968 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5969 // approximated with the per-lane cost below instead of using the tripcount 5970 // as here. 5971 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5972 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5973 return RTCostA < RTCostB; 5974 } 5975 5976 // When set to preferred, for now assume vscale may be larger than 1, so 5977 // that scalable vectorization is slightly favorable over fixed-width 5978 // vectorization. 5979 if (Hints->isScalableVectorizationPreferred()) 5980 if (A.Width.isScalable() && !B.Width.isScalable()) 5981 return (CostA * B.Width.getKnownMinValue()) <= 5982 (CostB * A.Width.getKnownMinValue()); 5983 5984 // To avoid the need for FP division: 5985 // (CostA / A.Width) < (CostB / B.Width) 5986 // <=> (CostA * B.Width) < (CostB * A.Width) 5987 return (CostA * B.Width.getKnownMinValue()) < 5988 (CostB * A.Width.getKnownMinValue()); 5989 } 5990 5991 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5992 const ElementCountSet &VFCandidates) { 5993 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5994 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5995 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5996 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5997 "Expected Scalar VF to be a candidate"); 5998 5999 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6000 VectorizationFactor ChosenFactor = ScalarCost; 6001 6002 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6003 if (ForceVectorization && VFCandidates.size() > 1) { 6004 // Ignore scalar width, because the user explicitly wants vectorization. 6005 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6006 // evaluation. 6007 ChosenFactor.Cost = InstructionCost::getMax(); 6008 } 6009 6010 SmallVector<InstructionVFPair> InvalidCosts; 6011 for (const auto &i : VFCandidates) { 6012 // The cost for scalar VF=1 is already calculated, so ignore it. 6013 if (i.isScalar()) 6014 continue; 6015 6016 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 6017 VectorizationFactor Candidate(i, C.first); 6018 LLVM_DEBUG( 6019 dbgs() << "LV: Vector loop of width " << i << " costs: " 6020 << (Candidate.Cost / Candidate.Width.getKnownMinValue()) 6021 << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") 6022 << ".\n"); 6023 6024 if (!C.second && !ForceVectorization) { 6025 LLVM_DEBUG( 6026 dbgs() << "LV: Not considering vector loop of width " << i 6027 << " because it will not generate any vector instructions.\n"); 6028 continue; 6029 } 6030 6031 // If profitable add it to ProfitableVF list. 6032 if (isMoreProfitable(Candidate, ScalarCost)) 6033 ProfitableVFs.push_back(Candidate); 6034 6035 if (isMoreProfitable(Candidate, ChosenFactor)) 6036 ChosenFactor = Candidate; 6037 } 6038 6039 // Emit a report of VFs with invalid costs in the loop. 6040 if (!InvalidCosts.empty()) { 6041 // Group the remarks per instruction, keeping the instruction order from 6042 // InvalidCosts. 6043 std::map<Instruction *, unsigned> Numbering; 6044 unsigned I = 0; 6045 for (auto &Pair : InvalidCosts) 6046 if (!Numbering.count(Pair.first)) 6047 Numbering[Pair.first] = I++; 6048 6049 // Sort the list, first on instruction(number) then on VF. 6050 llvm::sort(InvalidCosts, 6051 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6052 if (Numbering[A.first] != Numbering[B.first]) 6053 return Numbering[A.first] < Numbering[B.first]; 6054 ElementCountComparator ECC; 6055 return ECC(A.second, B.second); 6056 }); 6057 6058 // For a list of ordered instruction-vf pairs: 6059 // [(load, vf1), (load, vf2), (store, vf1)] 6060 // Group the instructions together to emit separate remarks for: 6061 // load (vf1, vf2) 6062 // store (vf1) 6063 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6064 auto Subset = ArrayRef<InstructionVFPair>(); 6065 do { 6066 if (Subset.empty()) 6067 Subset = Tail.take_front(1); 6068 6069 Instruction *I = Subset.front().first; 6070 6071 // If the next instruction is different, or if there are no other pairs, 6072 // emit a remark for the collated subset. e.g. 6073 // [(load, vf1), (load, vf2))] 6074 // to emit: 6075 // remark: invalid costs for 'load' at VF=(vf, vf2) 6076 if (Subset == Tail || Tail[Subset.size()].first != I) { 6077 std::string OutString; 6078 raw_string_ostream OS(OutString); 6079 assert(!Subset.empty() && "Unexpected empty range"); 6080 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6081 for (auto &Pair : Subset) 6082 OS << (Pair.second == Subset.front().second ? "" : ", ") 6083 << Pair.second; 6084 OS << "):"; 6085 if (auto *CI = dyn_cast<CallInst>(I)) 6086 OS << " call to " << CI->getCalledFunction()->getName(); 6087 else 6088 OS << " " << I->getOpcodeName(); 6089 OS.flush(); 6090 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6091 Tail = Tail.drop_front(Subset.size()); 6092 Subset = {}; 6093 } else 6094 // Grow the subset by one element 6095 Subset = Tail.take_front(Subset.size() + 1); 6096 } while (!Tail.empty()); 6097 } 6098 6099 if (!EnableCondStoresVectorization && NumPredStores) { 6100 reportVectorizationFailure("There are conditional stores.", 6101 "store that is conditionally executed prevents vectorization", 6102 "ConditionalStore", ORE, TheLoop); 6103 ChosenFactor = ScalarCost; 6104 } 6105 6106 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6107 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6108 << "LV: Vectorization seems to be not beneficial, " 6109 << "but was forced by a user.\n"); 6110 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6111 return ChosenFactor; 6112 } 6113 6114 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6115 const Loop &L, ElementCount VF) const { 6116 // Cross iteration phis such as reductions need special handling and are 6117 // currently unsupported. 6118 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6119 return Legal->isFirstOrderRecurrence(&Phi) || 6120 Legal->isReductionVariable(&Phi); 6121 })) 6122 return false; 6123 6124 // Phis with uses outside of the loop require special handling and are 6125 // currently unsupported. 6126 for (auto &Entry : Legal->getInductionVars()) { 6127 // Look for uses of the value of the induction at the last iteration. 6128 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6129 for (User *U : PostInc->users()) 6130 if (!L.contains(cast<Instruction>(U))) 6131 return false; 6132 // Look for uses of penultimate value of the induction. 6133 for (User *U : Entry.first->users()) 6134 if (!L.contains(cast<Instruction>(U))) 6135 return false; 6136 } 6137 6138 // Induction variables that are widened require special handling that is 6139 // currently not supported. 6140 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6141 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6142 this->isProfitableToScalarize(Entry.first, VF)); 6143 })) 6144 return false; 6145 6146 // Epilogue vectorization code has not been auditted to ensure it handles 6147 // non-latch exits properly. It may be fine, but it needs auditted and 6148 // tested. 6149 if (L.getExitingBlock() != L.getLoopLatch()) 6150 return false; 6151 6152 return true; 6153 } 6154 6155 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6156 const ElementCount VF) const { 6157 // FIXME: We need a much better cost-model to take different parameters such 6158 // as register pressure, code size increase and cost of extra branches into 6159 // account. For now we apply a very crude heuristic and only consider loops 6160 // with vectorization factors larger than a certain value. 6161 // We also consider epilogue vectorization unprofitable for targets that don't 6162 // consider interleaving beneficial (eg. MVE). 6163 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6164 return false; 6165 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6166 return true; 6167 return false; 6168 } 6169 6170 VectorizationFactor 6171 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6172 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6173 VectorizationFactor Result = VectorizationFactor::Disabled(); 6174 if (!EnableEpilogueVectorization) { 6175 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6176 return Result; 6177 } 6178 6179 if (!isScalarEpilogueAllowed()) { 6180 LLVM_DEBUG( 6181 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6182 "allowed.\n";); 6183 return Result; 6184 } 6185 6186 // FIXME: This can be fixed for scalable vectors later, because at this stage 6187 // the LoopVectorizer will only consider vectorizing a loop with scalable 6188 // vectors when the loop has a hint to enable vectorization for a given VF. 6189 if (MainLoopVF.isScalable()) { 6190 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6191 "yet supported.\n"); 6192 return Result; 6193 } 6194 6195 // Not really a cost consideration, but check for unsupported cases here to 6196 // simplify the logic. 6197 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6198 LLVM_DEBUG( 6199 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6200 "not a supported candidate.\n";); 6201 return Result; 6202 } 6203 6204 if (EpilogueVectorizationForceVF > 1) { 6205 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6206 if (LVP.hasPlanWithVFs( 6207 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6208 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6209 else { 6210 LLVM_DEBUG( 6211 dbgs() 6212 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6213 return Result; 6214 } 6215 } 6216 6217 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6218 TheLoop->getHeader()->getParent()->hasMinSize()) { 6219 LLVM_DEBUG( 6220 dbgs() 6221 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6222 return Result; 6223 } 6224 6225 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6226 return Result; 6227 6228 for (auto &NextVF : ProfitableVFs) 6229 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6230 (Result.Width.getFixedValue() == 1 || 6231 isMoreProfitable(NextVF, Result)) && 6232 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6233 Result = NextVF; 6234 6235 if (Result != VectorizationFactor::Disabled()) 6236 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6237 << Result.Width.getFixedValue() << "\n";); 6238 return Result; 6239 } 6240 6241 std::pair<unsigned, unsigned> 6242 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6243 unsigned MinWidth = -1U; 6244 unsigned MaxWidth = 8; 6245 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6246 for (Type *T : ElementTypesInLoop) { 6247 MinWidth = std::min<unsigned>( 6248 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6249 MaxWidth = std::max<unsigned>( 6250 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6251 } 6252 return {MinWidth, MaxWidth}; 6253 } 6254 6255 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6256 ElementTypesInLoop.clear(); 6257 // For each block. 6258 for (BasicBlock *BB : TheLoop->blocks()) { 6259 // For each instruction in the loop. 6260 for (Instruction &I : BB->instructionsWithoutDebug()) { 6261 Type *T = I.getType(); 6262 6263 // Skip ignored values. 6264 if (ValuesToIgnore.count(&I)) 6265 continue; 6266 6267 // Only examine Loads, Stores and PHINodes. 6268 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6269 continue; 6270 6271 // Examine PHI nodes that are reduction variables. Update the type to 6272 // account for the recurrence type. 6273 if (auto *PN = dyn_cast<PHINode>(&I)) { 6274 if (!Legal->isReductionVariable(PN)) 6275 continue; 6276 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6277 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6278 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6279 RdxDesc.getRecurrenceType(), 6280 TargetTransformInfo::ReductionFlags())) 6281 continue; 6282 T = RdxDesc.getRecurrenceType(); 6283 } 6284 6285 // Examine the stored values. 6286 if (auto *ST = dyn_cast<StoreInst>(&I)) 6287 T = ST->getValueOperand()->getType(); 6288 6289 // Ignore loaded pointer types and stored pointer types that are not 6290 // vectorizable. 6291 // 6292 // FIXME: The check here attempts to predict whether a load or store will 6293 // be vectorized. We only know this for certain after a VF has 6294 // been selected. Here, we assume that if an access can be 6295 // vectorized, it will be. We should also look at extending this 6296 // optimization to non-pointer types. 6297 // 6298 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6299 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6300 continue; 6301 6302 ElementTypesInLoop.insert(T); 6303 } 6304 } 6305 } 6306 6307 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6308 unsigned LoopCost) { 6309 // -- The interleave heuristics -- 6310 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6311 // There are many micro-architectural considerations that we can't predict 6312 // at this level. For example, frontend pressure (on decode or fetch) due to 6313 // code size, or the number and capabilities of the execution ports. 6314 // 6315 // We use the following heuristics to select the interleave count: 6316 // 1. If the code has reductions, then we interleave to break the cross 6317 // iteration dependency. 6318 // 2. If the loop is really small, then we interleave to reduce the loop 6319 // overhead. 6320 // 3. We don't interleave if we think that we will spill registers to memory 6321 // due to the increased register pressure. 6322 6323 if (!isScalarEpilogueAllowed()) 6324 return 1; 6325 6326 // We used the distance for the interleave count. 6327 if (Legal->getMaxSafeDepDistBytes() != -1U) 6328 return 1; 6329 6330 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6331 const bool HasReductions = !Legal->getReductionVars().empty(); 6332 // Do not interleave loops with a relatively small known or estimated trip 6333 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6334 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6335 // because with the above conditions interleaving can expose ILP and break 6336 // cross iteration dependences for reductions. 6337 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6338 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6339 return 1; 6340 6341 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6342 // We divide by these constants so assume that we have at least one 6343 // instruction that uses at least one register. 6344 for (auto& pair : R.MaxLocalUsers) { 6345 pair.second = std::max(pair.second, 1U); 6346 } 6347 6348 // We calculate the interleave count using the following formula. 6349 // Subtract the number of loop invariants from the number of available 6350 // registers. These registers are used by all of the interleaved instances. 6351 // Next, divide the remaining registers by the number of registers that is 6352 // required by the loop, in order to estimate how many parallel instances 6353 // fit without causing spills. All of this is rounded down if necessary to be 6354 // a power of two. We want power of two interleave count to simplify any 6355 // addressing operations or alignment considerations. 6356 // We also want power of two interleave counts to ensure that the induction 6357 // variable of the vector loop wraps to zero, when tail is folded by masking; 6358 // this currently happens when OptForSize, in which case IC is set to 1 above. 6359 unsigned IC = UINT_MAX; 6360 6361 for (auto& pair : R.MaxLocalUsers) { 6362 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6363 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6364 << " registers of " 6365 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6366 if (VF.isScalar()) { 6367 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6368 TargetNumRegisters = ForceTargetNumScalarRegs; 6369 } else { 6370 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6371 TargetNumRegisters = ForceTargetNumVectorRegs; 6372 } 6373 unsigned MaxLocalUsers = pair.second; 6374 unsigned LoopInvariantRegs = 0; 6375 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6376 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6377 6378 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6379 // Don't count the induction variable as interleaved. 6380 if (EnableIndVarRegisterHeur) { 6381 TmpIC = 6382 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6383 std::max(1U, (MaxLocalUsers - 1))); 6384 } 6385 6386 IC = std::min(IC, TmpIC); 6387 } 6388 6389 // Clamp the interleave ranges to reasonable counts. 6390 unsigned MaxInterleaveCount = 6391 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6392 6393 // Check if the user has overridden the max. 6394 if (VF.isScalar()) { 6395 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6396 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6397 } else { 6398 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6399 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6400 } 6401 6402 // If trip count is known or estimated compile time constant, limit the 6403 // interleave count to be less than the trip count divided by VF, provided it 6404 // is at least 1. 6405 // 6406 // For scalable vectors we can't know if interleaving is beneficial. It may 6407 // not be beneficial for small loops if none of the lanes in the second vector 6408 // iterations is enabled. However, for larger loops, there is likely to be a 6409 // similar benefit as for fixed-width vectors. For now, we choose to leave 6410 // the InterleaveCount as if vscale is '1', although if some information about 6411 // the vector is known (e.g. min vector size), we can make a better decision. 6412 if (BestKnownTC) { 6413 MaxInterleaveCount = 6414 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6415 // Make sure MaxInterleaveCount is greater than 0. 6416 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6417 } 6418 6419 assert(MaxInterleaveCount > 0 && 6420 "Maximum interleave count must be greater than 0"); 6421 6422 // Clamp the calculated IC to be between the 1 and the max interleave count 6423 // that the target and trip count allows. 6424 if (IC > MaxInterleaveCount) 6425 IC = MaxInterleaveCount; 6426 else 6427 // Make sure IC is greater than 0. 6428 IC = std::max(1u, IC); 6429 6430 assert(IC > 0 && "Interleave count must be greater than 0."); 6431 6432 // If we did not calculate the cost for VF (because the user selected the VF) 6433 // then we calculate the cost of VF here. 6434 if (LoopCost == 0) { 6435 InstructionCost C = expectedCost(VF).first; 6436 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6437 LoopCost = *C.getValue(); 6438 } 6439 6440 assert(LoopCost && "Non-zero loop cost expected"); 6441 6442 // Interleave if we vectorized this loop and there is a reduction that could 6443 // benefit from interleaving. 6444 if (VF.isVector() && HasReductions) { 6445 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6446 return IC; 6447 } 6448 6449 // Note that if we've already vectorized the loop we will have done the 6450 // runtime check and so interleaving won't require further checks. 6451 bool InterleavingRequiresRuntimePointerCheck = 6452 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6453 6454 // We want to interleave small loops in order to reduce the loop overhead and 6455 // potentially expose ILP opportunities. 6456 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6457 << "LV: IC is " << IC << '\n' 6458 << "LV: VF is " << VF << '\n'); 6459 const bool AggressivelyInterleaveReductions = 6460 TTI.enableAggressiveInterleaving(HasReductions); 6461 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6462 // We assume that the cost overhead is 1 and we use the cost model 6463 // to estimate the cost of the loop and interleave until the cost of the 6464 // loop overhead is about 5% of the cost of the loop. 6465 unsigned SmallIC = 6466 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6467 6468 // Interleave until store/load ports (estimated by max interleave count) are 6469 // saturated. 6470 unsigned NumStores = Legal->getNumStores(); 6471 unsigned NumLoads = Legal->getNumLoads(); 6472 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6473 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6474 6475 // If we have a scalar reduction (vector reductions are already dealt with 6476 // by this point), we can increase the critical path length if the loop 6477 // we're interleaving is inside another loop. For tree-wise reductions 6478 // set the limit to 2, and for ordered reductions it's best to disable 6479 // interleaving entirely. 6480 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6481 bool HasOrderedReductions = 6482 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6483 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6484 return RdxDesc.isOrdered(); 6485 }); 6486 if (HasOrderedReductions) { 6487 LLVM_DEBUG( 6488 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6489 return 1; 6490 } 6491 6492 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6493 SmallIC = std::min(SmallIC, F); 6494 StoresIC = std::min(StoresIC, F); 6495 LoadsIC = std::min(LoadsIC, F); 6496 } 6497 6498 if (EnableLoadStoreRuntimeInterleave && 6499 std::max(StoresIC, LoadsIC) > SmallIC) { 6500 LLVM_DEBUG( 6501 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6502 return std::max(StoresIC, LoadsIC); 6503 } 6504 6505 // If there are scalar reductions and TTI has enabled aggressive 6506 // interleaving for reductions, we will interleave to expose ILP. 6507 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6508 AggressivelyInterleaveReductions) { 6509 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6510 // Interleave no less than SmallIC but not as aggressive as the normal IC 6511 // to satisfy the rare situation when resources are too limited. 6512 return std::max(IC / 2, SmallIC); 6513 } else { 6514 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6515 return SmallIC; 6516 } 6517 } 6518 6519 // Interleave if this is a large loop (small loops are already dealt with by 6520 // this point) that could benefit from interleaving. 6521 if (AggressivelyInterleaveReductions) { 6522 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6523 return IC; 6524 } 6525 6526 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6527 return 1; 6528 } 6529 6530 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6531 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6532 // This function calculates the register usage by measuring the highest number 6533 // of values that are alive at a single location. Obviously, this is a very 6534 // rough estimation. We scan the loop in a topological order in order and 6535 // assign a number to each instruction. We use RPO to ensure that defs are 6536 // met before their users. We assume that each instruction that has in-loop 6537 // users starts an interval. We record every time that an in-loop value is 6538 // used, so we have a list of the first and last occurrences of each 6539 // instruction. Next, we transpose this data structure into a multi map that 6540 // holds the list of intervals that *end* at a specific location. This multi 6541 // map allows us to perform a linear search. We scan the instructions linearly 6542 // and record each time that a new interval starts, by placing it in a set. 6543 // If we find this value in the multi-map then we remove it from the set. 6544 // The max register usage is the maximum size of the set. 6545 // We also search for instructions that are defined outside the loop, but are 6546 // used inside the loop. We need this number separately from the max-interval 6547 // usage number because when we unroll, loop-invariant values do not take 6548 // more register. 6549 LoopBlocksDFS DFS(TheLoop); 6550 DFS.perform(LI); 6551 6552 RegisterUsage RU; 6553 6554 // Each 'key' in the map opens a new interval. The values 6555 // of the map are the index of the 'last seen' usage of the 6556 // instruction that is the key. 6557 using IntervalMap = DenseMap<Instruction *, unsigned>; 6558 6559 // Maps instruction to its index. 6560 SmallVector<Instruction *, 64> IdxToInstr; 6561 // Marks the end of each interval. 6562 IntervalMap EndPoint; 6563 // Saves the list of instruction indices that are used in the loop. 6564 SmallPtrSet<Instruction *, 8> Ends; 6565 // Saves the list of values that are used in the loop but are 6566 // defined outside the loop, such as arguments and constants. 6567 SmallPtrSet<Value *, 8> LoopInvariants; 6568 6569 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6570 for (Instruction &I : BB->instructionsWithoutDebug()) { 6571 IdxToInstr.push_back(&I); 6572 6573 // Save the end location of each USE. 6574 for (Value *U : I.operands()) { 6575 auto *Instr = dyn_cast<Instruction>(U); 6576 6577 // Ignore non-instruction values such as arguments, constants, etc. 6578 if (!Instr) 6579 continue; 6580 6581 // If this instruction is outside the loop then record it and continue. 6582 if (!TheLoop->contains(Instr)) { 6583 LoopInvariants.insert(Instr); 6584 continue; 6585 } 6586 6587 // Overwrite previous end points. 6588 EndPoint[Instr] = IdxToInstr.size(); 6589 Ends.insert(Instr); 6590 } 6591 } 6592 } 6593 6594 // Saves the list of intervals that end with the index in 'key'. 6595 using InstrList = SmallVector<Instruction *, 2>; 6596 DenseMap<unsigned, InstrList> TransposeEnds; 6597 6598 // Transpose the EndPoints to a list of values that end at each index. 6599 for (auto &Interval : EndPoint) 6600 TransposeEnds[Interval.second].push_back(Interval.first); 6601 6602 SmallPtrSet<Instruction *, 8> OpenIntervals; 6603 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6604 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6605 6606 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6607 6608 // A lambda that gets the register usage for the given type and VF. 6609 const auto &TTICapture = TTI; 6610 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6611 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6612 return 0; 6613 InstructionCost::CostType RegUsage = 6614 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6615 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6616 "Nonsensical values for register usage."); 6617 return RegUsage; 6618 }; 6619 6620 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6621 Instruction *I = IdxToInstr[i]; 6622 6623 // Remove all of the instructions that end at this location. 6624 InstrList &List = TransposeEnds[i]; 6625 for (Instruction *ToRemove : List) 6626 OpenIntervals.erase(ToRemove); 6627 6628 // Ignore instructions that are never used within the loop. 6629 if (!Ends.count(I)) 6630 continue; 6631 6632 // Skip ignored values. 6633 if (ValuesToIgnore.count(I)) 6634 continue; 6635 6636 // For each VF find the maximum usage of registers. 6637 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6638 // Count the number of live intervals. 6639 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6640 6641 if (VFs[j].isScalar()) { 6642 for (auto Inst : OpenIntervals) { 6643 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6644 if (RegUsage.find(ClassID) == RegUsage.end()) 6645 RegUsage[ClassID] = 1; 6646 else 6647 RegUsage[ClassID] += 1; 6648 } 6649 } else { 6650 collectUniformsAndScalars(VFs[j]); 6651 for (auto Inst : OpenIntervals) { 6652 // Skip ignored values for VF > 1. 6653 if (VecValuesToIgnore.count(Inst)) 6654 continue; 6655 if (isScalarAfterVectorization(Inst, VFs[j])) { 6656 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6657 if (RegUsage.find(ClassID) == RegUsage.end()) 6658 RegUsage[ClassID] = 1; 6659 else 6660 RegUsage[ClassID] += 1; 6661 } else { 6662 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6663 if (RegUsage.find(ClassID) == RegUsage.end()) 6664 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6665 else 6666 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6667 } 6668 } 6669 } 6670 6671 for (auto& pair : RegUsage) { 6672 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6673 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6674 else 6675 MaxUsages[j][pair.first] = pair.second; 6676 } 6677 } 6678 6679 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6680 << OpenIntervals.size() << '\n'); 6681 6682 // Add the current instruction to the list of open intervals. 6683 OpenIntervals.insert(I); 6684 } 6685 6686 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6687 SmallMapVector<unsigned, unsigned, 4> Invariant; 6688 6689 for (auto Inst : LoopInvariants) { 6690 unsigned Usage = 6691 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6692 unsigned ClassID = 6693 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6694 if (Invariant.find(ClassID) == Invariant.end()) 6695 Invariant[ClassID] = Usage; 6696 else 6697 Invariant[ClassID] += Usage; 6698 } 6699 6700 LLVM_DEBUG({ 6701 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6702 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6703 << " item\n"; 6704 for (const auto &pair : MaxUsages[i]) { 6705 dbgs() << "LV(REG): RegisterClass: " 6706 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6707 << " registers\n"; 6708 } 6709 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6710 << " item\n"; 6711 for (const auto &pair : Invariant) { 6712 dbgs() << "LV(REG): RegisterClass: " 6713 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6714 << " registers\n"; 6715 } 6716 }); 6717 6718 RU.LoopInvariantRegs = Invariant; 6719 RU.MaxLocalUsers = MaxUsages[i]; 6720 RUs[i] = RU; 6721 } 6722 6723 return RUs; 6724 } 6725 6726 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6727 // TODO: Cost model for emulated masked load/store is completely 6728 // broken. This hack guides the cost model to use an artificially 6729 // high enough value to practically disable vectorization with such 6730 // operations, except where previously deployed legality hack allowed 6731 // using very low cost values. This is to avoid regressions coming simply 6732 // from moving "masked load/store" check from legality to cost model. 6733 // Masked Load/Gather emulation was previously never allowed. 6734 // Limited number of Masked Store/Scatter emulation was allowed. 6735 assert(isPredicatedInst(I) && 6736 "Expecting a scalar emulated instruction"); 6737 return isa<LoadInst>(I) || 6738 (isa<StoreInst>(I) && 6739 NumPredStores > NumberOfStoresToPredicate); 6740 } 6741 6742 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6743 // If we aren't vectorizing the loop, or if we've already collected the 6744 // instructions to scalarize, there's nothing to do. Collection may already 6745 // have occurred if we have a user-selected VF and are now computing the 6746 // expected cost for interleaving. 6747 if (VF.isScalar() || VF.isZero() || 6748 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6749 return; 6750 6751 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6752 // not profitable to scalarize any instructions, the presence of VF in the 6753 // map will indicate that we've analyzed it already. 6754 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6755 6756 // Find all the instructions that are scalar with predication in the loop and 6757 // determine if it would be better to not if-convert the blocks they are in. 6758 // If so, we also record the instructions to scalarize. 6759 for (BasicBlock *BB : TheLoop->blocks()) { 6760 if (!blockNeedsPredication(BB)) 6761 continue; 6762 for (Instruction &I : *BB) 6763 if (isScalarWithPredication(&I)) { 6764 ScalarCostsTy ScalarCosts; 6765 // Do not apply discount if scalable, because that would lead to 6766 // invalid scalarization costs. 6767 // Do not apply discount logic if hacked cost is needed 6768 // for emulated masked memrefs. 6769 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6770 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6771 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6772 // Remember that BB will remain after vectorization. 6773 PredicatedBBsAfterVectorization.insert(BB); 6774 } 6775 } 6776 } 6777 6778 int LoopVectorizationCostModel::computePredInstDiscount( 6779 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6780 assert(!isUniformAfterVectorization(PredInst, VF) && 6781 "Instruction marked uniform-after-vectorization will be predicated"); 6782 6783 // Initialize the discount to zero, meaning that the scalar version and the 6784 // vector version cost the same. 6785 InstructionCost Discount = 0; 6786 6787 // Holds instructions to analyze. The instructions we visit are mapped in 6788 // ScalarCosts. Those instructions are the ones that would be scalarized if 6789 // we find that the scalar version costs less. 6790 SmallVector<Instruction *, 8> Worklist; 6791 6792 // Returns true if the given instruction can be scalarized. 6793 auto canBeScalarized = [&](Instruction *I) -> bool { 6794 // We only attempt to scalarize instructions forming a single-use chain 6795 // from the original predicated block that would otherwise be vectorized. 6796 // Although not strictly necessary, we give up on instructions we know will 6797 // already be scalar to avoid traversing chains that are unlikely to be 6798 // beneficial. 6799 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6800 isScalarAfterVectorization(I, VF)) 6801 return false; 6802 6803 // If the instruction is scalar with predication, it will be analyzed 6804 // separately. We ignore it within the context of PredInst. 6805 if (isScalarWithPredication(I)) 6806 return false; 6807 6808 // If any of the instruction's operands are uniform after vectorization, 6809 // the instruction cannot be scalarized. This prevents, for example, a 6810 // masked load from being scalarized. 6811 // 6812 // We assume we will only emit a value for lane zero of an instruction 6813 // marked uniform after vectorization, rather than VF identical values. 6814 // Thus, if we scalarize an instruction that uses a uniform, we would 6815 // create uses of values corresponding to the lanes we aren't emitting code 6816 // for. This behavior can be changed by allowing getScalarValue to clone 6817 // the lane zero values for uniforms rather than asserting. 6818 for (Use &U : I->operands()) 6819 if (auto *J = dyn_cast<Instruction>(U.get())) 6820 if (isUniformAfterVectorization(J, VF)) 6821 return false; 6822 6823 // Otherwise, we can scalarize the instruction. 6824 return true; 6825 }; 6826 6827 // Compute the expected cost discount from scalarizing the entire expression 6828 // feeding the predicated instruction. We currently only consider expressions 6829 // that are single-use instruction chains. 6830 Worklist.push_back(PredInst); 6831 while (!Worklist.empty()) { 6832 Instruction *I = Worklist.pop_back_val(); 6833 6834 // If we've already analyzed the instruction, there's nothing to do. 6835 if (ScalarCosts.find(I) != ScalarCosts.end()) 6836 continue; 6837 6838 // Compute the cost of the vector instruction. Note that this cost already 6839 // includes the scalarization overhead of the predicated instruction. 6840 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6841 6842 // Compute the cost of the scalarized instruction. This cost is the cost of 6843 // the instruction as if it wasn't if-converted and instead remained in the 6844 // predicated block. We will scale this cost by block probability after 6845 // computing the scalarization overhead. 6846 InstructionCost ScalarCost = 6847 VF.getFixedValue() * 6848 getInstructionCost(I, ElementCount::getFixed(1)).first; 6849 6850 // Compute the scalarization overhead of needed insertelement instructions 6851 // and phi nodes. 6852 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6853 ScalarCost += TTI.getScalarizationOverhead( 6854 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6855 APInt::getAllOnesValue(VF.getFixedValue()), true, false); 6856 ScalarCost += 6857 VF.getFixedValue() * 6858 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6859 } 6860 6861 // Compute the scalarization overhead of needed extractelement 6862 // instructions. For each of the instruction's operands, if the operand can 6863 // be scalarized, add it to the worklist; otherwise, account for the 6864 // overhead. 6865 for (Use &U : I->operands()) 6866 if (auto *J = dyn_cast<Instruction>(U.get())) { 6867 assert(VectorType::isValidElementType(J->getType()) && 6868 "Instruction has non-scalar type"); 6869 if (canBeScalarized(J)) 6870 Worklist.push_back(J); 6871 else if (needsExtract(J, VF)) { 6872 ScalarCost += TTI.getScalarizationOverhead( 6873 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6874 APInt::getAllOnesValue(VF.getFixedValue()), false, true); 6875 } 6876 } 6877 6878 // Scale the total scalar cost by block probability. 6879 ScalarCost /= getReciprocalPredBlockProb(); 6880 6881 // Compute the discount. A non-negative discount means the vector version 6882 // of the instruction costs more, and scalarizing would be beneficial. 6883 Discount += VectorCost - ScalarCost; 6884 ScalarCosts[I] = ScalarCost; 6885 } 6886 6887 return *Discount.getValue(); 6888 } 6889 6890 LoopVectorizationCostModel::VectorizationCostTy 6891 LoopVectorizationCostModel::expectedCost( 6892 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6893 VectorizationCostTy Cost; 6894 6895 // For each block. 6896 for (BasicBlock *BB : TheLoop->blocks()) { 6897 VectorizationCostTy BlockCost; 6898 6899 // For each instruction in the old loop. 6900 for (Instruction &I : BB->instructionsWithoutDebug()) { 6901 // Skip ignored values. 6902 if (ValuesToIgnore.count(&I) || 6903 (VF.isVector() && VecValuesToIgnore.count(&I))) 6904 continue; 6905 6906 VectorizationCostTy C = getInstructionCost(&I, VF); 6907 6908 // Check if we should override the cost. 6909 if (C.first.isValid() && 6910 ForceTargetInstructionCost.getNumOccurrences() > 0) 6911 C.first = InstructionCost(ForceTargetInstructionCost); 6912 6913 // Keep a list of instructions with invalid costs. 6914 if (Invalid && !C.first.isValid()) 6915 Invalid->emplace_back(&I, VF); 6916 6917 BlockCost.first += C.first; 6918 BlockCost.second |= C.second; 6919 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6920 << " for VF " << VF << " For instruction: " << I 6921 << '\n'); 6922 } 6923 6924 // If we are vectorizing a predicated block, it will have been 6925 // if-converted. This means that the block's instructions (aside from 6926 // stores and instructions that may divide by zero) will now be 6927 // unconditionally executed. For the scalar case, we may not always execute 6928 // the predicated block, if it is an if-else block. Thus, scale the block's 6929 // cost by the probability of executing it. blockNeedsPredication from 6930 // Legal is used so as to not include all blocks in tail folded loops. 6931 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6932 BlockCost.first /= getReciprocalPredBlockProb(); 6933 6934 Cost.first += BlockCost.first; 6935 Cost.second |= BlockCost.second; 6936 } 6937 6938 return Cost; 6939 } 6940 6941 /// Gets Address Access SCEV after verifying that the access pattern 6942 /// is loop invariant except the induction variable dependence. 6943 /// 6944 /// This SCEV can be sent to the Target in order to estimate the address 6945 /// calculation cost. 6946 static const SCEV *getAddressAccessSCEV( 6947 Value *Ptr, 6948 LoopVectorizationLegality *Legal, 6949 PredicatedScalarEvolution &PSE, 6950 const Loop *TheLoop) { 6951 6952 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6953 if (!Gep) 6954 return nullptr; 6955 6956 // We are looking for a gep with all loop invariant indices except for one 6957 // which should be an induction variable. 6958 auto SE = PSE.getSE(); 6959 unsigned NumOperands = Gep->getNumOperands(); 6960 for (unsigned i = 1; i < NumOperands; ++i) { 6961 Value *Opd = Gep->getOperand(i); 6962 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6963 !Legal->isInductionVariable(Opd)) 6964 return nullptr; 6965 } 6966 6967 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6968 return PSE.getSCEV(Ptr); 6969 } 6970 6971 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6972 return Legal->hasStride(I->getOperand(0)) || 6973 Legal->hasStride(I->getOperand(1)); 6974 } 6975 6976 InstructionCost 6977 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6978 ElementCount VF) { 6979 assert(VF.isVector() && 6980 "Scalarization cost of instruction implies vectorization."); 6981 if (VF.isScalable()) 6982 return InstructionCost::getInvalid(); 6983 6984 Type *ValTy = getLoadStoreType(I); 6985 auto SE = PSE.getSE(); 6986 6987 unsigned AS = getLoadStoreAddressSpace(I); 6988 Value *Ptr = getLoadStorePointerOperand(I); 6989 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6990 6991 // Figure out whether the access is strided and get the stride value 6992 // if it's known in compile time 6993 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6994 6995 // Get the cost of the scalar memory instruction and address computation. 6996 InstructionCost Cost = 6997 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6998 6999 // Don't pass *I here, since it is scalar but will actually be part of a 7000 // vectorized loop where the user of it is a vectorized instruction. 7001 const Align Alignment = getLoadStoreAlignment(I); 7002 Cost += VF.getKnownMinValue() * 7003 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 7004 AS, TTI::TCK_RecipThroughput); 7005 7006 // Get the overhead of the extractelement and insertelement instructions 7007 // we might create due to scalarization. 7008 Cost += getScalarizationOverhead(I, VF); 7009 7010 // If we have a predicated load/store, it will need extra i1 extracts and 7011 // conditional branches, but may not be executed for each vector lane. Scale 7012 // the cost by the probability of executing the predicated block. 7013 if (isPredicatedInst(I)) { 7014 Cost /= getReciprocalPredBlockProb(); 7015 7016 // Add the cost of an i1 extract and a branch 7017 auto *Vec_i1Ty = 7018 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 7019 Cost += TTI.getScalarizationOverhead( 7020 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7021 /*Insert=*/false, /*Extract=*/true); 7022 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 7023 7024 if (useEmulatedMaskMemRefHack(I)) 7025 // Artificially setting to a high enough value to practically disable 7026 // vectorization with such operations. 7027 Cost = 3000000; 7028 } 7029 7030 return Cost; 7031 } 7032 7033 InstructionCost 7034 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7035 ElementCount VF) { 7036 Type *ValTy = getLoadStoreType(I); 7037 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7038 Value *Ptr = getLoadStorePointerOperand(I); 7039 unsigned AS = getLoadStoreAddressSpace(I); 7040 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 7041 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7042 7043 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7044 "Stride should be 1 or -1 for consecutive memory access"); 7045 const Align Alignment = getLoadStoreAlignment(I); 7046 InstructionCost Cost = 0; 7047 if (Legal->isMaskRequired(I)) 7048 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7049 CostKind); 7050 else 7051 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7052 CostKind, I); 7053 7054 bool Reverse = ConsecutiveStride < 0; 7055 if (Reverse) 7056 Cost += 7057 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7058 return Cost; 7059 } 7060 7061 InstructionCost 7062 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7063 ElementCount VF) { 7064 assert(Legal->isUniformMemOp(*I)); 7065 7066 Type *ValTy = getLoadStoreType(I); 7067 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7068 const Align Alignment = getLoadStoreAlignment(I); 7069 unsigned AS = getLoadStoreAddressSpace(I); 7070 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7071 if (isa<LoadInst>(I)) { 7072 return TTI.getAddressComputationCost(ValTy) + 7073 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7074 CostKind) + 7075 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7076 } 7077 StoreInst *SI = cast<StoreInst>(I); 7078 7079 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7080 return TTI.getAddressComputationCost(ValTy) + 7081 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7082 CostKind) + 7083 (isLoopInvariantStoreValue 7084 ? 0 7085 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7086 VF.getKnownMinValue() - 1)); 7087 } 7088 7089 InstructionCost 7090 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7091 ElementCount VF) { 7092 Type *ValTy = getLoadStoreType(I); 7093 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7094 const Align Alignment = getLoadStoreAlignment(I); 7095 const Value *Ptr = getLoadStorePointerOperand(I); 7096 7097 return TTI.getAddressComputationCost(VectorTy) + 7098 TTI.getGatherScatterOpCost( 7099 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7100 TargetTransformInfo::TCK_RecipThroughput, I); 7101 } 7102 7103 InstructionCost 7104 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7105 ElementCount VF) { 7106 // TODO: Once we have support for interleaving with scalable vectors 7107 // we can calculate the cost properly here. 7108 if (VF.isScalable()) 7109 return InstructionCost::getInvalid(); 7110 7111 Type *ValTy = getLoadStoreType(I); 7112 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7113 unsigned AS = getLoadStoreAddressSpace(I); 7114 7115 auto Group = getInterleavedAccessGroup(I); 7116 assert(Group && "Fail to get an interleaved access group."); 7117 7118 unsigned InterleaveFactor = Group->getFactor(); 7119 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7120 7121 // Holds the indices of existing members in an interleaved load group. 7122 // An interleaved store group doesn't need this as it doesn't allow gaps. 7123 SmallVector<unsigned, 4> Indices; 7124 if (isa<LoadInst>(I)) { 7125 for (unsigned i = 0; i < InterleaveFactor; i++) 7126 if (Group->getMember(i)) 7127 Indices.push_back(i); 7128 } 7129 7130 // Calculate the cost of the whole interleaved group. 7131 bool UseMaskForGaps = 7132 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7133 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7134 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7135 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7136 7137 if (Group->isReverse()) { 7138 // TODO: Add support for reversed masked interleaved access. 7139 assert(!Legal->isMaskRequired(I) && 7140 "Reverse masked interleaved access not supported."); 7141 Cost += 7142 Group->getNumMembers() * 7143 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7144 } 7145 return Cost; 7146 } 7147 7148 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7149 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7150 using namespace llvm::PatternMatch; 7151 // Early exit for no inloop reductions 7152 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7153 return None; 7154 auto *VectorTy = cast<VectorType>(Ty); 7155 7156 // We are looking for a pattern of, and finding the minimal acceptable cost: 7157 // reduce(mul(ext(A), ext(B))) or 7158 // reduce(mul(A, B)) or 7159 // reduce(ext(A)) or 7160 // reduce(A). 7161 // The basic idea is that we walk down the tree to do that, finding the root 7162 // reduction instruction in InLoopReductionImmediateChains. From there we find 7163 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7164 // of the components. If the reduction cost is lower then we return it for the 7165 // reduction instruction and 0 for the other instructions in the pattern. If 7166 // it is not we return an invalid cost specifying the orignal cost method 7167 // should be used. 7168 Instruction *RetI = I; 7169 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7170 if (!RetI->hasOneUser()) 7171 return None; 7172 RetI = RetI->user_back(); 7173 } 7174 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7175 RetI->user_back()->getOpcode() == Instruction::Add) { 7176 if (!RetI->hasOneUser()) 7177 return None; 7178 RetI = RetI->user_back(); 7179 } 7180 7181 // Test if the found instruction is a reduction, and if not return an invalid 7182 // cost specifying the parent to use the original cost modelling. 7183 if (!InLoopReductionImmediateChains.count(RetI)) 7184 return None; 7185 7186 // Find the reduction this chain is a part of and calculate the basic cost of 7187 // the reduction on its own. 7188 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7189 Instruction *ReductionPhi = LastChain; 7190 while (!isa<PHINode>(ReductionPhi)) 7191 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7192 7193 const RecurrenceDescriptor &RdxDesc = 7194 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7195 7196 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7197 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7198 7199 // If we're using ordered reductions then we can just return the base cost 7200 // here, since getArithmeticReductionCost calculates the full ordered 7201 // reduction cost when FP reassociation is not allowed. 7202 if (useOrderedReductions(RdxDesc)) 7203 return BaseCost; 7204 7205 // Get the operand that was not the reduction chain and match it to one of the 7206 // patterns, returning the better cost if it is found. 7207 Instruction *RedOp = RetI->getOperand(1) == LastChain 7208 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7209 : dyn_cast<Instruction>(RetI->getOperand(1)); 7210 7211 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7212 7213 Instruction *Op0, *Op1; 7214 if (RedOp && 7215 match(RedOp, 7216 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7217 match(Op0, m_ZExtOrSExt(m_Value())) && 7218 Op0->getOpcode() == Op1->getOpcode() && 7219 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7220 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7221 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7222 7223 // Matched reduce(ext(mul(ext(A), ext(B))) 7224 // Note that the extend opcodes need to all match, or if A==B they will have 7225 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7226 // which is equally fine. 7227 bool IsUnsigned = isa<ZExtInst>(Op0); 7228 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7229 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7230 7231 InstructionCost ExtCost = 7232 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7233 TTI::CastContextHint::None, CostKind, Op0); 7234 InstructionCost MulCost = 7235 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7236 InstructionCost Ext2Cost = 7237 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7238 TTI::CastContextHint::None, CostKind, RedOp); 7239 7240 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7241 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7242 CostKind); 7243 7244 if (RedCost.isValid() && 7245 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7246 return I == RetI ? RedCost : 0; 7247 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7248 !TheLoop->isLoopInvariant(RedOp)) { 7249 // Matched reduce(ext(A)) 7250 bool IsUnsigned = isa<ZExtInst>(RedOp); 7251 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7252 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7253 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7254 CostKind); 7255 7256 InstructionCost ExtCost = 7257 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7258 TTI::CastContextHint::None, CostKind, RedOp); 7259 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7260 return I == RetI ? RedCost : 0; 7261 } else if (RedOp && 7262 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7263 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7264 Op0->getOpcode() == Op1->getOpcode() && 7265 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7266 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7267 bool IsUnsigned = isa<ZExtInst>(Op0); 7268 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7269 // Matched reduce(mul(ext, ext)) 7270 InstructionCost ExtCost = 7271 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7272 TTI::CastContextHint::None, CostKind, Op0); 7273 InstructionCost MulCost = 7274 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7275 7276 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7277 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7278 CostKind); 7279 7280 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7281 return I == RetI ? RedCost : 0; 7282 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7283 // Matched reduce(mul()) 7284 InstructionCost MulCost = 7285 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7286 7287 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7288 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7289 CostKind); 7290 7291 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7292 return I == RetI ? RedCost : 0; 7293 } 7294 } 7295 7296 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7297 } 7298 7299 InstructionCost 7300 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7301 ElementCount VF) { 7302 // Calculate scalar cost only. Vectorization cost should be ready at this 7303 // moment. 7304 if (VF.isScalar()) { 7305 Type *ValTy = getLoadStoreType(I); 7306 const Align Alignment = getLoadStoreAlignment(I); 7307 unsigned AS = getLoadStoreAddressSpace(I); 7308 7309 return TTI.getAddressComputationCost(ValTy) + 7310 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7311 TTI::TCK_RecipThroughput, I); 7312 } 7313 return getWideningCost(I, VF); 7314 } 7315 7316 LoopVectorizationCostModel::VectorizationCostTy 7317 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7318 ElementCount VF) { 7319 // If we know that this instruction will remain uniform, check the cost of 7320 // the scalar version. 7321 if (isUniformAfterVectorization(I, VF)) 7322 VF = ElementCount::getFixed(1); 7323 7324 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7325 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7326 7327 // Forced scalars do not have any scalarization overhead. 7328 auto ForcedScalar = ForcedScalars.find(VF); 7329 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7330 auto InstSet = ForcedScalar->second; 7331 if (InstSet.count(I)) 7332 return VectorizationCostTy( 7333 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7334 VF.getKnownMinValue()), 7335 false); 7336 } 7337 7338 Type *VectorTy; 7339 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7340 7341 bool TypeNotScalarized = 7342 VF.isVector() && VectorTy->isVectorTy() && 7343 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7344 return VectorizationCostTy(C, TypeNotScalarized); 7345 } 7346 7347 InstructionCost 7348 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7349 ElementCount VF) const { 7350 7351 // There is no mechanism yet to create a scalable scalarization loop, 7352 // so this is currently Invalid. 7353 if (VF.isScalable()) 7354 return InstructionCost::getInvalid(); 7355 7356 if (VF.isScalar()) 7357 return 0; 7358 7359 InstructionCost Cost = 0; 7360 Type *RetTy = ToVectorTy(I->getType(), VF); 7361 if (!RetTy->isVoidTy() && 7362 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7363 Cost += TTI.getScalarizationOverhead( 7364 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7365 true, false); 7366 7367 // Some targets keep addresses scalar. 7368 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7369 return Cost; 7370 7371 // Some targets support efficient element stores. 7372 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7373 return Cost; 7374 7375 // Collect operands to consider. 7376 CallInst *CI = dyn_cast<CallInst>(I); 7377 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7378 7379 // Skip operands that do not require extraction/scalarization and do not incur 7380 // any overhead. 7381 SmallVector<Type *> Tys; 7382 for (auto *V : filterExtractingOperands(Ops, VF)) 7383 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7384 return Cost + TTI.getOperandsScalarizationOverhead( 7385 filterExtractingOperands(Ops, VF), Tys); 7386 } 7387 7388 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7389 if (VF.isScalar()) 7390 return; 7391 NumPredStores = 0; 7392 for (BasicBlock *BB : TheLoop->blocks()) { 7393 // For each instruction in the old loop. 7394 for (Instruction &I : *BB) { 7395 Value *Ptr = getLoadStorePointerOperand(&I); 7396 if (!Ptr) 7397 continue; 7398 7399 // TODO: We should generate better code and update the cost model for 7400 // predicated uniform stores. Today they are treated as any other 7401 // predicated store (see added test cases in 7402 // invariant-store-vectorization.ll). 7403 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7404 NumPredStores++; 7405 7406 if (Legal->isUniformMemOp(I)) { 7407 // TODO: Avoid replicating loads and stores instead of 7408 // relying on instcombine to remove them. 7409 // Load: Scalar load + broadcast 7410 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7411 InstructionCost Cost; 7412 if (isa<StoreInst>(&I) && VF.isScalable() && 7413 isLegalGatherOrScatter(&I)) { 7414 Cost = getGatherScatterCost(&I, VF); 7415 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7416 } else { 7417 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7418 "Cannot yet scalarize uniform stores"); 7419 Cost = getUniformMemOpCost(&I, VF); 7420 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7421 } 7422 continue; 7423 } 7424 7425 // We assume that widening is the best solution when possible. 7426 if (memoryInstructionCanBeWidened(&I, VF)) { 7427 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7428 int ConsecutiveStride = 7429 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7430 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7431 "Expected consecutive stride."); 7432 InstWidening Decision = 7433 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7434 setWideningDecision(&I, VF, Decision, Cost); 7435 continue; 7436 } 7437 7438 // Choose between Interleaving, Gather/Scatter or Scalarization. 7439 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7440 unsigned NumAccesses = 1; 7441 if (isAccessInterleaved(&I)) { 7442 auto Group = getInterleavedAccessGroup(&I); 7443 assert(Group && "Fail to get an interleaved access group."); 7444 7445 // Make one decision for the whole group. 7446 if (getWideningDecision(&I, VF) != CM_Unknown) 7447 continue; 7448 7449 NumAccesses = Group->getNumMembers(); 7450 if (interleavedAccessCanBeWidened(&I, VF)) 7451 InterleaveCost = getInterleaveGroupCost(&I, VF); 7452 } 7453 7454 InstructionCost GatherScatterCost = 7455 isLegalGatherOrScatter(&I) 7456 ? getGatherScatterCost(&I, VF) * NumAccesses 7457 : InstructionCost::getInvalid(); 7458 7459 InstructionCost ScalarizationCost = 7460 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7461 7462 // Choose better solution for the current VF, 7463 // write down this decision and use it during vectorization. 7464 InstructionCost Cost; 7465 InstWidening Decision; 7466 if (InterleaveCost <= GatherScatterCost && 7467 InterleaveCost < ScalarizationCost) { 7468 Decision = CM_Interleave; 7469 Cost = InterleaveCost; 7470 } else if (GatherScatterCost < ScalarizationCost) { 7471 Decision = CM_GatherScatter; 7472 Cost = GatherScatterCost; 7473 } else { 7474 Decision = CM_Scalarize; 7475 Cost = ScalarizationCost; 7476 } 7477 // If the instructions belongs to an interleave group, the whole group 7478 // receives the same decision. The whole group receives the cost, but 7479 // the cost will actually be assigned to one instruction. 7480 if (auto Group = getInterleavedAccessGroup(&I)) 7481 setWideningDecision(Group, VF, Decision, Cost); 7482 else 7483 setWideningDecision(&I, VF, Decision, Cost); 7484 } 7485 } 7486 7487 // Make sure that any load of address and any other address computation 7488 // remains scalar unless there is gather/scatter support. This avoids 7489 // inevitable extracts into address registers, and also has the benefit of 7490 // activating LSR more, since that pass can't optimize vectorized 7491 // addresses. 7492 if (TTI.prefersVectorizedAddressing()) 7493 return; 7494 7495 // Start with all scalar pointer uses. 7496 SmallPtrSet<Instruction *, 8> AddrDefs; 7497 for (BasicBlock *BB : TheLoop->blocks()) 7498 for (Instruction &I : *BB) { 7499 Instruction *PtrDef = 7500 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7501 if (PtrDef && TheLoop->contains(PtrDef) && 7502 getWideningDecision(&I, VF) != CM_GatherScatter) 7503 AddrDefs.insert(PtrDef); 7504 } 7505 7506 // Add all instructions used to generate the addresses. 7507 SmallVector<Instruction *, 4> Worklist; 7508 append_range(Worklist, AddrDefs); 7509 while (!Worklist.empty()) { 7510 Instruction *I = Worklist.pop_back_val(); 7511 for (auto &Op : I->operands()) 7512 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7513 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7514 AddrDefs.insert(InstOp).second) 7515 Worklist.push_back(InstOp); 7516 } 7517 7518 for (auto *I : AddrDefs) { 7519 if (isa<LoadInst>(I)) { 7520 // Setting the desired widening decision should ideally be handled in 7521 // by cost functions, but since this involves the task of finding out 7522 // if the loaded register is involved in an address computation, it is 7523 // instead changed here when we know this is the case. 7524 InstWidening Decision = getWideningDecision(I, VF); 7525 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7526 // Scalarize a widened load of address. 7527 setWideningDecision( 7528 I, VF, CM_Scalarize, 7529 (VF.getKnownMinValue() * 7530 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7531 else if (auto Group = getInterleavedAccessGroup(I)) { 7532 // Scalarize an interleave group of address loads. 7533 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7534 if (Instruction *Member = Group->getMember(I)) 7535 setWideningDecision( 7536 Member, VF, CM_Scalarize, 7537 (VF.getKnownMinValue() * 7538 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7539 } 7540 } 7541 } else 7542 // Make sure I gets scalarized and a cost estimate without 7543 // scalarization overhead. 7544 ForcedScalars[VF].insert(I); 7545 } 7546 } 7547 7548 InstructionCost 7549 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7550 Type *&VectorTy) { 7551 Type *RetTy = I->getType(); 7552 if (canTruncateToMinimalBitwidth(I, VF)) 7553 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7554 auto SE = PSE.getSE(); 7555 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7556 7557 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7558 ElementCount VF) -> bool { 7559 if (VF.isScalar()) 7560 return true; 7561 7562 auto Scalarized = InstsToScalarize.find(VF); 7563 assert(Scalarized != InstsToScalarize.end() && 7564 "VF not yet analyzed for scalarization profitability"); 7565 return !Scalarized->second.count(I) && 7566 llvm::all_of(I->users(), [&](User *U) { 7567 auto *UI = cast<Instruction>(U); 7568 return !Scalarized->second.count(UI); 7569 }); 7570 }; 7571 (void) hasSingleCopyAfterVectorization; 7572 7573 if (isScalarAfterVectorization(I, VF)) { 7574 // With the exception of GEPs and PHIs, after scalarization there should 7575 // only be one copy of the instruction generated in the loop. This is 7576 // because the VF is either 1, or any instructions that need scalarizing 7577 // have already been dealt with by the the time we get here. As a result, 7578 // it means we don't have to multiply the instruction cost by VF. 7579 assert(I->getOpcode() == Instruction::GetElementPtr || 7580 I->getOpcode() == Instruction::PHI || 7581 (I->getOpcode() == Instruction::BitCast && 7582 I->getType()->isPointerTy()) || 7583 hasSingleCopyAfterVectorization(I, VF)); 7584 VectorTy = RetTy; 7585 } else 7586 VectorTy = ToVectorTy(RetTy, VF); 7587 7588 // TODO: We need to estimate the cost of intrinsic calls. 7589 switch (I->getOpcode()) { 7590 case Instruction::GetElementPtr: 7591 // We mark this instruction as zero-cost because the cost of GEPs in 7592 // vectorized code depends on whether the corresponding memory instruction 7593 // is scalarized or not. Therefore, we handle GEPs with the memory 7594 // instruction cost. 7595 return 0; 7596 case Instruction::Br: { 7597 // In cases of scalarized and predicated instructions, there will be VF 7598 // predicated blocks in the vectorized loop. Each branch around these 7599 // blocks requires also an extract of its vector compare i1 element. 7600 bool ScalarPredicatedBB = false; 7601 BranchInst *BI = cast<BranchInst>(I); 7602 if (VF.isVector() && BI->isConditional() && 7603 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7604 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7605 ScalarPredicatedBB = true; 7606 7607 if (ScalarPredicatedBB) { 7608 // Not possible to scalarize scalable vector with predicated instructions. 7609 if (VF.isScalable()) 7610 return InstructionCost::getInvalid(); 7611 // Return cost for branches around scalarized and predicated blocks. 7612 auto *Vec_i1Ty = 7613 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7614 return ( 7615 TTI.getScalarizationOverhead( 7616 Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false, 7617 true) + 7618 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7619 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7620 // The back-edge branch will remain, as will all scalar branches. 7621 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7622 else 7623 // This branch will be eliminated by if-conversion. 7624 return 0; 7625 // Note: We currently assume zero cost for an unconditional branch inside 7626 // a predicated block since it will become a fall-through, although we 7627 // may decide in the future to call TTI for all branches. 7628 } 7629 case Instruction::PHI: { 7630 auto *Phi = cast<PHINode>(I); 7631 7632 // First-order recurrences are replaced by vector shuffles inside the loop. 7633 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7634 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7635 return TTI.getShuffleCost( 7636 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7637 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7638 7639 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7640 // converted into select instructions. We require N - 1 selects per phi 7641 // node, where N is the number of incoming values. 7642 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7643 return (Phi->getNumIncomingValues() - 1) * 7644 TTI.getCmpSelInstrCost( 7645 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7646 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7647 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7648 7649 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7650 } 7651 case Instruction::UDiv: 7652 case Instruction::SDiv: 7653 case Instruction::URem: 7654 case Instruction::SRem: 7655 // If we have a predicated instruction, it may not be executed for each 7656 // vector lane. Get the scalarization cost and scale this amount by the 7657 // probability of executing the predicated block. If the instruction is not 7658 // predicated, we fall through to the next case. 7659 if (VF.isVector() && isScalarWithPredication(I)) { 7660 InstructionCost Cost = 0; 7661 7662 // These instructions have a non-void type, so account for the phi nodes 7663 // that we will create. This cost is likely to be zero. The phi node 7664 // cost, if any, should be scaled by the block probability because it 7665 // models a copy at the end of each predicated block. 7666 Cost += VF.getKnownMinValue() * 7667 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7668 7669 // The cost of the non-predicated instruction. 7670 Cost += VF.getKnownMinValue() * 7671 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7672 7673 // The cost of insertelement and extractelement instructions needed for 7674 // scalarization. 7675 Cost += getScalarizationOverhead(I, VF); 7676 7677 // Scale the cost by the probability of executing the predicated blocks. 7678 // This assumes the predicated block for each vector lane is equally 7679 // likely. 7680 return Cost / getReciprocalPredBlockProb(); 7681 } 7682 LLVM_FALLTHROUGH; 7683 case Instruction::Add: 7684 case Instruction::FAdd: 7685 case Instruction::Sub: 7686 case Instruction::FSub: 7687 case Instruction::Mul: 7688 case Instruction::FMul: 7689 case Instruction::FDiv: 7690 case Instruction::FRem: 7691 case Instruction::Shl: 7692 case Instruction::LShr: 7693 case Instruction::AShr: 7694 case Instruction::And: 7695 case Instruction::Or: 7696 case Instruction::Xor: { 7697 // Since we will replace the stride by 1 the multiplication should go away. 7698 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7699 return 0; 7700 7701 // Detect reduction patterns 7702 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7703 return *RedCost; 7704 7705 // Certain instructions can be cheaper to vectorize if they have a constant 7706 // second vector operand. One example of this are shifts on x86. 7707 Value *Op2 = I->getOperand(1); 7708 TargetTransformInfo::OperandValueProperties Op2VP; 7709 TargetTransformInfo::OperandValueKind Op2VK = 7710 TTI.getOperandInfo(Op2, Op2VP); 7711 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7712 Op2VK = TargetTransformInfo::OK_UniformValue; 7713 7714 SmallVector<const Value *, 4> Operands(I->operand_values()); 7715 return TTI.getArithmeticInstrCost( 7716 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7717 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7718 } 7719 case Instruction::FNeg: { 7720 return TTI.getArithmeticInstrCost( 7721 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7722 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7723 TargetTransformInfo::OP_None, I->getOperand(0), I); 7724 } 7725 case Instruction::Select: { 7726 SelectInst *SI = cast<SelectInst>(I); 7727 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7728 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7729 7730 const Value *Op0, *Op1; 7731 using namespace llvm::PatternMatch; 7732 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7733 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7734 // select x, y, false --> x & y 7735 // select x, true, y --> x | y 7736 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7737 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7738 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7739 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7740 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7741 Op1->getType()->getScalarSizeInBits() == 1); 7742 7743 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7744 return TTI.getArithmeticInstrCost( 7745 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7746 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7747 } 7748 7749 Type *CondTy = SI->getCondition()->getType(); 7750 if (!ScalarCond) 7751 CondTy = VectorType::get(CondTy, VF); 7752 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7753 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7754 } 7755 case Instruction::ICmp: 7756 case Instruction::FCmp: { 7757 Type *ValTy = I->getOperand(0)->getType(); 7758 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7759 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7760 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7761 VectorTy = ToVectorTy(ValTy, VF); 7762 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7763 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7764 } 7765 case Instruction::Store: 7766 case Instruction::Load: { 7767 ElementCount Width = VF; 7768 if (Width.isVector()) { 7769 InstWidening Decision = getWideningDecision(I, Width); 7770 assert(Decision != CM_Unknown && 7771 "CM decision should be taken at this point"); 7772 if (Decision == CM_Scalarize) 7773 Width = ElementCount::getFixed(1); 7774 } 7775 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7776 return getMemoryInstructionCost(I, VF); 7777 } 7778 case Instruction::BitCast: 7779 if (I->getType()->isPointerTy()) 7780 return 0; 7781 LLVM_FALLTHROUGH; 7782 case Instruction::ZExt: 7783 case Instruction::SExt: 7784 case Instruction::FPToUI: 7785 case Instruction::FPToSI: 7786 case Instruction::FPExt: 7787 case Instruction::PtrToInt: 7788 case Instruction::IntToPtr: 7789 case Instruction::SIToFP: 7790 case Instruction::UIToFP: 7791 case Instruction::Trunc: 7792 case Instruction::FPTrunc: { 7793 // Computes the CastContextHint from a Load/Store instruction. 7794 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7795 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7796 "Expected a load or a store!"); 7797 7798 if (VF.isScalar() || !TheLoop->contains(I)) 7799 return TTI::CastContextHint::Normal; 7800 7801 switch (getWideningDecision(I, VF)) { 7802 case LoopVectorizationCostModel::CM_GatherScatter: 7803 return TTI::CastContextHint::GatherScatter; 7804 case LoopVectorizationCostModel::CM_Interleave: 7805 return TTI::CastContextHint::Interleave; 7806 case LoopVectorizationCostModel::CM_Scalarize: 7807 case LoopVectorizationCostModel::CM_Widen: 7808 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7809 : TTI::CastContextHint::Normal; 7810 case LoopVectorizationCostModel::CM_Widen_Reverse: 7811 return TTI::CastContextHint::Reversed; 7812 case LoopVectorizationCostModel::CM_Unknown: 7813 llvm_unreachable("Instr did not go through cost modelling?"); 7814 } 7815 7816 llvm_unreachable("Unhandled case!"); 7817 }; 7818 7819 unsigned Opcode = I->getOpcode(); 7820 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7821 // For Trunc, the context is the only user, which must be a StoreInst. 7822 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7823 if (I->hasOneUse()) 7824 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7825 CCH = ComputeCCH(Store); 7826 } 7827 // For Z/Sext, the context is the operand, which must be a LoadInst. 7828 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7829 Opcode == Instruction::FPExt) { 7830 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7831 CCH = ComputeCCH(Load); 7832 } 7833 7834 // We optimize the truncation of induction variables having constant 7835 // integer steps. The cost of these truncations is the same as the scalar 7836 // operation. 7837 if (isOptimizableIVTruncate(I, VF)) { 7838 auto *Trunc = cast<TruncInst>(I); 7839 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7840 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7841 } 7842 7843 // Detect reduction patterns 7844 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7845 return *RedCost; 7846 7847 Type *SrcScalarTy = I->getOperand(0)->getType(); 7848 Type *SrcVecTy = 7849 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7850 if (canTruncateToMinimalBitwidth(I, VF)) { 7851 // This cast is going to be shrunk. This may remove the cast or it might 7852 // turn it into slightly different cast. For example, if MinBW == 16, 7853 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7854 // 7855 // Calculate the modified src and dest types. 7856 Type *MinVecTy = VectorTy; 7857 if (Opcode == Instruction::Trunc) { 7858 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7859 VectorTy = 7860 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7861 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7862 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7863 VectorTy = 7864 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7865 } 7866 } 7867 7868 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7869 } 7870 case Instruction::Call: { 7871 bool NeedToScalarize; 7872 CallInst *CI = cast<CallInst>(I); 7873 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7874 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7875 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7876 return std::min(CallCost, IntrinsicCost); 7877 } 7878 return CallCost; 7879 } 7880 case Instruction::ExtractValue: 7881 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7882 case Instruction::Alloca: 7883 // We cannot easily widen alloca to a scalable alloca, as 7884 // the result would need to be a vector of pointers. 7885 if (VF.isScalable()) 7886 return InstructionCost::getInvalid(); 7887 LLVM_FALLTHROUGH; 7888 default: 7889 // This opcode is unknown. Assume that it is the same as 'mul'. 7890 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7891 } // end of switch. 7892 } 7893 7894 char LoopVectorize::ID = 0; 7895 7896 static const char lv_name[] = "Loop Vectorization"; 7897 7898 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7899 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7900 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7901 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7902 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7903 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7904 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7905 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7906 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7907 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7908 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7909 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7910 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7911 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7912 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7913 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7914 7915 namespace llvm { 7916 7917 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7918 7919 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7920 bool VectorizeOnlyWhenForced) { 7921 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7922 } 7923 7924 } // end namespace llvm 7925 7926 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7927 // Check if the pointer operand of a load or store instruction is 7928 // consecutive. 7929 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7930 return Legal->isConsecutivePtr(Ptr); 7931 return false; 7932 } 7933 7934 void LoopVectorizationCostModel::collectValuesToIgnore() { 7935 // Ignore ephemeral values. 7936 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7937 7938 // Ignore type-promoting instructions we identified during reduction 7939 // detection. 7940 for (auto &Reduction : Legal->getReductionVars()) { 7941 RecurrenceDescriptor &RedDes = Reduction.second; 7942 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7943 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7944 } 7945 // Ignore type-casting instructions we identified during induction 7946 // detection. 7947 for (auto &Induction : Legal->getInductionVars()) { 7948 InductionDescriptor &IndDes = Induction.second; 7949 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7950 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7951 } 7952 } 7953 7954 void LoopVectorizationCostModel::collectInLoopReductions() { 7955 for (auto &Reduction : Legal->getReductionVars()) { 7956 PHINode *Phi = Reduction.first; 7957 RecurrenceDescriptor &RdxDesc = Reduction.second; 7958 7959 // We don't collect reductions that are type promoted (yet). 7960 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7961 continue; 7962 7963 // If the target would prefer this reduction to happen "in-loop", then we 7964 // want to record it as such. 7965 unsigned Opcode = RdxDesc.getOpcode(); 7966 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7967 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7968 TargetTransformInfo::ReductionFlags())) 7969 continue; 7970 7971 // Check that we can correctly put the reductions into the loop, by 7972 // finding the chain of operations that leads from the phi to the loop 7973 // exit value. 7974 SmallVector<Instruction *, 4> ReductionOperations = 7975 RdxDesc.getReductionOpChain(Phi, TheLoop); 7976 bool InLoop = !ReductionOperations.empty(); 7977 if (InLoop) { 7978 InLoopReductionChains[Phi] = ReductionOperations; 7979 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7980 Instruction *LastChain = Phi; 7981 for (auto *I : ReductionOperations) { 7982 InLoopReductionImmediateChains[I] = LastChain; 7983 LastChain = I; 7984 } 7985 } 7986 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7987 << " reduction for phi: " << *Phi << "\n"); 7988 } 7989 } 7990 7991 // TODO: we could return a pair of values that specify the max VF and 7992 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7993 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7994 // doesn't have a cost model that can choose which plan to execute if 7995 // more than one is generated. 7996 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7997 LoopVectorizationCostModel &CM) { 7998 unsigned WidestType; 7999 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 8000 return WidestVectorRegBits / WidestType; 8001 } 8002 8003 VectorizationFactor 8004 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 8005 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 8006 ElementCount VF = UserVF; 8007 // Outer loop handling: They may require CFG and instruction level 8008 // transformations before even evaluating whether vectorization is profitable. 8009 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8010 // the vectorization pipeline. 8011 if (!OrigLoop->isInnermost()) { 8012 // If the user doesn't provide a vectorization factor, determine a 8013 // reasonable one. 8014 if (UserVF.isZero()) { 8015 VF = ElementCount::getFixed(determineVPlanVF( 8016 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8017 .getFixedSize(), 8018 CM)); 8019 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8020 8021 // Make sure we have a VF > 1 for stress testing. 8022 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8023 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8024 << "overriding computed VF.\n"); 8025 VF = ElementCount::getFixed(4); 8026 } 8027 } 8028 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8029 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8030 "VF needs to be a power of two"); 8031 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8032 << "VF " << VF << " to build VPlans.\n"); 8033 buildVPlans(VF, VF); 8034 8035 // For VPlan build stress testing, we bail out after VPlan construction. 8036 if (VPlanBuildStressTest) 8037 return VectorizationFactor::Disabled(); 8038 8039 return {VF, 0 /*Cost*/}; 8040 } 8041 8042 LLVM_DEBUG( 8043 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8044 "VPlan-native path.\n"); 8045 return VectorizationFactor::Disabled(); 8046 } 8047 8048 Optional<VectorizationFactor> 8049 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8050 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8051 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8052 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8053 return None; 8054 8055 // Invalidate interleave groups if all blocks of loop will be predicated. 8056 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 8057 !useMaskedInterleavedAccesses(*TTI)) { 8058 LLVM_DEBUG( 8059 dbgs() 8060 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8061 "which requires masked-interleaved support.\n"); 8062 if (CM.InterleaveInfo.invalidateGroups()) 8063 // Invalidating interleave groups also requires invalidating all decisions 8064 // based on them, which includes widening decisions and uniform and scalar 8065 // values. 8066 CM.invalidateCostModelingDecisions(); 8067 } 8068 8069 ElementCount MaxUserVF = 8070 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8071 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8072 if (!UserVF.isZero() && UserVFIsLegal) { 8073 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8074 "VF needs to be a power of two"); 8075 // Collect the instructions (and their associated costs) that will be more 8076 // profitable to scalarize. 8077 if (CM.selectUserVectorizationFactor(UserVF)) { 8078 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8079 CM.collectInLoopReductions(); 8080 buildVPlansWithVPRecipes(UserVF, UserVF); 8081 LLVM_DEBUG(printPlans(dbgs())); 8082 return {{UserVF, 0}}; 8083 } else 8084 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8085 "InvalidCost", ORE, OrigLoop); 8086 } 8087 8088 // Populate the set of Vectorization Factor Candidates. 8089 ElementCountSet VFCandidates; 8090 for (auto VF = ElementCount::getFixed(1); 8091 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8092 VFCandidates.insert(VF); 8093 for (auto VF = ElementCount::getScalable(1); 8094 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8095 VFCandidates.insert(VF); 8096 8097 for (const auto &VF : VFCandidates) { 8098 // Collect Uniform and Scalar instructions after vectorization with VF. 8099 CM.collectUniformsAndScalars(VF); 8100 8101 // Collect the instructions (and their associated costs) that will be more 8102 // profitable to scalarize. 8103 if (VF.isVector()) 8104 CM.collectInstsToScalarize(VF); 8105 } 8106 8107 CM.collectInLoopReductions(); 8108 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8109 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8110 8111 LLVM_DEBUG(printPlans(dbgs())); 8112 if (!MaxFactors.hasVector()) 8113 return VectorizationFactor::Disabled(); 8114 8115 // Select the optimal vectorization factor. 8116 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8117 8118 // Check if it is profitable to vectorize with runtime checks. 8119 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8120 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8121 bool PragmaThresholdReached = 8122 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8123 bool ThresholdReached = 8124 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8125 if ((ThresholdReached && !Hints.allowReordering()) || 8126 PragmaThresholdReached) { 8127 ORE->emit([&]() { 8128 return OptimizationRemarkAnalysisAliasing( 8129 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8130 OrigLoop->getHeader()) 8131 << "loop not vectorized: cannot prove it is safe to reorder " 8132 "memory operations"; 8133 }); 8134 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8135 Hints.emitRemarkWithHints(); 8136 return VectorizationFactor::Disabled(); 8137 } 8138 } 8139 return SelectedVF; 8140 } 8141 8142 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8143 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8144 << '\n'); 8145 BestVF = VF; 8146 BestUF = UF; 8147 8148 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8149 return !Plan->hasVF(VF); 8150 }); 8151 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8152 } 8153 8154 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8155 DominatorTree *DT) { 8156 // Perform the actual loop transformation. 8157 8158 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8159 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8160 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8161 8162 VPTransformState State{ 8163 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8164 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8165 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8166 State.CanonicalIV = ILV.Induction; 8167 8168 ILV.printDebugTracesAtStart(); 8169 8170 //===------------------------------------------------===// 8171 // 8172 // Notice: any optimization or new instruction that go 8173 // into the code below should also be implemented in 8174 // the cost-model. 8175 // 8176 //===------------------------------------------------===// 8177 8178 // 2. Copy and widen instructions from the old loop into the new loop. 8179 VPlans.front()->execute(&State); 8180 8181 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8182 // predication, updating analyses. 8183 ILV.fixVectorizedLoop(State); 8184 8185 ILV.printDebugTracesAtEnd(); 8186 } 8187 8188 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8189 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8190 for (const auto &Plan : VPlans) 8191 if (PrintVPlansInDotFormat) 8192 Plan->printDOT(O); 8193 else 8194 Plan->print(O); 8195 } 8196 #endif 8197 8198 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8199 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8200 8201 // We create new control-flow for the vectorized loop, so the original exit 8202 // conditions will be dead after vectorization if it's only used by the 8203 // terminator 8204 SmallVector<BasicBlock*> ExitingBlocks; 8205 OrigLoop->getExitingBlocks(ExitingBlocks); 8206 for (auto *BB : ExitingBlocks) { 8207 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8208 if (!Cmp || !Cmp->hasOneUse()) 8209 continue; 8210 8211 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8212 if (!DeadInstructions.insert(Cmp).second) 8213 continue; 8214 8215 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8216 // TODO: can recurse through operands in general 8217 for (Value *Op : Cmp->operands()) { 8218 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8219 DeadInstructions.insert(cast<Instruction>(Op)); 8220 } 8221 } 8222 8223 // We create new "steps" for induction variable updates to which the original 8224 // induction variables map. An original update instruction will be dead if 8225 // all its users except the induction variable are dead. 8226 auto *Latch = OrigLoop->getLoopLatch(); 8227 for (auto &Induction : Legal->getInductionVars()) { 8228 PHINode *Ind = Induction.first; 8229 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8230 8231 // If the tail is to be folded by masking, the primary induction variable, 8232 // if exists, isn't dead: it will be used for masking. Don't kill it. 8233 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8234 continue; 8235 8236 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8237 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8238 })) 8239 DeadInstructions.insert(IndUpdate); 8240 8241 // We record as "Dead" also the type-casting instructions we had identified 8242 // during induction analysis. We don't need any handling for them in the 8243 // vectorized loop because we have proven that, under a proper runtime 8244 // test guarding the vectorized loop, the value of the phi, and the casted 8245 // value of the phi, are the same. The last instruction in this casting chain 8246 // will get its scalar/vector/widened def from the scalar/vector/widened def 8247 // of the respective phi node. Any other casts in the induction def-use chain 8248 // have no other uses outside the phi update chain, and will be ignored. 8249 InductionDescriptor &IndDes = Induction.second; 8250 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8251 DeadInstructions.insert(Casts.begin(), Casts.end()); 8252 } 8253 } 8254 8255 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8256 8257 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8258 8259 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8260 Instruction::BinaryOps BinOp) { 8261 // When unrolling and the VF is 1, we only need to add a simple scalar. 8262 Type *Ty = Val->getType(); 8263 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8264 8265 if (Ty->isFloatingPointTy()) { 8266 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8267 8268 // Floating-point operations inherit FMF via the builder's flags. 8269 Value *MulOp = Builder.CreateFMul(C, Step); 8270 return Builder.CreateBinOp(BinOp, Val, MulOp); 8271 } 8272 Constant *C = ConstantInt::get(Ty, StartIdx); 8273 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8274 } 8275 8276 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8277 SmallVector<Metadata *, 4> MDs; 8278 // Reserve first location for self reference to the LoopID metadata node. 8279 MDs.push_back(nullptr); 8280 bool IsUnrollMetadata = false; 8281 MDNode *LoopID = L->getLoopID(); 8282 if (LoopID) { 8283 // First find existing loop unrolling disable metadata. 8284 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8285 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8286 if (MD) { 8287 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8288 IsUnrollMetadata = 8289 S && S->getString().startswith("llvm.loop.unroll.disable"); 8290 } 8291 MDs.push_back(LoopID->getOperand(i)); 8292 } 8293 } 8294 8295 if (!IsUnrollMetadata) { 8296 // Add runtime unroll disable metadata. 8297 LLVMContext &Context = L->getHeader()->getContext(); 8298 SmallVector<Metadata *, 1> DisableOperands; 8299 DisableOperands.push_back( 8300 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8301 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8302 MDs.push_back(DisableNode); 8303 MDNode *NewLoopID = MDNode::get(Context, MDs); 8304 // Set operand 0 to refer to the loop id itself. 8305 NewLoopID->replaceOperandWith(0, NewLoopID); 8306 L->setLoopID(NewLoopID); 8307 } 8308 } 8309 8310 //===--------------------------------------------------------------------===// 8311 // EpilogueVectorizerMainLoop 8312 //===--------------------------------------------------------------------===// 8313 8314 /// This function is partially responsible for generating the control flow 8315 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8316 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8317 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8318 Loop *Lp = createVectorLoopSkeleton(""); 8319 8320 // Generate the code to check the minimum iteration count of the vector 8321 // epilogue (see below). 8322 EPI.EpilogueIterationCountCheck = 8323 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8324 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8325 8326 // Generate the code to check any assumptions that we've made for SCEV 8327 // expressions. 8328 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8329 8330 // Generate the code that checks at runtime if arrays overlap. We put the 8331 // checks into a separate block to make the more common case of few elements 8332 // faster. 8333 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8334 8335 // Generate the iteration count check for the main loop, *after* the check 8336 // for the epilogue loop, so that the path-length is shorter for the case 8337 // that goes directly through the vector epilogue. The longer-path length for 8338 // the main loop is compensated for, by the gain from vectorizing the larger 8339 // trip count. Note: the branch will get updated later on when we vectorize 8340 // the epilogue. 8341 EPI.MainLoopIterationCountCheck = 8342 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8343 8344 // Generate the induction variable. 8345 OldInduction = Legal->getPrimaryInduction(); 8346 Type *IdxTy = Legal->getWidestInductionType(); 8347 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8348 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8349 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8350 EPI.VectorTripCount = CountRoundDown; 8351 Induction = 8352 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8353 getDebugLocFromInstOrOperands(OldInduction)); 8354 8355 // Skip induction resume value creation here because they will be created in 8356 // the second pass. If we created them here, they wouldn't be used anyway, 8357 // because the vplan in the second pass still contains the inductions from the 8358 // original loop. 8359 8360 return completeLoopSkeleton(Lp, OrigLoopID); 8361 } 8362 8363 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8364 LLVM_DEBUG({ 8365 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8366 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8367 << ", Main Loop UF:" << EPI.MainLoopUF 8368 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8369 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8370 }); 8371 } 8372 8373 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8374 DEBUG_WITH_TYPE(VerboseDebug, { 8375 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8376 }); 8377 } 8378 8379 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8380 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8381 assert(L && "Expected valid Loop."); 8382 assert(Bypass && "Expected valid bypass basic block."); 8383 unsigned VFactor = 8384 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8385 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8386 Value *Count = getOrCreateTripCount(L); 8387 // Reuse existing vector loop preheader for TC checks. 8388 // Note that new preheader block is generated for vector loop. 8389 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8390 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8391 8392 // Generate code to check if the loop's trip count is less than VF * UF of the 8393 // main vector loop. 8394 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8395 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8396 8397 Value *CheckMinIters = Builder.CreateICmp( 8398 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8399 "min.iters.check"); 8400 8401 if (!ForEpilogue) 8402 TCCheckBlock->setName("vector.main.loop.iter.check"); 8403 8404 // Create new preheader for vector loop. 8405 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8406 DT, LI, nullptr, "vector.ph"); 8407 8408 if (ForEpilogue) { 8409 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8410 DT->getNode(Bypass)->getIDom()) && 8411 "TC check is expected to dominate Bypass"); 8412 8413 // Update dominator for Bypass & LoopExit. 8414 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8415 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8416 // For loops with multiple exits, there's no edge from the middle block 8417 // to exit blocks (as the epilogue must run) and thus no need to update 8418 // the immediate dominator of the exit blocks. 8419 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8420 8421 LoopBypassBlocks.push_back(TCCheckBlock); 8422 8423 // Save the trip count so we don't have to regenerate it in the 8424 // vec.epilog.iter.check. This is safe to do because the trip count 8425 // generated here dominates the vector epilog iter check. 8426 EPI.TripCount = Count; 8427 } 8428 8429 ReplaceInstWithInst( 8430 TCCheckBlock->getTerminator(), 8431 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8432 8433 return TCCheckBlock; 8434 } 8435 8436 //===--------------------------------------------------------------------===// 8437 // EpilogueVectorizerEpilogueLoop 8438 //===--------------------------------------------------------------------===// 8439 8440 /// This function is partially responsible for generating the control flow 8441 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8442 BasicBlock * 8443 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8444 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8445 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8446 8447 // Now, compare the remaining count and if there aren't enough iterations to 8448 // execute the vectorized epilogue skip to the scalar part. 8449 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8450 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8451 LoopVectorPreHeader = 8452 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8453 LI, nullptr, "vec.epilog.ph"); 8454 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8455 VecEpilogueIterationCountCheck); 8456 8457 // Adjust the control flow taking the state info from the main loop 8458 // vectorization into account. 8459 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8460 "expected this to be saved from the previous pass."); 8461 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8462 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8463 8464 DT->changeImmediateDominator(LoopVectorPreHeader, 8465 EPI.MainLoopIterationCountCheck); 8466 8467 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8468 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8469 8470 if (EPI.SCEVSafetyCheck) 8471 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8472 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8473 if (EPI.MemSafetyCheck) 8474 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8475 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8476 8477 DT->changeImmediateDominator( 8478 VecEpilogueIterationCountCheck, 8479 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8480 8481 DT->changeImmediateDominator(LoopScalarPreHeader, 8482 EPI.EpilogueIterationCountCheck); 8483 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8484 // If there is an epilogue which must run, there's no edge from the 8485 // middle block to exit blocks and thus no need to update the immediate 8486 // dominator of the exit blocks. 8487 DT->changeImmediateDominator(LoopExitBlock, 8488 EPI.EpilogueIterationCountCheck); 8489 8490 // Keep track of bypass blocks, as they feed start values to the induction 8491 // phis in the scalar loop preheader. 8492 if (EPI.SCEVSafetyCheck) 8493 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8494 if (EPI.MemSafetyCheck) 8495 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8496 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8497 8498 // Generate a resume induction for the vector epilogue and put it in the 8499 // vector epilogue preheader 8500 Type *IdxTy = Legal->getWidestInductionType(); 8501 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8502 LoopVectorPreHeader->getFirstNonPHI()); 8503 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8504 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8505 EPI.MainLoopIterationCountCheck); 8506 8507 // Generate the induction variable. 8508 OldInduction = Legal->getPrimaryInduction(); 8509 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8510 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8511 Value *StartIdx = EPResumeVal; 8512 Induction = 8513 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8514 getDebugLocFromInstOrOperands(OldInduction)); 8515 8516 // Generate induction resume values. These variables save the new starting 8517 // indexes for the scalar loop. They are used to test if there are any tail 8518 // iterations left once the vector loop has completed. 8519 // Note that when the vectorized epilogue is skipped due to iteration count 8520 // check, then the resume value for the induction variable comes from 8521 // the trip count of the main vector loop, hence passing the AdditionalBypass 8522 // argument. 8523 createInductionResumeValues(Lp, CountRoundDown, 8524 {VecEpilogueIterationCountCheck, 8525 EPI.VectorTripCount} /* AdditionalBypass */); 8526 8527 AddRuntimeUnrollDisableMetaData(Lp); 8528 return completeLoopSkeleton(Lp, OrigLoopID); 8529 } 8530 8531 BasicBlock * 8532 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8533 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8534 8535 assert(EPI.TripCount && 8536 "Expected trip count to have been safed in the first pass."); 8537 assert( 8538 (!isa<Instruction>(EPI.TripCount) || 8539 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8540 "saved trip count does not dominate insertion point."); 8541 Value *TC = EPI.TripCount; 8542 IRBuilder<> Builder(Insert->getTerminator()); 8543 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8544 8545 // Generate code to check if the loop's trip count is less than VF * UF of the 8546 // vector epilogue loop. 8547 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8548 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8549 8550 Value *CheckMinIters = Builder.CreateICmp( 8551 P, Count, 8552 ConstantInt::get(Count->getType(), 8553 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8554 "min.epilog.iters.check"); 8555 8556 ReplaceInstWithInst( 8557 Insert->getTerminator(), 8558 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8559 8560 LoopBypassBlocks.push_back(Insert); 8561 return Insert; 8562 } 8563 8564 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8565 LLVM_DEBUG({ 8566 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8567 << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8568 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8569 }); 8570 } 8571 8572 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8573 DEBUG_WITH_TYPE(VerboseDebug, { 8574 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8575 }); 8576 } 8577 8578 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8579 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8580 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8581 bool PredicateAtRangeStart = Predicate(Range.Start); 8582 8583 for (ElementCount TmpVF = Range.Start * 2; 8584 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8585 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8586 Range.End = TmpVF; 8587 break; 8588 } 8589 8590 return PredicateAtRangeStart; 8591 } 8592 8593 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8594 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8595 /// of VF's starting at a given VF and extending it as much as possible. Each 8596 /// vectorization decision can potentially shorten this sub-range during 8597 /// buildVPlan(). 8598 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8599 ElementCount MaxVF) { 8600 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8601 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8602 VFRange SubRange = {VF, MaxVFPlusOne}; 8603 VPlans.push_back(buildVPlan(SubRange)); 8604 VF = SubRange.End; 8605 } 8606 } 8607 8608 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8609 VPlanPtr &Plan) { 8610 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8611 8612 // Look for cached value. 8613 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8614 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8615 if (ECEntryIt != EdgeMaskCache.end()) 8616 return ECEntryIt->second; 8617 8618 VPValue *SrcMask = createBlockInMask(Src, Plan); 8619 8620 // The terminator has to be a branch inst! 8621 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8622 assert(BI && "Unexpected terminator found"); 8623 8624 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8625 return EdgeMaskCache[Edge] = SrcMask; 8626 8627 // If source is an exiting block, we know the exit edge is dynamically dead 8628 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8629 // adding uses of an otherwise potentially dead instruction. 8630 if (OrigLoop->isLoopExiting(Src)) 8631 return EdgeMaskCache[Edge] = SrcMask; 8632 8633 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8634 assert(EdgeMask && "No Edge Mask found for condition"); 8635 8636 if (BI->getSuccessor(0) != Dst) 8637 EdgeMask = Builder.createNot(EdgeMask); 8638 8639 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8640 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8641 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8642 // The select version does not introduce new UB if SrcMask is false and 8643 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8644 VPValue *False = Plan->getOrAddVPValue( 8645 ConstantInt::getFalse(BI->getCondition()->getType())); 8646 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8647 } 8648 8649 return EdgeMaskCache[Edge] = EdgeMask; 8650 } 8651 8652 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8653 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8654 8655 // Look for cached value. 8656 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8657 if (BCEntryIt != BlockMaskCache.end()) 8658 return BCEntryIt->second; 8659 8660 // All-one mask is modelled as no-mask following the convention for masked 8661 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8662 VPValue *BlockMask = nullptr; 8663 8664 if (OrigLoop->getHeader() == BB) { 8665 if (!CM.blockNeedsPredication(BB)) 8666 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8667 8668 // Create the block in mask as the first non-phi instruction in the block. 8669 VPBuilder::InsertPointGuard Guard(Builder); 8670 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8671 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8672 8673 // Introduce the early-exit compare IV <= BTC to form header block mask. 8674 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8675 // Start by constructing the desired canonical IV. 8676 VPValue *IV = nullptr; 8677 if (Legal->getPrimaryInduction()) 8678 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8679 else { 8680 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8681 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8682 IV = IVRecipe->getVPSingleValue(); 8683 } 8684 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8685 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8686 8687 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8688 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8689 // as a second argument, we only pass the IV here and extract the 8690 // tripcount from the transform state where codegen of the VP instructions 8691 // happen. 8692 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8693 } else { 8694 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8695 } 8696 return BlockMaskCache[BB] = BlockMask; 8697 } 8698 8699 // This is the block mask. We OR all incoming edges. 8700 for (auto *Predecessor : predecessors(BB)) { 8701 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8702 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8703 return BlockMaskCache[BB] = EdgeMask; 8704 8705 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8706 BlockMask = EdgeMask; 8707 continue; 8708 } 8709 8710 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8711 } 8712 8713 return BlockMaskCache[BB] = BlockMask; 8714 } 8715 8716 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8717 ArrayRef<VPValue *> Operands, 8718 VFRange &Range, 8719 VPlanPtr &Plan) { 8720 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8721 "Must be called with either a load or store"); 8722 8723 auto willWiden = [&](ElementCount VF) -> bool { 8724 if (VF.isScalar()) 8725 return false; 8726 LoopVectorizationCostModel::InstWidening Decision = 8727 CM.getWideningDecision(I, VF); 8728 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8729 "CM decision should be taken at this point."); 8730 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8731 return true; 8732 if (CM.isScalarAfterVectorization(I, VF) || 8733 CM.isProfitableToScalarize(I, VF)) 8734 return false; 8735 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8736 }; 8737 8738 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8739 return nullptr; 8740 8741 VPValue *Mask = nullptr; 8742 if (Legal->isMaskRequired(I)) 8743 Mask = createBlockInMask(I->getParent(), Plan); 8744 8745 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8746 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8747 8748 StoreInst *Store = cast<StoreInst>(I); 8749 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8750 Mask); 8751 } 8752 8753 VPWidenIntOrFpInductionRecipe * 8754 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8755 ArrayRef<VPValue *> Operands) const { 8756 // Check if this is an integer or fp induction. If so, build the recipe that 8757 // produces its scalar and vector values. 8758 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8759 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8760 II.getKind() == InductionDescriptor::IK_FpInduction) { 8761 assert(II.getStartValue() == 8762 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8763 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8764 return new VPWidenIntOrFpInductionRecipe( 8765 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8766 } 8767 8768 return nullptr; 8769 } 8770 8771 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8772 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8773 VPlan &Plan) const { 8774 // Optimize the special case where the source is a constant integer 8775 // induction variable. Notice that we can only optimize the 'trunc' case 8776 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8777 // (c) other casts depend on pointer size. 8778 8779 // Determine whether \p K is a truncation based on an induction variable that 8780 // can be optimized. 8781 auto isOptimizableIVTruncate = 8782 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8783 return [=](ElementCount VF) -> bool { 8784 return CM.isOptimizableIVTruncate(K, VF); 8785 }; 8786 }; 8787 8788 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8789 isOptimizableIVTruncate(I), Range)) { 8790 8791 InductionDescriptor II = 8792 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8793 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8794 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8795 Start, nullptr, I); 8796 } 8797 return nullptr; 8798 } 8799 8800 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8801 ArrayRef<VPValue *> Operands, 8802 VPlanPtr &Plan) { 8803 // If all incoming values are equal, the incoming VPValue can be used directly 8804 // instead of creating a new VPBlendRecipe. 8805 VPValue *FirstIncoming = Operands[0]; 8806 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8807 return FirstIncoming == Inc; 8808 })) { 8809 return Operands[0]; 8810 } 8811 8812 // We know that all PHIs in non-header blocks are converted into selects, so 8813 // we don't have to worry about the insertion order and we can just use the 8814 // builder. At this point we generate the predication tree. There may be 8815 // duplications since this is a simple recursive scan, but future 8816 // optimizations will clean it up. 8817 SmallVector<VPValue *, 2> OperandsWithMask; 8818 unsigned NumIncoming = Phi->getNumIncomingValues(); 8819 8820 for (unsigned In = 0; In < NumIncoming; In++) { 8821 VPValue *EdgeMask = 8822 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8823 assert((EdgeMask || NumIncoming == 1) && 8824 "Multiple predecessors with one having a full mask"); 8825 OperandsWithMask.push_back(Operands[In]); 8826 if (EdgeMask) 8827 OperandsWithMask.push_back(EdgeMask); 8828 } 8829 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8830 } 8831 8832 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8833 ArrayRef<VPValue *> Operands, 8834 VFRange &Range) const { 8835 8836 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8837 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8838 Range); 8839 8840 if (IsPredicated) 8841 return nullptr; 8842 8843 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8844 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8845 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8846 ID == Intrinsic::pseudoprobe || 8847 ID == Intrinsic::experimental_noalias_scope_decl)) 8848 return nullptr; 8849 8850 auto willWiden = [&](ElementCount VF) -> bool { 8851 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8852 // The following case may be scalarized depending on the VF. 8853 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8854 // version of the instruction. 8855 // Is it beneficial to perform intrinsic call compared to lib call? 8856 bool NeedToScalarize = false; 8857 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8858 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8859 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8860 return UseVectorIntrinsic || !NeedToScalarize; 8861 }; 8862 8863 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8864 return nullptr; 8865 8866 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8867 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8868 } 8869 8870 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8871 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8872 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8873 // Instruction should be widened, unless it is scalar after vectorization, 8874 // scalarization is profitable or it is predicated. 8875 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8876 return CM.isScalarAfterVectorization(I, VF) || 8877 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8878 }; 8879 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8880 Range); 8881 } 8882 8883 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8884 ArrayRef<VPValue *> Operands) const { 8885 auto IsVectorizableOpcode = [](unsigned Opcode) { 8886 switch (Opcode) { 8887 case Instruction::Add: 8888 case Instruction::And: 8889 case Instruction::AShr: 8890 case Instruction::BitCast: 8891 case Instruction::FAdd: 8892 case Instruction::FCmp: 8893 case Instruction::FDiv: 8894 case Instruction::FMul: 8895 case Instruction::FNeg: 8896 case Instruction::FPExt: 8897 case Instruction::FPToSI: 8898 case Instruction::FPToUI: 8899 case Instruction::FPTrunc: 8900 case Instruction::FRem: 8901 case Instruction::FSub: 8902 case Instruction::ICmp: 8903 case Instruction::IntToPtr: 8904 case Instruction::LShr: 8905 case Instruction::Mul: 8906 case Instruction::Or: 8907 case Instruction::PtrToInt: 8908 case Instruction::SDiv: 8909 case Instruction::Select: 8910 case Instruction::SExt: 8911 case Instruction::Shl: 8912 case Instruction::SIToFP: 8913 case Instruction::SRem: 8914 case Instruction::Sub: 8915 case Instruction::Trunc: 8916 case Instruction::UDiv: 8917 case Instruction::UIToFP: 8918 case Instruction::URem: 8919 case Instruction::Xor: 8920 case Instruction::ZExt: 8921 return true; 8922 } 8923 return false; 8924 }; 8925 8926 if (!IsVectorizableOpcode(I->getOpcode())) 8927 return nullptr; 8928 8929 // Success: widen this instruction. 8930 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8931 } 8932 8933 void VPRecipeBuilder::fixHeaderPhis() { 8934 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8935 for (VPWidenPHIRecipe *R : PhisToFix) { 8936 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8937 VPRecipeBase *IncR = 8938 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8939 R->addOperand(IncR->getVPSingleValue()); 8940 } 8941 } 8942 8943 VPBasicBlock *VPRecipeBuilder::handleReplication( 8944 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8945 VPlanPtr &Plan) { 8946 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8947 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8948 Range); 8949 8950 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8951 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8952 8953 // Even if the instruction is not marked as uniform, there are certain 8954 // intrinsic calls that can be effectively treated as such, so we check for 8955 // them here. Conservatively, we only do this for scalable vectors, since 8956 // for fixed-width VFs we can always fall back on full scalarization. 8957 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8958 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8959 case Intrinsic::assume: 8960 case Intrinsic::lifetime_start: 8961 case Intrinsic::lifetime_end: 8962 // For scalable vectors if one of the operands is variant then we still 8963 // want to mark as uniform, which will generate one instruction for just 8964 // the first lane of the vector. We can't scalarize the call in the same 8965 // way as for fixed-width vectors because we don't know how many lanes 8966 // there are. 8967 // 8968 // The reasons for doing it this way for scalable vectors are: 8969 // 1. For the assume intrinsic generating the instruction for the first 8970 // lane is still be better than not generating any at all. For 8971 // example, the input may be a splat across all lanes. 8972 // 2. For the lifetime start/end intrinsics the pointer operand only 8973 // does anything useful when the input comes from a stack object, 8974 // which suggests it should always be uniform. For non-stack objects 8975 // the effect is to poison the object, which still allows us to 8976 // remove the call. 8977 IsUniform = true; 8978 break; 8979 default: 8980 break; 8981 } 8982 } 8983 8984 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8985 IsUniform, IsPredicated); 8986 setRecipe(I, Recipe); 8987 Plan->addVPValue(I, Recipe); 8988 8989 // Find if I uses a predicated instruction. If so, it will use its scalar 8990 // value. Avoid hoisting the insert-element which packs the scalar value into 8991 // a vector value, as that happens iff all users use the vector value. 8992 for (VPValue *Op : Recipe->operands()) { 8993 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8994 if (!PredR) 8995 continue; 8996 auto *RepR = 8997 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8998 assert(RepR->isPredicated() && 8999 "expected Replicate recipe to be predicated"); 9000 RepR->setAlsoPack(false); 9001 } 9002 9003 // Finalize the recipe for Instr, first if it is not predicated. 9004 if (!IsPredicated) { 9005 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9006 VPBB->appendRecipe(Recipe); 9007 return VPBB; 9008 } 9009 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9010 assert(VPBB->getSuccessors().empty() && 9011 "VPBB has successors when handling predicated replication."); 9012 // Record predicated instructions for above packing optimizations. 9013 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9014 VPBlockUtils::insertBlockAfter(Region, VPBB); 9015 auto *RegSucc = new VPBasicBlock(); 9016 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9017 return RegSucc; 9018 } 9019 9020 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9021 VPRecipeBase *PredRecipe, 9022 VPlanPtr &Plan) { 9023 // Instructions marked for predication are replicated and placed under an 9024 // if-then construct to prevent side-effects. 9025 9026 // Generate recipes to compute the block mask for this region. 9027 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9028 9029 // Build the triangular if-then region. 9030 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9031 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9032 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9033 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9034 auto *PHIRecipe = Instr->getType()->isVoidTy() 9035 ? nullptr 9036 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9037 if (PHIRecipe) { 9038 Plan->removeVPValueFor(Instr); 9039 Plan->addVPValue(Instr, PHIRecipe); 9040 } 9041 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9042 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9043 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9044 9045 // Note: first set Entry as region entry and then connect successors starting 9046 // from it in order, to propagate the "parent" of each VPBasicBlock. 9047 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9048 VPBlockUtils::connectBlocks(Pred, Exit); 9049 9050 return Region; 9051 } 9052 9053 VPRecipeOrVPValueTy 9054 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9055 ArrayRef<VPValue *> Operands, 9056 VFRange &Range, VPlanPtr &Plan) { 9057 // First, check for specific widening recipes that deal with calls, memory 9058 // operations, inductions and Phi nodes. 9059 if (auto *CI = dyn_cast<CallInst>(Instr)) 9060 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9061 9062 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9063 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9064 9065 VPRecipeBase *Recipe; 9066 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9067 if (Phi->getParent() != OrigLoop->getHeader()) 9068 return tryToBlend(Phi, Operands, Plan); 9069 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9070 return toVPRecipeResult(Recipe); 9071 9072 VPWidenPHIRecipe *PhiRecipe = nullptr; 9073 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9074 VPValue *StartV = Operands[0]; 9075 if (Legal->isReductionVariable(Phi)) { 9076 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9077 assert(RdxDesc.getRecurrenceStartValue() == 9078 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9079 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9080 CM.isInLoopReduction(Phi), 9081 CM.useOrderedReductions(RdxDesc)); 9082 } else { 9083 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9084 } 9085 9086 // Record the incoming value from the backedge, so we can add the incoming 9087 // value from the backedge after all recipes have been created. 9088 recordRecipeOf(cast<Instruction>( 9089 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9090 PhisToFix.push_back(PhiRecipe); 9091 } else { 9092 // TODO: record start and backedge value for remaining pointer induction 9093 // phis. 9094 assert(Phi->getType()->isPointerTy() && 9095 "only pointer phis should be handled here"); 9096 PhiRecipe = new VPWidenPHIRecipe(Phi); 9097 } 9098 9099 return toVPRecipeResult(PhiRecipe); 9100 } 9101 9102 if (isa<TruncInst>(Instr) && 9103 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9104 Range, *Plan))) 9105 return toVPRecipeResult(Recipe); 9106 9107 if (!shouldWiden(Instr, Range)) 9108 return nullptr; 9109 9110 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9111 return toVPRecipeResult(new VPWidenGEPRecipe( 9112 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9113 9114 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9115 bool InvariantCond = 9116 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9117 return toVPRecipeResult(new VPWidenSelectRecipe( 9118 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9119 } 9120 9121 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9122 } 9123 9124 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9125 ElementCount MaxVF) { 9126 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9127 9128 // Collect instructions from the original loop that will become trivially dead 9129 // in the vectorized loop. We don't need to vectorize these instructions. For 9130 // example, original induction update instructions can become dead because we 9131 // separately emit induction "steps" when generating code for the new loop. 9132 // Similarly, we create a new latch condition when setting up the structure 9133 // of the new loop, so the old one can become dead. 9134 SmallPtrSet<Instruction *, 4> DeadInstructions; 9135 collectTriviallyDeadInstructions(DeadInstructions); 9136 9137 // Add assume instructions we need to drop to DeadInstructions, to prevent 9138 // them from being added to the VPlan. 9139 // TODO: We only need to drop assumes in blocks that get flattend. If the 9140 // control flow is preserved, we should keep them. 9141 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9142 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9143 9144 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9145 // Dead instructions do not need sinking. Remove them from SinkAfter. 9146 for (Instruction *I : DeadInstructions) 9147 SinkAfter.erase(I); 9148 9149 // Cannot sink instructions after dead instructions (there won't be any 9150 // recipes for them). Instead, find the first non-dead previous instruction. 9151 for (auto &P : Legal->getSinkAfter()) { 9152 Instruction *SinkTarget = P.second; 9153 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9154 (void)FirstInst; 9155 while (DeadInstructions.contains(SinkTarget)) { 9156 assert( 9157 SinkTarget != FirstInst && 9158 "Must find a live instruction (at least the one feeding the " 9159 "first-order recurrence PHI) before reaching beginning of the block"); 9160 SinkTarget = SinkTarget->getPrevNode(); 9161 assert(SinkTarget != P.first && 9162 "sink source equals target, no sinking required"); 9163 } 9164 P.second = SinkTarget; 9165 } 9166 9167 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9168 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9169 VFRange SubRange = {VF, MaxVFPlusOne}; 9170 VPlans.push_back( 9171 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9172 VF = SubRange.End; 9173 } 9174 } 9175 9176 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9177 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9178 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9179 9180 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9181 9182 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9183 9184 // --------------------------------------------------------------------------- 9185 // Pre-construction: record ingredients whose recipes we'll need to further 9186 // process after constructing the initial VPlan. 9187 // --------------------------------------------------------------------------- 9188 9189 // Mark instructions we'll need to sink later and their targets as 9190 // ingredients whose recipe we'll need to record. 9191 for (auto &Entry : SinkAfter) { 9192 RecipeBuilder.recordRecipeOf(Entry.first); 9193 RecipeBuilder.recordRecipeOf(Entry.second); 9194 } 9195 for (auto &Reduction : CM.getInLoopReductionChains()) { 9196 PHINode *Phi = Reduction.first; 9197 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9198 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9199 9200 RecipeBuilder.recordRecipeOf(Phi); 9201 for (auto &R : ReductionOperations) { 9202 RecipeBuilder.recordRecipeOf(R); 9203 // For min/max reducitons, where we have a pair of icmp/select, we also 9204 // need to record the ICmp recipe, so it can be removed later. 9205 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9206 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9207 } 9208 } 9209 9210 // For each interleave group which is relevant for this (possibly trimmed) 9211 // Range, add it to the set of groups to be later applied to the VPlan and add 9212 // placeholders for its members' Recipes which we'll be replacing with a 9213 // single VPInterleaveRecipe. 9214 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9215 auto applyIG = [IG, this](ElementCount VF) -> bool { 9216 return (VF.isVector() && // Query is illegal for VF == 1 9217 CM.getWideningDecision(IG->getInsertPos(), VF) == 9218 LoopVectorizationCostModel::CM_Interleave); 9219 }; 9220 if (!getDecisionAndClampRange(applyIG, Range)) 9221 continue; 9222 InterleaveGroups.insert(IG); 9223 for (unsigned i = 0; i < IG->getFactor(); i++) 9224 if (Instruction *Member = IG->getMember(i)) 9225 RecipeBuilder.recordRecipeOf(Member); 9226 }; 9227 9228 // --------------------------------------------------------------------------- 9229 // Build initial VPlan: Scan the body of the loop in a topological order to 9230 // visit each basic block after having visited its predecessor basic blocks. 9231 // --------------------------------------------------------------------------- 9232 9233 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9234 auto Plan = std::make_unique<VPlan>(); 9235 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9236 Plan->setEntry(VPBB); 9237 9238 // Scan the body of the loop in a topological order to visit each basic block 9239 // after having visited its predecessor basic blocks. 9240 LoopBlocksDFS DFS(OrigLoop); 9241 DFS.perform(LI); 9242 9243 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9244 // Relevant instructions from basic block BB will be grouped into VPRecipe 9245 // ingredients and fill a new VPBasicBlock. 9246 unsigned VPBBsForBB = 0; 9247 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9248 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9249 VPBB = FirstVPBBForBB; 9250 Builder.setInsertPoint(VPBB); 9251 9252 // Introduce each ingredient into VPlan. 9253 // TODO: Model and preserve debug instrinsics in VPlan. 9254 for (Instruction &I : BB->instructionsWithoutDebug()) { 9255 Instruction *Instr = &I; 9256 9257 // First filter out irrelevant instructions, to ensure no recipes are 9258 // built for them. 9259 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9260 continue; 9261 9262 SmallVector<VPValue *, 4> Operands; 9263 auto *Phi = dyn_cast<PHINode>(Instr); 9264 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9265 Operands.push_back(Plan->getOrAddVPValue( 9266 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9267 } else { 9268 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9269 Operands = {OpRange.begin(), OpRange.end()}; 9270 } 9271 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9272 Instr, Operands, Range, Plan)) { 9273 // If Instr can be simplified to an existing VPValue, use it. 9274 if (RecipeOrValue.is<VPValue *>()) { 9275 auto *VPV = RecipeOrValue.get<VPValue *>(); 9276 Plan->addVPValue(Instr, VPV); 9277 // If the re-used value is a recipe, register the recipe for the 9278 // instruction, in case the recipe for Instr needs to be recorded. 9279 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9280 RecipeBuilder.setRecipe(Instr, R); 9281 continue; 9282 } 9283 // Otherwise, add the new recipe. 9284 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9285 for (auto *Def : Recipe->definedValues()) { 9286 auto *UV = Def->getUnderlyingValue(); 9287 Plan->addVPValue(UV, Def); 9288 } 9289 9290 RecipeBuilder.setRecipe(Instr, Recipe); 9291 VPBB->appendRecipe(Recipe); 9292 continue; 9293 } 9294 9295 // Otherwise, if all widening options failed, Instruction is to be 9296 // replicated. This may create a successor for VPBB. 9297 VPBasicBlock *NextVPBB = 9298 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9299 if (NextVPBB != VPBB) { 9300 VPBB = NextVPBB; 9301 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9302 : ""); 9303 } 9304 } 9305 } 9306 9307 RecipeBuilder.fixHeaderPhis(); 9308 9309 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9310 // may also be empty, such as the last one VPBB, reflecting original 9311 // basic-blocks with no recipes. 9312 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9313 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9314 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9315 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9316 delete PreEntry; 9317 9318 // --------------------------------------------------------------------------- 9319 // Transform initial VPlan: Apply previously taken decisions, in order, to 9320 // bring the VPlan to its final state. 9321 // --------------------------------------------------------------------------- 9322 9323 // Apply Sink-After legal constraints. 9324 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9325 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9326 if (Region && Region->isReplicator()) { 9327 assert(Region->getNumSuccessors() == 1 && 9328 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9329 assert(R->getParent()->size() == 1 && 9330 "A recipe in an original replicator region must be the only " 9331 "recipe in its block"); 9332 return Region; 9333 } 9334 return nullptr; 9335 }; 9336 for (auto &Entry : SinkAfter) { 9337 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9338 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9339 9340 auto *TargetRegion = GetReplicateRegion(Target); 9341 auto *SinkRegion = GetReplicateRegion(Sink); 9342 if (!SinkRegion) { 9343 // If the sink source is not a replicate region, sink the recipe directly. 9344 if (TargetRegion) { 9345 // The target is in a replication region, make sure to move Sink to 9346 // the block after it, not into the replication region itself. 9347 VPBasicBlock *NextBlock = 9348 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9349 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9350 } else 9351 Sink->moveAfter(Target); 9352 continue; 9353 } 9354 9355 // The sink source is in a replicate region. Unhook the region from the CFG. 9356 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9357 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9358 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9359 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9360 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9361 9362 if (TargetRegion) { 9363 // The target recipe is also in a replicate region, move the sink region 9364 // after the target region. 9365 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9366 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9367 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9368 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9369 } else { 9370 // The sink source is in a replicate region, we need to move the whole 9371 // replicate region, which should only contain a single recipe in the 9372 // main block. 9373 auto *SplitBlock = 9374 Target->getParent()->splitAt(std::next(Target->getIterator())); 9375 9376 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9377 9378 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9379 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9380 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9381 if (VPBB == SplitPred) 9382 VPBB = SplitBlock; 9383 } 9384 } 9385 9386 // Introduce a recipe to combine the incoming and previous values of a 9387 // first-order recurrence. 9388 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9389 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9390 if (!RecurPhi) 9391 continue; 9392 9393 auto *RecurSplice = cast<VPInstruction>( 9394 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9395 {RecurPhi, RecurPhi->getBackedgeValue()})); 9396 9397 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9398 if (auto *Region = GetReplicateRegion(PrevRecipe)) { 9399 VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9400 RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi()); 9401 } else 9402 RecurSplice->moveAfter(PrevRecipe); 9403 RecurPhi->replaceAllUsesWith(RecurSplice); 9404 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9405 // all users. 9406 RecurSplice->setOperand(0, RecurPhi); 9407 } 9408 9409 // Interleave memory: for each Interleave Group we marked earlier as relevant 9410 // for this VPlan, replace the Recipes widening its memory instructions with a 9411 // single VPInterleaveRecipe at its insertion point. 9412 for (auto IG : InterleaveGroups) { 9413 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9414 RecipeBuilder.getRecipe(IG->getInsertPos())); 9415 SmallVector<VPValue *, 4> StoredValues; 9416 for (unsigned i = 0; i < IG->getFactor(); ++i) 9417 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9418 auto *StoreR = 9419 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9420 StoredValues.push_back(StoreR->getStoredValue()); 9421 } 9422 9423 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9424 Recipe->getMask()); 9425 VPIG->insertBefore(Recipe); 9426 unsigned J = 0; 9427 for (unsigned i = 0; i < IG->getFactor(); ++i) 9428 if (Instruction *Member = IG->getMember(i)) { 9429 if (!Member->getType()->isVoidTy()) { 9430 VPValue *OriginalV = Plan->getVPValue(Member); 9431 Plan->removeVPValueFor(Member); 9432 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9433 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9434 J++; 9435 } 9436 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9437 } 9438 } 9439 9440 // Adjust the recipes for any inloop reductions. 9441 adjustRecipesForInLoopReductions(Plan, RecipeBuilder, Range.Start); 9442 9443 // Finally, if tail is folded by masking, introduce selects between the phi 9444 // and the live-out instruction of each reduction, at the end of the latch. 9445 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9446 Builder.setInsertPoint(VPBB); 9447 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9448 for (auto &Reduction : Legal->getReductionVars()) { 9449 if (CM.isInLoopReduction(Reduction.first)) 9450 continue; 9451 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9452 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9453 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9454 } 9455 } 9456 9457 VPlanTransforms::sinkScalarOperands(*Plan); 9458 VPlanTransforms::mergeReplicateRegions(*Plan); 9459 9460 std::string PlanName; 9461 raw_string_ostream RSO(PlanName); 9462 ElementCount VF = Range.Start; 9463 Plan->addVF(VF); 9464 RSO << "Initial VPlan for VF={" << VF; 9465 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9466 Plan->addVF(VF); 9467 RSO << "," << VF; 9468 } 9469 RSO << "},UF>=1"; 9470 RSO.flush(); 9471 Plan->setName(PlanName); 9472 9473 return Plan; 9474 } 9475 9476 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9477 // Outer loop handling: They may require CFG and instruction level 9478 // transformations before even evaluating whether vectorization is profitable. 9479 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9480 // the vectorization pipeline. 9481 assert(!OrigLoop->isInnermost()); 9482 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9483 9484 // Create new empty VPlan 9485 auto Plan = std::make_unique<VPlan>(); 9486 9487 // Build hierarchical CFG 9488 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9489 HCFGBuilder.buildHierarchicalCFG(); 9490 9491 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9492 VF *= 2) 9493 Plan->addVF(VF); 9494 9495 if (EnableVPlanPredication) { 9496 VPlanPredicator VPP(*Plan); 9497 VPP.predicate(); 9498 9499 // Avoid running transformation to recipes until masked code generation in 9500 // VPlan-native path is in place. 9501 return Plan; 9502 } 9503 9504 SmallPtrSet<Instruction *, 1> DeadInstructions; 9505 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9506 Legal->getInductionVars(), 9507 DeadInstructions, *PSE.getSE()); 9508 return Plan; 9509 } 9510 9511 // Adjust the recipes for any inloop reductions. The chain of instructions 9512 // leading from the loop exit instr to the phi need to be converted to 9513 // reductions, with one operand being vector and the other being the scalar 9514 // reduction chain. 9515 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9516 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { 9517 for (auto &Reduction : CM.getInLoopReductionChains()) { 9518 PHINode *Phi = Reduction.first; 9519 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9520 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9521 9522 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9523 continue; 9524 9525 // ReductionOperations are orders top-down from the phi's use to the 9526 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9527 // which of the two operands will remain scalar and which will be reduced. 9528 // For minmax the chain will be the select instructions. 9529 Instruction *Chain = Phi; 9530 for (Instruction *R : ReductionOperations) { 9531 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9532 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9533 9534 VPValue *ChainOp = Plan->getVPValue(Chain); 9535 unsigned FirstOpId; 9536 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9537 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9538 "Expected to replace a VPWidenSelectSC"); 9539 FirstOpId = 1; 9540 } else { 9541 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && 9542 "Expected to replace a VPWidenSC"); 9543 FirstOpId = 0; 9544 } 9545 unsigned VecOpId = 9546 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9547 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9548 9549 auto *CondOp = CM.foldTailByMasking() 9550 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9551 : nullptr; 9552 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9553 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9554 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9555 Plan->removeVPValueFor(R); 9556 Plan->addVPValue(R, RedRecipe); 9557 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9558 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9559 WidenRecipe->eraseFromParent(); 9560 9561 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9562 VPRecipeBase *CompareRecipe = 9563 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9564 assert(isa<VPWidenRecipe>(CompareRecipe) && 9565 "Expected to replace a VPWidenSC"); 9566 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9567 "Expected no remaining users"); 9568 CompareRecipe->eraseFromParent(); 9569 } 9570 Chain = R; 9571 } 9572 } 9573 } 9574 9575 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9576 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9577 VPSlotTracker &SlotTracker) const { 9578 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9579 IG->getInsertPos()->printAsOperand(O, false); 9580 O << ", "; 9581 getAddr()->printAsOperand(O, SlotTracker); 9582 VPValue *Mask = getMask(); 9583 if (Mask) { 9584 O << ", "; 9585 Mask->printAsOperand(O, SlotTracker); 9586 } 9587 9588 unsigned OpIdx = 0; 9589 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9590 if (!IG->getMember(i)) 9591 continue; 9592 if (getNumStoreOperands() > 0) { 9593 O << "\n" << Indent << " store "; 9594 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9595 O << " to index " << i; 9596 } else { 9597 O << "\n" << Indent << " "; 9598 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9599 O << " = load from index " << i; 9600 } 9601 ++OpIdx; 9602 } 9603 } 9604 #endif 9605 9606 void VPWidenCallRecipe::execute(VPTransformState &State) { 9607 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9608 *this, State); 9609 } 9610 9611 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9612 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9613 this, *this, InvariantCond, State); 9614 } 9615 9616 void VPWidenRecipe::execute(VPTransformState &State) { 9617 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9618 } 9619 9620 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9621 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9622 *this, State.UF, State.VF, IsPtrLoopInvariant, 9623 IsIndexLoopInvariant, State); 9624 } 9625 9626 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9627 assert(!State.Instance && "Int or FP induction being replicated."); 9628 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9629 getTruncInst(), getVPValue(0), 9630 getCastValue(), State); 9631 } 9632 9633 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9634 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9635 State); 9636 } 9637 9638 void VPBlendRecipe::execute(VPTransformState &State) { 9639 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9640 // We know that all PHIs in non-header blocks are converted into 9641 // selects, so we don't have to worry about the insertion order and we 9642 // can just use the builder. 9643 // At this point we generate the predication tree. There may be 9644 // duplications since this is a simple recursive scan, but future 9645 // optimizations will clean it up. 9646 9647 unsigned NumIncoming = getNumIncomingValues(); 9648 9649 // Generate a sequence of selects of the form: 9650 // SELECT(Mask3, In3, 9651 // SELECT(Mask2, In2, 9652 // SELECT(Mask1, In1, 9653 // In0))) 9654 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9655 // are essentially undef are taken from In0. 9656 InnerLoopVectorizer::VectorParts Entry(State.UF); 9657 for (unsigned In = 0; In < NumIncoming; ++In) { 9658 for (unsigned Part = 0; Part < State.UF; ++Part) { 9659 // We might have single edge PHIs (blocks) - use an identity 9660 // 'select' for the first PHI operand. 9661 Value *In0 = State.get(getIncomingValue(In), Part); 9662 if (In == 0) 9663 Entry[Part] = In0; // Initialize with the first incoming value. 9664 else { 9665 // Select between the current value and the previous incoming edge 9666 // based on the incoming mask. 9667 Value *Cond = State.get(getMask(In), Part); 9668 Entry[Part] = 9669 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9670 } 9671 } 9672 } 9673 for (unsigned Part = 0; Part < State.UF; ++Part) 9674 State.set(this, Entry[Part], Part); 9675 } 9676 9677 void VPInterleaveRecipe::execute(VPTransformState &State) { 9678 assert(!State.Instance && "Interleave group being replicated."); 9679 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9680 getStoredValues(), getMask()); 9681 } 9682 9683 void VPReductionRecipe::execute(VPTransformState &State) { 9684 assert(!State.Instance && "Reduction being replicated."); 9685 Value *PrevInChain = State.get(getChainOp(), 0); 9686 for (unsigned Part = 0; Part < State.UF; ++Part) { 9687 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9688 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9689 Value *NewVecOp = State.get(getVecOp(), Part); 9690 if (VPValue *Cond = getCondOp()) { 9691 Value *NewCond = State.get(Cond, Part); 9692 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9693 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9694 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9695 Constant *IdenVec = 9696 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9697 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9698 NewVecOp = Select; 9699 } 9700 Value *NewRed; 9701 Value *NextInChain; 9702 if (IsOrdered) { 9703 if (State.VF.isVector()) 9704 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9705 PrevInChain); 9706 else 9707 NewRed = State.Builder.CreateBinOp( 9708 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), 9709 PrevInChain, NewVecOp); 9710 PrevInChain = NewRed; 9711 } else { 9712 PrevInChain = State.get(getChainOp(), Part); 9713 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9714 } 9715 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9716 NextInChain = 9717 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9718 NewRed, PrevInChain); 9719 } else if (IsOrdered) 9720 NextInChain = NewRed; 9721 else { 9722 NextInChain = State.Builder.CreateBinOp( 9723 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9724 PrevInChain); 9725 } 9726 State.set(this, NextInChain, Part); 9727 } 9728 } 9729 9730 void VPReplicateRecipe::execute(VPTransformState &State) { 9731 if (State.Instance) { // Generate a single instance. 9732 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9733 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9734 *State.Instance, IsPredicated, State); 9735 // Insert scalar instance packing it into a vector. 9736 if (AlsoPack && State.VF.isVector()) { 9737 // If we're constructing lane 0, initialize to start from poison. 9738 if (State.Instance->Lane.isFirstLane()) { 9739 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9740 Value *Poison = PoisonValue::get( 9741 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9742 State.set(this, Poison, State.Instance->Part); 9743 } 9744 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9745 } 9746 return; 9747 } 9748 9749 // Generate scalar instances for all VF lanes of all UF parts, unless the 9750 // instruction is uniform inwhich case generate only the first lane for each 9751 // of the UF parts. 9752 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9753 assert((!State.VF.isScalable() || IsUniform) && 9754 "Can't scalarize a scalable vector"); 9755 for (unsigned Part = 0; Part < State.UF; ++Part) 9756 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9757 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9758 VPIteration(Part, Lane), IsPredicated, 9759 State); 9760 } 9761 9762 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9763 assert(State.Instance && "Branch on Mask works only on single instance."); 9764 9765 unsigned Part = State.Instance->Part; 9766 unsigned Lane = State.Instance->Lane.getKnownLane(); 9767 9768 Value *ConditionBit = nullptr; 9769 VPValue *BlockInMask = getMask(); 9770 if (BlockInMask) { 9771 ConditionBit = State.get(BlockInMask, Part); 9772 if (ConditionBit->getType()->isVectorTy()) 9773 ConditionBit = State.Builder.CreateExtractElement( 9774 ConditionBit, State.Builder.getInt32(Lane)); 9775 } else // Block in mask is all-one. 9776 ConditionBit = State.Builder.getTrue(); 9777 9778 // Replace the temporary unreachable terminator with a new conditional branch, 9779 // whose two destinations will be set later when they are created. 9780 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9781 assert(isa<UnreachableInst>(CurrentTerminator) && 9782 "Expected to replace unreachable terminator with conditional branch."); 9783 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9784 CondBr->setSuccessor(0, nullptr); 9785 ReplaceInstWithInst(CurrentTerminator, CondBr); 9786 } 9787 9788 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9789 assert(State.Instance && "Predicated instruction PHI works per instance."); 9790 Instruction *ScalarPredInst = 9791 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9792 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9793 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9794 assert(PredicatingBB && "Predicated block has no single predecessor."); 9795 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9796 "operand must be VPReplicateRecipe"); 9797 9798 // By current pack/unpack logic we need to generate only a single phi node: if 9799 // a vector value for the predicated instruction exists at this point it means 9800 // the instruction has vector users only, and a phi for the vector value is 9801 // needed. In this case the recipe of the predicated instruction is marked to 9802 // also do that packing, thereby "hoisting" the insert-element sequence. 9803 // Otherwise, a phi node for the scalar value is needed. 9804 unsigned Part = State.Instance->Part; 9805 if (State.hasVectorValue(getOperand(0), Part)) { 9806 Value *VectorValue = State.get(getOperand(0), Part); 9807 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9808 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9809 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9810 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9811 if (State.hasVectorValue(this, Part)) 9812 State.reset(this, VPhi, Part); 9813 else 9814 State.set(this, VPhi, Part); 9815 // NOTE: Currently we need to update the value of the operand, so the next 9816 // predicated iteration inserts its generated value in the correct vector. 9817 State.reset(getOperand(0), VPhi, Part); 9818 } else { 9819 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9820 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9821 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9822 PredicatingBB); 9823 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9824 if (State.hasScalarValue(this, *State.Instance)) 9825 State.reset(this, Phi, *State.Instance); 9826 else 9827 State.set(this, Phi, *State.Instance); 9828 // NOTE: Currently we need to update the value of the operand, so the next 9829 // predicated iteration inserts its generated value in the correct vector. 9830 State.reset(getOperand(0), Phi, *State.Instance); 9831 } 9832 } 9833 9834 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9835 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9836 State.ILV->vectorizeMemoryInstruction( 9837 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9838 StoredValue, getMask()); 9839 } 9840 9841 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9842 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9843 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9844 // for predication. 9845 static ScalarEpilogueLowering getScalarEpilogueLowering( 9846 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9847 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9848 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9849 LoopVectorizationLegality &LVL) { 9850 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9851 // don't look at hints or options, and don't request a scalar epilogue. 9852 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9853 // LoopAccessInfo (due to code dependency and not being able to reliably get 9854 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9855 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9856 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9857 // back to the old way and vectorize with versioning when forced. See D81345.) 9858 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9859 PGSOQueryType::IRPass) && 9860 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9861 return CM_ScalarEpilogueNotAllowedOptSize; 9862 9863 // 2) If set, obey the directives 9864 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9865 switch (PreferPredicateOverEpilogue) { 9866 case PreferPredicateTy::ScalarEpilogue: 9867 return CM_ScalarEpilogueAllowed; 9868 case PreferPredicateTy::PredicateElseScalarEpilogue: 9869 return CM_ScalarEpilogueNotNeededUsePredicate; 9870 case PreferPredicateTy::PredicateOrDontVectorize: 9871 return CM_ScalarEpilogueNotAllowedUsePredicate; 9872 }; 9873 } 9874 9875 // 3) If set, obey the hints 9876 switch (Hints.getPredicate()) { 9877 case LoopVectorizeHints::FK_Enabled: 9878 return CM_ScalarEpilogueNotNeededUsePredicate; 9879 case LoopVectorizeHints::FK_Disabled: 9880 return CM_ScalarEpilogueAllowed; 9881 }; 9882 9883 // 4) if the TTI hook indicates this is profitable, request predication. 9884 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9885 LVL.getLAI())) 9886 return CM_ScalarEpilogueNotNeededUsePredicate; 9887 9888 return CM_ScalarEpilogueAllowed; 9889 } 9890 9891 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9892 // If Values have been set for this Def return the one relevant for \p Part. 9893 if (hasVectorValue(Def, Part)) 9894 return Data.PerPartOutput[Def][Part]; 9895 9896 if (!hasScalarValue(Def, {Part, 0})) { 9897 Value *IRV = Def->getLiveInIRValue(); 9898 Value *B = ILV->getBroadcastInstrs(IRV); 9899 set(Def, B, Part); 9900 return B; 9901 } 9902 9903 Value *ScalarValue = get(Def, {Part, 0}); 9904 // If we aren't vectorizing, we can just copy the scalar map values over 9905 // to the vector map. 9906 if (VF.isScalar()) { 9907 set(Def, ScalarValue, Part); 9908 return ScalarValue; 9909 } 9910 9911 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9912 bool IsUniform = RepR && RepR->isUniform(); 9913 9914 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9915 // Check if there is a scalar value for the selected lane. 9916 if (!hasScalarValue(Def, {Part, LastLane})) { 9917 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9918 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9919 "unexpected recipe found to be invariant"); 9920 IsUniform = true; 9921 LastLane = 0; 9922 } 9923 9924 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9925 // Set the insert point after the last scalarized instruction or after the 9926 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9927 // will directly follow the scalar definitions. 9928 auto OldIP = Builder.saveIP(); 9929 auto NewIP = 9930 isa<PHINode>(LastInst) 9931 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9932 : std::next(BasicBlock::iterator(LastInst)); 9933 Builder.SetInsertPoint(&*NewIP); 9934 9935 // However, if we are vectorizing, we need to construct the vector values. 9936 // If the value is known to be uniform after vectorization, we can just 9937 // broadcast the scalar value corresponding to lane zero for each unroll 9938 // iteration. Otherwise, we construct the vector values using 9939 // insertelement instructions. Since the resulting vectors are stored in 9940 // State, we will only generate the insertelements once. 9941 Value *VectorValue = nullptr; 9942 if (IsUniform) { 9943 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9944 set(Def, VectorValue, Part); 9945 } else { 9946 // Initialize packing with insertelements to start from undef. 9947 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9948 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9949 set(Def, Undef, Part); 9950 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9951 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9952 VectorValue = get(Def, Part); 9953 } 9954 Builder.restoreIP(OldIP); 9955 return VectorValue; 9956 } 9957 9958 // Process the loop in the VPlan-native vectorization path. This path builds 9959 // VPlan upfront in the vectorization pipeline, which allows to apply 9960 // VPlan-to-VPlan transformations from the very beginning without modifying the 9961 // input LLVM IR. 9962 static bool processLoopInVPlanNativePath( 9963 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9964 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9965 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9966 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9967 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9968 LoopVectorizationRequirements &Requirements) { 9969 9970 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9971 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9972 return false; 9973 } 9974 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9975 Function *F = L->getHeader()->getParent(); 9976 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9977 9978 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9979 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9980 9981 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9982 &Hints, IAI); 9983 // Use the planner for outer loop vectorization. 9984 // TODO: CM is not used at this point inside the planner. Turn CM into an 9985 // optional argument if we don't need it in the future. 9986 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9987 Requirements, ORE); 9988 9989 // Get user vectorization factor. 9990 ElementCount UserVF = Hints.getWidth(); 9991 9992 CM.collectElementTypesForWidening(); 9993 9994 // Plan how to best vectorize, return the best VF and its cost. 9995 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9996 9997 // If we are stress testing VPlan builds, do not attempt to generate vector 9998 // code. Masked vector code generation support will follow soon. 9999 // Also, do not attempt to vectorize if no vector code will be produced. 10000 if (VPlanBuildStressTest || EnableVPlanPredication || 10001 VectorizationFactor::Disabled() == VF) 10002 return false; 10003 10004 LVP.setBestPlan(VF.Width, 1); 10005 10006 { 10007 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10008 F->getParent()->getDataLayout()); 10009 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10010 &CM, BFI, PSI, Checks); 10011 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10012 << L->getHeader()->getParent()->getName() << "\"\n"); 10013 LVP.executePlan(LB, DT); 10014 } 10015 10016 // Mark the loop as already vectorized to avoid vectorizing again. 10017 Hints.setAlreadyVectorized(); 10018 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10019 return true; 10020 } 10021 10022 // Emit a remark if there are stores to floats that required a floating point 10023 // extension. If the vectorized loop was generated with floating point there 10024 // will be a performance penalty from the conversion overhead and the change in 10025 // the vector width. 10026 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10027 SmallVector<Instruction *, 4> Worklist; 10028 for (BasicBlock *BB : L->getBlocks()) { 10029 for (Instruction &Inst : *BB) { 10030 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10031 if (S->getValueOperand()->getType()->isFloatTy()) 10032 Worklist.push_back(S); 10033 } 10034 } 10035 } 10036 10037 // Traverse the floating point stores upwards searching, for floating point 10038 // conversions. 10039 SmallPtrSet<const Instruction *, 4> Visited; 10040 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10041 while (!Worklist.empty()) { 10042 auto *I = Worklist.pop_back_val(); 10043 if (!L->contains(I)) 10044 continue; 10045 if (!Visited.insert(I).second) 10046 continue; 10047 10048 // Emit a remark if the floating point store required a floating 10049 // point conversion. 10050 // TODO: More work could be done to identify the root cause such as a 10051 // constant or a function return type and point the user to it. 10052 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10053 ORE->emit([&]() { 10054 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10055 I->getDebugLoc(), L->getHeader()) 10056 << "floating point conversion changes vector width. " 10057 << "Mixed floating point precision requires an up/down " 10058 << "cast that will negatively impact performance."; 10059 }); 10060 10061 for (Use &Op : I->operands()) 10062 if (auto *OpI = dyn_cast<Instruction>(Op)) 10063 Worklist.push_back(OpI); 10064 } 10065 } 10066 10067 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10068 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10069 !EnableLoopInterleaving), 10070 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10071 !EnableLoopVectorization) {} 10072 10073 bool LoopVectorizePass::processLoop(Loop *L) { 10074 assert((EnableVPlanNativePath || L->isInnermost()) && 10075 "VPlan-native path is not enabled. Only process inner loops."); 10076 10077 #ifndef NDEBUG 10078 const std::string DebugLocStr = getDebugLocString(L); 10079 #endif /* NDEBUG */ 10080 10081 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10082 << L->getHeader()->getParent()->getName() << "\" from " 10083 << DebugLocStr << "\n"); 10084 10085 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10086 10087 LLVM_DEBUG( 10088 dbgs() << "LV: Loop hints:" 10089 << " force=" 10090 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10091 ? "disabled" 10092 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10093 ? "enabled" 10094 : "?")) 10095 << " width=" << Hints.getWidth() 10096 << " interleave=" << Hints.getInterleave() << "\n"); 10097 10098 // Function containing loop 10099 Function *F = L->getHeader()->getParent(); 10100 10101 // Looking at the diagnostic output is the only way to determine if a loop 10102 // was vectorized (other than looking at the IR or machine code), so it 10103 // is important to generate an optimization remark for each loop. Most of 10104 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10105 // generated as OptimizationRemark and OptimizationRemarkMissed are 10106 // less verbose reporting vectorized loops and unvectorized loops that may 10107 // benefit from vectorization, respectively. 10108 10109 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10110 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10111 return false; 10112 } 10113 10114 PredicatedScalarEvolution PSE(*SE, *L); 10115 10116 // Check if it is legal to vectorize the loop. 10117 LoopVectorizationRequirements Requirements; 10118 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10119 &Requirements, &Hints, DB, AC, BFI, PSI); 10120 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10121 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10122 Hints.emitRemarkWithHints(); 10123 return false; 10124 } 10125 10126 // Check the function attributes and profiles to find out if this function 10127 // should be optimized for size. 10128 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10129 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10130 10131 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10132 // here. They may require CFG and instruction level transformations before 10133 // even evaluating whether vectorization is profitable. Since we cannot modify 10134 // the incoming IR, we need to build VPlan upfront in the vectorization 10135 // pipeline. 10136 if (!L->isInnermost()) 10137 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10138 ORE, BFI, PSI, Hints, Requirements); 10139 10140 assert(L->isInnermost() && "Inner loop expected."); 10141 10142 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10143 // count by optimizing for size, to minimize overheads. 10144 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10145 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10146 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10147 << "This loop is worth vectorizing only if no scalar " 10148 << "iteration overheads are incurred."); 10149 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10150 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10151 else { 10152 LLVM_DEBUG(dbgs() << "\n"); 10153 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10154 } 10155 } 10156 10157 // Check the function attributes to see if implicit floats are allowed. 10158 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10159 // an integer loop and the vector instructions selected are purely integer 10160 // vector instructions? 10161 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10162 reportVectorizationFailure( 10163 "Can't vectorize when the NoImplicitFloat attribute is used", 10164 "loop not vectorized due to NoImplicitFloat attribute", 10165 "NoImplicitFloat", ORE, L); 10166 Hints.emitRemarkWithHints(); 10167 return false; 10168 } 10169 10170 // Check if the target supports potentially unsafe FP vectorization. 10171 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10172 // for the target we're vectorizing for, to make sure none of the 10173 // additional fp-math flags can help. 10174 if (Hints.isPotentiallyUnsafe() && 10175 TTI->isFPVectorizationPotentiallyUnsafe()) { 10176 reportVectorizationFailure( 10177 "Potentially unsafe FP op prevents vectorization", 10178 "loop not vectorized due to unsafe FP support.", 10179 "UnsafeFP", ORE, L); 10180 Hints.emitRemarkWithHints(); 10181 return false; 10182 } 10183 10184 if (!LVL.canVectorizeFPMath(ForceOrderedReductions)) { 10185 ORE->emit([&]() { 10186 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10187 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10188 ExactFPMathInst->getDebugLoc(), 10189 ExactFPMathInst->getParent()) 10190 << "loop not vectorized: cannot prove it is safe to reorder " 10191 "floating-point operations"; 10192 }); 10193 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10194 "reorder floating-point operations\n"); 10195 Hints.emitRemarkWithHints(); 10196 return false; 10197 } 10198 10199 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10200 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10201 10202 // If an override option has been passed in for interleaved accesses, use it. 10203 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10204 UseInterleaved = EnableInterleavedMemAccesses; 10205 10206 // Analyze interleaved memory accesses. 10207 if (UseInterleaved) { 10208 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10209 } 10210 10211 // Use the cost model. 10212 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10213 F, &Hints, IAI); 10214 CM.collectValuesToIgnore(); 10215 CM.collectElementTypesForWidening(); 10216 10217 // Use the planner for vectorization. 10218 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10219 Requirements, ORE); 10220 10221 // Get user vectorization factor and interleave count. 10222 ElementCount UserVF = Hints.getWidth(); 10223 unsigned UserIC = Hints.getInterleave(); 10224 10225 // Plan how to best vectorize, return the best VF and its cost. 10226 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10227 10228 VectorizationFactor VF = VectorizationFactor::Disabled(); 10229 unsigned IC = 1; 10230 10231 if (MaybeVF) { 10232 VF = *MaybeVF; 10233 // Select the interleave count. 10234 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10235 } 10236 10237 // Identify the diagnostic messages that should be produced. 10238 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10239 bool VectorizeLoop = true, InterleaveLoop = true; 10240 if (VF.Width.isScalar()) { 10241 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10242 VecDiagMsg = std::make_pair( 10243 "VectorizationNotBeneficial", 10244 "the cost-model indicates that vectorization is not beneficial"); 10245 VectorizeLoop = false; 10246 } 10247 10248 if (!MaybeVF && UserIC > 1) { 10249 // Tell the user interleaving was avoided up-front, despite being explicitly 10250 // requested. 10251 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10252 "interleaving should be avoided up front\n"); 10253 IntDiagMsg = std::make_pair( 10254 "InterleavingAvoided", 10255 "Ignoring UserIC, because interleaving was avoided up front"); 10256 InterleaveLoop = false; 10257 } else if (IC == 1 && UserIC <= 1) { 10258 // Tell the user interleaving is not beneficial. 10259 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10260 IntDiagMsg = std::make_pair( 10261 "InterleavingNotBeneficial", 10262 "the cost-model indicates that interleaving is not beneficial"); 10263 InterleaveLoop = false; 10264 if (UserIC == 1) { 10265 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10266 IntDiagMsg.second += 10267 " and is explicitly disabled or interleave count is set to 1"; 10268 } 10269 } else if (IC > 1 && UserIC == 1) { 10270 // Tell the user interleaving is beneficial, but it explicitly disabled. 10271 LLVM_DEBUG( 10272 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10273 IntDiagMsg = std::make_pair( 10274 "InterleavingBeneficialButDisabled", 10275 "the cost-model indicates that interleaving is beneficial " 10276 "but is explicitly disabled or interleave count is set to 1"); 10277 InterleaveLoop = false; 10278 } 10279 10280 // Override IC if user provided an interleave count. 10281 IC = UserIC > 0 ? UserIC : IC; 10282 10283 // Emit diagnostic messages, if any. 10284 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10285 if (!VectorizeLoop && !InterleaveLoop) { 10286 // Do not vectorize or interleaving the loop. 10287 ORE->emit([&]() { 10288 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10289 L->getStartLoc(), L->getHeader()) 10290 << VecDiagMsg.second; 10291 }); 10292 ORE->emit([&]() { 10293 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10294 L->getStartLoc(), L->getHeader()) 10295 << IntDiagMsg.second; 10296 }); 10297 return false; 10298 } else if (!VectorizeLoop && InterleaveLoop) { 10299 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10300 ORE->emit([&]() { 10301 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10302 L->getStartLoc(), L->getHeader()) 10303 << VecDiagMsg.second; 10304 }); 10305 } else if (VectorizeLoop && !InterleaveLoop) { 10306 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10307 << ") in " << DebugLocStr << '\n'); 10308 ORE->emit([&]() { 10309 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10310 L->getStartLoc(), L->getHeader()) 10311 << IntDiagMsg.second; 10312 }); 10313 } else if (VectorizeLoop && InterleaveLoop) { 10314 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10315 << ") in " << DebugLocStr << '\n'); 10316 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10317 } 10318 10319 bool DisableRuntimeUnroll = false; 10320 MDNode *OrigLoopID = L->getLoopID(); 10321 { 10322 // Optimistically generate runtime checks. Drop them if they turn out to not 10323 // be profitable. Limit the scope of Checks, so the cleanup happens 10324 // immediately after vector codegeneration is done. 10325 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10326 F->getParent()->getDataLayout()); 10327 if (!VF.Width.isScalar() || IC > 1) 10328 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10329 LVP.setBestPlan(VF.Width, IC); 10330 10331 using namespace ore; 10332 if (!VectorizeLoop) { 10333 assert(IC > 1 && "interleave count should not be 1 or 0"); 10334 // If we decided that it is not legal to vectorize the loop, then 10335 // interleave it. 10336 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10337 &CM, BFI, PSI, Checks); 10338 LVP.executePlan(Unroller, DT); 10339 10340 ORE->emit([&]() { 10341 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10342 L->getHeader()) 10343 << "interleaved loop (interleaved count: " 10344 << NV("InterleaveCount", IC) << ")"; 10345 }); 10346 } else { 10347 // If we decided that it is *legal* to vectorize the loop, then do it. 10348 10349 // Consider vectorizing the epilogue too if it's profitable. 10350 VectorizationFactor EpilogueVF = 10351 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10352 if (EpilogueVF.Width.isVector()) { 10353 10354 // The first pass vectorizes the main loop and creates a scalar epilogue 10355 // to be vectorized by executing the plan (potentially with a different 10356 // factor) again shortly afterwards. 10357 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10358 EpilogueVF.Width.getKnownMinValue(), 10359 1); 10360 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10361 EPI, &LVL, &CM, BFI, PSI, Checks); 10362 10363 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10364 LVP.executePlan(MainILV, DT); 10365 ++LoopsVectorized; 10366 10367 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10368 formLCSSARecursively(*L, *DT, LI, SE); 10369 10370 // Second pass vectorizes the epilogue and adjusts the control flow 10371 // edges from the first pass. 10372 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10373 EPI.MainLoopVF = EPI.EpilogueVF; 10374 EPI.MainLoopUF = EPI.EpilogueUF; 10375 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10376 ORE, EPI, &LVL, &CM, BFI, PSI, 10377 Checks); 10378 LVP.executePlan(EpilogILV, DT); 10379 ++LoopsEpilogueVectorized; 10380 10381 if (!MainILV.areSafetyChecksAdded()) 10382 DisableRuntimeUnroll = true; 10383 } else { 10384 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10385 &LVL, &CM, BFI, PSI, Checks); 10386 LVP.executePlan(LB, DT); 10387 ++LoopsVectorized; 10388 10389 // Add metadata to disable runtime unrolling a scalar loop when there 10390 // are no runtime checks about strides and memory. A scalar loop that is 10391 // rarely used is not worth unrolling. 10392 if (!LB.areSafetyChecksAdded()) 10393 DisableRuntimeUnroll = true; 10394 } 10395 // Report the vectorization decision. 10396 ORE->emit([&]() { 10397 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10398 L->getHeader()) 10399 << "vectorized loop (vectorization width: " 10400 << NV("VectorizationFactor", VF.Width) 10401 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10402 }); 10403 } 10404 10405 if (ORE->allowExtraAnalysis(LV_NAME)) 10406 checkMixedPrecision(L, ORE); 10407 } 10408 10409 Optional<MDNode *> RemainderLoopID = 10410 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10411 LLVMLoopVectorizeFollowupEpilogue}); 10412 if (RemainderLoopID.hasValue()) { 10413 L->setLoopID(RemainderLoopID.getValue()); 10414 } else { 10415 if (DisableRuntimeUnroll) 10416 AddRuntimeUnrollDisableMetaData(L); 10417 10418 // Mark the loop as already vectorized to avoid vectorizing again. 10419 Hints.setAlreadyVectorized(); 10420 } 10421 10422 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10423 return true; 10424 } 10425 10426 LoopVectorizeResult LoopVectorizePass::runImpl( 10427 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10428 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10429 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10430 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10431 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10432 SE = &SE_; 10433 LI = &LI_; 10434 TTI = &TTI_; 10435 DT = &DT_; 10436 BFI = &BFI_; 10437 TLI = TLI_; 10438 AA = &AA_; 10439 AC = &AC_; 10440 GetLAA = &GetLAA_; 10441 DB = &DB_; 10442 ORE = &ORE_; 10443 PSI = PSI_; 10444 10445 // Don't attempt if 10446 // 1. the target claims to have no vector registers, and 10447 // 2. interleaving won't help ILP. 10448 // 10449 // The second condition is necessary because, even if the target has no 10450 // vector registers, loop vectorization may still enable scalar 10451 // interleaving. 10452 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10453 TTI->getMaxInterleaveFactor(1) < 2) 10454 return LoopVectorizeResult(false, false); 10455 10456 bool Changed = false, CFGChanged = false; 10457 10458 // The vectorizer requires loops to be in simplified form. 10459 // Since simplification may add new inner loops, it has to run before the 10460 // legality and profitability checks. This means running the loop vectorizer 10461 // will simplify all loops, regardless of whether anything end up being 10462 // vectorized. 10463 for (auto &L : *LI) 10464 Changed |= CFGChanged |= 10465 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10466 10467 // Build up a worklist of inner-loops to vectorize. This is necessary as 10468 // the act of vectorizing or partially unrolling a loop creates new loops 10469 // and can invalidate iterators across the loops. 10470 SmallVector<Loop *, 8> Worklist; 10471 10472 for (Loop *L : *LI) 10473 collectSupportedLoops(*L, LI, ORE, Worklist); 10474 10475 LoopsAnalyzed += Worklist.size(); 10476 10477 // Now walk the identified inner loops. 10478 while (!Worklist.empty()) { 10479 Loop *L = Worklist.pop_back_val(); 10480 10481 // For the inner loops we actually process, form LCSSA to simplify the 10482 // transform. 10483 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10484 10485 Changed |= CFGChanged |= processLoop(L); 10486 } 10487 10488 // Process each loop nest in the function. 10489 return LoopVectorizeResult(Changed, CFGChanged); 10490 } 10491 10492 PreservedAnalyses LoopVectorizePass::run(Function &F, 10493 FunctionAnalysisManager &AM) { 10494 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10495 auto &LI = AM.getResult<LoopAnalysis>(F); 10496 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10497 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10498 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10499 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10500 auto &AA = AM.getResult<AAManager>(F); 10501 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10502 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10503 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10504 MemorySSA *MSSA = EnableMSSALoopDependency 10505 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10506 : nullptr; 10507 10508 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10509 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10510 [&](Loop &L) -> const LoopAccessInfo & { 10511 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10512 TLI, TTI, nullptr, MSSA}; 10513 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10514 }; 10515 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10516 ProfileSummaryInfo *PSI = 10517 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10518 LoopVectorizeResult Result = 10519 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10520 if (!Result.MadeAnyChange) 10521 return PreservedAnalyses::all(); 10522 PreservedAnalyses PA; 10523 10524 // We currently do not preserve loopinfo/dominator analyses with outer loop 10525 // vectorization. Until this is addressed, mark these analyses as preserved 10526 // only for non-VPlan-native path. 10527 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10528 if (!EnableVPlanNativePath) { 10529 PA.preserve<LoopAnalysis>(); 10530 PA.preserve<DominatorTreeAnalysis>(); 10531 } 10532 if (!Result.MadeCFGChange) 10533 PA.preserveSet<CFGAnalyses>(); 10534 return PA; 10535 } 10536