1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop. 472 /// In the case of epilogue vectorization, this function is overriden to 473 /// handle the more complex control flow around the loops. 474 virtual BasicBlock *createVectorizedLoopSkeleton(); 475 476 /// Widen a single instruction within the innermost loop. 477 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 478 VPTransformState &State); 479 480 /// Widen a single call instruction within the innermost loop. 481 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 482 VPTransformState &State); 483 484 /// Widen a single select instruction within the innermost loop. 485 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 486 bool InvariantCond, VPTransformState &State); 487 488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 489 void fixVectorizedLoop(VPTransformState &State); 490 491 // Return true if any runtime check is added. 492 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 493 494 /// A type for vectorized values in the new loop. Each value from the 495 /// original loop, when vectorized, is represented by UF vector values in the 496 /// new unrolled loop, where UF is the unroll factor. 497 using VectorParts = SmallVector<Value *, 2>; 498 499 /// Vectorize a single GetElementPtrInst based on information gathered and 500 /// decisions taken during planning. 501 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 502 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 503 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 504 505 /// Vectorize a single first-order recurrence or pointer induction PHINode in 506 /// a block. This method handles the induction variable canonicalization. It 507 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 508 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 509 VPTransformState &State); 510 511 /// A helper function to scalarize a single Instruction in the innermost loop. 512 /// Generates a sequence of scalar instances for each lane between \p MinLane 513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 514 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 515 /// Instr's operands. 516 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 517 const VPIteration &Instance, bool IfPredicateInstr, 518 VPTransformState &State); 519 520 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 521 /// is provided, the integer induction variable will first be truncated to 522 /// the corresponding type. 523 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 524 VPValue *Def, VPValue *CastDef, 525 VPTransformState &State); 526 527 /// Construct the vector value of a scalarized value \p V one lane at a time. 528 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 529 VPTransformState &State); 530 531 /// Try to vectorize interleaved access group \p Group with the base address 532 /// given in \p Addr, optionally masking the vector operations if \p 533 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 534 /// values in the vectorized loop. 535 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 536 ArrayRef<VPValue *> VPDefs, 537 VPTransformState &State, VPValue *Addr, 538 ArrayRef<VPValue *> StoredValues, 539 VPValue *BlockInMask = nullptr); 540 541 /// Vectorize Load and Store instructions with the base address given in \p 542 /// Addr, optionally masking the vector operations if \p BlockInMask is 543 /// non-null. Use \p State to translate given VPValues to IR values in the 544 /// vectorized loop. 545 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 546 VPValue *Def, VPValue *Addr, 547 VPValue *StoredValue, VPValue *BlockInMask); 548 549 /// Set the debug location in the builder \p Ptr using the debug location in 550 /// \p V. If \p Ptr is None then it uses the class member's Builder. 551 void setDebugLocFromInst(const Value *V, 552 Optional<IRBuilder<> *> CustomBuilder = None); 553 554 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 555 void fixNonInductionPHIs(VPTransformState &State); 556 557 /// Returns true if the reordering of FP operations is not allowed, but we are 558 /// able to vectorize with strict in-order reductions for the given RdxDesc. 559 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 560 561 /// Create a broadcast instruction. This method generates a broadcast 562 /// instruction (shuffle) for loop invariant values and for the induction 563 /// value. If this is the induction variable then we extend it to N, N+1, ... 564 /// this is needed because each iteration in the loop corresponds to a SIMD 565 /// element. 566 virtual Value *getBroadcastInstrs(Value *V); 567 568 protected: 569 friend class LoopVectorizationPlanner; 570 571 /// A small list of PHINodes. 572 using PhiVector = SmallVector<PHINode *, 4>; 573 574 /// A type for scalarized values in the new loop. Each value from the 575 /// original loop, when scalarized, is represented by UF x VF scalar values 576 /// in the new unrolled loop, where UF is the unroll factor and VF is the 577 /// vectorization factor. 578 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 579 580 /// Set up the values of the IVs correctly when exiting the vector loop. 581 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 582 Value *CountRoundDown, Value *EndValue, 583 BasicBlock *MiddleBlock); 584 585 /// Create a new induction variable inside L. 586 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 587 Value *Step, Instruction *DL); 588 589 /// Handle all cross-iteration phis in the header. 590 void fixCrossIterationPHIs(VPTransformState &State); 591 592 /// Create the exit value of first order recurrences in the middle block and 593 /// update their users. 594 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 595 596 /// Create code for the loop exit value of the reduction. 597 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 598 599 /// Clear NSW/NUW flags from reduction instructions if necessary. 600 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 601 VPTransformState &State); 602 603 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 604 /// means we need to add the appropriate incoming value from the middle 605 /// block as exiting edges from the scalar epilogue loop (if present) are 606 /// already in place, and we exit the vector loop exclusively to the middle 607 /// block. 608 void fixLCSSAPHIs(VPTransformState &State); 609 610 /// Iteratively sink the scalarized operands of a predicated instruction into 611 /// the block that was created for it. 612 void sinkScalarOperands(Instruction *PredInst); 613 614 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 615 /// represented as. 616 void truncateToMinimalBitwidths(VPTransformState &State); 617 618 /// This function adds 619 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 620 /// to each vector element of Val. The sequence starts at StartIndex. 621 /// \p Opcode is relevant for FP induction variable. 622 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 623 Instruction::BinaryOps Opcode = 624 Instruction::BinaryOpsEnd); 625 626 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 627 /// variable on which to base the steps, \p Step is the size of the step, and 628 /// \p EntryVal is the value from the original loop that maps to the steps. 629 /// Note that \p EntryVal doesn't have to be an induction variable - it 630 /// can also be a truncate instruction. 631 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 632 const InductionDescriptor &ID, VPValue *Def, 633 VPValue *CastDef, VPTransformState &State); 634 635 /// Create a vector induction phi node based on an existing scalar one. \p 636 /// EntryVal is the value from the original loop that maps to the vector phi 637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 638 /// truncate instruction, instead of widening the original IV, we widen a 639 /// version of the IV truncated to \p EntryVal's type. 640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 641 Value *Step, Value *Start, 642 Instruction *EntryVal, VPValue *Def, 643 VPValue *CastDef, 644 VPTransformState &State); 645 646 /// Returns true if an instruction \p I should be scalarized instead of 647 /// vectorized for the chosen vectorization factor. 648 bool shouldScalarizeInstruction(Instruction *I) const; 649 650 /// Returns true if we should generate a scalar version of \p IV. 651 bool needsScalarInduction(Instruction *IV) const; 652 653 /// If there is a cast involved in the induction variable \p ID, which should 654 /// be ignored in the vectorized loop body, this function records the 655 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 656 /// cast. We had already proved that the casted Phi is equal to the uncasted 657 /// Phi in the vectorized loop (under a runtime guard), and therefore 658 /// there is no need to vectorize the cast - the same value can be used in the 659 /// vector loop for both the Phi and the cast. 660 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 661 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 662 /// 663 /// \p EntryVal is the value from the original loop that maps to the vector 664 /// phi node and is used to distinguish what is the IV currently being 665 /// processed - original one (if \p EntryVal is a phi corresponding to the 666 /// original IV) or the "newly-created" one based on the proof mentioned above 667 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 668 /// latter case \p EntryVal is a TruncInst and we must not record anything for 669 /// that IV, but it's error-prone to expect callers of this routine to care 670 /// about that, hence this explicit parameter. 671 void recordVectorLoopValueForInductionCast( 672 const InductionDescriptor &ID, const Instruction *EntryVal, 673 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 674 unsigned Part, unsigned Lane = UINT_MAX); 675 676 /// Generate a shuffle sequence that will reverse the vector Vec. 677 virtual Value *reverseVector(Value *Vec); 678 679 /// Returns (and creates if needed) the original loop trip count. 680 Value *getOrCreateTripCount(Loop *NewLoop); 681 682 /// Returns (and creates if needed) the trip count of the widened loop. 683 Value *getOrCreateVectorTripCount(Loop *NewLoop); 684 685 /// Returns a bitcasted value to the requested vector type. 686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 687 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 688 const DataLayout &DL); 689 690 /// Emit a bypass check to see if the vector trip count is zero, including if 691 /// it overflows. 692 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 693 694 /// Emit a bypass check to see if all of the SCEV assumptions we've 695 /// had to make are correct. Returns the block containing the checks or 696 /// nullptr if no checks have been added. 697 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Emit bypass checks to check any memory assumptions we may have made. 700 /// Returns the block containing the checks or nullptr if no checks have been 701 /// added. 702 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Compute the transformed value of Index at offset StartValue using step 705 /// StepValue. 706 /// For integer induction, returns StartValue + Index * StepValue. 707 /// For pointer induction, returns StartValue[Index * StepValue]. 708 /// FIXME: The newly created binary instructions should contain nsw/nuw 709 /// flags, which can be found from the original scalar operations. 710 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 711 const DataLayout &DL, 712 const InductionDescriptor &ID) const; 713 714 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 715 /// vector loop preheader, middle block and scalar preheader. Also 716 /// allocate a loop object for the new vector loop and return it. 717 Loop *createVectorLoopSkeleton(StringRef Prefix); 718 719 /// Create new phi nodes for the induction variables to resume iteration count 720 /// in the scalar epilogue, from where the vectorized loop left off (given by 721 /// \p VectorTripCount). 722 /// In cases where the loop skeleton is more complicated (eg. epilogue 723 /// vectorization) and the resume values can come from an additional bypass 724 /// block, the \p AdditionalBypass pair provides information about the bypass 725 /// block and the end value on the edge from bypass to this loop. 726 void createInductionResumeValues( 727 Loop *L, Value *VectorTripCount, 728 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 729 730 /// Complete the loop skeleton by adding debug MDs, creating appropriate 731 /// conditional branches in the middle block, preparing the builder and 732 /// running the verifier. Take in the vector loop \p L as argument, and return 733 /// the preheader of the completed vector loop. 734 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 735 736 /// Add additional metadata to \p To that was not present on \p Orig. 737 /// 738 /// Currently this is used to add the noalias annotations based on the 739 /// inserted memchecks. Use this for instructions that are *cloned* into the 740 /// vector loop. 741 void addNewMetadata(Instruction *To, const Instruction *Orig); 742 743 /// Add metadata from one instruction to another. 744 /// 745 /// This includes both the original MDs from \p From and additional ones (\see 746 /// addNewMetadata). Use this for *newly created* instructions in the vector 747 /// loop. 748 void addMetadata(Instruction *To, Instruction *From); 749 750 /// Similar to the previous function but it adds the metadata to a 751 /// vector of instructions. 752 void addMetadata(ArrayRef<Value *> To, Instruction *From); 753 754 /// Allow subclasses to override and print debug traces before/after vplan 755 /// execution, when trace information is requested. 756 virtual void printDebugTracesAtStart(){}; 757 virtual void printDebugTracesAtEnd(){}; 758 759 /// The original loop. 760 Loop *OrigLoop; 761 762 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 763 /// dynamic knowledge to simplify SCEV expressions and converts them to a 764 /// more usable form. 765 PredicatedScalarEvolution &PSE; 766 767 /// Loop Info. 768 LoopInfo *LI; 769 770 /// Dominator Tree. 771 DominatorTree *DT; 772 773 /// Alias Analysis. 774 AAResults *AA; 775 776 /// Target Library Info. 777 const TargetLibraryInfo *TLI; 778 779 /// Target Transform Info. 780 const TargetTransformInfo *TTI; 781 782 /// Assumption Cache. 783 AssumptionCache *AC; 784 785 /// Interface to emit optimization remarks. 786 OptimizationRemarkEmitter *ORE; 787 788 /// LoopVersioning. It's only set up (non-null) if memchecks were 789 /// used. 790 /// 791 /// This is currently only used to add no-alias metadata based on the 792 /// memchecks. The actually versioning is performed manually. 793 std::unique_ptr<LoopVersioning> LVer; 794 795 /// The vectorization SIMD factor to use. Each vector will have this many 796 /// vector elements. 797 ElementCount VF; 798 799 /// The vectorization unroll factor to use. Each scalar is vectorized to this 800 /// many different vector instructions. 801 unsigned UF; 802 803 /// The builder that we use 804 IRBuilder<> Builder; 805 806 // --- Vectorization state --- 807 808 /// The vector-loop preheader. 809 BasicBlock *LoopVectorPreHeader; 810 811 /// The scalar-loop preheader. 812 BasicBlock *LoopScalarPreHeader; 813 814 /// Middle Block between the vector and the scalar. 815 BasicBlock *LoopMiddleBlock; 816 817 /// The unique ExitBlock of the scalar loop if one exists. Note that 818 /// there can be multiple exiting edges reaching this block. 819 BasicBlock *LoopExitBlock; 820 821 /// The vector loop body. 822 BasicBlock *LoopVectorBody; 823 824 /// The scalar loop body. 825 BasicBlock *LoopScalarBody; 826 827 /// A list of all bypass blocks. The first block is the entry of the loop. 828 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 829 830 /// The new Induction variable which was added to the new block. 831 PHINode *Induction = nullptr; 832 833 /// The induction variable of the old basic block. 834 PHINode *OldInduction = nullptr; 835 836 /// Store instructions that were predicated. 837 SmallVector<Instruction *, 4> PredicatedInstructions; 838 839 /// Trip count of the original loop. 840 Value *TripCount = nullptr; 841 842 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 843 Value *VectorTripCount = nullptr; 844 845 /// The legality analysis. 846 LoopVectorizationLegality *Legal; 847 848 /// The profitablity analysis. 849 LoopVectorizationCostModel *Cost; 850 851 // Record whether runtime checks are added. 852 bool AddedSafetyChecks = false; 853 854 // Holds the end values for each induction variable. We save the end values 855 // so we can later fix-up the external users of the induction variables. 856 DenseMap<PHINode *, Value *> IVEndValues; 857 858 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 859 // fixed up at the end of vector code generation. 860 SmallVector<PHINode *, 8> OrigPHIsToFix; 861 862 /// BFI and PSI are used to check for profile guided size optimizations. 863 BlockFrequencyInfo *BFI; 864 ProfileSummaryInfo *PSI; 865 866 // Whether this loop should be optimized for size based on profile guided size 867 // optimizatios. 868 bool OptForSizeBasedOnProfile; 869 870 /// Structure to hold information about generated runtime checks, responsible 871 /// for cleaning the checks, if vectorization turns out unprofitable. 872 GeneratedRTChecks &RTChecks; 873 }; 874 875 class InnerLoopUnroller : public InnerLoopVectorizer { 876 public: 877 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 878 LoopInfo *LI, DominatorTree *DT, 879 const TargetLibraryInfo *TLI, 880 const TargetTransformInfo *TTI, AssumptionCache *AC, 881 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 882 LoopVectorizationLegality *LVL, 883 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 884 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 885 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 886 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 887 BFI, PSI, Check) {} 888 889 private: 890 Value *getBroadcastInstrs(Value *V) override; 891 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 892 Instruction::BinaryOps Opcode = 893 Instruction::BinaryOpsEnd) override; 894 Value *reverseVector(Value *Vec) override; 895 }; 896 897 /// Encapsulate information regarding vectorization of a loop and its epilogue. 898 /// This information is meant to be updated and used across two stages of 899 /// epilogue vectorization. 900 struct EpilogueLoopVectorizationInfo { 901 ElementCount MainLoopVF = ElementCount::getFixed(0); 902 unsigned MainLoopUF = 0; 903 ElementCount EpilogueVF = ElementCount::getFixed(0); 904 unsigned EpilogueUF = 0; 905 BasicBlock *MainLoopIterationCountCheck = nullptr; 906 BasicBlock *EpilogueIterationCountCheck = nullptr; 907 BasicBlock *SCEVSafetyCheck = nullptr; 908 BasicBlock *MemSafetyCheck = nullptr; 909 Value *TripCount = nullptr; 910 Value *VectorTripCount = nullptr; 911 912 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 913 unsigned EUF) 914 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 915 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 916 assert(EUF == 1 && 917 "A high UF for the epilogue loop is likely not beneficial."); 918 } 919 }; 920 921 /// An extension of the inner loop vectorizer that creates a skeleton for a 922 /// vectorized loop that has its epilogue (residual) also vectorized. 923 /// The idea is to run the vplan on a given loop twice, firstly to setup the 924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 925 /// from the first step and vectorize the epilogue. This is achieved by 926 /// deriving two concrete strategy classes from this base class and invoking 927 /// them in succession from the loop vectorizer planner. 928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 929 public: 930 InnerLoopAndEpilogueVectorizer( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 940 Checks), 941 EPI(EPI) {} 942 943 // Override this function to handle the more complex control flow around the 944 // three loops. 945 BasicBlock *createVectorizedLoopSkeleton() final override { 946 return createEpilogueVectorizedLoopSkeleton(); 947 } 948 949 /// The interface for creating a vectorized skeleton using one of two 950 /// different strategies, each corresponding to one execution of the vplan 951 /// as described above. 952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 953 954 /// Holds and updates state information required to vectorize the main loop 955 /// and its epilogue in two separate passes. This setup helps us avoid 956 /// regenerating and recomputing runtime safety checks. It also helps us to 957 /// shorten the iteration-count-check path length for the cases where the 958 /// iteration count of the loop is so small that the main vector loop is 959 /// completely skipped. 960 EpilogueLoopVectorizationInfo &EPI; 961 }; 962 963 /// A specialized derived class of inner loop vectorizer that performs 964 /// vectorization of *main* loops in the process of vectorizing loops and their 965 /// epilogues. 966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 967 public: 968 EpilogueVectorizerMainLoop( 969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 970 DominatorTree *DT, const TargetLibraryInfo *TLI, 971 const TargetTransformInfo *TTI, AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 975 GeneratedRTChecks &Check) 976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 977 EPI, LVL, CM, BFI, PSI, Check) {} 978 /// Implements the interface for creating a vectorized skeleton using the 979 /// *main loop* strategy (ie the first pass of vplan execution). 980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 981 982 protected: 983 /// Emits an iteration count bypass check once for the main loop (when \p 984 /// ForEpilogue is false) and once for the epilogue loop (when \p 985 /// ForEpilogue is true). 986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 987 bool ForEpilogue); 988 void printDebugTracesAtStart() override; 989 void printDebugTracesAtEnd() override; 990 }; 991 992 // A specialized derived class of inner loop vectorizer that performs 993 // vectorization of *epilogue* loops in the process of vectorizing loops and 994 // their epilogues. 995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 996 public: 997 EpilogueVectorizerEpilogueLoop( 998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 999 DominatorTree *DT, const TargetLibraryInfo *TLI, 1000 const TargetTransformInfo *TTI, AssumptionCache *AC, 1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1004 GeneratedRTChecks &Checks) 1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1006 EPI, LVL, CM, BFI, PSI, Checks) {} 1007 /// Implements the interface for creating a vectorized skeleton using the 1008 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1010 1011 protected: 1012 /// Emits an iteration count bypass check after the main vector loop has 1013 /// finished to see if there are any iterations left to execute by either 1014 /// the vector epilogue or the scalar epilogue. 1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1016 BasicBlock *Bypass, 1017 BasicBlock *Insert); 1018 void printDebugTracesAtStart() override; 1019 void printDebugTracesAtEnd() override; 1020 }; 1021 } // end namespace llvm 1022 1023 /// Look for a meaningful debug location on the instruction or it's 1024 /// operands. 1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1026 if (!I) 1027 return I; 1028 1029 DebugLoc Empty; 1030 if (I->getDebugLoc() != Empty) 1031 return I; 1032 1033 for (Use &Op : I->operands()) { 1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1035 if (OpInst->getDebugLoc() != Empty) 1036 return OpInst; 1037 } 1038 1039 return I; 1040 } 1041 1042 void InnerLoopVectorizer::setDebugLocFromInst( 1043 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1044 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1045 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1046 const DILocation *DIL = Inst->getDebugLoc(); 1047 1048 // When a FSDiscriminator is enabled, we don't need to add the multiply 1049 // factors to the discriminators. 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1052 // FIXME: For scalable vectors, assume vscale=1. 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B->SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } else 1062 B->SetCurrentDebugLocation(DIL); 1063 } else 1064 B->SetCurrentDebugLocation(DebugLoc()); 1065 } 1066 1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1068 /// is passed, the message relates to that particular instruction. 1069 #ifndef NDEBUG 1070 static void debugVectorizationMessage(const StringRef Prefix, 1071 const StringRef DebugMsg, 1072 Instruction *I) { 1073 dbgs() << "LV: " << Prefix << DebugMsg; 1074 if (I != nullptr) 1075 dbgs() << " " << *I; 1076 else 1077 dbgs() << '.'; 1078 dbgs() << '\n'; 1079 } 1080 #endif 1081 1082 /// Create an analysis remark that explains why vectorization failed 1083 /// 1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1085 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1086 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1087 /// the location of the remark. \return the remark object that can be 1088 /// streamed to. 1089 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1090 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1091 Value *CodeRegion = TheLoop->getHeader(); 1092 DebugLoc DL = TheLoop->getStartLoc(); 1093 1094 if (I) { 1095 CodeRegion = I->getParent(); 1096 // If there is no debug location attached to the instruction, revert back to 1097 // using the loop's. 1098 if (I->getDebugLoc()) 1099 DL = I->getDebugLoc(); 1100 } 1101 1102 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1103 } 1104 1105 /// Return a value for Step multiplied by VF. 1106 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1107 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1108 Constant *StepVal = ConstantInt::get( 1109 Step->getType(), 1110 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1111 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1112 } 1113 1114 namespace llvm { 1115 1116 /// Return the runtime value for VF. 1117 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1118 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1119 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1120 } 1121 1122 void reportVectorizationFailure(const StringRef DebugMsg, 1123 const StringRef OREMsg, const StringRef ORETag, 1124 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1125 Instruction *I) { 1126 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1127 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1128 ORE->emit( 1129 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1130 << "loop not vectorized: " << OREMsg); 1131 } 1132 1133 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1134 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1135 Instruction *I) { 1136 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1137 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1138 ORE->emit( 1139 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1140 << Msg); 1141 } 1142 1143 } // end namespace llvm 1144 1145 #ifndef NDEBUG 1146 /// \return string containing a file name and a line # for the given loop. 1147 static std::string getDebugLocString(const Loop *L) { 1148 std::string Result; 1149 if (L) { 1150 raw_string_ostream OS(Result); 1151 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1152 LoopDbgLoc.print(OS); 1153 else 1154 // Just print the module name. 1155 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1156 OS.flush(); 1157 } 1158 return Result; 1159 } 1160 #endif 1161 1162 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1163 const Instruction *Orig) { 1164 // If the loop was versioned with memchecks, add the corresponding no-alias 1165 // metadata. 1166 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1167 LVer->annotateInstWithNoAlias(To, Orig); 1168 } 1169 1170 void InnerLoopVectorizer::addMetadata(Instruction *To, 1171 Instruction *From) { 1172 propagateMetadata(To, From); 1173 addNewMetadata(To, From); 1174 } 1175 1176 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1177 Instruction *From) { 1178 for (Value *V : To) { 1179 if (Instruction *I = dyn_cast<Instruction>(V)) 1180 addMetadata(I, From); 1181 } 1182 } 1183 1184 namespace llvm { 1185 1186 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1187 // lowered. 1188 enum ScalarEpilogueLowering { 1189 1190 // The default: allowing scalar epilogues. 1191 CM_ScalarEpilogueAllowed, 1192 1193 // Vectorization with OptForSize: don't allow epilogues. 1194 CM_ScalarEpilogueNotAllowedOptSize, 1195 1196 // A special case of vectorisation with OptForSize: loops with a very small 1197 // trip count are considered for vectorization under OptForSize, thereby 1198 // making sure the cost of their loop body is dominant, free of runtime 1199 // guards and scalar iteration overheads. 1200 CM_ScalarEpilogueNotAllowedLowTripLoop, 1201 1202 // Loop hint predicate indicating an epilogue is undesired. 1203 CM_ScalarEpilogueNotNeededUsePredicate, 1204 1205 // Directive indicating we must either tail fold or not vectorize 1206 CM_ScalarEpilogueNotAllowedUsePredicate 1207 }; 1208 1209 /// ElementCountComparator creates a total ordering for ElementCount 1210 /// for the purposes of using it in a set structure. 1211 struct ElementCountComparator { 1212 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1213 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1214 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1215 } 1216 }; 1217 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1218 1219 /// LoopVectorizationCostModel - estimates the expected speedups due to 1220 /// vectorization. 1221 /// In many cases vectorization is not profitable. This can happen because of 1222 /// a number of reasons. In this class we mainly attempt to predict the 1223 /// expected speedup/slowdowns due to the supported instruction set. We use the 1224 /// TargetTransformInfo to query the different backends for the cost of 1225 /// different operations. 1226 class LoopVectorizationCostModel { 1227 public: 1228 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1229 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1230 LoopVectorizationLegality *Legal, 1231 const TargetTransformInfo &TTI, 1232 const TargetLibraryInfo *TLI, DemandedBits *DB, 1233 AssumptionCache *AC, 1234 OptimizationRemarkEmitter *ORE, const Function *F, 1235 const LoopVectorizeHints *Hints, 1236 InterleavedAccessInfo &IAI) 1237 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1238 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1239 Hints(Hints), InterleaveInfo(IAI) {} 1240 1241 /// \return An upper bound for the vectorization factors (both fixed and 1242 /// scalable). If the factors are 0, vectorization and interleaving should be 1243 /// avoided up front. 1244 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1245 1246 /// \return True if runtime checks are required for vectorization, and false 1247 /// otherwise. 1248 bool runtimeChecksRequired(); 1249 1250 /// \return The most profitable vectorization factor and the cost of that VF. 1251 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1252 /// then this vectorization factor will be selected if vectorization is 1253 /// possible. 1254 VectorizationFactor 1255 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1256 1257 VectorizationFactor 1258 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1259 const LoopVectorizationPlanner &LVP); 1260 1261 /// Setup cost-based decisions for user vectorization factor. 1262 /// \return true if the UserVF is a feasible VF to be chosen. 1263 bool selectUserVectorizationFactor(ElementCount UserVF) { 1264 collectUniformsAndScalars(UserVF); 1265 collectInstsToScalarize(UserVF); 1266 return expectedCost(UserVF).first.isValid(); 1267 } 1268 1269 /// \return The size (in bits) of the smallest and widest types in the code 1270 /// that needs to be vectorized. We ignore values that remain scalar such as 1271 /// 64 bit loop indices. 1272 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1273 1274 /// \return The desired interleave count. 1275 /// If interleave count has been specified by metadata it will be returned. 1276 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1277 /// are the selected vectorization factor and the cost of the selected VF. 1278 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1279 1280 /// Memory access instruction may be vectorized in more than one way. 1281 /// Form of instruction after vectorization depends on cost. 1282 /// This function takes cost-based decisions for Load/Store instructions 1283 /// and collects them in a map. This decisions map is used for building 1284 /// the lists of loop-uniform and loop-scalar instructions. 1285 /// The calculated cost is saved with widening decision in order to 1286 /// avoid redundant calculations. 1287 void setCostBasedWideningDecision(ElementCount VF); 1288 1289 /// A struct that represents some properties of the register usage 1290 /// of a loop. 1291 struct RegisterUsage { 1292 /// Holds the number of loop invariant values that are used in the loop. 1293 /// The key is ClassID of target-provided register class. 1294 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1295 /// Holds the maximum number of concurrent live intervals in the loop. 1296 /// The key is ClassID of target-provided register class. 1297 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1298 }; 1299 1300 /// \return Returns information about the register usages of the loop for the 1301 /// given vectorization factors. 1302 SmallVector<RegisterUsage, 8> 1303 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1304 1305 /// Collect values we want to ignore in the cost model. 1306 void collectValuesToIgnore(); 1307 1308 /// Collect all element types in the loop for which widening is needed. 1309 void collectElementTypesForWidening(); 1310 1311 /// Split reductions into those that happen in the loop, and those that happen 1312 /// outside. In loop reductions are collected into InLoopReductionChains. 1313 void collectInLoopReductions(); 1314 1315 /// Returns true if we should use strict in-order reductions for the given 1316 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1317 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1318 /// of FP operations. 1319 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1320 return ForceOrderedReductions && !Hints->allowReordering() && 1321 RdxDesc.isOrdered(); 1322 } 1323 1324 /// \returns The smallest bitwidth each instruction can be represented with. 1325 /// The vector equivalents of these instructions should be truncated to this 1326 /// type. 1327 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1328 return MinBWs; 1329 } 1330 1331 /// \returns True if it is more profitable to scalarize instruction \p I for 1332 /// vectorization factor \p VF. 1333 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1334 assert(VF.isVector() && 1335 "Profitable to scalarize relevant only for VF > 1."); 1336 1337 // Cost model is not run in the VPlan-native path - return conservative 1338 // result until this changes. 1339 if (EnableVPlanNativePath) 1340 return false; 1341 1342 auto Scalars = InstsToScalarize.find(VF); 1343 assert(Scalars != InstsToScalarize.end() && 1344 "VF not yet analyzed for scalarization profitability"); 1345 return Scalars->second.find(I) != Scalars->second.end(); 1346 } 1347 1348 /// Returns true if \p I is known to be uniform after vectorization. 1349 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1350 if (VF.isScalar()) 1351 return true; 1352 1353 // Cost model is not run in the VPlan-native path - return conservative 1354 // result until this changes. 1355 if (EnableVPlanNativePath) 1356 return false; 1357 1358 auto UniformsPerVF = Uniforms.find(VF); 1359 assert(UniformsPerVF != Uniforms.end() && 1360 "VF not yet analyzed for uniformity"); 1361 return UniformsPerVF->second.count(I); 1362 } 1363 1364 /// Returns true if \p I is known to be scalar after vectorization. 1365 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1366 if (VF.isScalar()) 1367 return true; 1368 1369 // Cost model is not run in the VPlan-native path - return conservative 1370 // result until this changes. 1371 if (EnableVPlanNativePath) 1372 return false; 1373 1374 auto ScalarsPerVF = Scalars.find(VF); 1375 assert(ScalarsPerVF != Scalars.end() && 1376 "Scalar values are not calculated for VF"); 1377 return ScalarsPerVF->second.count(I); 1378 } 1379 1380 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1381 /// for vectorization factor \p VF. 1382 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1383 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1384 !isProfitableToScalarize(I, VF) && 1385 !isScalarAfterVectorization(I, VF); 1386 } 1387 1388 /// Decision that was taken during cost calculation for memory instruction. 1389 enum InstWidening { 1390 CM_Unknown, 1391 CM_Widen, // For consecutive accesses with stride +1. 1392 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1393 CM_Interleave, 1394 CM_GatherScatter, 1395 CM_Scalarize 1396 }; 1397 1398 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1399 /// instruction \p I and vector width \p VF. 1400 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1401 InstructionCost Cost) { 1402 assert(VF.isVector() && "Expected VF >=2"); 1403 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1404 } 1405 1406 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1407 /// interleaving group \p Grp and vector width \p VF. 1408 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1409 ElementCount VF, InstWidening W, 1410 InstructionCost Cost) { 1411 assert(VF.isVector() && "Expected VF >=2"); 1412 /// Broadcast this decicion to all instructions inside the group. 1413 /// But the cost will be assigned to one instruction only. 1414 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1415 if (auto *I = Grp->getMember(i)) { 1416 if (Grp->getInsertPos() == I) 1417 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1418 else 1419 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1420 } 1421 } 1422 } 1423 1424 /// Return the cost model decision for the given instruction \p I and vector 1425 /// width \p VF. Return CM_Unknown if this instruction did not pass 1426 /// through the cost modeling. 1427 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1428 assert(VF.isVector() && "Expected VF to be a vector VF"); 1429 // Cost model is not run in the VPlan-native path - return conservative 1430 // result until this changes. 1431 if (EnableVPlanNativePath) 1432 return CM_GatherScatter; 1433 1434 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1435 auto Itr = WideningDecisions.find(InstOnVF); 1436 if (Itr == WideningDecisions.end()) 1437 return CM_Unknown; 1438 return Itr->second.first; 1439 } 1440 1441 /// Return the vectorization cost for the given instruction \p I and vector 1442 /// width \p VF. 1443 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1444 assert(VF.isVector() && "Expected VF >=2"); 1445 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1446 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1447 "The cost is not calculated"); 1448 return WideningDecisions[InstOnVF].second; 1449 } 1450 1451 /// Return True if instruction \p I is an optimizable truncate whose operand 1452 /// is an induction variable. Such a truncate will be removed by adding a new 1453 /// induction variable with the destination type. 1454 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1455 // If the instruction is not a truncate, return false. 1456 auto *Trunc = dyn_cast<TruncInst>(I); 1457 if (!Trunc) 1458 return false; 1459 1460 // Get the source and destination types of the truncate. 1461 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1462 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1463 1464 // If the truncate is free for the given types, return false. Replacing a 1465 // free truncate with an induction variable would add an induction variable 1466 // update instruction to each iteration of the loop. We exclude from this 1467 // check the primary induction variable since it will need an update 1468 // instruction regardless. 1469 Value *Op = Trunc->getOperand(0); 1470 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1471 return false; 1472 1473 // If the truncated value is not an induction variable, return false. 1474 return Legal->isInductionPhi(Op); 1475 } 1476 1477 /// Collects the instructions to scalarize for each predicated instruction in 1478 /// the loop. 1479 void collectInstsToScalarize(ElementCount VF); 1480 1481 /// Collect Uniform and Scalar values for the given \p VF. 1482 /// The sets depend on CM decision for Load/Store instructions 1483 /// that may be vectorized as interleave, gather-scatter or scalarized. 1484 void collectUniformsAndScalars(ElementCount VF) { 1485 // Do the analysis once. 1486 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1487 return; 1488 setCostBasedWideningDecision(VF); 1489 collectLoopUniforms(VF); 1490 collectLoopScalars(VF); 1491 } 1492 1493 /// Returns true if the target machine supports masked store operation 1494 /// for the given \p DataType and kind of access to \p Ptr. 1495 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1496 return Legal->isConsecutivePtr(Ptr) && 1497 TTI.isLegalMaskedStore(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine supports masked load operation 1501 /// for the given \p DataType and kind of access to \p Ptr. 1502 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1503 return Legal->isConsecutivePtr(Ptr) && 1504 TTI.isLegalMaskedLoad(DataType, Alignment); 1505 } 1506 1507 /// Returns true if the target machine can represent \p V as a masked gather 1508 /// or scatter operation. 1509 bool isLegalGatherOrScatter(Value *V) { 1510 bool LI = isa<LoadInst>(V); 1511 bool SI = isa<StoreInst>(V); 1512 if (!LI && !SI) 1513 return false; 1514 auto *Ty = getLoadStoreType(V); 1515 Align Align = getLoadStoreAlignment(V); 1516 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1517 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1518 } 1519 1520 /// Returns true if the target machine supports all of the reduction 1521 /// variables found for the given VF. 1522 bool canVectorizeReductions(ElementCount VF) const { 1523 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1524 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1525 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1526 })); 1527 } 1528 1529 /// Returns true if \p I is an instruction that will be scalarized with 1530 /// predication. Such instructions include conditional stores and 1531 /// instructions that may divide by zero. 1532 /// If a non-zero VF has been calculated, we check if I will be scalarized 1533 /// predication for that VF. 1534 bool isScalarWithPredication(Instruction *I) const; 1535 1536 // Returns true if \p I is an instruction that will be predicated either 1537 // through scalar predication or masked load/store or masked gather/scatter. 1538 // Superset of instructions that return true for isScalarWithPredication. 1539 bool isPredicatedInst(Instruction *I) { 1540 if (!blockNeedsPredication(I->getParent())) 1541 return false; 1542 // Loads and stores that need some form of masked operation are predicated 1543 // instructions. 1544 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1545 return Legal->isMaskRequired(I); 1546 return isScalarWithPredication(I); 1547 } 1548 1549 /// Returns true if \p I is a memory instruction with consecutive memory 1550 /// access that can be widened. 1551 bool 1552 memoryInstructionCanBeWidened(Instruction *I, 1553 ElementCount VF = ElementCount::getFixed(1)); 1554 1555 /// Returns true if \p I is a memory instruction in an interleaved-group 1556 /// of memory accesses that can be vectorized with wide vector loads/stores 1557 /// and shuffles. 1558 bool 1559 interleavedAccessCanBeWidened(Instruction *I, 1560 ElementCount VF = ElementCount::getFixed(1)); 1561 1562 /// Check if \p Instr belongs to any interleaved access group. 1563 bool isAccessInterleaved(Instruction *Instr) { 1564 return InterleaveInfo.isInterleaved(Instr); 1565 } 1566 1567 /// Get the interleaved access group that \p Instr belongs to. 1568 const InterleaveGroup<Instruction> * 1569 getInterleavedAccessGroup(Instruction *Instr) { 1570 return InterleaveInfo.getInterleaveGroup(Instr); 1571 } 1572 1573 /// Returns true if we're required to use a scalar epilogue for at least 1574 /// the final iteration of the original loop. 1575 bool requiresScalarEpilogue(ElementCount VF) const { 1576 if (!isScalarEpilogueAllowed()) 1577 return false; 1578 // If we might exit from anywhere but the latch, must run the exiting 1579 // iteration in scalar form. 1580 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1581 return true; 1582 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1583 } 1584 1585 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1586 /// loop hint annotation. 1587 bool isScalarEpilogueAllowed() const { 1588 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1589 } 1590 1591 /// Returns true if all loop blocks should be masked to fold tail loop. 1592 bool foldTailByMasking() const { return FoldTailByMasking; } 1593 1594 bool blockNeedsPredication(BasicBlock *BB) const { 1595 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1596 } 1597 1598 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1599 /// nodes to the chain of instructions representing the reductions. Uses a 1600 /// MapVector to ensure deterministic iteration order. 1601 using ReductionChainMap = 1602 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1603 1604 /// Return the chain of instructions representing an inloop reduction. 1605 const ReductionChainMap &getInLoopReductionChains() const { 1606 return InLoopReductionChains; 1607 } 1608 1609 /// Returns true if the Phi is part of an inloop reduction. 1610 bool isInLoopReduction(PHINode *Phi) const { 1611 return InLoopReductionChains.count(Phi); 1612 } 1613 1614 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1615 /// with factor VF. Return the cost of the instruction, including 1616 /// scalarization overhead if it's needed. 1617 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1618 1619 /// Estimate cost of a call instruction CI if it were vectorized with factor 1620 /// VF. Return the cost of the instruction, including scalarization overhead 1621 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1622 /// scalarized - 1623 /// i.e. either vector version isn't available, or is too expensive. 1624 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1625 bool &NeedToScalarize) const; 1626 1627 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1628 /// that of B. 1629 bool isMoreProfitable(const VectorizationFactor &A, 1630 const VectorizationFactor &B) const; 1631 1632 /// Invalidates decisions already taken by the cost model. 1633 void invalidateCostModelingDecisions() { 1634 WideningDecisions.clear(); 1635 Uniforms.clear(); 1636 Scalars.clear(); 1637 } 1638 1639 private: 1640 unsigned NumPredStores = 0; 1641 1642 /// \return An upper bound for the vectorization factors for both 1643 /// fixed and scalable vectorization, where the minimum-known number of 1644 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1645 /// disabled or unsupported, then the scalable part will be equal to 1646 /// ElementCount::getScalable(0). 1647 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1648 ElementCount UserVF); 1649 1650 /// \return the maximized element count based on the targets vector 1651 /// registers and the loop trip-count, but limited to a maximum safe VF. 1652 /// This is a helper function of computeFeasibleMaxVF. 1653 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1654 /// issue that occurred on one of the buildbots which cannot be reproduced 1655 /// without having access to the properietary compiler (see comments on 1656 /// D98509). The issue is currently under investigation and this workaround 1657 /// will be removed as soon as possible. 1658 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1659 unsigned SmallestType, 1660 unsigned WidestType, 1661 const ElementCount &MaxSafeVF); 1662 1663 /// \return the maximum legal scalable VF, based on the safe max number 1664 /// of elements. 1665 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1666 1667 /// The vectorization cost is a combination of the cost itself and a boolean 1668 /// indicating whether any of the contributing operations will actually 1669 /// operate on vector values after type legalization in the backend. If this 1670 /// latter value is false, then all operations will be scalarized (i.e. no 1671 /// vectorization has actually taken place). 1672 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1673 1674 /// Returns the expected execution cost. The unit of the cost does 1675 /// not matter because we use the 'cost' units to compare different 1676 /// vector widths. The cost that is returned is *not* normalized by 1677 /// the factor width. If \p Invalid is not nullptr, this function 1678 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1679 /// each instruction that has an Invalid cost for the given VF. 1680 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1681 VectorizationCostTy 1682 expectedCost(ElementCount VF, 1683 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1684 1685 /// Returns the execution time cost of an instruction for a given vector 1686 /// width. Vector width of one means scalar. 1687 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1688 1689 /// The cost-computation logic from getInstructionCost which provides 1690 /// the vector type as an output parameter. 1691 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1692 Type *&VectorTy); 1693 1694 /// Return the cost of instructions in an inloop reduction pattern, if I is 1695 /// part of that pattern. 1696 Optional<InstructionCost> 1697 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1698 TTI::TargetCostKind CostKind); 1699 1700 /// Calculate vectorization cost of memory instruction \p I. 1701 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1702 1703 /// The cost computation for scalarized memory instruction. 1704 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1705 1706 /// The cost computation for interleaving group of memory instructions. 1707 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1708 1709 /// The cost computation for Gather/Scatter instruction. 1710 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1711 1712 /// The cost computation for widening instruction \p I with consecutive 1713 /// memory access. 1714 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1715 1716 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1717 /// Load: scalar load + broadcast. 1718 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1719 /// element) 1720 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1721 1722 /// Estimate the overhead of scalarizing an instruction. This is a 1723 /// convenience wrapper for the type-based getScalarizationOverhead API. 1724 InstructionCost getScalarizationOverhead(Instruction *I, 1725 ElementCount VF) const; 1726 1727 /// Returns whether the instruction is a load or store and will be a emitted 1728 /// as a vector operation. 1729 bool isConsecutiveLoadOrStore(Instruction *I); 1730 1731 /// Returns true if an artificially high cost for emulated masked memrefs 1732 /// should be used. 1733 bool useEmulatedMaskMemRefHack(Instruction *I); 1734 1735 /// Map of scalar integer values to the smallest bitwidth they can be legally 1736 /// represented as. The vector equivalents of these values should be truncated 1737 /// to this type. 1738 MapVector<Instruction *, uint64_t> MinBWs; 1739 1740 /// A type representing the costs for instructions if they were to be 1741 /// scalarized rather than vectorized. The entries are Instruction-Cost 1742 /// pairs. 1743 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1744 1745 /// A set containing all BasicBlocks that are known to present after 1746 /// vectorization as a predicated block. 1747 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1748 1749 /// Records whether it is allowed to have the original scalar loop execute at 1750 /// least once. This may be needed as a fallback loop in case runtime 1751 /// aliasing/dependence checks fail, or to handle the tail/remainder 1752 /// iterations when the trip count is unknown or doesn't divide by the VF, 1753 /// or as a peel-loop to handle gaps in interleave-groups. 1754 /// Under optsize and when the trip count is very small we don't allow any 1755 /// iterations to execute in the scalar loop. 1756 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1757 1758 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1759 bool FoldTailByMasking = false; 1760 1761 /// A map holding scalar costs for different vectorization factors. The 1762 /// presence of a cost for an instruction in the mapping indicates that the 1763 /// instruction will be scalarized when vectorizing with the associated 1764 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1765 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1766 1767 /// Holds the instructions known to be uniform after vectorization. 1768 /// The data is collected per VF. 1769 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1770 1771 /// Holds the instructions known to be scalar after vectorization. 1772 /// The data is collected per VF. 1773 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1774 1775 /// Holds the instructions (address computations) that are forced to be 1776 /// scalarized. 1777 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1778 1779 /// PHINodes of the reductions that should be expanded in-loop along with 1780 /// their associated chains of reduction operations, in program order from top 1781 /// (PHI) to bottom 1782 ReductionChainMap InLoopReductionChains; 1783 1784 /// A Map of inloop reduction operations and their immediate chain operand. 1785 /// FIXME: This can be removed once reductions can be costed correctly in 1786 /// vplan. This was added to allow quick lookup to the inloop operations, 1787 /// without having to loop through InLoopReductionChains. 1788 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1789 1790 /// Returns the expected difference in cost from scalarizing the expression 1791 /// feeding a predicated instruction \p PredInst. The instructions to 1792 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1793 /// non-negative return value implies the expression will be scalarized. 1794 /// Currently, only single-use chains are considered for scalarization. 1795 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1796 ElementCount VF); 1797 1798 /// Collect the instructions that are uniform after vectorization. An 1799 /// instruction is uniform if we represent it with a single scalar value in 1800 /// the vectorized loop corresponding to each vector iteration. Examples of 1801 /// uniform instructions include pointer operands of consecutive or 1802 /// interleaved memory accesses. Note that although uniformity implies an 1803 /// instruction will be scalar, the reverse is not true. In general, a 1804 /// scalarized instruction will be represented by VF scalar values in the 1805 /// vectorized loop, each corresponding to an iteration of the original 1806 /// scalar loop. 1807 void collectLoopUniforms(ElementCount VF); 1808 1809 /// Collect the instructions that are scalar after vectorization. An 1810 /// instruction is scalar if it is known to be uniform or will be scalarized 1811 /// during vectorization. Non-uniform scalarized instructions will be 1812 /// represented by VF values in the vectorized loop, each corresponding to an 1813 /// iteration of the original scalar loop. 1814 void collectLoopScalars(ElementCount VF); 1815 1816 /// Keeps cost model vectorization decision and cost for instructions. 1817 /// Right now it is used for memory instructions only. 1818 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1819 std::pair<InstWidening, InstructionCost>>; 1820 1821 DecisionList WideningDecisions; 1822 1823 /// Returns true if \p V is expected to be vectorized and it needs to be 1824 /// extracted. 1825 bool needsExtract(Value *V, ElementCount VF) const { 1826 Instruction *I = dyn_cast<Instruction>(V); 1827 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1828 TheLoop->isLoopInvariant(I)) 1829 return false; 1830 1831 // Assume we can vectorize V (and hence we need extraction) if the 1832 // scalars are not computed yet. This can happen, because it is called 1833 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1834 // the scalars are collected. That should be a safe assumption in most 1835 // cases, because we check if the operands have vectorizable types 1836 // beforehand in LoopVectorizationLegality. 1837 return Scalars.find(VF) == Scalars.end() || 1838 !isScalarAfterVectorization(I, VF); 1839 }; 1840 1841 /// Returns a range containing only operands needing to be extracted. 1842 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1843 ElementCount VF) const { 1844 return SmallVector<Value *, 4>(make_filter_range( 1845 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1846 } 1847 1848 /// Determines if we have the infrastructure to vectorize loop \p L and its 1849 /// epilogue, assuming the main loop is vectorized by \p VF. 1850 bool isCandidateForEpilogueVectorization(const Loop &L, 1851 const ElementCount VF) const; 1852 1853 /// Returns true if epilogue vectorization is considered profitable, and 1854 /// false otherwise. 1855 /// \p VF is the vectorization factor chosen for the original loop. 1856 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1857 1858 public: 1859 /// The loop that we evaluate. 1860 Loop *TheLoop; 1861 1862 /// Predicated scalar evolution analysis. 1863 PredicatedScalarEvolution &PSE; 1864 1865 /// Loop Info analysis. 1866 LoopInfo *LI; 1867 1868 /// Vectorization legality. 1869 LoopVectorizationLegality *Legal; 1870 1871 /// Vector target information. 1872 const TargetTransformInfo &TTI; 1873 1874 /// Target Library Info. 1875 const TargetLibraryInfo *TLI; 1876 1877 /// Demanded bits analysis. 1878 DemandedBits *DB; 1879 1880 /// Assumption cache. 1881 AssumptionCache *AC; 1882 1883 /// Interface to emit optimization remarks. 1884 OptimizationRemarkEmitter *ORE; 1885 1886 const Function *TheFunction; 1887 1888 /// Loop Vectorize Hint. 1889 const LoopVectorizeHints *Hints; 1890 1891 /// The interleave access information contains groups of interleaved accesses 1892 /// with the same stride and close to each other. 1893 InterleavedAccessInfo &InterleaveInfo; 1894 1895 /// Values to ignore in the cost model. 1896 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1897 1898 /// Values to ignore in the cost model when VF > 1. 1899 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1900 1901 /// All element types found in the loop. 1902 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1903 1904 /// Profitable vector factors. 1905 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1906 }; 1907 } // end namespace llvm 1908 1909 /// Helper struct to manage generating runtime checks for vectorization. 1910 /// 1911 /// The runtime checks are created up-front in temporary blocks to allow better 1912 /// estimating the cost and un-linked from the existing IR. After deciding to 1913 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1914 /// temporary blocks are completely removed. 1915 class GeneratedRTChecks { 1916 /// Basic block which contains the generated SCEV checks, if any. 1917 BasicBlock *SCEVCheckBlock = nullptr; 1918 1919 /// The value representing the result of the generated SCEV checks. If it is 1920 /// nullptr, either no SCEV checks have been generated or they have been used. 1921 Value *SCEVCheckCond = nullptr; 1922 1923 /// Basic block which contains the generated memory runtime checks, if any. 1924 BasicBlock *MemCheckBlock = nullptr; 1925 1926 /// The value representing the result of the generated memory runtime checks. 1927 /// If it is nullptr, either no memory runtime checks have been generated or 1928 /// they have been used. 1929 Instruction *MemRuntimeCheckCond = nullptr; 1930 1931 DominatorTree *DT; 1932 LoopInfo *LI; 1933 1934 SCEVExpander SCEVExp; 1935 SCEVExpander MemCheckExp; 1936 1937 public: 1938 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1939 const DataLayout &DL) 1940 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1941 MemCheckExp(SE, DL, "scev.check") {} 1942 1943 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1944 /// accurately estimate the cost of the runtime checks. The blocks are 1945 /// un-linked from the IR and is added back during vector code generation. If 1946 /// there is no vector code generation, the check blocks are removed 1947 /// completely. 1948 void Create(Loop *L, const LoopAccessInfo &LAI, 1949 const SCEVUnionPredicate &UnionPred) { 1950 1951 BasicBlock *LoopHeader = L->getHeader(); 1952 BasicBlock *Preheader = L->getLoopPreheader(); 1953 1954 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1955 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1956 // may be used by SCEVExpander. The blocks will be un-linked from their 1957 // predecessors and removed from LI & DT at the end of the function. 1958 if (!UnionPred.isAlwaysTrue()) { 1959 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1960 nullptr, "vector.scevcheck"); 1961 1962 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1963 &UnionPred, SCEVCheckBlock->getTerminator()); 1964 } 1965 1966 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1967 if (RtPtrChecking.Need) { 1968 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1969 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1970 "vector.memcheck"); 1971 1972 std::tie(std::ignore, MemRuntimeCheckCond) = 1973 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1974 RtPtrChecking.getChecks(), MemCheckExp); 1975 assert(MemRuntimeCheckCond && 1976 "no RT checks generated although RtPtrChecking " 1977 "claimed checks are required"); 1978 } 1979 1980 if (!MemCheckBlock && !SCEVCheckBlock) 1981 return; 1982 1983 // Unhook the temporary block with the checks, update various places 1984 // accordingly. 1985 if (SCEVCheckBlock) 1986 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1987 if (MemCheckBlock) 1988 MemCheckBlock->replaceAllUsesWith(Preheader); 1989 1990 if (SCEVCheckBlock) { 1991 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1992 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1993 Preheader->getTerminator()->eraseFromParent(); 1994 } 1995 if (MemCheckBlock) { 1996 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1997 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1998 Preheader->getTerminator()->eraseFromParent(); 1999 } 2000 2001 DT->changeImmediateDominator(LoopHeader, Preheader); 2002 if (MemCheckBlock) { 2003 DT->eraseNode(MemCheckBlock); 2004 LI->removeBlock(MemCheckBlock); 2005 } 2006 if (SCEVCheckBlock) { 2007 DT->eraseNode(SCEVCheckBlock); 2008 LI->removeBlock(SCEVCheckBlock); 2009 } 2010 } 2011 2012 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2013 /// unused. 2014 ~GeneratedRTChecks() { 2015 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2016 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2017 if (!SCEVCheckCond) 2018 SCEVCleaner.markResultUsed(); 2019 2020 if (!MemRuntimeCheckCond) 2021 MemCheckCleaner.markResultUsed(); 2022 2023 if (MemRuntimeCheckCond) { 2024 auto &SE = *MemCheckExp.getSE(); 2025 // Memory runtime check generation creates compares that use expanded 2026 // values. Remove them before running the SCEVExpanderCleaners. 2027 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2028 if (MemCheckExp.isInsertedInstruction(&I)) 2029 continue; 2030 SE.forgetValue(&I); 2031 SE.eraseValueFromMap(&I); 2032 I.eraseFromParent(); 2033 } 2034 } 2035 MemCheckCleaner.cleanup(); 2036 SCEVCleaner.cleanup(); 2037 2038 if (SCEVCheckCond) 2039 SCEVCheckBlock->eraseFromParent(); 2040 if (MemRuntimeCheckCond) 2041 MemCheckBlock->eraseFromParent(); 2042 } 2043 2044 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2045 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2046 /// depending on the generated condition. 2047 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2048 BasicBlock *LoopVectorPreHeader, 2049 BasicBlock *LoopExitBlock) { 2050 if (!SCEVCheckCond) 2051 return nullptr; 2052 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2053 if (C->isZero()) 2054 return nullptr; 2055 2056 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2057 2058 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2059 // Create new preheader for vector loop. 2060 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2061 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2062 2063 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2064 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2065 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2066 SCEVCheckBlock); 2067 2068 DT->addNewBlock(SCEVCheckBlock, Pred); 2069 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2070 2071 ReplaceInstWithInst( 2072 SCEVCheckBlock->getTerminator(), 2073 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2074 // Mark the check as used, to prevent it from being removed during cleanup. 2075 SCEVCheckCond = nullptr; 2076 return SCEVCheckBlock; 2077 } 2078 2079 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2080 /// the branches to branch to the vector preheader or \p Bypass, depending on 2081 /// the generated condition. 2082 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2083 BasicBlock *LoopVectorPreHeader) { 2084 // Check if we generated code that checks in runtime if arrays overlap. 2085 if (!MemRuntimeCheckCond) 2086 return nullptr; 2087 2088 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2089 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2090 MemCheckBlock); 2091 2092 DT->addNewBlock(MemCheckBlock, Pred); 2093 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2094 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2095 2096 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2097 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2098 2099 ReplaceInstWithInst( 2100 MemCheckBlock->getTerminator(), 2101 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2102 MemCheckBlock->getTerminator()->setDebugLoc( 2103 Pred->getTerminator()->getDebugLoc()); 2104 2105 // Mark the check as used, to prevent it from being removed during cleanup. 2106 MemRuntimeCheckCond = nullptr; 2107 return MemCheckBlock; 2108 } 2109 }; 2110 2111 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2112 // vectorization. The loop needs to be annotated with #pragma omp simd 2113 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2114 // vector length information is not provided, vectorization is not considered 2115 // explicit. Interleave hints are not allowed either. These limitations will be 2116 // relaxed in the future. 2117 // Please, note that we are currently forced to abuse the pragma 'clang 2118 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2119 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2120 // provides *explicit vectorization hints* (LV can bypass legal checks and 2121 // assume that vectorization is legal). However, both hints are implemented 2122 // using the same metadata (llvm.loop.vectorize, processed by 2123 // LoopVectorizeHints). This will be fixed in the future when the native IR 2124 // representation for pragma 'omp simd' is introduced. 2125 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2126 OptimizationRemarkEmitter *ORE) { 2127 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2128 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2129 2130 // Only outer loops with an explicit vectorization hint are supported. 2131 // Unannotated outer loops are ignored. 2132 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2133 return false; 2134 2135 Function *Fn = OuterLp->getHeader()->getParent(); 2136 if (!Hints.allowVectorization(Fn, OuterLp, 2137 true /*VectorizeOnlyWhenForced*/)) { 2138 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2139 return false; 2140 } 2141 2142 if (Hints.getInterleave() > 1) { 2143 // TODO: Interleave support is future work. 2144 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2145 "outer loops.\n"); 2146 Hints.emitRemarkWithHints(); 2147 return false; 2148 } 2149 2150 return true; 2151 } 2152 2153 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2154 OptimizationRemarkEmitter *ORE, 2155 SmallVectorImpl<Loop *> &V) { 2156 // Collect inner loops and outer loops without irreducible control flow. For 2157 // now, only collect outer loops that have explicit vectorization hints. If we 2158 // are stress testing the VPlan H-CFG construction, we collect the outermost 2159 // loop of every loop nest. 2160 if (L.isInnermost() || VPlanBuildStressTest || 2161 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2162 LoopBlocksRPO RPOT(&L); 2163 RPOT.perform(LI); 2164 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2165 V.push_back(&L); 2166 // TODO: Collect inner loops inside marked outer loops in case 2167 // vectorization fails for the outer loop. Do not invoke 2168 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2169 // already known to be reducible. We can use an inherited attribute for 2170 // that. 2171 return; 2172 } 2173 } 2174 for (Loop *InnerL : L) 2175 collectSupportedLoops(*InnerL, LI, ORE, V); 2176 } 2177 2178 namespace { 2179 2180 /// The LoopVectorize Pass. 2181 struct LoopVectorize : public FunctionPass { 2182 /// Pass identification, replacement for typeid 2183 static char ID; 2184 2185 LoopVectorizePass Impl; 2186 2187 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2188 bool VectorizeOnlyWhenForced = false) 2189 : FunctionPass(ID), 2190 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2191 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2192 } 2193 2194 bool runOnFunction(Function &F) override { 2195 if (skipFunction(F)) 2196 return false; 2197 2198 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2199 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2200 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2201 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2202 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2203 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2204 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2205 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2206 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2207 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2208 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2209 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2210 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2211 2212 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2213 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2214 2215 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2216 GetLAA, *ORE, PSI).MadeAnyChange; 2217 } 2218 2219 void getAnalysisUsage(AnalysisUsage &AU) const override { 2220 AU.addRequired<AssumptionCacheTracker>(); 2221 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2222 AU.addRequired<DominatorTreeWrapperPass>(); 2223 AU.addRequired<LoopInfoWrapperPass>(); 2224 AU.addRequired<ScalarEvolutionWrapperPass>(); 2225 AU.addRequired<TargetTransformInfoWrapperPass>(); 2226 AU.addRequired<AAResultsWrapperPass>(); 2227 AU.addRequired<LoopAccessLegacyAnalysis>(); 2228 AU.addRequired<DemandedBitsWrapperPass>(); 2229 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2230 AU.addRequired<InjectTLIMappingsLegacy>(); 2231 2232 // We currently do not preserve loopinfo/dominator analyses with outer loop 2233 // vectorization. Until this is addressed, mark these analyses as preserved 2234 // only for non-VPlan-native path. 2235 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2236 if (!EnableVPlanNativePath) { 2237 AU.addPreserved<LoopInfoWrapperPass>(); 2238 AU.addPreserved<DominatorTreeWrapperPass>(); 2239 } 2240 2241 AU.addPreserved<BasicAAWrapperPass>(); 2242 AU.addPreserved<GlobalsAAWrapperPass>(); 2243 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2244 } 2245 }; 2246 2247 } // end anonymous namespace 2248 2249 //===----------------------------------------------------------------------===// 2250 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2251 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2252 //===----------------------------------------------------------------------===// 2253 2254 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2255 // We need to place the broadcast of invariant variables outside the loop, 2256 // but only if it's proven safe to do so. Else, broadcast will be inside 2257 // vector loop body. 2258 Instruction *Instr = dyn_cast<Instruction>(V); 2259 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2260 (!Instr || 2261 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2262 // Place the code for broadcasting invariant variables in the new preheader. 2263 IRBuilder<>::InsertPointGuard Guard(Builder); 2264 if (SafeToHoist) 2265 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2266 2267 // Broadcast the scalar into all locations in the vector. 2268 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2269 2270 return Shuf; 2271 } 2272 2273 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2274 const InductionDescriptor &II, Value *Step, Value *Start, 2275 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2276 VPTransformState &State) { 2277 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2278 "Expected either an induction phi-node or a truncate of it!"); 2279 2280 // Construct the initial value of the vector IV in the vector loop preheader 2281 auto CurrIP = Builder.saveIP(); 2282 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2283 if (isa<TruncInst>(EntryVal)) { 2284 assert(Start->getType()->isIntegerTy() && 2285 "Truncation requires an integer type"); 2286 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2287 Step = Builder.CreateTrunc(Step, TruncType); 2288 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2289 } 2290 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2291 Value *SteppedStart = 2292 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2293 2294 // We create vector phi nodes for both integer and floating-point induction 2295 // variables. Here, we determine the kind of arithmetic we will perform. 2296 Instruction::BinaryOps AddOp; 2297 Instruction::BinaryOps MulOp; 2298 if (Step->getType()->isIntegerTy()) { 2299 AddOp = Instruction::Add; 2300 MulOp = Instruction::Mul; 2301 } else { 2302 AddOp = II.getInductionOpcode(); 2303 MulOp = Instruction::FMul; 2304 } 2305 2306 // Multiply the vectorization factor by the step using integer or 2307 // floating-point arithmetic as appropriate. 2308 Type *StepType = Step->getType(); 2309 if (Step->getType()->isFloatingPointTy()) 2310 StepType = IntegerType::get(StepType->getContext(), 2311 StepType->getScalarSizeInBits()); 2312 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2313 if (Step->getType()->isFloatingPointTy()) 2314 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2315 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2316 2317 // Create a vector splat to use in the induction update. 2318 // 2319 // FIXME: If the step is non-constant, we create the vector splat with 2320 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2321 // handle a constant vector splat. 2322 Value *SplatVF = isa<Constant>(Mul) 2323 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2324 : Builder.CreateVectorSplat(VF, Mul); 2325 Builder.restoreIP(CurrIP); 2326 2327 // We may need to add the step a number of times, depending on the unroll 2328 // factor. The last of those goes into the PHI. 2329 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2330 &*LoopVectorBody->getFirstInsertionPt()); 2331 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2332 Instruction *LastInduction = VecInd; 2333 for (unsigned Part = 0; Part < UF; ++Part) { 2334 State.set(Def, LastInduction, Part); 2335 2336 if (isa<TruncInst>(EntryVal)) 2337 addMetadata(LastInduction, EntryVal); 2338 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2339 State, Part); 2340 2341 LastInduction = cast<Instruction>( 2342 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2343 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2344 } 2345 2346 // Move the last step to the end of the latch block. This ensures consistent 2347 // placement of all induction updates. 2348 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2349 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2350 auto *ICmp = cast<Instruction>(Br->getCondition()); 2351 LastInduction->moveBefore(ICmp); 2352 LastInduction->setName("vec.ind.next"); 2353 2354 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2355 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2356 } 2357 2358 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2359 return Cost->isScalarAfterVectorization(I, VF) || 2360 Cost->isProfitableToScalarize(I, VF); 2361 } 2362 2363 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2364 if (shouldScalarizeInstruction(IV)) 2365 return true; 2366 auto isScalarInst = [&](User *U) -> bool { 2367 auto *I = cast<Instruction>(U); 2368 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2369 }; 2370 return llvm::any_of(IV->users(), isScalarInst); 2371 } 2372 2373 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2374 const InductionDescriptor &ID, const Instruction *EntryVal, 2375 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2376 unsigned Part, unsigned Lane) { 2377 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2378 "Expected either an induction phi-node or a truncate of it!"); 2379 2380 // This induction variable is not the phi from the original loop but the 2381 // newly-created IV based on the proof that casted Phi is equal to the 2382 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2383 // re-uses the same InductionDescriptor that original IV uses but we don't 2384 // have to do any recording in this case - that is done when original IV is 2385 // processed. 2386 if (isa<TruncInst>(EntryVal)) 2387 return; 2388 2389 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2390 if (Casts.empty()) 2391 return; 2392 // Only the first Cast instruction in the Casts vector is of interest. 2393 // The rest of the Casts (if exist) have no uses outside the 2394 // induction update chain itself. 2395 if (Lane < UINT_MAX) 2396 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2397 else 2398 State.set(CastDef, VectorLoopVal, Part); 2399 } 2400 2401 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2402 TruncInst *Trunc, VPValue *Def, 2403 VPValue *CastDef, 2404 VPTransformState &State) { 2405 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2406 "Primary induction variable must have an integer type"); 2407 2408 auto II = Legal->getInductionVars().find(IV); 2409 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2410 2411 auto ID = II->second; 2412 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2413 2414 // The value from the original loop to which we are mapping the new induction 2415 // variable. 2416 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2417 2418 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2419 2420 // Generate code for the induction step. Note that induction steps are 2421 // required to be loop-invariant 2422 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2423 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2424 "Induction step should be loop invariant"); 2425 if (PSE.getSE()->isSCEVable(IV->getType())) { 2426 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2427 return Exp.expandCodeFor(Step, Step->getType(), 2428 LoopVectorPreHeader->getTerminator()); 2429 } 2430 return cast<SCEVUnknown>(Step)->getValue(); 2431 }; 2432 2433 // The scalar value to broadcast. This is derived from the canonical 2434 // induction variable. If a truncation type is given, truncate the canonical 2435 // induction variable and step. Otherwise, derive these values from the 2436 // induction descriptor. 2437 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2438 Value *ScalarIV = Induction; 2439 if (IV != OldInduction) { 2440 ScalarIV = IV->getType()->isIntegerTy() 2441 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2442 : Builder.CreateCast(Instruction::SIToFP, Induction, 2443 IV->getType()); 2444 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2445 ScalarIV->setName("offset.idx"); 2446 } 2447 if (Trunc) { 2448 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2449 assert(Step->getType()->isIntegerTy() && 2450 "Truncation requires an integer step"); 2451 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2452 Step = Builder.CreateTrunc(Step, TruncType); 2453 } 2454 return ScalarIV; 2455 }; 2456 2457 // Create the vector values from the scalar IV, in the absence of creating a 2458 // vector IV. 2459 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2460 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2461 for (unsigned Part = 0; Part < UF; ++Part) { 2462 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2463 Value *EntryPart = 2464 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2465 ID.getInductionOpcode()); 2466 State.set(Def, EntryPart, Part); 2467 if (Trunc) 2468 addMetadata(EntryPart, Trunc); 2469 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2470 State, Part); 2471 } 2472 }; 2473 2474 // Fast-math-flags propagate from the original induction instruction. 2475 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2476 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2477 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2478 2479 // Now do the actual transformations, and start with creating the step value. 2480 Value *Step = CreateStepValue(ID.getStep()); 2481 if (VF.isZero() || VF.isScalar()) { 2482 Value *ScalarIV = CreateScalarIV(Step); 2483 CreateSplatIV(ScalarIV, Step); 2484 return; 2485 } 2486 2487 // Determine if we want a scalar version of the induction variable. This is 2488 // true if the induction variable itself is not widened, or if it has at 2489 // least one user in the loop that is not widened. 2490 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2491 if (!NeedsScalarIV) { 2492 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2493 State); 2494 return; 2495 } 2496 2497 // Try to create a new independent vector induction variable. If we can't 2498 // create the phi node, we will splat the scalar induction variable in each 2499 // loop iteration. 2500 if (!shouldScalarizeInstruction(EntryVal)) { 2501 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2502 State); 2503 Value *ScalarIV = CreateScalarIV(Step); 2504 // Create scalar steps that can be used by instructions we will later 2505 // scalarize. Note that the addition of the scalar steps will not increase 2506 // the number of instructions in the loop in the common case prior to 2507 // InstCombine. We will be trading one vector extract for each scalar step. 2508 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2509 return; 2510 } 2511 2512 // All IV users are scalar instructions, so only emit a scalar IV, not a 2513 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2514 // predicate used by the masked loads/stores. 2515 Value *ScalarIV = CreateScalarIV(Step); 2516 if (!Cost->isScalarEpilogueAllowed()) 2517 CreateSplatIV(ScalarIV, Step); 2518 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2519 } 2520 2521 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2522 Instruction::BinaryOps BinOp) { 2523 // Create and check the types. 2524 auto *ValVTy = cast<VectorType>(Val->getType()); 2525 ElementCount VLen = ValVTy->getElementCount(); 2526 2527 Type *STy = Val->getType()->getScalarType(); 2528 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2529 "Induction Step must be an integer or FP"); 2530 assert(Step->getType() == STy && "Step has wrong type"); 2531 2532 SmallVector<Constant *, 8> Indices; 2533 2534 // Create a vector of consecutive numbers from zero to VF. 2535 VectorType *InitVecValVTy = ValVTy; 2536 Type *InitVecValSTy = STy; 2537 if (STy->isFloatingPointTy()) { 2538 InitVecValSTy = 2539 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2540 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2541 } 2542 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2543 2544 // Add on StartIdx 2545 Value *StartIdxSplat = Builder.CreateVectorSplat( 2546 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2547 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2548 2549 if (STy->isIntegerTy()) { 2550 Step = Builder.CreateVectorSplat(VLen, Step); 2551 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2552 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2553 // which can be found from the original scalar operations. 2554 Step = Builder.CreateMul(InitVec, Step); 2555 return Builder.CreateAdd(Val, Step, "induction"); 2556 } 2557 2558 // Floating point induction. 2559 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2560 "Binary Opcode should be specified for FP induction"); 2561 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2562 Step = Builder.CreateVectorSplat(VLen, Step); 2563 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2564 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2565 } 2566 2567 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2568 Instruction *EntryVal, 2569 const InductionDescriptor &ID, 2570 VPValue *Def, VPValue *CastDef, 2571 VPTransformState &State) { 2572 // We shouldn't have to build scalar steps if we aren't vectorizing. 2573 assert(VF.isVector() && "VF should be greater than one"); 2574 // Get the value type and ensure it and the step have the same integer type. 2575 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2576 assert(ScalarIVTy == Step->getType() && 2577 "Val and Step should have the same type"); 2578 2579 // We build scalar steps for both integer and floating-point induction 2580 // variables. Here, we determine the kind of arithmetic we will perform. 2581 Instruction::BinaryOps AddOp; 2582 Instruction::BinaryOps MulOp; 2583 if (ScalarIVTy->isIntegerTy()) { 2584 AddOp = Instruction::Add; 2585 MulOp = Instruction::Mul; 2586 } else { 2587 AddOp = ID.getInductionOpcode(); 2588 MulOp = Instruction::FMul; 2589 } 2590 2591 // Determine the number of scalars we need to generate for each unroll 2592 // iteration. If EntryVal is uniform, we only need to generate the first 2593 // lane. Otherwise, we generate all VF values. 2594 bool IsUniform = 2595 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2596 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2597 // Compute the scalar steps and save the results in State. 2598 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2599 ScalarIVTy->getScalarSizeInBits()); 2600 Type *VecIVTy = nullptr; 2601 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2602 if (!IsUniform && VF.isScalable()) { 2603 VecIVTy = VectorType::get(ScalarIVTy, VF); 2604 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2605 SplatStep = Builder.CreateVectorSplat(VF, Step); 2606 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2607 } 2608 2609 for (unsigned Part = 0; Part < UF; ++Part) { 2610 Value *StartIdx0 = 2611 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2612 2613 if (!IsUniform && VF.isScalable()) { 2614 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2615 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2616 if (ScalarIVTy->isFloatingPointTy()) 2617 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2618 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2619 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2620 State.set(Def, Add, Part); 2621 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2622 Part); 2623 // It's useful to record the lane values too for the known minimum number 2624 // of elements so we do those below. This improves the code quality when 2625 // trying to extract the first element, for example. 2626 } 2627 2628 if (ScalarIVTy->isFloatingPointTy()) 2629 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2630 2631 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2632 Value *StartIdx = Builder.CreateBinOp( 2633 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2634 // The step returned by `createStepForVF` is a runtime-evaluated value 2635 // when VF is scalable. Otherwise, it should be folded into a Constant. 2636 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2637 "Expected StartIdx to be folded to a constant when VF is not " 2638 "scalable"); 2639 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2640 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2641 State.set(Def, Add, VPIteration(Part, Lane)); 2642 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2643 Part, Lane); 2644 } 2645 } 2646 } 2647 2648 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2649 const VPIteration &Instance, 2650 VPTransformState &State) { 2651 Value *ScalarInst = State.get(Def, Instance); 2652 Value *VectorValue = State.get(Def, Instance.Part); 2653 VectorValue = Builder.CreateInsertElement( 2654 VectorValue, ScalarInst, 2655 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2656 State.set(Def, VectorValue, Instance.Part); 2657 } 2658 2659 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2660 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2661 return Builder.CreateVectorReverse(Vec, "reverse"); 2662 } 2663 2664 // Return whether we allow using masked interleave-groups (for dealing with 2665 // strided loads/stores that reside in predicated blocks, or for dealing 2666 // with gaps). 2667 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2668 // If an override option has been passed in for interleaved accesses, use it. 2669 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2670 return EnableMaskedInterleavedMemAccesses; 2671 2672 return TTI.enableMaskedInterleavedAccessVectorization(); 2673 } 2674 2675 // Try to vectorize the interleave group that \p Instr belongs to. 2676 // 2677 // E.g. Translate following interleaved load group (factor = 3): 2678 // for (i = 0; i < N; i+=3) { 2679 // R = Pic[i]; // Member of index 0 2680 // G = Pic[i+1]; // Member of index 1 2681 // B = Pic[i+2]; // Member of index 2 2682 // ... // do something to R, G, B 2683 // } 2684 // To: 2685 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2686 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2687 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2688 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2689 // 2690 // Or translate following interleaved store group (factor = 3): 2691 // for (i = 0; i < N; i+=3) { 2692 // ... do something to R, G, B 2693 // Pic[i] = R; // Member of index 0 2694 // Pic[i+1] = G; // Member of index 1 2695 // Pic[i+2] = B; // Member of index 2 2696 // } 2697 // To: 2698 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2699 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2700 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2701 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2702 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2703 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2704 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2705 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2706 VPValue *BlockInMask) { 2707 Instruction *Instr = Group->getInsertPos(); 2708 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2709 2710 // Prepare for the vector type of the interleaved load/store. 2711 Type *ScalarTy = getLoadStoreType(Instr); 2712 unsigned InterleaveFactor = Group->getFactor(); 2713 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2714 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2715 2716 // Prepare for the new pointers. 2717 SmallVector<Value *, 2> AddrParts; 2718 unsigned Index = Group->getIndex(Instr); 2719 2720 // TODO: extend the masked interleaved-group support to reversed access. 2721 assert((!BlockInMask || !Group->isReverse()) && 2722 "Reversed masked interleave-group not supported."); 2723 2724 // If the group is reverse, adjust the index to refer to the last vector lane 2725 // instead of the first. We adjust the index from the first vector lane, 2726 // rather than directly getting the pointer for lane VF - 1, because the 2727 // pointer operand of the interleaved access is supposed to be uniform. For 2728 // uniform instructions, we're only required to generate a value for the 2729 // first vector lane in each unroll iteration. 2730 if (Group->isReverse()) 2731 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2732 2733 for (unsigned Part = 0; Part < UF; Part++) { 2734 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2735 setDebugLocFromInst(AddrPart); 2736 2737 // Notice current instruction could be any index. Need to adjust the address 2738 // to the member of index 0. 2739 // 2740 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2741 // b = A[i]; // Member of index 0 2742 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2743 // 2744 // E.g. A[i+1] = a; // Member of index 1 2745 // A[i] = b; // Member of index 0 2746 // A[i+2] = c; // Member of index 2 (Current instruction) 2747 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2748 2749 bool InBounds = false; 2750 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2751 InBounds = gep->isInBounds(); 2752 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2753 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2754 2755 // Cast to the vector pointer type. 2756 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2757 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2758 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2759 } 2760 2761 setDebugLocFromInst(Instr); 2762 Value *PoisonVec = PoisonValue::get(VecTy); 2763 2764 Value *MaskForGaps = nullptr; 2765 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2766 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2767 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2768 } 2769 2770 // Vectorize the interleaved load group. 2771 if (isa<LoadInst>(Instr)) { 2772 // For each unroll part, create a wide load for the group. 2773 SmallVector<Value *, 2> NewLoads; 2774 for (unsigned Part = 0; Part < UF; Part++) { 2775 Instruction *NewLoad; 2776 if (BlockInMask || MaskForGaps) { 2777 assert(useMaskedInterleavedAccesses(*TTI) && 2778 "masked interleaved groups are not allowed."); 2779 Value *GroupMask = MaskForGaps; 2780 if (BlockInMask) { 2781 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2782 Value *ShuffledMask = Builder.CreateShuffleVector( 2783 BlockInMaskPart, 2784 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2785 "interleaved.mask"); 2786 GroupMask = MaskForGaps 2787 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2788 MaskForGaps) 2789 : ShuffledMask; 2790 } 2791 NewLoad = 2792 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2793 GroupMask, PoisonVec, "wide.masked.vec"); 2794 } 2795 else 2796 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2797 Group->getAlign(), "wide.vec"); 2798 Group->addMetadata(NewLoad); 2799 NewLoads.push_back(NewLoad); 2800 } 2801 2802 // For each member in the group, shuffle out the appropriate data from the 2803 // wide loads. 2804 unsigned J = 0; 2805 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2806 Instruction *Member = Group->getMember(I); 2807 2808 // Skip the gaps in the group. 2809 if (!Member) 2810 continue; 2811 2812 auto StrideMask = 2813 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2814 for (unsigned Part = 0; Part < UF; Part++) { 2815 Value *StridedVec = Builder.CreateShuffleVector( 2816 NewLoads[Part], StrideMask, "strided.vec"); 2817 2818 // If this member has different type, cast the result type. 2819 if (Member->getType() != ScalarTy) { 2820 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2821 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2822 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2823 } 2824 2825 if (Group->isReverse()) 2826 StridedVec = reverseVector(StridedVec); 2827 2828 State.set(VPDefs[J], StridedVec, Part); 2829 } 2830 ++J; 2831 } 2832 return; 2833 } 2834 2835 // The sub vector type for current instruction. 2836 auto *SubVT = VectorType::get(ScalarTy, VF); 2837 2838 // Vectorize the interleaved store group. 2839 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2840 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2841 "masked interleaved groups are not allowed."); 2842 assert((!MaskForGaps || !VF.isScalable()) && 2843 "masking gaps for scalable vectors is not yet supported."); 2844 for (unsigned Part = 0; Part < UF; Part++) { 2845 // Collect the stored vector from each member. 2846 SmallVector<Value *, 4> StoredVecs; 2847 for (unsigned i = 0; i < InterleaveFactor; i++) { 2848 assert((Group->getMember(i) || MaskForGaps) && 2849 "Fail to get a member from an interleaved store group"); 2850 Instruction *Member = Group->getMember(i); 2851 2852 // Skip the gaps in the group. 2853 if (!Member) { 2854 Value *Undef = PoisonValue::get(SubVT); 2855 StoredVecs.push_back(Undef); 2856 continue; 2857 } 2858 2859 Value *StoredVec = State.get(StoredValues[i], Part); 2860 2861 if (Group->isReverse()) 2862 StoredVec = reverseVector(StoredVec); 2863 2864 // If this member has different type, cast it to a unified type. 2865 2866 if (StoredVec->getType() != SubVT) 2867 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2868 2869 StoredVecs.push_back(StoredVec); 2870 } 2871 2872 // Concatenate all vectors into a wide vector. 2873 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2874 2875 // Interleave the elements in the wide vector. 2876 Value *IVec = Builder.CreateShuffleVector( 2877 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2878 "interleaved.vec"); 2879 2880 Instruction *NewStoreInstr; 2881 if (BlockInMask || MaskForGaps) { 2882 Value *GroupMask = MaskForGaps; 2883 if (BlockInMask) { 2884 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2885 Value *ShuffledMask = Builder.CreateShuffleVector( 2886 BlockInMaskPart, 2887 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2888 "interleaved.mask"); 2889 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2890 ShuffledMask, MaskForGaps) 2891 : ShuffledMask; 2892 } 2893 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2894 Group->getAlign(), GroupMask); 2895 } else 2896 NewStoreInstr = 2897 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2898 2899 Group->addMetadata(NewStoreInstr); 2900 } 2901 } 2902 2903 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2904 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2905 VPValue *StoredValue, VPValue *BlockInMask) { 2906 // Attempt to issue a wide load. 2907 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2908 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2909 2910 assert((LI || SI) && "Invalid Load/Store instruction"); 2911 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2912 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2913 2914 LoopVectorizationCostModel::InstWidening Decision = 2915 Cost->getWideningDecision(Instr, VF); 2916 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2917 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2918 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2919 "CM decision is not to widen the memory instruction"); 2920 2921 Type *ScalarDataTy = getLoadStoreType(Instr); 2922 2923 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2924 const Align Alignment = getLoadStoreAlignment(Instr); 2925 2926 // Determine if the pointer operand of the access is either consecutive or 2927 // reverse consecutive. 2928 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2929 bool ConsecutiveStride = 2930 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2931 bool CreateGatherScatter = 2932 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2933 2934 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2935 // gather/scatter. Otherwise Decision should have been to Scalarize. 2936 assert((ConsecutiveStride || CreateGatherScatter) && 2937 "The instruction should be scalarized"); 2938 (void)ConsecutiveStride; 2939 2940 VectorParts BlockInMaskParts(UF); 2941 bool isMaskRequired = BlockInMask; 2942 if (isMaskRequired) 2943 for (unsigned Part = 0; Part < UF; ++Part) 2944 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2945 2946 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2947 // Calculate the pointer for the specific unroll-part. 2948 GetElementPtrInst *PartPtr = nullptr; 2949 2950 bool InBounds = false; 2951 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2952 InBounds = gep->isInBounds(); 2953 if (Reverse) { 2954 // If the address is consecutive but reversed, then the 2955 // wide store needs to start at the last vector element. 2956 // RunTimeVF = VScale * VF.getKnownMinValue() 2957 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2958 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2959 // NumElt = -Part * RunTimeVF 2960 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2961 // LastLane = 1 - RunTimeVF 2962 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2963 PartPtr = 2964 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2965 PartPtr->setIsInBounds(InBounds); 2966 PartPtr = cast<GetElementPtrInst>( 2967 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2968 PartPtr->setIsInBounds(InBounds); 2969 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2970 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2971 } else { 2972 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2973 PartPtr = cast<GetElementPtrInst>( 2974 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2975 PartPtr->setIsInBounds(InBounds); 2976 } 2977 2978 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2979 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2980 }; 2981 2982 // Handle Stores: 2983 if (SI) { 2984 setDebugLocFromInst(SI); 2985 2986 for (unsigned Part = 0; Part < UF; ++Part) { 2987 Instruction *NewSI = nullptr; 2988 Value *StoredVal = State.get(StoredValue, Part); 2989 if (CreateGatherScatter) { 2990 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2991 Value *VectorGep = State.get(Addr, Part); 2992 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2993 MaskPart); 2994 } else { 2995 if (Reverse) { 2996 // If we store to reverse consecutive memory locations, then we need 2997 // to reverse the order of elements in the stored value. 2998 StoredVal = reverseVector(StoredVal); 2999 // We don't want to update the value in the map as it might be used in 3000 // another expression. So don't call resetVectorValue(StoredVal). 3001 } 3002 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3003 if (isMaskRequired) 3004 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 3005 BlockInMaskParts[Part]); 3006 else 3007 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 3008 } 3009 addMetadata(NewSI, SI); 3010 } 3011 return; 3012 } 3013 3014 // Handle loads. 3015 assert(LI && "Must have a load instruction"); 3016 setDebugLocFromInst(LI); 3017 for (unsigned Part = 0; Part < UF; ++Part) { 3018 Value *NewLI; 3019 if (CreateGatherScatter) { 3020 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3021 Value *VectorGep = State.get(Addr, Part); 3022 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3023 nullptr, "wide.masked.gather"); 3024 addMetadata(NewLI, LI); 3025 } else { 3026 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3027 if (isMaskRequired) 3028 NewLI = Builder.CreateMaskedLoad( 3029 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3030 PoisonValue::get(DataTy), "wide.masked.load"); 3031 else 3032 NewLI = 3033 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3034 3035 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3036 addMetadata(NewLI, LI); 3037 if (Reverse) 3038 NewLI = reverseVector(NewLI); 3039 } 3040 3041 State.set(Def, NewLI, Part); 3042 } 3043 } 3044 3045 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3046 VPUser &User, 3047 const VPIteration &Instance, 3048 bool IfPredicateInstr, 3049 VPTransformState &State) { 3050 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3051 3052 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3053 // the first lane and part. 3054 if (isa<NoAliasScopeDeclInst>(Instr)) 3055 if (!Instance.isFirstIteration()) 3056 return; 3057 3058 setDebugLocFromInst(Instr); 3059 3060 // Does this instruction return a value ? 3061 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3062 3063 Instruction *Cloned = Instr->clone(); 3064 if (!IsVoidRetTy) 3065 Cloned->setName(Instr->getName() + ".cloned"); 3066 3067 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3068 Builder.GetInsertPoint()); 3069 // Replace the operands of the cloned instructions with their scalar 3070 // equivalents in the new loop. 3071 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3072 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3073 auto InputInstance = Instance; 3074 if (!Operand || !OrigLoop->contains(Operand) || 3075 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3076 InputInstance.Lane = VPLane::getFirstLane(); 3077 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3078 Cloned->setOperand(op, NewOp); 3079 } 3080 addNewMetadata(Cloned, Instr); 3081 3082 // Place the cloned scalar in the new loop. 3083 Builder.Insert(Cloned); 3084 3085 State.set(Def, Cloned, Instance); 3086 3087 // If we just cloned a new assumption, add it the assumption cache. 3088 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3089 AC->registerAssumption(II); 3090 3091 // End if-block. 3092 if (IfPredicateInstr) 3093 PredicatedInstructions.push_back(Cloned); 3094 } 3095 3096 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3097 Value *End, Value *Step, 3098 Instruction *DL) { 3099 BasicBlock *Header = L->getHeader(); 3100 BasicBlock *Latch = L->getLoopLatch(); 3101 // As we're just creating this loop, it's possible no latch exists 3102 // yet. If so, use the header as this will be a single block loop. 3103 if (!Latch) 3104 Latch = Header; 3105 3106 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3107 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3108 setDebugLocFromInst(OldInst, &B); 3109 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3110 3111 B.SetInsertPoint(Latch->getTerminator()); 3112 setDebugLocFromInst(OldInst, &B); 3113 3114 // Create i+1 and fill the PHINode. 3115 // 3116 // If the tail is not folded, we know that End - Start >= Step (either 3117 // statically or through the minimum iteration checks). We also know that both 3118 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3119 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3120 // overflows and we can mark the induction increment as NUW. 3121 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3122 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3123 Induction->addIncoming(Start, L->getLoopPreheader()); 3124 Induction->addIncoming(Next, Latch); 3125 // Create the compare. 3126 Value *ICmp = B.CreateICmpEQ(Next, End); 3127 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3128 3129 // Now we have two terminators. Remove the old one from the block. 3130 Latch->getTerminator()->eraseFromParent(); 3131 3132 return Induction; 3133 } 3134 3135 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3136 if (TripCount) 3137 return TripCount; 3138 3139 assert(L && "Create Trip Count for null loop."); 3140 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3141 // Find the loop boundaries. 3142 ScalarEvolution *SE = PSE.getSE(); 3143 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3144 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3145 "Invalid loop count"); 3146 3147 Type *IdxTy = Legal->getWidestInductionType(); 3148 assert(IdxTy && "No type for induction"); 3149 3150 // The exit count might have the type of i64 while the phi is i32. This can 3151 // happen if we have an induction variable that is sign extended before the 3152 // compare. The only way that we get a backedge taken count is that the 3153 // induction variable was signed and as such will not overflow. In such a case 3154 // truncation is legal. 3155 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3156 IdxTy->getPrimitiveSizeInBits()) 3157 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3158 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3159 3160 // Get the total trip count from the count by adding 1. 3161 const SCEV *ExitCount = SE->getAddExpr( 3162 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3163 3164 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3165 3166 // Expand the trip count and place the new instructions in the preheader. 3167 // Notice that the pre-header does not change, only the loop body. 3168 SCEVExpander Exp(*SE, DL, "induction"); 3169 3170 // Count holds the overall loop count (N). 3171 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3172 L->getLoopPreheader()->getTerminator()); 3173 3174 if (TripCount->getType()->isPointerTy()) 3175 TripCount = 3176 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3177 L->getLoopPreheader()->getTerminator()); 3178 3179 return TripCount; 3180 } 3181 3182 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3183 if (VectorTripCount) 3184 return VectorTripCount; 3185 3186 Value *TC = getOrCreateTripCount(L); 3187 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3188 3189 Type *Ty = TC->getType(); 3190 // This is where we can make the step a runtime constant. 3191 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3192 3193 // If the tail is to be folded by masking, round the number of iterations N 3194 // up to a multiple of Step instead of rounding down. This is done by first 3195 // adding Step-1 and then rounding down. Note that it's ok if this addition 3196 // overflows: the vector induction variable will eventually wrap to zero given 3197 // that it starts at zero and its Step is a power of two; the loop will then 3198 // exit, with the last early-exit vector comparison also producing all-true. 3199 if (Cost->foldTailByMasking()) { 3200 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3201 "VF*UF must be a power of 2 when folding tail by masking"); 3202 assert(!VF.isScalable() && 3203 "Tail folding not yet supported for scalable vectors"); 3204 TC = Builder.CreateAdd( 3205 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3206 } 3207 3208 // Now we need to generate the expression for the part of the loop that the 3209 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3210 // iterations are not required for correctness, or N - Step, otherwise. Step 3211 // is equal to the vectorization factor (number of SIMD elements) times the 3212 // unroll factor (number of SIMD instructions). 3213 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3214 3215 // There are cases where we *must* run at least one iteration in the remainder 3216 // loop. See the cost model for when this can happen. If the step evenly 3217 // divides the trip count, we set the remainder to be equal to the step. If 3218 // the step does not evenly divide the trip count, no adjustment is necessary 3219 // since there will already be scalar iterations. Note that the minimum 3220 // iterations check ensures that N >= Step. 3221 if (Cost->requiresScalarEpilogue(VF)) { 3222 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3223 R = Builder.CreateSelect(IsZero, Step, R); 3224 } 3225 3226 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3227 3228 return VectorTripCount; 3229 } 3230 3231 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3232 const DataLayout &DL) { 3233 // Verify that V is a vector type with same number of elements as DstVTy. 3234 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3235 unsigned VF = DstFVTy->getNumElements(); 3236 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3237 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3238 Type *SrcElemTy = SrcVecTy->getElementType(); 3239 Type *DstElemTy = DstFVTy->getElementType(); 3240 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3241 "Vector elements must have same size"); 3242 3243 // Do a direct cast if element types are castable. 3244 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3245 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3246 } 3247 // V cannot be directly casted to desired vector type. 3248 // May happen when V is a floating point vector but DstVTy is a vector of 3249 // pointers or vice-versa. Handle this using a two-step bitcast using an 3250 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3251 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3252 "Only one type should be a pointer type"); 3253 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3254 "Only one type should be a floating point type"); 3255 Type *IntTy = 3256 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3257 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3258 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3259 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3260 } 3261 3262 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3263 BasicBlock *Bypass) { 3264 Value *Count = getOrCreateTripCount(L); 3265 // Reuse existing vector loop preheader for TC checks. 3266 // Note that new preheader block is generated for vector loop. 3267 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3268 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3269 3270 // Generate code to check if the loop's trip count is less than VF * UF, or 3271 // equal to it in case a scalar epilogue is required; this implies that the 3272 // vector trip count is zero. This check also covers the case where adding one 3273 // to the backedge-taken count overflowed leading to an incorrect trip count 3274 // of zero. In this case we will also jump to the scalar loop. 3275 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3276 : ICmpInst::ICMP_ULT; 3277 3278 // If tail is to be folded, vector loop takes care of all iterations. 3279 Value *CheckMinIters = Builder.getFalse(); 3280 if (!Cost->foldTailByMasking()) { 3281 Value *Step = 3282 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3283 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3284 } 3285 // Create new preheader for vector loop. 3286 LoopVectorPreHeader = 3287 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3288 "vector.ph"); 3289 3290 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3291 DT->getNode(Bypass)->getIDom()) && 3292 "TC check is expected to dominate Bypass"); 3293 3294 // Update dominator for Bypass & LoopExit (if needed). 3295 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3296 if (!Cost->requiresScalarEpilogue(VF)) 3297 // If there is an epilogue which must run, there's no edge from the 3298 // middle block to exit blocks and thus no need to update the immediate 3299 // dominator of the exit blocks. 3300 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3301 3302 ReplaceInstWithInst( 3303 TCCheckBlock->getTerminator(), 3304 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3305 LoopBypassBlocks.push_back(TCCheckBlock); 3306 } 3307 3308 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3309 3310 BasicBlock *const SCEVCheckBlock = 3311 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3312 if (!SCEVCheckBlock) 3313 return nullptr; 3314 3315 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3316 (OptForSizeBasedOnProfile && 3317 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3318 "Cannot SCEV check stride or overflow when optimizing for size"); 3319 3320 3321 // Update dominator only if this is first RT check. 3322 if (LoopBypassBlocks.empty()) { 3323 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3324 if (!Cost->requiresScalarEpilogue(VF)) 3325 // If there is an epilogue which must run, there's no edge from the 3326 // middle block to exit blocks and thus no need to update the immediate 3327 // dominator of the exit blocks. 3328 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3329 } 3330 3331 LoopBypassBlocks.push_back(SCEVCheckBlock); 3332 AddedSafetyChecks = true; 3333 return SCEVCheckBlock; 3334 } 3335 3336 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3337 BasicBlock *Bypass) { 3338 // VPlan-native path does not do any analysis for runtime checks currently. 3339 if (EnableVPlanNativePath) 3340 return nullptr; 3341 3342 BasicBlock *const MemCheckBlock = 3343 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3344 3345 // Check if we generated code that checks in runtime if arrays overlap. We put 3346 // the checks into a separate block to make the more common case of few 3347 // elements faster. 3348 if (!MemCheckBlock) 3349 return nullptr; 3350 3351 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3352 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3353 "Cannot emit memory checks when optimizing for size, unless forced " 3354 "to vectorize."); 3355 ORE->emit([&]() { 3356 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3357 L->getStartLoc(), L->getHeader()) 3358 << "Code-size may be reduced by not forcing " 3359 "vectorization, or by source-code modifications " 3360 "eliminating the need for runtime checks " 3361 "(e.g., adding 'restrict')."; 3362 }); 3363 } 3364 3365 LoopBypassBlocks.push_back(MemCheckBlock); 3366 3367 AddedSafetyChecks = true; 3368 3369 // We currently don't use LoopVersioning for the actual loop cloning but we 3370 // still use it to add the noalias metadata. 3371 LVer = std::make_unique<LoopVersioning>( 3372 *Legal->getLAI(), 3373 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3374 DT, PSE.getSE()); 3375 LVer->prepareNoAliasMetadata(); 3376 return MemCheckBlock; 3377 } 3378 3379 Value *InnerLoopVectorizer::emitTransformedIndex( 3380 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3381 const InductionDescriptor &ID) const { 3382 3383 SCEVExpander Exp(*SE, DL, "induction"); 3384 auto Step = ID.getStep(); 3385 auto StartValue = ID.getStartValue(); 3386 assert(Index->getType()->getScalarType() == Step->getType() && 3387 "Index scalar type does not match StepValue type"); 3388 3389 // Note: the IR at this point is broken. We cannot use SE to create any new 3390 // SCEV and then expand it, hoping that SCEV's simplification will give us 3391 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3392 // lead to various SCEV crashes. So all we can do is to use builder and rely 3393 // on InstCombine for future simplifications. Here we handle some trivial 3394 // cases only. 3395 auto CreateAdd = [&B](Value *X, Value *Y) { 3396 assert(X->getType() == Y->getType() && "Types don't match!"); 3397 if (auto *CX = dyn_cast<ConstantInt>(X)) 3398 if (CX->isZero()) 3399 return Y; 3400 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3401 if (CY->isZero()) 3402 return X; 3403 return B.CreateAdd(X, Y); 3404 }; 3405 3406 // We allow X to be a vector type, in which case Y will potentially be 3407 // splatted into a vector with the same element count. 3408 auto CreateMul = [&B](Value *X, Value *Y) { 3409 assert(X->getType()->getScalarType() == Y->getType() && 3410 "Types don't match!"); 3411 if (auto *CX = dyn_cast<ConstantInt>(X)) 3412 if (CX->isOne()) 3413 return Y; 3414 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3415 if (CY->isOne()) 3416 return X; 3417 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3418 if (XVTy && !isa<VectorType>(Y->getType())) 3419 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3420 return B.CreateMul(X, Y); 3421 }; 3422 3423 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3424 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3425 // the DomTree is not kept up-to-date for additional blocks generated in the 3426 // vector loop. By using the header as insertion point, we guarantee that the 3427 // expanded instructions dominate all their uses. 3428 auto GetInsertPoint = [this, &B]() { 3429 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3430 if (InsertBB != LoopVectorBody && 3431 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3432 return LoopVectorBody->getTerminator(); 3433 return &*B.GetInsertPoint(); 3434 }; 3435 3436 switch (ID.getKind()) { 3437 case InductionDescriptor::IK_IntInduction: { 3438 assert(!isa<VectorType>(Index->getType()) && 3439 "Vector indices not supported for integer inductions yet"); 3440 assert(Index->getType() == StartValue->getType() && 3441 "Index type does not match StartValue type"); 3442 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3443 return B.CreateSub(StartValue, Index); 3444 auto *Offset = CreateMul( 3445 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3446 return CreateAdd(StartValue, Offset); 3447 } 3448 case InductionDescriptor::IK_PtrInduction: { 3449 assert(isa<SCEVConstant>(Step) && 3450 "Expected constant step for pointer induction"); 3451 return B.CreateGEP( 3452 StartValue->getType()->getPointerElementType(), StartValue, 3453 CreateMul(Index, 3454 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3455 GetInsertPoint()))); 3456 } 3457 case InductionDescriptor::IK_FpInduction: { 3458 assert(!isa<VectorType>(Index->getType()) && 3459 "Vector indices not supported for FP inductions yet"); 3460 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3461 auto InductionBinOp = ID.getInductionBinOp(); 3462 assert(InductionBinOp && 3463 (InductionBinOp->getOpcode() == Instruction::FAdd || 3464 InductionBinOp->getOpcode() == Instruction::FSub) && 3465 "Original bin op should be defined for FP induction"); 3466 3467 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3468 Value *MulExp = B.CreateFMul(StepValue, Index); 3469 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3470 "induction"); 3471 } 3472 case InductionDescriptor::IK_NoInduction: 3473 return nullptr; 3474 } 3475 llvm_unreachable("invalid enum"); 3476 } 3477 3478 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3479 LoopScalarBody = OrigLoop->getHeader(); 3480 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3481 assert(LoopVectorPreHeader && "Invalid loop structure"); 3482 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3483 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3484 "multiple exit loop without required epilogue?"); 3485 3486 LoopMiddleBlock = 3487 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3488 LI, nullptr, Twine(Prefix) + "middle.block"); 3489 LoopScalarPreHeader = 3490 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3491 nullptr, Twine(Prefix) + "scalar.ph"); 3492 3493 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3494 3495 // Set up the middle block terminator. Two cases: 3496 // 1) If we know that we must execute the scalar epilogue, emit an 3497 // unconditional branch. 3498 // 2) Otherwise, we must have a single unique exit block (due to how we 3499 // implement the multiple exit case). In this case, set up a conditonal 3500 // branch from the middle block to the loop scalar preheader, and the 3501 // exit block. completeLoopSkeleton will update the condition to use an 3502 // iteration check, if required to decide whether to execute the remainder. 3503 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3504 BranchInst::Create(LoopScalarPreHeader) : 3505 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3506 Builder.getTrue()); 3507 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3508 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3509 3510 // We intentionally don't let SplitBlock to update LoopInfo since 3511 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3512 // LoopVectorBody is explicitly added to the correct place few lines later. 3513 LoopVectorBody = 3514 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3515 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3516 3517 // Update dominator for loop exit. 3518 if (!Cost->requiresScalarEpilogue(VF)) 3519 // If there is an epilogue which must run, there's no edge from the 3520 // middle block to exit blocks and thus no need to update the immediate 3521 // dominator of the exit blocks. 3522 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3523 3524 // Create and register the new vector loop. 3525 Loop *Lp = LI->AllocateLoop(); 3526 Loop *ParentLoop = OrigLoop->getParentLoop(); 3527 3528 // Insert the new loop into the loop nest and register the new basic blocks 3529 // before calling any utilities such as SCEV that require valid LoopInfo. 3530 if (ParentLoop) { 3531 ParentLoop->addChildLoop(Lp); 3532 } else { 3533 LI->addTopLevelLoop(Lp); 3534 } 3535 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3536 return Lp; 3537 } 3538 3539 void InnerLoopVectorizer::createInductionResumeValues( 3540 Loop *L, Value *VectorTripCount, 3541 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3542 assert(VectorTripCount && L && "Expected valid arguments"); 3543 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3544 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3545 "Inconsistent information about additional bypass."); 3546 // We are going to resume the execution of the scalar loop. 3547 // Go over all of the induction variables that we found and fix the 3548 // PHIs that are left in the scalar version of the loop. 3549 // The starting values of PHI nodes depend on the counter of the last 3550 // iteration in the vectorized loop. 3551 // If we come from a bypass edge then we need to start from the original 3552 // start value. 3553 for (auto &InductionEntry : Legal->getInductionVars()) { 3554 PHINode *OrigPhi = InductionEntry.first; 3555 InductionDescriptor II = InductionEntry.second; 3556 3557 // Create phi nodes to merge from the backedge-taken check block. 3558 PHINode *BCResumeVal = 3559 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3560 LoopScalarPreHeader->getTerminator()); 3561 // Copy original phi DL over to the new one. 3562 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3563 Value *&EndValue = IVEndValues[OrigPhi]; 3564 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3565 if (OrigPhi == OldInduction) { 3566 // We know what the end value is. 3567 EndValue = VectorTripCount; 3568 } else { 3569 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3570 3571 // Fast-math-flags propagate from the original induction instruction. 3572 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3573 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3574 3575 Type *StepType = II.getStep()->getType(); 3576 Instruction::CastOps CastOp = 3577 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3578 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3579 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3580 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3581 EndValue->setName("ind.end"); 3582 3583 // Compute the end value for the additional bypass (if applicable). 3584 if (AdditionalBypass.first) { 3585 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3586 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3587 StepType, true); 3588 CRD = 3589 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3590 EndValueFromAdditionalBypass = 3591 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3592 EndValueFromAdditionalBypass->setName("ind.end"); 3593 } 3594 } 3595 // The new PHI merges the original incoming value, in case of a bypass, 3596 // or the value at the end of the vectorized loop. 3597 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3598 3599 // Fix the scalar body counter (PHI node). 3600 // The old induction's phi node in the scalar body needs the truncated 3601 // value. 3602 for (BasicBlock *BB : LoopBypassBlocks) 3603 BCResumeVal->addIncoming(II.getStartValue(), BB); 3604 3605 if (AdditionalBypass.first) 3606 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3607 EndValueFromAdditionalBypass); 3608 3609 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3610 } 3611 } 3612 3613 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3614 MDNode *OrigLoopID) { 3615 assert(L && "Expected valid loop."); 3616 3617 // The trip counts should be cached by now. 3618 Value *Count = getOrCreateTripCount(L); 3619 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3620 3621 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3622 3623 // Add a check in the middle block to see if we have completed 3624 // all of the iterations in the first vector loop. Three cases: 3625 // 1) If we require a scalar epilogue, there is no conditional branch as 3626 // we unconditionally branch to the scalar preheader. Do nothing. 3627 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3628 // Thus if tail is to be folded, we know we don't need to run the 3629 // remainder and we can use the previous value for the condition (true). 3630 // 3) Otherwise, construct a runtime check. 3631 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3632 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3633 Count, VectorTripCount, "cmp.n", 3634 LoopMiddleBlock->getTerminator()); 3635 3636 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3637 // of the corresponding compare because they may have ended up with 3638 // different line numbers and we want to avoid awkward line stepping while 3639 // debugging. Eg. if the compare has got a line number inside the loop. 3640 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3641 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3642 } 3643 3644 // Get ready to start creating new instructions into the vectorized body. 3645 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3646 "Inconsistent vector loop preheader"); 3647 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3648 3649 Optional<MDNode *> VectorizedLoopID = 3650 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3651 LLVMLoopVectorizeFollowupVectorized}); 3652 if (VectorizedLoopID.hasValue()) { 3653 L->setLoopID(VectorizedLoopID.getValue()); 3654 3655 // Do not setAlreadyVectorized if loop attributes have been defined 3656 // explicitly. 3657 return LoopVectorPreHeader; 3658 } 3659 3660 // Keep all loop hints from the original loop on the vector loop (we'll 3661 // replace the vectorizer-specific hints below). 3662 if (MDNode *LID = OrigLoop->getLoopID()) 3663 L->setLoopID(LID); 3664 3665 LoopVectorizeHints Hints(L, true, *ORE); 3666 Hints.setAlreadyVectorized(); 3667 3668 #ifdef EXPENSIVE_CHECKS 3669 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3670 LI->verify(*DT); 3671 #endif 3672 3673 return LoopVectorPreHeader; 3674 } 3675 3676 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3677 /* 3678 In this function we generate a new loop. The new loop will contain 3679 the vectorized instructions while the old loop will continue to run the 3680 scalar remainder. 3681 3682 [ ] <-- loop iteration number check. 3683 / | 3684 / v 3685 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3686 | / | 3687 | / v 3688 || [ ] <-- vector pre header. 3689 |/ | 3690 | v 3691 | [ ] \ 3692 | [ ]_| <-- vector loop. 3693 | | 3694 | v 3695 \ -[ ] <--- middle-block. 3696 \/ | 3697 /\ v 3698 | ->[ ] <--- new preheader. 3699 | | 3700 (opt) v <-- edge from middle to exit iff epilogue is not required. 3701 | [ ] \ 3702 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3703 \ | 3704 \ v 3705 >[ ] <-- exit block(s). 3706 ... 3707 */ 3708 3709 // Get the metadata of the original loop before it gets modified. 3710 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3711 3712 // Workaround! Compute the trip count of the original loop and cache it 3713 // before we start modifying the CFG. This code has a systemic problem 3714 // wherein it tries to run analysis over partially constructed IR; this is 3715 // wrong, and not simply for SCEV. The trip count of the original loop 3716 // simply happens to be prone to hitting this in practice. In theory, we 3717 // can hit the same issue for any SCEV, or ValueTracking query done during 3718 // mutation. See PR49900. 3719 getOrCreateTripCount(OrigLoop); 3720 3721 // Create an empty vector loop, and prepare basic blocks for the runtime 3722 // checks. 3723 Loop *Lp = createVectorLoopSkeleton(""); 3724 3725 // Now, compare the new count to zero. If it is zero skip the vector loop and 3726 // jump to the scalar loop. This check also covers the case where the 3727 // backedge-taken count is uint##_max: adding one to it will overflow leading 3728 // to an incorrect trip count of zero. In this (rare) case we will also jump 3729 // to the scalar loop. 3730 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3731 3732 // Generate the code to check any assumptions that we've made for SCEV 3733 // expressions. 3734 emitSCEVChecks(Lp, LoopScalarPreHeader); 3735 3736 // Generate the code that checks in runtime if arrays overlap. We put the 3737 // checks into a separate block to make the more common case of few elements 3738 // faster. 3739 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3740 3741 // Some loops have a single integer induction variable, while other loops 3742 // don't. One example is c++ iterators that often have multiple pointer 3743 // induction variables. In the code below we also support a case where we 3744 // don't have a single induction variable. 3745 // 3746 // We try to obtain an induction variable from the original loop as hard 3747 // as possible. However if we don't find one that: 3748 // - is an integer 3749 // - counts from zero, stepping by one 3750 // - is the size of the widest induction variable type 3751 // then we create a new one. 3752 OldInduction = Legal->getPrimaryInduction(); 3753 Type *IdxTy = Legal->getWidestInductionType(); 3754 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3755 // The loop step is equal to the vectorization factor (num of SIMD elements) 3756 // times the unroll factor (num of SIMD instructions). 3757 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3758 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3759 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3760 Induction = 3761 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3762 getDebugLocFromInstOrOperands(OldInduction)); 3763 3764 // Emit phis for the new starting index of the scalar loop. 3765 createInductionResumeValues(Lp, CountRoundDown); 3766 3767 return completeLoopSkeleton(Lp, OrigLoopID); 3768 } 3769 3770 // Fix up external users of the induction variable. At this point, we are 3771 // in LCSSA form, with all external PHIs that use the IV having one input value, 3772 // coming from the remainder loop. We need those PHIs to also have a correct 3773 // value for the IV when arriving directly from the middle block. 3774 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3775 const InductionDescriptor &II, 3776 Value *CountRoundDown, Value *EndValue, 3777 BasicBlock *MiddleBlock) { 3778 // There are two kinds of external IV usages - those that use the value 3779 // computed in the last iteration (the PHI) and those that use the penultimate 3780 // value (the value that feeds into the phi from the loop latch). 3781 // We allow both, but they, obviously, have different values. 3782 3783 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3784 3785 DenseMap<Value *, Value *> MissingVals; 3786 3787 // An external user of the last iteration's value should see the value that 3788 // the remainder loop uses to initialize its own IV. 3789 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3790 for (User *U : PostInc->users()) { 3791 Instruction *UI = cast<Instruction>(U); 3792 if (!OrigLoop->contains(UI)) { 3793 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3794 MissingVals[UI] = EndValue; 3795 } 3796 } 3797 3798 // An external user of the penultimate value need to see EndValue - Step. 3799 // The simplest way to get this is to recompute it from the constituent SCEVs, 3800 // that is Start + (Step * (CRD - 1)). 3801 for (User *U : OrigPhi->users()) { 3802 auto *UI = cast<Instruction>(U); 3803 if (!OrigLoop->contains(UI)) { 3804 const DataLayout &DL = 3805 OrigLoop->getHeader()->getModule()->getDataLayout(); 3806 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3807 3808 IRBuilder<> B(MiddleBlock->getTerminator()); 3809 3810 // Fast-math-flags propagate from the original induction instruction. 3811 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3812 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3813 3814 Value *CountMinusOne = B.CreateSub( 3815 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3816 Value *CMO = 3817 !II.getStep()->getType()->isIntegerTy() 3818 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3819 II.getStep()->getType()) 3820 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3821 CMO->setName("cast.cmo"); 3822 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3823 Escape->setName("ind.escape"); 3824 MissingVals[UI] = Escape; 3825 } 3826 } 3827 3828 for (auto &I : MissingVals) { 3829 PHINode *PHI = cast<PHINode>(I.first); 3830 // One corner case we have to handle is two IVs "chasing" each-other, 3831 // that is %IV2 = phi [...], [ %IV1, %latch ] 3832 // In this case, if IV1 has an external use, we need to avoid adding both 3833 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3834 // don't already have an incoming value for the middle block. 3835 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3836 PHI->addIncoming(I.second, MiddleBlock); 3837 } 3838 } 3839 3840 namespace { 3841 3842 struct CSEDenseMapInfo { 3843 static bool canHandle(const Instruction *I) { 3844 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3845 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3846 } 3847 3848 static inline Instruction *getEmptyKey() { 3849 return DenseMapInfo<Instruction *>::getEmptyKey(); 3850 } 3851 3852 static inline Instruction *getTombstoneKey() { 3853 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3854 } 3855 3856 static unsigned getHashValue(const Instruction *I) { 3857 assert(canHandle(I) && "Unknown instruction!"); 3858 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3859 I->value_op_end())); 3860 } 3861 3862 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3863 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3864 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3865 return LHS == RHS; 3866 return LHS->isIdenticalTo(RHS); 3867 } 3868 }; 3869 3870 } // end anonymous namespace 3871 3872 ///Perform cse of induction variable instructions. 3873 static void cse(BasicBlock *BB) { 3874 // Perform simple cse. 3875 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3876 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3877 Instruction *In = &*I++; 3878 3879 if (!CSEDenseMapInfo::canHandle(In)) 3880 continue; 3881 3882 // Check if we can replace this instruction with any of the 3883 // visited instructions. 3884 if (Instruction *V = CSEMap.lookup(In)) { 3885 In->replaceAllUsesWith(V); 3886 In->eraseFromParent(); 3887 continue; 3888 } 3889 3890 CSEMap[In] = In; 3891 } 3892 } 3893 3894 InstructionCost 3895 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3896 bool &NeedToScalarize) const { 3897 Function *F = CI->getCalledFunction(); 3898 Type *ScalarRetTy = CI->getType(); 3899 SmallVector<Type *, 4> Tys, ScalarTys; 3900 for (auto &ArgOp : CI->arg_operands()) 3901 ScalarTys.push_back(ArgOp->getType()); 3902 3903 // Estimate cost of scalarized vector call. The source operands are assumed 3904 // to be vectors, so we need to extract individual elements from there, 3905 // execute VF scalar calls, and then gather the result into the vector return 3906 // value. 3907 InstructionCost ScalarCallCost = 3908 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3909 if (VF.isScalar()) 3910 return ScalarCallCost; 3911 3912 // Compute corresponding vector type for return value and arguments. 3913 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3914 for (Type *ScalarTy : ScalarTys) 3915 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3916 3917 // Compute costs of unpacking argument values for the scalar calls and 3918 // packing the return values to a vector. 3919 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3920 3921 InstructionCost Cost = 3922 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3923 3924 // If we can't emit a vector call for this function, then the currently found 3925 // cost is the cost we need to return. 3926 NeedToScalarize = true; 3927 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3928 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3929 3930 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3931 return Cost; 3932 3933 // If the corresponding vector cost is cheaper, return its cost. 3934 InstructionCost VectorCallCost = 3935 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3936 if (VectorCallCost < Cost) { 3937 NeedToScalarize = false; 3938 Cost = VectorCallCost; 3939 } 3940 return Cost; 3941 } 3942 3943 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3944 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3945 return Elt; 3946 return VectorType::get(Elt, VF); 3947 } 3948 3949 InstructionCost 3950 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3951 ElementCount VF) const { 3952 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3953 assert(ID && "Expected intrinsic call!"); 3954 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3955 FastMathFlags FMF; 3956 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3957 FMF = FPMO->getFastMathFlags(); 3958 3959 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3960 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3961 SmallVector<Type *> ParamTys; 3962 std::transform(FTy->param_begin(), FTy->param_end(), 3963 std::back_inserter(ParamTys), 3964 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3965 3966 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3967 dyn_cast<IntrinsicInst>(CI)); 3968 return TTI.getIntrinsicInstrCost(CostAttrs, 3969 TargetTransformInfo::TCK_RecipThroughput); 3970 } 3971 3972 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3973 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3974 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3975 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3976 } 3977 3978 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3979 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3980 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3981 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3982 } 3983 3984 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3985 // For every instruction `I` in MinBWs, truncate the operands, create a 3986 // truncated version of `I` and reextend its result. InstCombine runs 3987 // later and will remove any ext/trunc pairs. 3988 SmallPtrSet<Value *, 4> Erased; 3989 for (const auto &KV : Cost->getMinimalBitwidths()) { 3990 // If the value wasn't vectorized, we must maintain the original scalar 3991 // type. The absence of the value from State indicates that it 3992 // wasn't vectorized. 3993 VPValue *Def = State.Plan->getVPValue(KV.first); 3994 if (!State.hasAnyVectorValue(Def)) 3995 continue; 3996 for (unsigned Part = 0; Part < UF; ++Part) { 3997 Value *I = State.get(Def, Part); 3998 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3999 continue; 4000 Type *OriginalTy = I->getType(); 4001 Type *ScalarTruncatedTy = 4002 IntegerType::get(OriginalTy->getContext(), KV.second); 4003 auto *TruncatedTy = VectorType::get( 4004 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 4005 if (TruncatedTy == OriginalTy) 4006 continue; 4007 4008 IRBuilder<> B(cast<Instruction>(I)); 4009 auto ShrinkOperand = [&](Value *V) -> Value * { 4010 if (auto *ZI = dyn_cast<ZExtInst>(V)) 4011 if (ZI->getSrcTy() == TruncatedTy) 4012 return ZI->getOperand(0); 4013 return B.CreateZExtOrTrunc(V, TruncatedTy); 4014 }; 4015 4016 // The actual instruction modification depends on the instruction type, 4017 // unfortunately. 4018 Value *NewI = nullptr; 4019 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4020 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4021 ShrinkOperand(BO->getOperand(1))); 4022 4023 // Any wrapping introduced by shrinking this operation shouldn't be 4024 // considered undefined behavior. So, we can't unconditionally copy 4025 // arithmetic wrapping flags to NewI. 4026 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4027 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4028 NewI = 4029 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4030 ShrinkOperand(CI->getOperand(1))); 4031 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4032 NewI = B.CreateSelect(SI->getCondition(), 4033 ShrinkOperand(SI->getTrueValue()), 4034 ShrinkOperand(SI->getFalseValue())); 4035 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4036 switch (CI->getOpcode()) { 4037 default: 4038 llvm_unreachable("Unhandled cast!"); 4039 case Instruction::Trunc: 4040 NewI = ShrinkOperand(CI->getOperand(0)); 4041 break; 4042 case Instruction::SExt: 4043 NewI = B.CreateSExtOrTrunc( 4044 CI->getOperand(0), 4045 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4046 break; 4047 case Instruction::ZExt: 4048 NewI = B.CreateZExtOrTrunc( 4049 CI->getOperand(0), 4050 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4051 break; 4052 } 4053 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4054 auto Elements0 = 4055 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4056 auto *O0 = B.CreateZExtOrTrunc( 4057 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4058 auto Elements1 = 4059 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4060 auto *O1 = B.CreateZExtOrTrunc( 4061 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4062 4063 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4064 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4065 // Don't do anything with the operands, just extend the result. 4066 continue; 4067 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4068 auto Elements = 4069 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4070 auto *O0 = B.CreateZExtOrTrunc( 4071 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4072 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4073 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4074 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4075 auto Elements = 4076 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4077 auto *O0 = B.CreateZExtOrTrunc( 4078 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4079 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4080 } else { 4081 // If we don't know what to do, be conservative and don't do anything. 4082 continue; 4083 } 4084 4085 // Lastly, extend the result. 4086 NewI->takeName(cast<Instruction>(I)); 4087 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4088 I->replaceAllUsesWith(Res); 4089 cast<Instruction>(I)->eraseFromParent(); 4090 Erased.insert(I); 4091 State.reset(Def, Res, Part); 4092 } 4093 } 4094 4095 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4096 for (const auto &KV : Cost->getMinimalBitwidths()) { 4097 // If the value wasn't vectorized, we must maintain the original scalar 4098 // type. The absence of the value from State indicates that it 4099 // wasn't vectorized. 4100 VPValue *Def = State.Plan->getVPValue(KV.first); 4101 if (!State.hasAnyVectorValue(Def)) 4102 continue; 4103 for (unsigned Part = 0; Part < UF; ++Part) { 4104 Value *I = State.get(Def, Part); 4105 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4106 if (Inst && Inst->use_empty()) { 4107 Value *NewI = Inst->getOperand(0); 4108 Inst->eraseFromParent(); 4109 State.reset(Def, NewI, Part); 4110 } 4111 } 4112 } 4113 } 4114 4115 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4116 // Insert truncates and extends for any truncated instructions as hints to 4117 // InstCombine. 4118 if (VF.isVector()) 4119 truncateToMinimalBitwidths(State); 4120 4121 // Fix widened non-induction PHIs by setting up the PHI operands. 4122 if (OrigPHIsToFix.size()) { 4123 assert(EnableVPlanNativePath && 4124 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4125 fixNonInductionPHIs(State); 4126 } 4127 4128 // At this point every instruction in the original loop is widened to a 4129 // vector form. Now we need to fix the recurrences in the loop. These PHI 4130 // nodes are currently empty because we did not want to introduce cycles. 4131 // This is the second stage of vectorizing recurrences. 4132 fixCrossIterationPHIs(State); 4133 4134 // Forget the original basic block. 4135 PSE.getSE()->forgetLoop(OrigLoop); 4136 4137 // If we inserted an edge from the middle block to the unique exit block, 4138 // update uses outside the loop (phis) to account for the newly inserted 4139 // edge. 4140 if (!Cost->requiresScalarEpilogue(VF)) { 4141 // Fix-up external users of the induction variables. 4142 for (auto &Entry : Legal->getInductionVars()) 4143 fixupIVUsers(Entry.first, Entry.second, 4144 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4145 IVEndValues[Entry.first], LoopMiddleBlock); 4146 4147 fixLCSSAPHIs(State); 4148 } 4149 4150 for (Instruction *PI : PredicatedInstructions) 4151 sinkScalarOperands(&*PI); 4152 4153 // Remove redundant induction instructions. 4154 cse(LoopVectorBody); 4155 4156 // Set/update profile weights for the vector and remainder loops as original 4157 // loop iterations are now distributed among them. Note that original loop 4158 // represented by LoopScalarBody becomes remainder loop after vectorization. 4159 // 4160 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4161 // end up getting slightly roughened result but that should be OK since 4162 // profile is not inherently precise anyway. Note also possible bypass of 4163 // vector code caused by legality checks is ignored, assigning all the weight 4164 // to the vector loop, optimistically. 4165 // 4166 // For scalable vectorization we can't know at compile time how many iterations 4167 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4168 // vscale of '1'. 4169 setProfileInfoAfterUnrolling( 4170 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4171 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4172 } 4173 4174 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4175 // In order to support recurrences we need to be able to vectorize Phi nodes. 4176 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4177 // stage #2: We now need to fix the recurrences by adding incoming edges to 4178 // the currently empty PHI nodes. At this point every instruction in the 4179 // original loop is widened to a vector form so we can use them to construct 4180 // the incoming edges. 4181 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4182 for (VPRecipeBase &R : Header->phis()) { 4183 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4184 fixReduction(ReductionPhi, State); 4185 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4186 fixFirstOrderRecurrence(FOR, State); 4187 } 4188 } 4189 4190 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4191 VPTransformState &State) { 4192 // This is the second phase of vectorizing first-order recurrences. An 4193 // overview of the transformation is described below. Suppose we have the 4194 // following loop. 4195 // 4196 // for (int i = 0; i < n; ++i) 4197 // b[i] = a[i] - a[i - 1]; 4198 // 4199 // There is a first-order recurrence on "a". For this loop, the shorthand 4200 // scalar IR looks like: 4201 // 4202 // scalar.ph: 4203 // s_init = a[-1] 4204 // br scalar.body 4205 // 4206 // scalar.body: 4207 // i = phi [0, scalar.ph], [i+1, scalar.body] 4208 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4209 // s2 = a[i] 4210 // b[i] = s2 - s1 4211 // br cond, scalar.body, ... 4212 // 4213 // In this example, s1 is a recurrence because it's value depends on the 4214 // previous iteration. In the first phase of vectorization, we created a 4215 // vector phi v1 for s1. We now complete the vectorization and produce the 4216 // shorthand vector IR shown below (for VF = 4, UF = 1). 4217 // 4218 // vector.ph: 4219 // v_init = vector(..., ..., ..., a[-1]) 4220 // br vector.body 4221 // 4222 // vector.body 4223 // i = phi [0, vector.ph], [i+4, vector.body] 4224 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4225 // v2 = a[i, i+1, i+2, i+3]; 4226 // v3 = vector(v1(3), v2(0, 1, 2)) 4227 // b[i, i+1, i+2, i+3] = v2 - v3 4228 // br cond, vector.body, middle.block 4229 // 4230 // middle.block: 4231 // x = v2(3) 4232 // br scalar.ph 4233 // 4234 // scalar.ph: 4235 // s_init = phi [x, middle.block], [a[-1], otherwise] 4236 // br scalar.body 4237 // 4238 // After execution completes the vector loop, we extract the next value of 4239 // the recurrence (x) to use as the initial value in the scalar loop. 4240 4241 // Extract the last vector element in the middle block. This will be the 4242 // initial value for the recurrence when jumping to the scalar loop. 4243 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4244 Value *Incoming = State.get(PreviousDef, UF - 1); 4245 auto *ExtractForScalar = Incoming; 4246 auto *IdxTy = Builder.getInt32Ty(); 4247 if (VF.isVector()) { 4248 auto *One = ConstantInt::get(IdxTy, 1); 4249 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4250 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4251 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4252 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4253 "vector.recur.extract"); 4254 } 4255 // Extract the second last element in the middle block if the 4256 // Phi is used outside the loop. We need to extract the phi itself 4257 // and not the last element (the phi update in the current iteration). This 4258 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4259 // when the scalar loop is not run at all. 4260 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4261 if (VF.isVector()) { 4262 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4263 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4264 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4265 Incoming, Idx, "vector.recur.extract.for.phi"); 4266 } else if (UF > 1) 4267 // When loop is unrolled without vectorizing, initialize 4268 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4269 // of `Incoming`. This is analogous to the vectorized case above: extracting 4270 // the second last element when VF > 1. 4271 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4272 4273 // Fix the initial value of the original recurrence in the scalar loop. 4274 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4275 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4276 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4277 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4278 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4279 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4280 Start->addIncoming(Incoming, BB); 4281 } 4282 4283 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4284 Phi->setName("scalar.recur"); 4285 4286 // Finally, fix users of the recurrence outside the loop. The users will need 4287 // either the last value of the scalar recurrence or the last value of the 4288 // vector recurrence we extracted in the middle block. Since the loop is in 4289 // LCSSA form, we just need to find all the phi nodes for the original scalar 4290 // recurrence in the exit block, and then add an edge for the middle block. 4291 // Note that LCSSA does not imply single entry when the original scalar loop 4292 // had multiple exiting edges (as we always run the last iteration in the 4293 // scalar epilogue); in that case, there is no edge from middle to exit and 4294 // and thus no phis which needed updated. 4295 if (!Cost->requiresScalarEpilogue(VF)) 4296 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4297 if (any_of(LCSSAPhi.incoming_values(), 4298 [Phi](Value *V) { return V == Phi; })) 4299 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4300 } 4301 4302 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4303 VPTransformState &State) { 4304 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4305 // Get it's reduction variable descriptor. 4306 assert(Legal->isReductionVariable(OrigPhi) && 4307 "Unable to find the reduction variable"); 4308 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4309 4310 RecurKind RK = RdxDesc.getRecurrenceKind(); 4311 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4312 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4313 setDebugLocFromInst(ReductionStartValue); 4314 4315 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4316 // This is the vector-clone of the value that leaves the loop. 4317 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4318 4319 // Wrap flags are in general invalid after vectorization, clear them. 4320 clearReductionWrapFlags(RdxDesc, State); 4321 4322 // Before each round, move the insertion point right between 4323 // the PHIs and the values we are going to write. 4324 // This allows us to write both PHINodes and the extractelement 4325 // instructions. 4326 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4327 4328 setDebugLocFromInst(LoopExitInst); 4329 4330 Type *PhiTy = OrigPhi->getType(); 4331 // If tail is folded by masking, the vector value to leave the loop should be 4332 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4333 // instead of the former. For an inloop reduction the reduction will already 4334 // be predicated, and does not need to be handled here. 4335 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4336 for (unsigned Part = 0; Part < UF; ++Part) { 4337 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4338 Value *Sel = nullptr; 4339 for (User *U : VecLoopExitInst->users()) { 4340 if (isa<SelectInst>(U)) { 4341 assert(!Sel && "Reduction exit feeding two selects"); 4342 Sel = U; 4343 } else 4344 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4345 } 4346 assert(Sel && "Reduction exit feeds no select"); 4347 State.reset(LoopExitInstDef, Sel, Part); 4348 4349 // If the target can create a predicated operator for the reduction at no 4350 // extra cost in the loop (for example a predicated vadd), it can be 4351 // cheaper for the select to remain in the loop than be sunk out of it, 4352 // and so use the select value for the phi instead of the old 4353 // LoopExitValue. 4354 if (PreferPredicatedReductionSelect || 4355 TTI->preferPredicatedReductionSelect( 4356 RdxDesc.getOpcode(), PhiTy, 4357 TargetTransformInfo::ReductionFlags())) { 4358 auto *VecRdxPhi = 4359 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4360 VecRdxPhi->setIncomingValueForBlock( 4361 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4362 } 4363 } 4364 } 4365 4366 // If the vector reduction can be performed in a smaller type, we truncate 4367 // then extend the loop exit value to enable InstCombine to evaluate the 4368 // entire expression in the smaller type. 4369 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4370 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4371 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4372 Builder.SetInsertPoint( 4373 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4374 VectorParts RdxParts(UF); 4375 for (unsigned Part = 0; Part < UF; ++Part) { 4376 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4377 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4378 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4379 : Builder.CreateZExt(Trunc, VecTy); 4380 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4381 UI != RdxParts[Part]->user_end();) 4382 if (*UI != Trunc) { 4383 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4384 RdxParts[Part] = Extnd; 4385 } else { 4386 ++UI; 4387 } 4388 } 4389 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4390 for (unsigned Part = 0; Part < UF; ++Part) { 4391 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4392 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4393 } 4394 } 4395 4396 // Reduce all of the unrolled parts into a single vector. 4397 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4398 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4399 4400 // The middle block terminator has already been assigned a DebugLoc here (the 4401 // OrigLoop's single latch terminator). We want the whole middle block to 4402 // appear to execute on this line because: (a) it is all compiler generated, 4403 // (b) these instructions are always executed after evaluating the latch 4404 // conditional branch, and (c) other passes may add new predecessors which 4405 // terminate on this line. This is the easiest way to ensure we don't 4406 // accidentally cause an extra step back into the loop while debugging. 4407 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4408 if (PhiR->isOrdered()) 4409 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4410 else { 4411 // Floating-point operations should have some FMF to enable the reduction. 4412 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4413 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4414 for (unsigned Part = 1; Part < UF; ++Part) { 4415 Value *RdxPart = State.get(LoopExitInstDef, Part); 4416 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4417 ReducedPartRdx = Builder.CreateBinOp( 4418 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4419 } else { 4420 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4421 } 4422 } 4423 } 4424 4425 // Create the reduction after the loop. Note that inloop reductions create the 4426 // target reduction in the loop using a Reduction recipe. 4427 if (VF.isVector() && !PhiR->isInLoop()) { 4428 ReducedPartRdx = 4429 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4430 // If the reduction can be performed in a smaller type, we need to extend 4431 // the reduction to the wider type before we branch to the original loop. 4432 if (PhiTy != RdxDesc.getRecurrenceType()) 4433 ReducedPartRdx = RdxDesc.isSigned() 4434 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4435 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4436 } 4437 4438 // Create a phi node that merges control-flow from the backedge-taken check 4439 // block and the middle block. 4440 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4441 LoopScalarPreHeader->getTerminator()); 4442 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4443 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4444 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4445 4446 // Now, we need to fix the users of the reduction variable 4447 // inside and outside of the scalar remainder loop. 4448 4449 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4450 // in the exit blocks. See comment on analogous loop in 4451 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4452 if (!Cost->requiresScalarEpilogue(VF)) 4453 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4454 if (any_of(LCSSAPhi.incoming_values(), 4455 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4456 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4457 4458 // Fix the scalar loop reduction variable with the incoming reduction sum 4459 // from the vector body and from the backedge value. 4460 int IncomingEdgeBlockIdx = 4461 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4462 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4463 // Pick the other block. 4464 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4465 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4466 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4467 } 4468 4469 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4470 VPTransformState &State) { 4471 RecurKind RK = RdxDesc.getRecurrenceKind(); 4472 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4473 return; 4474 4475 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4476 assert(LoopExitInstr && "null loop exit instruction"); 4477 SmallVector<Instruction *, 8> Worklist; 4478 SmallPtrSet<Instruction *, 8> Visited; 4479 Worklist.push_back(LoopExitInstr); 4480 Visited.insert(LoopExitInstr); 4481 4482 while (!Worklist.empty()) { 4483 Instruction *Cur = Worklist.pop_back_val(); 4484 if (isa<OverflowingBinaryOperator>(Cur)) 4485 for (unsigned Part = 0; Part < UF; ++Part) { 4486 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4487 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4488 } 4489 4490 for (User *U : Cur->users()) { 4491 Instruction *UI = cast<Instruction>(U); 4492 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4493 Visited.insert(UI).second) 4494 Worklist.push_back(UI); 4495 } 4496 } 4497 } 4498 4499 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4500 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4501 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4502 // Some phis were already hand updated by the reduction and recurrence 4503 // code above, leave them alone. 4504 continue; 4505 4506 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4507 // Non-instruction incoming values will have only one value. 4508 4509 VPLane Lane = VPLane::getFirstLane(); 4510 if (isa<Instruction>(IncomingValue) && 4511 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4512 VF)) 4513 Lane = VPLane::getLastLaneForVF(VF); 4514 4515 // Can be a loop invariant incoming value or the last scalar value to be 4516 // extracted from the vectorized loop. 4517 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4518 Value *lastIncomingValue = 4519 OrigLoop->isLoopInvariant(IncomingValue) 4520 ? IncomingValue 4521 : State.get(State.Plan->getVPValue(IncomingValue), 4522 VPIteration(UF - 1, Lane)); 4523 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4524 } 4525 } 4526 4527 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4528 // The basic block and loop containing the predicated instruction. 4529 auto *PredBB = PredInst->getParent(); 4530 auto *VectorLoop = LI->getLoopFor(PredBB); 4531 4532 // Initialize a worklist with the operands of the predicated instruction. 4533 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4534 4535 // Holds instructions that we need to analyze again. An instruction may be 4536 // reanalyzed if we don't yet know if we can sink it or not. 4537 SmallVector<Instruction *, 8> InstsToReanalyze; 4538 4539 // Returns true if a given use occurs in the predicated block. Phi nodes use 4540 // their operands in their corresponding predecessor blocks. 4541 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4542 auto *I = cast<Instruction>(U.getUser()); 4543 BasicBlock *BB = I->getParent(); 4544 if (auto *Phi = dyn_cast<PHINode>(I)) 4545 BB = Phi->getIncomingBlock( 4546 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4547 return BB == PredBB; 4548 }; 4549 4550 // Iteratively sink the scalarized operands of the predicated instruction 4551 // into the block we created for it. When an instruction is sunk, it's 4552 // operands are then added to the worklist. The algorithm ends after one pass 4553 // through the worklist doesn't sink a single instruction. 4554 bool Changed; 4555 do { 4556 // Add the instructions that need to be reanalyzed to the worklist, and 4557 // reset the changed indicator. 4558 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4559 InstsToReanalyze.clear(); 4560 Changed = false; 4561 4562 while (!Worklist.empty()) { 4563 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4564 4565 // We can't sink an instruction if it is a phi node, is not in the loop, 4566 // or may have side effects. 4567 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4568 I->mayHaveSideEffects()) 4569 continue; 4570 4571 // If the instruction is already in PredBB, check if we can sink its 4572 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4573 // sinking the scalar instruction I, hence it appears in PredBB; but it 4574 // may have failed to sink I's operands (recursively), which we try 4575 // (again) here. 4576 if (I->getParent() == PredBB) { 4577 Worklist.insert(I->op_begin(), I->op_end()); 4578 continue; 4579 } 4580 4581 // It's legal to sink the instruction if all its uses occur in the 4582 // predicated block. Otherwise, there's nothing to do yet, and we may 4583 // need to reanalyze the instruction. 4584 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4585 InstsToReanalyze.push_back(I); 4586 continue; 4587 } 4588 4589 // Move the instruction to the beginning of the predicated block, and add 4590 // it's operands to the worklist. 4591 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4592 Worklist.insert(I->op_begin(), I->op_end()); 4593 4594 // The sinking may have enabled other instructions to be sunk, so we will 4595 // need to iterate. 4596 Changed = true; 4597 } 4598 } while (Changed); 4599 } 4600 4601 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4602 for (PHINode *OrigPhi : OrigPHIsToFix) { 4603 VPWidenPHIRecipe *VPPhi = 4604 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4605 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4606 // Make sure the builder has a valid insert point. 4607 Builder.SetInsertPoint(NewPhi); 4608 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4609 VPValue *Inc = VPPhi->getIncomingValue(i); 4610 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4611 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4612 } 4613 } 4614 } 4615 4616 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4617 return Cost->useOrderedReductions(RdxDesc); 4618 } 4619 4620 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4621 VPUser &Operands, unsigned UF, 4622 ElementCount VF, bool IsPtrLoopInvariant, 4623 SmallBitVector &IsIndexLoopInvariant, 4624 VPTransformState &State) { 4625 // Construct a vector GEP by widening the operands of the scalar GEP as 4626 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4627 // results in a vector of pointers when at least one operand of the GEP 4628 // is vector-typed. Thus, to keep the representation compact, we only use 4629 // vector-typed operands for loop-varying values. 4630 4631 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4632 // If we are vectorizing, but the GEP has only loop-invariant operands, 4633 // the GEP we build (by only using vector-typed operands for 4634 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4635 // produce a vector of pointers, we need to either arbitrarily pick an 4636 // operand to broadcast, or broadcast a clone of the original GEP. 4637 // Here, we broadcast a clone of the original. 4638 // 4639 // TODO: If at some point we decide to scalarize instructions having 4640 // loop-invariant operands, this special case will no longer be 4641 // required. We would add the scalarization decision to 4642 // collectLoopScalars() and teach getVectorValue() to broadcast 4643 // the lane-zero scalar value. 4644 auto *Clone = Builder.Insert(GEP->clone()); 4645 for (unsigned Part = 0; Part < UF; ++Part) { 4646 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4647 State.set(VPDef, EntryPart, Part); 4648 addMetadata(EntryPart, GEP); 4649 } 4650 } else { 4651 // If the GEP has at least one loop-varying operand, we are sure to 4652 // produce a vector of pointers. But if we are only unrolling, we want 4653 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4654 // produce with the code below will be scalar (if VF == 1) or vector 4655 // (otherwise). Note that for the unroll-only case, we still maintain 4656 // values in the vector mapping with initVector, as we do for other 4657 // instructions. 4658 for (unsigned Part = 0; Part < UF; ++Part) { 4659 // The pointer operand of the new GEP. If it's loop-invariant, we 4660 // won't broadcast it. 4661 auto *Ptr = IsPtrLoopInvariant 4662 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4663 : State.get(Operands.getOperand(0), Part); 4664 4665 // Collect all the indices for the new GEP. If any index is 4666 // loop-invariant, we won't broadcast it. 4667 SmallVector<Value *, 4> Indices; 4668 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4669 VPValue *Operand = Operands.getOperand(I); 4670 if (IsIndexLoopInvariant[I - 1]) 4671 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4672 else 4673 Indices.push_back(State.get(Operand, Part)); 4674 } 4675 4676 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4677 // but it should be a vector, otherwise. 4678 auto *NewGEP = 4679 GEP->isInBounds() 4680 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4681 Indices) 4682 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4683 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4684 "NewGEP is not a pointer vector"); 4685 State.set(VPDef, NewGEP, Part); 4686 addMetadata(NewGEP, GEP); 4687 } 4688 } 4689 } 4690 4691 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4692 VPWidenPHIRecipe *PhiR, 4693 VPTransformState &State) { 4694 PHINode *P = cast<PHINode>(PN); 4695 if (EnableVPlanNativePath) { 4696 // Currently we enter here in the VPlan-native path for non-induction 4697 // PHIs where all control flow is uniform. We simply widen these PHIs. 4698 // Create a vector phi with no operands - the vector phi operands will be 4699 // set at the end of vector code generation. 4700 Type *VecTy = (State.VF.isScalar()) 4701 ? PN->getType() 4702 : VectorType::get(PN->getType(), State.VF); 4703 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4704 State.set(PhiR, VecPhi, 0); 4705 OrigPHIsToFix.push_back(P); 4706 4707 return; 4708 } 4709 4710 assert(PN->getParent() == OrigLoop->getHeader() && 4711 "Non-header phis should have been handled elsewhere"); 4712 4713 // In order to support recurrences we need to be able to vectorize Phi nodes. 4714 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4715 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4716 // this value when we vectorize all of the instructions that use the PHI. 4717 4718 assert(!Legal->isReductionVariable(P) && 4719 "reductions should be handled elsewhere"); 4720 4721 setDebugLocFromInst(P); 4722 4723 // This PHINode must be an induction variable. 4724 // Make sure that we know about it. 4725 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4726 4727 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4728 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4729 4730 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4731 // which can be found from the original scalar operations. 4732 switch (II.getKind()) { 4733 case InductionDescriptor::IK_NoInduction: 4734 llvm_unreachable("Unknown induction"); 4735 case InductionDescriptor::IK_IntInduction: 4736 case InductionDescriptor::IK_FpInduction: 4737 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4738 case InductionDescriptor::IK_PtrInduction: { 4739 // Handle the pointer induction variable case. 4740 assert(P->getType()->isPointerTy() && "Unexpected type."); 4741 4742 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4743 // This is the normalized GEP that starts counting at zero. 4744 Value *PtrInd = 4745 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4746 // Determine the number of scalars we need to generate for each unroll 4747 // iteration. If the instruction is uniform, we only need to generate the 4748 // first lane. Otherwise, we generate all VF values. 4749 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4750 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4751 4752 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4753 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4754 if (NeedsVectorIndex) { 4755 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4756 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4757 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4758 } 4759 4760 for (unsigned Part = 0; Part < UF; ++Part) { 4761 Value *PartStart = createStepForVF( 4762 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4763 4764 if (NeedsVectorIndex) { 4765 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4766 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4767 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4768 Value *SclrGep = 4769 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4770 SclrGep->setName("next.gep"); 4771 State.set(PhiR, SclrGep, Part); 4772 // We've cached the whole vector, which means we can support the 4773 // extraction of any lane. 4774 continue; 4775 } 4776 4777 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4778 Value *Idx = Builder.CreateAdd( 4779 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4780 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4781 Value *SclrGep = 4782 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4783 SclrGep->setName("next.gep"); 4784 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4785 } 4786 } 4787 return; 4788 } 4789 assert(isa<SCEVConstant>(II.getStep()) && 4790 "Induction step not a SCEV constant!"); 4791 Type *PhiType = II.getStep()->getType(); 4792 4793 // Build a pointer phi 4794 Value *ScalarStartValue = II.getStartValue(); 4795 Type *ScStValueType = ScalarStartValue->getType(); 4796 PHINode *NewPointerPhi = 4797 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4798 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4799 4800 // A pointer induction, performed by using a gep 4801 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4802 Instruction *InductionLoc = LoopLatch->getTerminator(); 4803 const SCEV *ScalarStep = II.getStep(); 4804 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4805 Value *ScalarStepValue = 4806 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4807 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4808 Value *NumUnrolledElems = 4809 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4810 Value *InductionGEP = GetElementPtrInst::Create( 4811 ScStValueType->getPointerElementType(), NewPointerPhi, 4812 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4813 InductionLoc); 4814 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4815 4816 // Create UF many actual address geps that use the pointer 4817 // phi as base and a vectorized version of the step value 4818 // (<step*0, ..., step*N>) as offset. 4819 for (unsigned Part = 0; Part < State.UF; ++Part) { 4820 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4821 Value *StartOffsetScalar = 4822 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4823 Value *StartOffset = 4824 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4825 // Create a vector of consecutive numbers from zero to VF. 4826 StartOffset = 4827 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4828 4829 Value *GEP = Builder.CreateGEP( 4830 ScStValueType->getPointerElementType(), NewPointerPhi, 4831 Builder.CreateMul( 4832 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4833 "vector.gep")); 4834 State.set(PhiR, GEP, Part); 4835 } 4836 } 4837 } 4838 } 4839 4840 /// A helper function for checking whether an integer division-related 4841 /// instruction may divide by zero (in which case it must be predicated if 4842 /// executed conditionally in the scalar code). 4843 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4844 /// Non-zero divisors that are non compile-time constants will not be 4845 /// converted into multiplication, so we will still end up scalarizing 4846 /// the division, but can do so w/o predication. 4847 static bool mayDivideByZero(Instruction &I) { 4848 assert((I.getOpcode() == Instruction::UDiv || 4849 I.getOpcode() == Instruction::SDiv || 4850 I.getOpcode() == Instruction::URem || 4851 I.getOpcode() == Instruction::SRem) && 4852 "Unexpected instruction"); 4853 Value *Divisor = I.getOperand(1); 4854 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4855 return !CInt || CInt->isZero(); 4856 } 4857 4858 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4859 VPUser &User, 4860 VPTransformState &State) { 4861 switch (I.getOpcode()) { 4862 case Instruction::Call: 4863 case Instruction::Br: 4864 case Instruction::PHI: 4865 case Instruction::GetElementPtr: 4866 case Instruction::Select: 4867 llvm_unreachable("This instruction is handled by a different recipe."); 4868 case Instruction::UDiv: 4869 case Instruction::SDiv: 4870 case Instruction::SRem: 4871 case Instruction::URem: 4872 case Instruction::Add: 4873 case Instruction::FAdd: 4874 case Instruction::Sub: 4875 case Instruction::FSub: 4876 case Instruction::FNeg: 4877 case Instruction::Mul: 4878 case Instruction::FMul: 4879 case Instruction::FDiv: 4880 case Instruction::FRem: 4881 case Instruction::Shl: 4882 case Instruction::LShr: 4883 case Instruction::AShr: 4884 case Instruction::And: 4885 case Instruction::Or: 4886 case Instruction::Xor: { 4887 // Just widen unops and binops. 4888 setDebugLocFromInst(&I); 4889 4890 for (unsigned Part = 0; Part < UF; ++Part) { 4891 SmallVector<Value *, 2> Ops; 4892 for (VPValue *VPOp : User.operands()) 4893 Ops.push_back(State.get(VPOp, Part)); 4894 4895 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4896 4897 if (auto *VecOp = dyn_cast<Instruction>(V)) 4898 VecOp->copyIRFlags(&I); 4899 4900 // Use this vector value for all users of the original instruction. 4901 State.set(Def, V, Part); 4902 addMetadata(V, &I); 4903 } 4904 4905 break; 4906 } 4907 case Instruction::ICmp: 4908 case Instruction::FCmp: { 4909 // Widen compares. Generate vector compares. 4910 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4911 auto *Cmp = cast<CmpInst>(&I); 4912 setDebugLocFromInst(Cmp); 4913 for (unsigned Part = 0; Part < UF; ++Part) { 4914 Value *A = State.get(User.getOperand(0), Part); 4915 Value *B = State.get(User.getOperand(1), Part); 4916 Value *C = nullptr; 4917 if (FCmp) { 4918 // Propagate fast math flags. 4919 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4920 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4921 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4922 } else { 4923 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4924 } 4925 State.set(Def, C, Part); 4926 addMetadata(C, &I); 4927 } 4928 4929 break; 4930 } 4931 4932 case Instruction::ZExt: 4933 case Instruction::SExt: 4934 case Instruction::FPToUI: 4935 case Instruction::FPToSI: 4936 case Instruction::FPExt: 4937 case Instruction::PtrToInt: 4938 case Instruction::IntToPtr: 4939 case Instruction::SIToFP: 4940 case Instruction::UIToFP: 4941 case Instruction::Trunc: 4942 case Instruction::FPTrunc: 4943 case Instruction::BitCast: { 4944 auto *CI = cast<CastInst>(&I); 4945 setDebugLocFromInst(CI); 4946 4947 /// Vectorize casts. 4948 Type *DestTy = 4949 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4950 4951 for (unsigned Part = 0; Part < UF; ++Part) { 4952 Value *A = State.get(User.getOperand(0), Part); 4953 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4954 State.set(Def, Cast, Part); 4955 addMetadata(Cast, &I); 4956 } 4957 break; 4958 } 4959 default: 4960 // This instruction is not vectorized by simple widening. 4961 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4962 llvm_unreachable("Unhandled instruction!"); 4963 } // end of switch. 4964 } 4965 4966 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4967 VPUser &ArgOperands, 4968 VPTransformState &State) { 4969 assert(!isa<DbgInfoIntrinsic>(I) && 4970 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4971 setDebugLocFromInst(&I); 4972 4973 Module *M = I.getParent()->getParent()->getParent(); 4974 auto *CI = cast<CallInst>(&I); 4975 4976 SmallVector<Type *, 4> Tys; 4977 for (Value *ArgOperand : CI->arg_operands()) 4978 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4979 4980 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4981 4982 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4983 // version of the instruction. 4984 // Is it beneficial to perform intrinsic call compared to lib call? 4985 bool NeedToScalarize = false; 4986 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4987 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4988 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4989 assert((UseVectorIntrinsic || !NeedToScalarize) && 4990 "Instruction should be scalarized elsewhere."); 4991 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4992 "Either the intrinsic cost or vector call cost must be valid"); 4993 4994 for (unsigned Part = 0; Part < UF; ++Part) { 4995 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4996 SmallVector<Value *, 4> Args; 4997 for (auto &I : enumerate(ArgOperands.operands())) { 4998 // Some intrinsics have a scalar argument - don't replace it with a 4999 // vector. 5000 Value *Arg; 5001 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5002 Arg = State.get(I.value(), Part); 5003 else { 5004 Arg = State.get(I.value(), VPIteration(0, 0)); 5005 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 5006 TysForDecl.push_back(Arg->getType()); 5007 } 5008 Args.push_back(Arg); 5009 } 5010 5011 Function *VectorF; 5012 if (UseVectorIntrinsic) { 5013 // Use vector version of the intrinsic. 5014 if (VF.isVector()) 5015 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5016 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5017 assert(VectorF && "Can't retrieve vector intrinsic."); 5018 } else { 5019 // Use vector version of the function call. 5020 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5021 #ifndef NDEBUG 5022 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5023 "Can't create vector function."); 5024 #endif 5025 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5026 } 5027 SmallVector<OperandBundleDef, 1> OpBundles; 5028 CI->getOperandBundlesAsDefs(OpBundles); 5029 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5030 5031 if (isa<FPMathOperator>(V)) 5032 V->copyFastMathFlags(CI); 5033 5034 State.set(Def, V, Part); 5035 addMetadata(V, &I); 5036 } 5037 } 5038 5039 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5040 VPUser &Operands, 5041 bool InvariantCond, 5042 VPTransformState &State) { 5043 setDebugLocFromInst(&I); 5044 5045 // The condition can be loop invariant but still defined inside the 5046 // loop. This means that we can't just use the original 'cond' value. 5047 // We have to take the 'vectorized' value and pick the first lane. 5048 // Instcombine will make this a no-op. 5049 auto *InvarCond = InvariantCond 5050 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5051 : nullptr; 5052 5053 for (unsigned Part = 0; Part < UF; ++Part) { 5054 Value *Cond = 5055 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5056 Value *Op0 = State.get(Operands.getOperand(1), Part); 5057 Value *Op1 = State.get(Operands.getOperand(2), Part); 5058 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5059 State.set(VPDef, Sel, Part); 5060 addMetadata(Sel, &I); 5061 } 5062 } 5063 5064 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5065 // We should not collect Scalars more than once per VF. Right now, this 5066 // function is called from collectUniformsAndScalars(), which already does 5067 // this check. Collecting Scalars for VF=1 does not make any sense. 5068 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5069 "This function should not be visited twice for the same VF"); 5070 5071 SmallSetVector<Instruction *, 8> Worklist; 5072 5073 // These sets are used to seed the analysis with pointers used by memory 5074 // accesses that will remain scalar. 5075 SmallSetVector<Instruction *, 8> ScalarPtrs; 5076 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5077 auto *Latch = TheLoop->getLoopLatch(); 5078 5079 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5080 // The pointer operands of loads and stores will be scalar as long as the 5081 // memory access is not a gather or scatter operation. The value operand of a 5082 // store will remain scalar if the store is scalarized. 5083 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5084 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5085 assert(WideningDecision != CM_Unknown && 5086 "Widening decision should be ready at this moment"); 5087 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5088 if (Ptr == Store->getValueOperand()) 5089 return WideningDecision == CM_Scalarize; 5090 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5091 "Ptr is neither a value or pointer operand"); 5092 return WideningDecision != CM_GatherScatter; 5093 }; 5094 5095 // A helper that returns true if the given value is a bitcast or 5096 // getelementptr instruction contained in the loop. 5097 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5098 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5099 isa<GetElementPtrInst>(V)) && 5100 !TheLoop->isLoopInvariant(V); 5101 }; 5102 5103 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5104 if (!isa<PHINode>(Ptr) || 5105 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5106 return false; 5107 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5108 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5109 return false; 5110 return isScalarUse(MemAccess, Ptr); 5111 }; 5112 5113 // A helper that evaluates a memory access's use of a pointer. If the 5114 // pointer is actually the pointer induction of a loop, it is being 5115 // inserted into Worklist. If the use will be a scalar use, and the 5116 // pointer is only used by memory accesses, we place the pointer in 5117 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5118 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5119 if (isScalarPtrInduction(MemAccess, Ptr)) { 5120 Worklist.insert(cast<Instruction>(Ptr)); 5121 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5122 << "\n"); 5123 5124 Instruction *Update = cast<Instruction>( 5125 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5126 ScalarPtrs.insert(Update); 5127 return; 5128 } 5129 // We only care about bitcast and getelementptr instructions contained in 5130 // the loop. 5131 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5132 return; 5133 5134 // If the pointer has already been identified as scalar (e.g., if it was 5135 // also identified as uniform), there's nothing to do. 5136 auto *I = cast<Instruction>(Ptr); 5137 if (Worklist.count(I)) 5138 return; 5139 5140 // If the use of the pointer will be a scalar use, and all users of the 5141 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5142 // place the pointer in PossibleNonScalarPtrs. 5143 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5144 return isa<LoadInst>(U) || isa<StoreInst>(U); 5145 })) 5146 ScalarPtrs.insert(I); 5147 else 5148 PossibleNonScalarPtrs.insert(I); 5149 }; 5150 5151 // We seed the scalars analysis with three classes of instructions: (1) 5152 // instructions marked uniform-after-vectorization and (2) bitcast, 5153 // getelementptr and (pointer) phi instructions used by memory accesses 5154 // requiring a scalar use. 5155 // 5156 // (1) Add to the worklist all instructions that have been identified as 5157 // uniform-after-vectorization. 5158 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5159 5160 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5161 // memory accesses requiring a scalar use. The pointer operands of loads and 5162 // stores will be scalar as long as the memory accesses is not a gather or 5163 // scatter operation. The value operand of a store will remain scalar if the 5164 // store is scalarized. 5165 for (auto *BB : TheLoop->blocks()) 5166 for (auto &I : *BB) { 5167 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5168 evaluatePtrUse(Load, Load->getPointerOperand()); 5169 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5170 evaluatePtrUse(Store, Store->getPointerOperand()); 5171 evaluatePtrUse(Store, Store->getValueOperand()); 5172 } 5173 } 5174 for (auto *I : ScalarPtrs) 5175 if (!PossibleNonScalarPtrs.count(I)) { 5176 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5177 Worklist.insert(I); 5178 } 5179 5180 // Insert the forced scalars. 5181 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5182 // induction variable when the PHI user is scalarized. 5183 auto ForcedScalar = ForcedScalars.find(VF); 5184 if (ForcedScalar != ForcedScalars.end()) 5185 for (auto *I : ForcedScalar->second) 5186 Worklist.insert(I); 5187 5188 // Expand the worklist by looking through any bitcasts and getelementptr 5189 // instructions we've already identified as scalar. This is similar to the 5190 // expansion step in collectLoopUniforms(); however, here we're only 5191 // expanding to include additional bitcasts and getelementptr instructions. 5192 unsigned Idx = 0; 5193 while (Idx != Worklist.size()) { 5194 Instruction *Dst = Worklist[Idx++]; 5195 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5196 continue; 5197 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5198 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5199 auto *J = cast<Instruction>(U); 5200 return !TheLoop->contains(J) || Worklist.count(J) || 5201 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5202 isScalarUse(J, Src)); 5203 })) { 5204 Worklist.insert(Src); 5205 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5206 } 5207 } 5208 5209 // An induction variable will remain scalar if all users of the induction 5210 // variable and induction variable update remain scalar. 5211 for (auto &Induction : Legal->getInductionVars()) { 5212 auto *Ind = Induction.first; 5213 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5214 5215 // If tail-folding is applied, the primary induction variable will be used 5216 // to feed a vector compare. 5217 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5218 continue; 5219 5220 // Determine if all users of the induction variable are scalar after 5221 // vectorization. 5222 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5223 auto *I = cast<Instruction>(U); 5224 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5225 }); 5226 if (!ScalarInd) 5227 continue; 5228 5229 // Determine if all users of the induction variable update instruction are 5230 // scalar after vectorization. 5231 auto ScalarIndUpdate = 5232 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5233 auto *I = cast<Instruction>(U); 5234 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5235 }); 5236 if (!ScalarIndUpdate) 5237 continue; 5238 5239 // The induction variable and its update instruction will remain scalar. 5240 Worklist.insert(Ind); 5241 Worklist.insert(IndUpdate); 5242 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5243 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5244 << "\n"); 5245 } 5246 5247 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5248 } 5249 5250 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5251 if (!blockNeedsPredication(I->getParent())) 5252 return false; 5253 switch(I->getOpcode()) { 5254 default: 5255 break; 5256 case Instruction::Load: 5257 case Instruction::Store: { 5258 if (!Legal->isMaskRequired(I)) 5259 return false; 5260 auto *Ptr = getLoadStorePointerOperand(I); 5261 auto *Ty = getLoadStoreType(I); 5262 const Align Alignment = getLoadStoreAlignment(I); 5263 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5264 TTI.isLegalMaskedGather(Ty, Alignment)) 5265 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5266 TTI.isLegalMaskedScatter(Ty, Alignment)); 5267 } 5268 case Instruction::UDiv: 5269 case Instruction::SDiv: 5270 case Instruction::SRem: 5271 case Instruction::URem: 5272 return mayDivideByZero(*I); 5273 } 5274 return false; 5275 } 5276 5277 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5278 Instruction *I, ElementCount VF) { 5279 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5280 assert(getWideningDecision(I, VF) == CM_Unknown && 5281 "Decision should not be set yet."); 5282 auto *Group = getInterleavedAccessGroup(I); 5283 assert(Group && "Must have a group."); 5284 5285 // If the instruction's allocated size doesn't equal it's type size, it 5286 // requires padding and will be scalarized. 5287 auto &DL = I->getModule()->getDataLayout(); 5288 auto *ScalarTy = getLoadStoreType(I); 5289 if (hasIrregularType(ScalarTy, DL)) 5290 return false; 5291 5292 // Check if masking is required. 5293 // A Group may need masking for one of two reasons: it resides in a block that 5294 // needs predication, or it was decided to use masking to deal with gaps 5295 // (either a gap at the end of a load-access that may result in a speculative 5296 // load, or any gaps in a store-access). 5297 bool PredicatedAccessRequiresMasking = 5298 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5299 bool LoadAccessWithGapsRequiresEpilogMasking = 5300 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 5301 !isScalarEpilogueAllowed(); 5302 bool StoreAccessWithGapsRequiresMasking = 5303 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 5304 if (!PredicatedAccessRequiresMasking && 5305 !LoadAccessWithGapsRequiresEpilogMasking && 5306 !StoreAccessWithGapsRequiresMasking) 5307 return true; 5308 5309 // If masked interleaving is required, we expect that the user/target had 5310 // enabled it, because otherwise it either wouldn't have been created or 5311 // it should have been invalidated by the CostModel. 5312 assert(useMaskedInterleavedAccesses(TTI) && 5313 "Masked interleave-groups for predicated accesses are not enabled."); 5314 5315 auto *Ty = getLoadStoreType(I); 5316 const Align Alignment = getLoadStoreAlignment(I); 5317 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5318 : TTI.isLegalMaskedStore(Ty, Alignment); 5319 } 5320 5321 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5322 Instruction *I, ElementCount VF) { 5323 // Get and ensure we have a valid memory instruction. 5324 LoadInst *LI = dyn_cast<LoadInst>(I); 5325 StoreInst *SI = dyn_cast<StoreInst>(I); 5326 assert((LI || SI) && "Invalid memory instruction"); 5327 5328 auto *Ptr = getLoadStorePointerOperand(I); 5329 5330 // In order to be widened, the pointer should be consecutive, first of all. 5331 if (!Legal->isConsecutivePtr(Ptr)) 5332 return false; 5333 5334 // If the instruction is a store located in a predicated block, it will be 5335 // scalarized. 5336 if (isScalarWithPredication(I)) 5337 return false; 5338 5339 // If the instruction's allocated size doesn't equal it's type size, it 5340 // requires padding and will be scalarized. 5341 auto &DL = I->getModule()->getDataLayout(); 5342 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5343 if (hasIrregularType(ScalarTy, DL)) 5344 return false; 5345 5346 return true; 5347 } 5348 5349 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5350 // We should not collect Uniforms more than once per VF. Right now, 5351 // this function is called from collectUniformsAndScalars(), which 5352 // already does this check. Collecting Uniforms for VF=1 does not make any 5353 // sense. 5354 5355 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5356 "This function should not be visited twice for the same VF"); 5357 5358 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5359 // not analyze again. Uniforms.count(VF) will return 1. 5360 Uniforms[VF].clear(); 5361 5362 // We now know that the loop is vectorizable! 5363 // Collect instructions inside the loop that will remain uniform after 5364 // vectorization. 5365 5366 // Global values, params and instructions outside of current loop are out of 5367 // scope. 5368 auto isOutOfScope = [&](Value *V) -> bool { 5369 Instruction *I = dyn_cast<Instruction>(V); 5370 return (!I || !TheLoop->contains(I)); 5371 }; 5372 5373 SetVector<Instruction *> Worklist; 5374 BasicBlock *Latch = TheLoop->getLoopLatch(); 5375 5376 // Instructions that are scalar with predication must not be considered 5377 // uniform after vectorization, because that would create an erroneous 5378 // replicating region where only a single instance out of VF should be formed. 5379 // TODO: optimize such seldom cases if found important, see PR40816. 5380 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5381 if (isOutOfScope(I)) { 5382 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5383 << *I << "\n"); 5384 return; 5385 } 5386 if (isScalarWithPredication(I)) { 5387 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5388 << *I << "\n"); 5389 return; 5390 } 5391 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5392 Worklist.insert(I); 5393 }; 5394 5395 // Start with the conditional branch. If the branch condition is an 5396 // instruction contained in the loop that is only used by the branch, it is 5397 // uniform. 5398 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5399 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5400 addToWorklistIfAllowed(Cmp); 5401 5402 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5403 InstWidening WideningDecision = getWideningDecision(I, VF); 5404 assert(WideningDecision != CM_Unknown && 5405 "Widening decision should be ready at this moment"); 5406 5407 // A uniform memory op is itself uniform. We exclude uniform stores 5408 // here as they demand the last lane, not the first one. 5409 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5410 assert(WideningDecision == CM_Scalarize); 5411 return true; 5412 } 5413 5414 return (WideningDecision == CM_Widen || 5415 WideningDecision == CM_Widen_Reverse || 5416 WideningDecision == CM_Interleave); 5417 }; 5418 5419 5420 // Returns true if Ptr is the pointer operand of a memory access instruction 5421 // I, and I is known to not require scalarization. 5422 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5423 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5424 }; 5425 5426 // Holds a list of values which are known to have at least one uniform use. 5427 // Note that there may be other uses which aren't uniform. A "uniform use" 5428 // here is something which only demands lane 0 of the unrolled iterations; 5429 // it does not imply that all lanes produce the same value (e.g. this is not 5430 // the usual meaning of uniform) 5431 SetVector<Value *> HasUniformUse; 5432 5433 // Scan the loop for instructions which are either a) known to have only 5434 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5435 for (auto *BB : TheLoop->blocks()) 5436 for (auto &I : *BB) { 5437 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5438 switch (II->getIntrinsicID()) { 5439 case Intrinsic::sideeffect: 5440 case Intrinsic::experimental_noalias_scope_decl: 5441 case Intrinsic::assume: 5442 case Intrinsic::lifetime_start: 5443 case Intrinsic::lifetime_end: 5444 if (TheLoop->hasLoopInvariantOperands(&I)) 5445 addToWorklistIfAllowed(&I); 5446 break; 5447 default: 5448 break; 5449 } 5450 } 5451 5452 // ExtractValue instructions must be uniform, because the operands are 5453 // known to be loop-invariant. 5454 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5455 assert(isOutOfScope(EVI->getAggregateOperand()) && 5456 "Expected aggregate value to be loop invariant"); 5457 addToWorklistIfAllowed(EVI); 5458 continue; 5459 } 5460 5461 // If there's no pointer operand, there's nothing to do. 5462 auto *Ptr = getLoadStorePointerOperand(&I); 5463 if (!Ptr) 5464 continue; 5465 5466 // A uniform memory op is itself uniform. We exclude uniform stores 5467 // here as they demand the last lane, not the first one. 5468 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5469 addToWorklistIfAllowed(&I); 5470 5471 if (isUniformDecision(&I, VF)) { 5472 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5473 HasUniformUse.insert(Ptr); 5474 } 5475 } 5476 5477 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5478 // demanding) users. Since loops are assumed to be in LCSSA form, this 5479 // disallows uses outside the loop as well. 5480 for (auto *V : HasUniformUse) { 5481 if (isOutOfScope(V)) 5482 continue; 5483 auto *I = cast<Instruction>(V); 5484 auto UsersAreMemAccesses = 5485 llvm::all_of(I->users(), [&](User *U) -> bool { 5486 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5487 }); 5488 if (UsersAreMemAccesses) 5489 addToWorklistIfAllowed(I); 5490 } 5491 5492 // Expand Worklist in topological order: whenever a new instruction 5493 // is added , its users should be already inside Worklist. It ensures 5494 // a uniform instruction will only be used by uniform instructions. 5495 unsigned idx = 0; 5496 while (idx != Worklist.size()) { 5497 Instruction *I = Worklist[idx++]; 5498 5499 for (auto OV : I->operand_values()) { 5500 // isOutOfScope operands cannot be uniform instructions. 5501 if (isOutOfScope(OV)) 5502 continue; 5503 // First order recurrence Phi's should typically be considered 5504 // non-uniform. 5505 auto *OP = dyn_cast<PHINode>(OV); 5506 if (OP && Legal->isFirstOrderRecurrence(OP)) 5507 continue; 5508 // If all the users of the operand are uniform, then add the 5509 // operand into the uniform worklist. 5510 auto *OI = cast<Instruction>(OV); 5511 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5512 auto *J = cast<Instruction>(U); 5513 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5514 })) 5515 addToWorklistIfAllowed(OI); 5516 } 5517 } 5518 5519 // For an instruction to be added into Worklist above, all its users inside 5520 // the loop should also be in Worklist. However, this condition cannot be 5521 // true for phi nodes that form a cyclic dependence. We must process phi 5522 // nodes separately. An induction variable will remain uniform if all users 5523 // of the induction variable and induction variable update remain uniform. 5524 // The code below handles both pointer and non-pointer induction variables. 5525 for (auto &Induction : Legal->getInductionVars()) { 5526 auto *Ind = Induction.first; 5527 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5528 5529 // Determine if all users of the induction variable are uniform after 5530 // vectorization. 5531 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5532 auto *I = cast<Instruction>(U); 5533 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5534 isVectorizedMemAccessUse(I, Ind); 5535 }); 5536 if (!UniformInd) 5537 continue; 5538 5539 // Determine if all users of the induction variable update instruction are 5540 // uniform after vectorization. 5541 auto UniformIndUpdate = 5542 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5543 auto *I = cast<Instruction>(U); 5544 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5545 isVectorizedMemAccessUse(I, IndUpdate); 5546 }); 5547 if (!UniformIndUpdate) 5548 continue; 5549 5550 // The induction variable and its update instruction will remain uniform. 5551 addToWorklistIfAllowed(Ind); 5552 addToWorklistIfAllowed(IndUpdate); 5553 } 5554 5555 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5556 } 5557 5558 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5559 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5560 5561 if (Legal->getRuntimePointerChecking()->Need) { 5562 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5563 "runtime pointer checks needed. Enable vectorization of this " 5564 "loop with '#pragma clang loop vectorize(enable)' when " 5565 "compiling with -Os/-Oz", 5566 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5567 return true; 5568 } 5569 5570 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5571 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5572 "runtime SCEV checks needed. Enable vectorization of this " 5573 "loop with '#pragma clang loop vectorize(enable)' when " 5574 "compiling with -Os/-Oz", 5575 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5576 return true; 5577 } 5578 5579 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5580 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5581 reportVectorizationFailure("Runtime stride check for small trip count", 5582 "runtime stride == 1 checks needed. Enable vectorization of " 5583 "this loop without such check by compiling with -Os/-Oz", 5584 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5585 return true; 5586 } 5587 5588 return false; 5589 } 5590 5591 ElementCount 5592 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5593 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5594 return ElementCount::getScalable(0); 5595 5596 if (Hints->isScalableVectorizationDisabled()) { 5597 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5598 "ScalableVectorizationDisabled", ORE, TheLoop); 5599 return ElementCount::getScalable(0); 5600 } 5601 5602 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5603 5604 auto MaxScalableVF = ElementCount::getScalable( 5605 std::numeric_limits<ElementCount::ScalarTy>::max()); 5606 5607 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5608 // FIXME: While for scalable vectors this is currently sufficient, this should 5609 // be replaced by a more detailed mechanism that filters out specific VFs, 5610 // instead of invalidating vectorization for a whole set of VFs based on the 5611 // MaxVF. 5612 5613 // Disable scalable vectorization if the loop contains unsupported reductions. 5614 if (!canVectorizeReductions(MaxScalableVF)) { 5615 reportVectorizationInfo( 5616 "Scalable vectorization not supported for the reduction " 5617 "operations found in this loop.", 5618 "ScalableVFUnfeasible", ORE, TheLoop); 5619 return ElementCount::getScalable(0); 5620 } 5621 5622 // Disable scalable vectorization if the loop contains any instructions 5623 // with element types not supported for scalable vectors. 5624 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5625 return !Ty->isVoidTy() && 5626 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5627 })) { 5628 reportVectorizationInfo("Scalable vectorization is not supported " 5629 "for all element types found in this loop.", 5630 "ScalableVFUnfeasible", ORE, TheLoop); 5631 return ElementCount::getScalable(0); 5632 } 5633 5634 if (Legal->isSafeForAnyVectorWidth()) 5635 return MaxScalableVF; 5636 5637 // Limit MaxScalableVF by the maximum safe dependence distance. 5638 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5639 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5640 unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) 5641 .getVScaleRangeArgs() 5642 .second; 5643 if (VScaleMax > 0) 5644 MaxVScale = VScaleMax; 5645 } 5646 MaxScalableVF = ElementCount::getScalable( 5647 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5648 if (!MaxScalableVF) 5649 reportVectorizationInfo( 5650 "Max legal vector width too small, scalable vectorization " 5651 "unfeasible.", 5652 "ScalableVFUnfeasible", ORE, TheLoop); 5653 5654 return MaxScalableVF; 5655 } 5656 5657 FixedScalableVFPair 5658 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5659 ElementCount UserVF) { 5660 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5661 unsigned SmallestType, WidestType; 5662 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5663 5664 // Get the maximum safe dependence distance in bits computed by LAA. 5665 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5666 // the memory accesses that is most restrictive (involved in the smallest 5667 // dependence distance). 5668 unsigned MaxSafeElements = 5669 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5670 5671 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5672 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5673 5674 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5675 << ".\n"); 5676 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5677 << ".\n"); 5678 5679 // First analyze the UserVF, fall back if the UserVF should be ignored. 5680 if (UserVF) { 5681 auto MaxSafeUserVF = 5682 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5683 5684 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5685 // If `VF=vscale x N` is safe, then so is `VF=N` 5686 if (UserVF.isScalable()) 5687 return FixedScalableVFPair( 5688 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5689 else 5690 return UserVF; 5691 } 5692 5693 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5694 5695 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5696 // is better to ignore the hint and let the compiler choose a suitable VF. 5697 if (!UserVF.isScalable()) { 5698 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5699 << " is unsafe, clamping to max safe VF=" 5700 << MaxSafeFixedVF << ".\n"); 5701 ORE->emit([&]() { 5702 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5703 TheLoop->getStartLoc(), 5704 TheLoop->getHeader()) 5705 << "User-specified vectorization factor " 5706 << ore::NV("UserVectorizationFactor", UserVF) 5707 << " is unsafe, clamping to maximum safe vectorization factor " 5708 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5709 }); 5710 return MaxSafeFixedVF; 5711 } 5712 5713 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5714 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5715 << " is ignored because scalable vectors are not " 5716 "available.\n"); 5717 ORE->emit([&]() { 5718 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5719 TheLoop->getStartLoc(), 5720 TheLoop->getHeader()) 5721 << "User-specified vectorization factor " 5722 << ore::NV("UserVectorizationFactor", UserVF) 5723 << " is ignored because the target does not support scalable " 5724 "vectors. The compiler will pick a more suitable value."; 5725 }); 5726 } else { 5727 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5728 << " is unsafe. Ignoring scalable UserVF.\n"); 5729 ORE->emit([&]() { 5730 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5731 TheLoop->getStartLoc(), 5732 TheLoop->getHeader()) 5733 << "User-specified vectorization factor " 5734 << ore::NV("UserVectorizationFactor", UserVF) 5735 << " is unsafe. Ignoring the hint to let the compiler pick a " 5736 "more suitable value."; 5737 }); 5738 } 5739 } 5740 5741 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5742 << " / " << WidestType << " bits.\n"); 5743 5744 FixedScalableVFPair Result(ElementCount::getFixed(1), 5745 ElementCount::getScalable(0)); 5746 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5747 WidestType, MaxSafeFixedVF)) 5748 Result.FixedVF = MaxVF; 5749 5750 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5751 WidestType, MaxSafeScalableVF)) 5752 if (MaxVF.isScalable()) { 5753 Result.ScalableVF = MaxVF; 5754 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5755 << "\n"); 5756 } 5757 5758 return Result; 5759 } 5760 5761 FixedScalableVFPair 5762 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5763 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5764 // TODO: It may by useful to do since it's still likely to be dynamically 5765 // uniform if the target can skip. 5766 reportVectorizationFailure( 5767 "Not inserting runtime ptr check for divergent target", 5768 "runtime pointer checks needed. Not enabled for divergent target", 5769 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5770 return FixedScalableVFPair::getNone(); 5771 } 5772 5773 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5774 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5775 if (TC == 1) { 5776 reportVectorizationFailure("Single iteration (non) loop", 5777 "loop trip count is one, irrelevant for vectorization", 5778 "SingleIterationLoop", ORE, TheLoop); 5779 return FixedScalableVFPair::getNone(); 5780 } 5781 5782 switch (ScalarEpilogueStatus) { 5783 case CM_ScalarEpilogueAllowed: 5784 return computeFeasibleMaxVF(TC, UserVF); 5785 case CM_ScalarEpilogueNotAllowedUsePredicate: 5786 LLVM_FALLTHROUGH; 5787 case CM_ScalarEpilogueNotNeededUsePredicate: 5788 LLVM_DEBUG( 5789 dbgs() << "LV: vector predicate hint/switch found.\n" 5790 << "LV: Not allowing scalar epilogue, creating predicated " 5791 << "vector loop.\n"); 5792 break; 5793 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5794 // fallthrough as a special case of OptForSize 5795 case CM_ScalarEpilogueNotAllowedOptSize: 5796 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5797 LLVM_DEBUG( 5798 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5799 else 5800 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5801 << "count.\n"); 5802 5803 // Bail if runtime checks are required, which are not good when optimising 5804 // for size. 5805 if (runtimeChecksRequired()) 5806 return FixedScalableVFPair::getNone(); 5807 5808 break; 5809 } 5810 5811 // The only loops we can vectorize without a scalar epilogue, are loops with 5812 // a bottom-test and a single exiting block. We'd have to handle the fact 5813 // that not every instruction executes on the last iteration. This will 5814 // require a lane mask which varies through the vector loop body. (TODO) 5815 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5816 // If there was a tail-folding hint/switch, but we can't fold the tail by 5817 // masking, fallback to a vectorization with a scalar epilogue. 5818 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5819 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5820 "scalar epilogue instead.\n"); 5821 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5822 return computeFeasibleMaxVF(TC, UserVF); 5823 } 5824 return FixedScalableVFPair::getNone(); 5825 } 5826 5827 // Now try the tail folding 5828 5829 // Invalidate interleave groups that require an epilogue if we can't mask 5830 // the interleave-group. 5831 if (!useMaskedInterleavedAccesses(TTI)) { 5832 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5833 "No decisions should have been taken at this point"); 5834 // Note: There is no need to invalidate any cost modeling decisions here, as 5835 // non where taken so far. 5836 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5837 } 5838 5839 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5840 // Avoid tail folding if the trip count is known to be a multiple of any VF 5841 // we chose. 5842 // FIXME: The condition below pessimises the case for fixed-width vectors, 5843 // when scalable VFs are also candidates for vectorization. 5844 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5845 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5846 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5847 "MaxFixedVF must be a power of 2"); 5848 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5849 : MaxFixedVF.getFixedValue(); 5850 ScalarEvolution *SE = PSE.getSE(); 5851 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5852 const SCEV *ExitCount = SE->getAddExpr( 5853 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5854 const SCEV *Rem = SE->getURemExpr( 5855 SE->applyLoopGuards(ExitCount, TheLoop), 5856 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5857 if (Rem->isZero()) { 5858 // Accept MaxFixedVF if we do not have a tail. 5859 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5860 return MaxFactors; 5861 } 5862 } 5863 5864 // For scalable vectors, don't use tail folding as this is currently not yet 5865 // supported. The code is likely to have ended up here if the tripcount is 5866 // low, in which case it makes sense not to use scalable vectors. 5867 if (MaxFactors.ScalableVF.isVector()) 5868 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5869 5870 // If we don't know the precise trip count, or if the trip count that we 5871 // found modulo the vectorization factor is not zero, try to fold the tail 5872 // by masking. 5873 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5874 if (Legal->prepareToFoldTailByMasking()) { 5875 FoldTailByMasking = true; 5876 return MaxFactors; 5877 } 5878 5879 // If there was a tail-folding hint/switch, but we can't fold the tail by 5880 // masking, fallback to a vectorization with a scalar epilogue. 5881 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5882 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5883 "scalar epilogue instead.\n"); 5884 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5885 return MaxFactors; 5886 } 5887 5888 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5889 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5890 return FixedScalableVFPair::getNone(); 5891 } 5892 5893 if (TC == 0) { 5894 reportVectorizationFailure( 5895 "Unable to calculate the loop count due to complex control flow", 5896 "unable to calculate the loop count due to complex control flow", 5897 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5898 return FixedScalableVFPair::getNone(); 5899 } 5900 5901 reportVectorizationFailure( 5902 "Cannot optimize for size and vectorize at the same time.", 5903 "cannot optimize for size and vectorize at the same time. " 5904 "Enable vectorization of this loop with '#pragma clang loop " 5905 "vectorize(enable)' when compiling with -Os/-Oz", 5906 "NoTailLoopWithOptForSize", ORE, TheLoop); 5907 return FixedScalableVFPair::getNone(); 5908 } 5909 5910 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5911 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5912 const ElementCount &MaxSafeVF) { 5913 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5914 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5915 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5916 : TargetTransformInfo::RGK_FixedWidthVector); 5917 5918 // Convenience function to return the minimum of two ElementCounts. 5919 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5920 assert((LHS.isScalable() == RHS.isScalable()) && 5921 "Scalable flags must match"); 5922 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5923 }; 5924 5925 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5926 // Note that both WidestRegister and WidestType may not be a powers of 2. 5927 auto MaxVectorElementCount = ElementCount::get( 5928 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5929 ComputeScalableMaxVF); 5930 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5931 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5932 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5933 5934 if (!MaxVectorElementCount) { 5935 LLVM_DEBUG(dbgs() << "LV: The target has no " 5936 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5937 << " vector registers.\n"); 5938 return ElementCount::getFixed(1); 5939 } 5940 5941 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5942 if (ConstTripCount && 5943 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5944 isPowerOf2_32(ConstTripCount)) { 5945 // We need to clamp the VF to be the ConstTripCount. There is no point in 5946 // choosing a higher viable VF as done in the loop below. If 5947 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5948 // the TC is less than or equal to the known number of lanes. 5949 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5950 << ConstTripCount << "\n"); 5951 return TripCountEC; 5952 } 5953 5954 ElementCount MaxVF = MaxVectorElementCount; 5955 if (TTI.shouldMaximizeVectorBandwidth() || 5956 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5957 auto MaxVectorElementCountMaxBW = ElementCount::get( 5958 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5959 ComputeScalableMaxVF); 5960 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5961 5962 // Collect all viable vectorization factors larger than the default MaxVF 5963 // (i.e. MaxVectorElementCount). 5964 SmallVector<ElementCount, 8> VFs; 5965 for (ElementCount VS = MaxVectorElementCount * 2; 5966 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5967 VFs.push_back(VS); 5968 5969 // For each VF calculate its register usage. 5970 auto RUs = calculateRegisterUsage(VFs); 5971 5972 // Select the largest VF which doesn't require more registers than existing 5973 // ones. 5974 for (int i = RUs.size() - 1; i >= 0; --i) { 5975 bool Selected = true; 5976 for (auto &pair : RUs[i].MaxLocalUsers) { 5977 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5978 if (pair.second > TargetNumRegisters) 5979 Selected = false; 5980 } 5981 if (Selected) { 5982 MaxVF = VFs[i]; 5983 break; 5984 } 5985 } 5986 if (ElementCount MinVF = 5987 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5988 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5989 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5990 << ") with target's minimum: " << MinVF << '\n'); 5991 MaxVF = MinVF; 5992 } 5993 } 5994 } 5995 return MaxVF; 5996 } 5997 5998 bool LoopVectorizationCostModel::isMoreProfitable( 5999 const VectorizationFactor &A, const VectorizationFactor &B) const { 6000 InstructionCost CostA = A.Cost; 6001 InstructionCost CostB = B.Cost; 6002 6003 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6004 6005 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6006 MaxTripCount) { 6007 // If we are folding the tail and the trip count is a known (possibly small) 6008 // constant, the trip count will be rounded up to an integer number of 6009 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6010 // which we compare directly. When not folding the tail, the total cost will 6011 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6012 // approximated with the per-lane cost below instead of using the tripcount 6013 // as here. 6014 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6015 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6016 return RTCostA < RTCostB; 6017 } 6018 6019 // When set to preferred, for now assume vscale may be larger than 1, so 6020 // that scalable vectorization is slightly favorable over fixed-width 6021 // vectorization. 6022 if (Hints->isScalableVectorizationPreferred()) 6023 if (A.Width.isScalable() && !B.Width.isScalable()) 6024 return (CostA * B.Width.getKnownMinValue()) <= 6025 (CostB * A.Width.getKnownMinValue()); 6026 6027 // To avoid the need for FP division: 6028 // (CostA / A.Width) < (CostB / B.Width) 6029 // <=> (CostA * B.Width) < (CostB * A.Width) 6030 return (CostA * B.Width.getKnownMinValue()) < 6031 (CostB * A.Width.getKnownMinValue()); 6032 } 6033 6034 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 6035 const ElementCountSet &VFCandidates) { 6036 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6037 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6038 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6039 assert(VFCandidates.count(ElementCount::getFixed(1)) && 6040 "Expected Scalar VF to be a candidate"); 6041 6042 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6043 VectorizationFactor ChosenFactor = ScalarCost; 6044 6045 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6046 if (ForceVectorization && VFCandidates.size() > 1) { 6047 // Ignore scalar width, because the user explicitly wants vectorization. 6048 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6049 // evaluation. 6050 ChosenFactor.Cost = InstructionCost::getMax(); 6051 } 6052 6053 SmallVector<InstructionVFPair> InvalidCosts; 6054 for (const auto &i : VFCandidates) { 6055 // The cost for scalar VF=1 is already calculated, so ignore it. 6056 if (i.isScalar()) 6057 continue; 6058 6059 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 6060 VectorizationFactor Candidate(i, C.first); 6061 LLVM_DEBUG( 6062 dbgs() << "LV: Vector loop of width " << i << " costs: " 6063 << (Candidate.Cost / Candidate.Width.getKnownMinValue()) 6064 << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") 6065 << ".\n"); 6066 6067 if (!C.second && !ForceVectorization) { 6068 LLVM_DEBUG( 6069 dbgs() << "LV: Not considering vector loop of width " << i 6070 << " because it will not generate any vector instructions.\n"); 6071 continue; 6072 } 6073 6074 // If profitable add it to ProfitableVF list. 6075 if (isMoreProfitable(Candidate, ScalarCost)) 6076 ProfitableVFs.push_back(Candidate); 6077 6078 if (isMoreProfitable(Candidate, ChosenFactor)) 6079 ChosenFactor = Candidate; 6080 } 6081 6082 // Emit a report of VFs with invalid costs in the loop. 6083 if (!InvalidCosts.empty()) { 6084 // Group the remarks per instruction, keeping the instruction order from 6085 // InvalidCosts. 6086 std::map<Instruction *, unsigned> Numbering; 6087 unsigned I = 0; 6088 for (auto &Pair : InvalidCosts) 6089 if (!Numbering.count(Pair.first)) 6090 Numbering[Pair.first] = I++; 6091 6092 // Sort the list, first on instruction(number) then on VF. 6093 llvm::sort(InvalidCosts, 6094 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6095 if (Numbering[A.first] != Numbering[B.first]) 6096 return Numbering[A.first] < Numbering[B.first]; 6097 ElementCountComparator ECC; 6098 return ECC(A.second, B.second); 6099 }); 6100 6101 // For a list of ordered instruction-vf pairs: 6102 // [(load, vf1), (load, vf2), (store, vf1)] 6103 // Group the instructions together to emit separate remarks for: 6104 // load (vf1, vf2) 6105 // store (vf1) 6106 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6107 auto Subset = ArrayRef<InstructionVFPair>(); 6108 do { 6109 if (Subset.empty()) 6110 Subset = Tail.take_front(1); 6111 6112 Instruction *I = Subset.front().first; 6113 6114 // If the next instruction is different, or if there are no other pairs, 6115 // emit a remark for the collated subset. e.g. 6116 // [(load, vf1), (load, vf2))] 6117 // to emit: 6118 // remark: invalid costs for 'load' at VF=(vf, vf2) 6119 if (Subset == Tail || Tail[Subset.size()].first != I) { 6120 std::string OutString; 6121 raw_string_ostream OS(OutString); 6122 assert(!Subset.empty() && "Unexpected empty range"); 6123 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6124 for (auto &Pair : Subset) 6125 OS << (Pair.second == Subset.front().second ? "" : ", ") 6126 << Pair.second; 6127 OS << "):"; 6128 if (auto *CI = dyn_cast<CallInst>(I)) 6129 OS << " call to " << CI->getCalledFunction()->getName(); 6130 else 6131 OS << " " << I->getOpcodeName(); 6132 OS.flush(); 6133 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6134 Tail = Tail.drop_front(Subset.size()); 6135 Subset = {}; 6136 } else 6137 // Grow the subset by one element 6138 Subset = Tail.take_front(Subset.size() + 1); 6139 } while (!Tail.empty()); 6140 } 6141 6142 if (!EnableCondStoresVectorization && NumPredStores) { 6143 reportVectorizationFailure("There are conditional stores.", 6144 "store that is conditionally executed prevents vectorization", 6145 "ConditionalStore", ORE, TheLoop); 6146 ChosenFactor = ScalarCost; 6147 } 6148 6149 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6150 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6151 << "LV: Vectorization seems to be not beneficial, " 6152 << "but was forced by a user.\n"); 6153 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6154 return ChosenFactor; 6155 } 6156 6157 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6158 const Loop &L, ElementCount VF) const { 6159 // Cross iteration phis such as reductions need special handling and are 6160 // currently unsupported. 6161 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6162 return Legal->isFirstOrderRecurrence(&Phi) || 6163 Legal->isReductionVariable(&Phi); 6164 })) 6165 return false; 6166 6167 // Phis with uses outside of the loop require special handling and are 6168 // currently unsupported. 6169 for (auto &Entry : Legal->getInductionVars()) { 6170 // Look for uses of the value of the induction at the last iteration. 6171 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6172 for (User *U : PostInc->users()) 6173 if (!L.contains(cast<Instruction>(U))) 6174 return false; 6175 // Look for uses of penultimate value of the induction. 6176 for (User *U : Entry.first->users()) 6177 if (!L.contains(cast<Instruction>(U))) 6178 return false; 6179 } 6180 6181 // Induction variables that are widened require special handling that is 6182 // currently not supported. 6183 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6184 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6185 this->isProfitableToScalarize(Entry.first, VF)); 6186 })) 6187 return false; 6188 6189 // Epilogue vectorization code has not been auditted to ensure it handles 6190 // non-latch exits properly. It may be fine, but it needs auditted and 6191 // tested. 6192 if (L.getExitingBlock() != L.getLoopLatch()) 6193 return false; 6194 6195 return true; 6196 } 6197 6198 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6199 const ElementCount VF) const { 6200 // FIXME: We need a much better cost-model to take different parameters such 6201 // as register pressure, code size increase and cost of extra branches into 6202 // account. For now we apply a very crude heuristic and only consider loops 6203 // with vectorization factors larger than a certain value. 6204 // We also consider epilogue vectorization unprofitable for targets that don't 6205 // consider interleaving beneficial (eg. MVE). 6206 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6207 return false; 6208 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6209 return true; 6210 return false; 6211 } 6212 6213 VectorizationFactor 6214 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6215 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6216 VectorizationFactor Result = VectorizationFactor::Disabled(); 6217 if (!EnableEpilogueVectorization) { 6218 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6219 return Result; 6220 } 6221 6222 if (!isScalarEpilogueAllowed()) { 6223 LLVM_DEBUG( 6224 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6225 "allowed.\n";); 6226 return Result; 6227 } 6228 6229 // FIXME: This can be fixed for scalable vectors later, because at this stage 6230 // the LoopVectorizer will only consider vectorizing a loop with scalable 6231 // vectors when the loop has a hint to enable vectorization for a given VF. 6232 if (MainLoopVF.isScalable()) { 6233 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6234 "yet supported.\n"); 6235 return Result; 6236 } 6237 6238 // Not really a cost consideration, but check for unsupported cases here to 6239 // simplify the logic. 6240 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6241 LLVM_DEBUG( 6242 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6243 "not a supported candidate.\n";); 6244 return Result; 6245 } 6246 6247 if (EpilogueVectorizationForceVF > 1) { 6248 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6249 if (LVP.hasPlanWithVFs( 6250 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6251 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6252 else { 6253 LLVM_DEBUG( 6254 dbgs() 6255 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6256 return Result; 6257 } 6258 } 6259 6260 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6261 TheLoop->getHeader()->getParent()->hasMinSize()) { 6262 LLVM_DEBUG( 6263 dbgs() 6264 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6265 return Result; 6266 } 6267 6268 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6269 return Result; 6270 6271 for (auto &NextVF : ProfitableVFs) 6272 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6273 (Result.Width.getFixedValue() == 1 || 6274 isMoreProfitable(NextVF, Result)) && 6275 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6276 Result = NextVF; 6277 6278 if (Result != VectorizationFactor::Disabled()) 6279 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6280 << Result.Width.getFixedValue() << "\n";); 6281 return Result; 6282 } 6283 6284 std::pair<unsigned, unsigned> 6285 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6286 unsigned MinWidth = -1U; 6287 unsigned MaxWidth = 8; 6288 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6289 for (Type *T : ElementTypesInLoop) { 6290 MinWidth = std::min<unsigned>( 6291 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6292 MaxWidth = std::max<unsigned>( 6293 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6294 } 6295 return {MinWidth, MaxWidth}; 6296 } 6297 6298 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6299 ElementTypesInLoop.clear(); 6300 // For each block. 6301 for (BasicBlock *BB : TheLoop->blocks()) { 6302 // For each instruction in the loop. 6303 for (Instruction &I : BB->instructionsWithoutDebug()) { 6304 Type *T = I.getType(); 6305 6306 // Skip ignored values. 6307 if (ValuesToIgnore.count(&I)) 6308 continue; 6309 6310 // Only examine Loads, Stores and PHINodes. 6311 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6312 continue; 6313 6314 // Examine PHI nodes that are reduction variables. Update the type to 6315 // account for the recurrence type. 6316 if (auto *PN = dyn_cast<PHINode>(&I)) { 6317 if (!Legal->isReductionVariable(PN)) 6318 continue; 6319 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6320 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6321 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6322 RdxDesc.getRecurrenceType(), 6323 TargetTransformInfo::ReductionFlags())) 6324 continue; 6325 T = RdxDesc.getRecurrenceType(); 6326 } 6327 6328 // Examine the stored values. 6329 if (auto *ST = dyn_cast<StoreInst>(&I)) 6330 T = ST->getValueOperand()->getType(); 6331 6332 // Ignore loaded pointer types and stored pointer types that are not 6333 // vectorizable. 6334 // 6335 // FIXME: The check here attempts to predict whether a load or store will 6336 // be vectorized. We only know this for certain after a VF has 6337 // been selected. Here, we assume that if an access can be 6338 // vectorized, it will be. We should also look at extending this 6339 // optimization to non-pointer types. 6340 // 6341 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6342 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6343 continue; 6344 6345 ElementTypesInLoop.insert(T); 6346 } 6347 } 6348 } 6349 6350 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6351 unsigned LoopCost) { 6352 // -- The interleave heuristics -- 6353 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6354 // There are many micro-architectural considerations that we can't predict 6355 // at this level. For example, frontend pressure (on decode or fetch) due to 6356 // code size, or the number and capabilities of the execution ports. 6357 // 6358 // We use the following heuristics to select the interleave count: 6359 // 1. If the code has reductions, then we interleave to break the cross 6360 // iteration dependency. 6361 // 2. If the loop is really small, then we interleave to reduce the loop 6362 // overhead. 6363 // 3. We don't interleave if we think that we will spill registers to memory 6364 // due to the increased register pressure. 6365 6366 if (!isScalarEpilogueAllowed()) 6367 return 1; 6368 6369 // We used the distance for the interleave count. 6370 if (Legal->getMaxSafeDepDistBytes() != -1U) 6371 return 1; 6372 6373 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6374 const bool HasReductions = !Legal->getReductionVars().empty(); 6375 // Do not interleave loops with a relatively small known or estimated trip 6376 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6377 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6378 // because with the above conditions interleaving can expose ILP and break 6379 // cross iteration dependences for reductions. 6380 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6381 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6382 return 1; 6383 6384 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6385 // We divide by these constants so assume that we have at least one 6386 // instruction that uses at least one register. 6387 for (auto& pair : R.MaxLocalUsers) { 6388 pair.second = std::max(pair.second, 1U); 6389 } 6390 6391 // We calculate the interleave count using the following formula. 6392 // Subtract the number of loop invariants from the number of available 6393 // registers. These registers are used by all of the interleaved instances. 6394 // Next, divide the remaining registers by the number of registers that is 6395 // required by the loop, in order to estimate how many parallel instances 6396 // fit without causing spills. All of this is rounded down if necessary to be 6397 // a power of two. We want power of two interleave count to simplify any 6398 // addressing operations or alignment considerations. 6399 // We also want power of two interleave counts to ensure that the induction 6400 // variable of the vector loop wraps to zero, when tail is folded by masking; 6401 // this currently happens when OptForSize, in which case IC is set to 1 above. 6402 unsigned IC = UINT_MAX; 6403 6404 for (auto& pair : R.MaxLocalUsers) { 6405 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6406 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6407 << " registers of " 6408 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6409 if (VF.isScalar()) { 6410 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6411 TargetNumRegisters = ForceTargetNumScalarRegs; 6412 } else { 6413 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6414 TargetNumRegisters = ForceTargetNumVectorRegs; 6415 } 6416 unsigned MaxLocalUsers = pair.second; 6417 unsigned LoopInvariantRegs = 0; 6418 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6419 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6420 6421 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6422 // Don't count the induction variable as interleaved. 6423 if (EnableIndVarRegisterHeur) { 6424 TmpIC = 6425 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6426 std::max(1U, (MaxLocalUsers - 1))); 6427 } 6428 6429 IC = std::min(IC, TmpIC); 6430 } 6431 6432 // Clamp the interleave ranges to reasonable counts. 6433 unsigned MaxInterleaveCount = 6434 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6435 6436 // Check if the user has overridden the max. 6437 if (VF.isScalar()) { 6438 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6439 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6440 } else { 6441 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6442 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6443 } 6444 6445 // If trip count is known or estimated compile time constant, limit the 6446 // interleave count to be less than the trip count divided by VF, provided it 6447 // is at least 1. 6448 // 6449 // For scalable vectors we can't know if interleaving is beneficial. It may 6450 // not be beneficial for small loops if none of the lanes in the second vector 6451 // iterations is enabled. However, for larger loops, there is likely to be a 6452 // similar benefit as for fixed-width vectors. For now, we choose to leave 6453 // the InterleaveCount as if vscale is '1', although if some information about 6454 // the vector is known (e.g. min vector size), we can make a better decision. 6455 if (BestKnownTC) { 6456 MaxInterleaveCount = 6457 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6458 // Make sure MaxInterleaveCount is greater than 0. 6459 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6460 } 6461 6462 assert(MaxInterleaveCount > 0 && 6463 "Maximum interleave count must be greater than 0"); 6464 6465 // Clamp the calculated IC to be between the 1 and the max interleave count 6466 // that the target and trip count allows. 6467 if (IC > MaxInterleaveCount) 6468 IC = MaxInterleaveCount; 6469 else 6470 // Make sure IC is greater than 0. 6471 IC = std::max(1u, IC); 6472 6473 assert(IC > 0 && "Interleave count must be greater than 0."); 6474 6475 // If we did not calculate the cost for VF (because the user selected the VF) 6476 // then we calculate the cost of VF here. 6477 if (LoopCost == 0) { 6478 InstructionCost C = expectedCost(VF).first; 6479 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6480 LoopCost = *C.getValue(); 6481 } 6482 6483 assert(LoopCost && "Non-zero loop cost expected"); 6484 6485 // Interleave if we vectorized this loop and there is a reduction that could 6486 // benefit from interleaving. 6487 if (VF.isVector() && HasReductions) { 6488 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6489 return IC; 6490 } 6491 6492 // Note that if we've already vectorized the loop we will have done the 6493 // runtime check and so interleaving won't require further checks. 6494 bool InterleavingRequiresRuntimePointerCheck = 6495 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6496 6497 // We want to interleave small loops in order to reduce the loop overhead and 6498 // potentially expose ILP opportunities. 6499 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6500 << "LV: IC is " << IC << '\n' 6501 << "LV: VF is " << VF << '\n'); 6502 const bool AggressivelyInterleaveReductions = 6503 TTI.enableAggressiveInterleaving(HasReductions); 6504 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6505 // We assume that the cost overhead is 1 and we use the cost model 6506 // to estimate the cost of the loop and interleave until the cost of the 6507 // loop overhead is about 5% of the cost of the loop. 6508 unsigned SmallIC = 6509 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6510 6511 // Interleave until store/load ports (estimated by max interleave count) are 6512 // saturated. 6513 unsigned NumStores = Legal->getNumStores(); 6514 unsigned NumLoads = Legal->getNumLoads(); 6515 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6516 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6517 6518 // If we have a scalar reduction (vector reductions are already dealt with 6519 // by this point), we can increase the critical path length if the loop 6520 // we're interleaving is inside another loop. For tree-wise reductions 6521 // set the limit to 2, and for ordered reductions it's best to disable 6522 // interleaving entirely. 6523 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6524 bool HasOrderedReductions = 6525 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6526 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6527 return RdxDesc.isOrdered(); 6528 }); 6529 if (HasOrderedReductions) { 6530 LLVM_DEBUG( 6531 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6532 return 1; 6533 } 6534 6535 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6536 SmallIC = std::min(SmallIC, F); 6537 StoresIC = std::min(StoresIC, F); 6538 LoadsIC = std::min(LoadsIC, F); 6539 } 6540 6541 if (EnableLoadStoreRuntimeInterleave && 6542 std::max(StoresIC, LoadsIC) > SmallIC) { 6543 LLVM_DEBUG( 6544 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6545 return std::max(StoresIC, LoadsIC); 6546 } 6547 6548 // If there are scalar reductions and TTI has enabled aggressive 6549 // interleaving for reductions, we will interleave to expose ILP. 6550 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6551 AggressivelyInterleaveReductions) { 6552 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6553 // Interleave no less than SmallIC but not as aggressive as the normal IC 6554 // to satisfy the rare situation when resources are too limited. 6555 return std::max(IC / 2, SmallIC); 6556 } else { 6557 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6558 return SmallIC; 6559 } 6560 } 6561 6562 // Interleave if this is a large loop (small loops are already dealt with by 6563 // this point) that could benefit from interleaving. 6564 if (AggressivelyInterleaveReductions) { 6565 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6566 return IC; 6567 } 6568 6569 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6570 return 1; 6571 } 6572 6573 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6574 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6575 // This function calculates the register usage by measuring the highest number 6576 // of values that are alive at a single location. Obviously, this is a very 6577 // rough estimation. We scan the loop in a topological order in order and 6578 // assign a number to each instruction. We use RPO to ensure that defs are 6579 // met before their users. We assume that each instruction that has in-loop 6580 // users starts an interval. We record every time that an in-loop value is 6581 // used, so we have a list of the first and last occurrences of each 6582 // instruction. Next, we transpose this data structure into a multi map that 6583 // holds the list of intervals that *end* at a specific location. This multi 6584 // map allows us to perform a linear search. We scan the instructions linearly 6585 // and record each time that a new interval starts, by placing it in a set. 6586 // If we find this value in the multi-map then we remove it from the set. 6587 // The max register usage is the maximum size of the set. 6588 // We also search for instructions that are defined outside the loop, but are 6589 // used inside the loop. We need this number separately from the max-interval 6590 // usage number because when we unroll, loop-invariant values do not take 6591 // more register. 6592 LoopBlocksDFS DFS(TheLoop); 6593 DFS.perform(LI); 6594 6595 RegisterUsage RU; 6596 6597 // Each 'key' in the map opens a new interval. The values 6598 // of the map are the index of the 'last seen' usage of the 6599 // instruction that is the key. 6600 using IntervalMap = DenseMap<Instruction *, unsigned>; 6601 6602 // Maps instruction to its index. 6603 SmallVector<Instruction *, 64> IdxToInstr; 6604 // Marks the end of each interval. 6605 IntervalMap EndPoint; 6606 // Saves the list of instruction indices that are used in the loop. 6607 SmallPtrSet<Instruction *, 8> Ends; 6608 // Saves the list of values that are used in the loop but are 6609 // defined outside the loop, such as arguments and constants. 6610 SmallPtrSet<Value *, 8> LoopInvariants; 6611 6612 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6613 for (Instruction &I : BB->instructionsWithoutDebug()) { 6614 IdxToInstr.push_back(&I); 6615 6616 // Save the end location of each USE. 6617 for (Value *U : I.operands()) { 6618 auto *Instr = dyn_cast<Instruction>(U); 6619 6620 // Ignore non-instruction values such as arguments, constants, etc. 6621 if (!Instr) 6622 continue; 6623 6624 // If this instruction is outside the loop then record it and continue. 6625 if (!TheLoop->contains(Instr)) { 6626 LoopInvariants.insert(Instr); 6627 continue; 6628 } 6629 6630 // Overwrite previous end points. 6631 EndPoint[Instr] = IdxToInstr.size(); 6632 Ends.insert(Instr); 6633 } 6634 } 6635 } 6636 6637 // Saves the list of intervals that end with the index in 'key'. 6638 using InstrList = SmallVector<Instruction *, 2>; 6639 DenseMap<unsigned, InstrList> TransposeEnds; 6640 6641 // Transpose the EndPoints to a list of values that end at each index. 6642 for (auto &Interval : EndPoint) 6643 TransposeEnds[Interval.second].push_back(Interval.first); 6644 6645 SmallPtrSet<Instruction *, 8> OpenIntervals; 6646 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6647 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6648 6649 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6650 6651 // A lambda that gets the register usage for the given type and VF. 6652 const auto &TTICapture = TTI; 6653 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6654 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6655 return 0; 6656 InstructionCost::CostType RegUsage = 6657 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6658 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6659 "Nonsensical values for register usage."); 6660 return RegUsage; 6661 }; 6662 6663 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6664 Instruction *I = IdxToInstr[i]; 6665 6666 // Remove all of the instructions that end at this location. 6667 InstrList &List = TransposeEnds[i]; 6668 for (Instruction *ToRemove : List) 6669 OpenIntervals.erase(ToRemove); 6670 6671 // Ignore instructions that are never used within the loop. 6672 if (!Ends.count(I)) 6673 continue; 6674 6675 // Skip ignored values. 6676 if (ValuesToIgnore.count(I)) 6677 continue; 6678 6679 // For each VF find the maximum usage of registers. 6680 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6681 // Count the number of live intervals. 6682 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6683 6684 if (VFs[j].isScalar()) { 6685 for (auto Inst : OpenIntervals) { 6686 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6687 if (RegUsage.find(ClassID) == RegUsage.end()) 6688 RegUsage[ClassID] = 1; 6689 else 6690 RegUsage[ClassID] += 1; 6691 } 6692 } else { 6693 collectUniformsAndScalars(VFs[j]); 6694 for (auto Inst : OpenIntervals) { 6695 // Skip ignored values for VF > 1. 6696 if (VecValuesToIgnore.count(Inst)) 6697 continue; 6698 if (isScalarAfterVectorization(Inst, VFs[j])) { 6699 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6700 if (RegUsage.find(ClassID) == RegUsage.end()) 6701 RegUsage[ClassID] = 1; 6702 else 6703 RegUsage[ClassID] += 1; 6704 } else { 6705 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6706 if (RegUsage.find(ClassID) == RegUsage.end()) 6707 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6708 else 6709 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6710 } 6711 } 6712 } 6713 6714 for (auto& pair : RegUsage) { 6715 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6716 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6717 else 6718 MaxUsages[j][pair.first] = pair.second; 6719 } 6720 } 6721 6722 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6723 << OpenIntervals.size() << '\n'); 6724 6725 // Add the current instruction to the list of open intervals. 6726 OpenIntervals.insert(I); 6727 } 6728 6729 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6730 SmallMapVector<unsigned, unsigned, 4> Invariant; 6731 6732 for (auto Inst : LoopInvariants) { 6733 unsigned Usage = 6734 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6735 unsigned ClassID = 6736 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6737 if (Invariant.find(ClassID) == Invariant.end()) 6738 Invariant[ClassID] = Usage; 6739 else 6740 Invariant[ClassID] += Usage; 6741 } 6742 6743 LLVM_DEBUG({ 6744 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6745 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6746 << " item\n"; 6747 for (const auto &pair : MaxUsages[i]) { 6748 dbgs() << "LV(REG): RegisterClass: " 6749 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6750 << " registers\n"; 6751 } 6752 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6753 << " item\n"; 6754 for (const auto &pair : Invariant) { 6755 dbgs() << "LV(REG): RegisterClass: " 6756 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6757 << " registers\n"; 6758 } 6759 }); 6760 6761 RU.LoopInvariantRegs = Invariant; 6762 RU.MaxLocalUsers = MaxUsages[i]; 6763 RUs[i] = RU; 6764 } 6765 6766 return RUs; 6767 } 6768 6769 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6770 // TODO: Cost model for emulated masked load/store is completely 6771 // broken. This hack guides the cost model to use an artificially 6772 // high enough value to practically disable vectorization with such 6773 // operations, except where previously deployed legality hack allowed 6774 // using very low cost values. This is to avoid regressions coming simply 6775 // from moving "masked load/store" check from legality to cost model. 6776 // Masked Load/Gather emulation was previously never allowed. 6777 // Limited number of Masked Store/Scatter emulation was allowed. 6778 assert(isPredicatedInst(I) && 6779 "Expecting a scalar emulated instruction"); 6780 return isa<LoadInst>(I) || 6781 (isa<StoreInst>(I) && 6782 NumPredStores > NumberOfStoresToPredicate); 6783 } 6784 6785 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6786 // If we aren't vectorizing the loop, or if we've already collected the 6787 // instructions to scalarize, there's nothing to do. Collection may already 6788 // have occurred if we have a user-selected VF and are now computing the 6789 // expected cost for interleaving. 6790 if (VF.isScalar() || VF.isZero() || 6791 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6792 return; 6793 6794 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6795 // not profitable to scalarize any instructions, the presence of VF in the 6796 // map will indicate that we've analyzed it already. 6797 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6798 6799 // Find all the instructions that are scalar with predication in the loop and 6800 // determine if it would be better to not if-convert the blocks they are in. 6801 // If so, we also record the instructions to scalarize. 6802 for (BasicBlock *BB : TheLoop->blocks()) { 6803 if (!blockNeedsPredication(BB)) 6804 continue; 6805 for (Instruction &I : *BB) 6806 if (isScalarWithPredication(&I)) { 6807 ScalarCostsTy ScalarCosts; 6808 // Do not apply discount if scalable, because that would lead to 6809 // invalid scalarization costs. 6810 // Do not apply discount logic if hacked cost is needed 6811 // for emulated masked memrefs. 6812 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6813 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6814 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6815 // Remember that BB will remain after vectorization. 6816 PredicatedBBsAfterVectorization.insert(BB); 6817 } 6818 } 6819 } 6820 6821 int LoopVectorizationCostModel::computePredInstDiscount( 6822 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6823 assert(!isUniformAfterVectorization(PredInst, VF) && 6824 "Instruction marked uniform-after-vectorization will be predicated"); 6825 6826 // Initialize the discount to zero, meaning that the scalar version and the 6827 // vector version cost the same. 6828 InstructionCost Discount = 0; 6829 6830 // Holds instructions to analyze. The instructions we visit are mapped in 6831 // ScalarCosts. Those instructions are the ones that would be scalarized if 6832 // we find that the scalar version costs less. 6833 SmallVector<Instruction *, 8> Worklist; 6834 6835 // Returns true if the given instruction can be scalarized. 6836 auto canBeScalarized = [&](Instruction *I) -> bool { 6837 // We only attempt to scalarize instructions forming a single-use chain 6838 // from the original predicated block that would otherwise be vectorized. 6839 // Although not strictly necessary, we give up on instructions we know will 6840 // already be scalar to avoid traversing chains that are unlikely to be 6841 // beneficial. 6842 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6843 isScalarAfterVectorization(I, VF)) 6844 return false; 6845 6846 // If the instruction is scalar with predication, it will be analyzed 6847 // separately. We ignore it within the context of PredInst. 6848 if (isScalarWithPredication(I)) 6849 return false; 6850 6851 // If any of the instruction's operands are uniform after vectorization, 6852 // the instruction cannot be scalarized. This prevents, for example, a 6853 // masked load from being scalarized. 6854 // 6855 // We assume we will only emit a value for lane zero of an instruction 6856 // marked uniform after vectorization, rather than VF identical values. 6857 // Thus, if we scalarize an instruction that uses a uniform, we would 6858 // create uses of values corresponding to the lanes we aren't emitting code 6859 // for. This behavior can be changed by allowing getScalarValue to clone 6860 // the lane zero values for uniforms rather than asserting. 6861 for (Use &U : I->operands()) 6862 if (auto *J = dyn_cast<Instruction>(U.get())) 6863 if (isUniformAfterVectorization(J, VF)) 6864 return false; 6865 6866 // Otherwise, we can scalarize the instruction. 6867 return true; 6868 }; 6869 6870 // Compute the expected cost discount from scalarizing the entire expression 6871 // feeding the predicated instruction. We currently only consider expressions 6872 // that are single-use instruction chains. 6873 Worklist.push_back(PredInst); 6874 while (!Worklist.empty()) { 6875 Instruction *I = Worklist.pop_back_val(); 6876 6877 // If we've already analyzed the instruction, there's nothing to do. 6878 if (ScalarCosts.find(I) != ScalarCosts.end()) 6879 continue; 6880 6881 // Compute the cost of the vector instruction. Note that this cost already 6882 // includes the scalarization overhead of the predicated instruction. 6883 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6884 6885 // Compute the cost of the scalarized instruction. This cost is the cost of 6886 // the instruction as if it wasn't if-converted and instead remained in the 6887 // predicated block. We will scale this cost by block probability after 6888 // computing the scalarization overhead. 6889 InstructionCost ScalarCost = 6890 VF.getFixedValue() * 6891 getInstructionCost(I, ElementCount::getFixed(1)).first; 6892 6893 // Compute the scalarization overhead of needed insertelement instructions 6894 // and phi nodes. 6895 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6896 ScalarCost += TTI.getScalarizationOverhead( 6897 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6898 APInt::getAllOnesValue(VF.getFixedValue()), true, false); 6899 ScalarCost += 6900 VF.getFixedValue() * 6901 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6902 } 6903 6904 // Compute the scalarization overhead of needed extractelement 6905 // instructions. For each of the instruction's operands, if the operand can 6906 // be scalarized, add it to the worklist; otherwise, account for the 6907 // overhead. 6908 for (Use &U : I->operands()) 6909 if (auto *J = dyn_cast<Instruction>(U.get())) { 6910 assert(VectorType::isValidElementType(J->getType()) && 6911 "Instruction has non-scalar type"); 6912 if (canBeScalarized(J)) 6913 Worklist.push_back(J); 6914 else if (needsExtract(J, VF)) { 6915 ScalarCost += TTI.getScalarizationOverhead( 6916 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6917 APInt::getAllOnesValue(VF.getFixedValue()), false, true); 6918 } 6919 } 6920 6921 // Scale the total scalar cost by block probability. 6922 ScalarCost /= getReciprocalPredBlockProb(); 6923 6924 // Compute the discount. A non-negative discount means the vector version 6925 // of the instruction costs more, and scalarizing would be beneficial. 6926 Discount += VectorCost - ScalarCost; 6927 ScalarCosts[I] = ScalarCost; 6928 } 6929 6930 return *Discount.getValue(); 6931 } 6932 6933 LoopVectorizationCostModel::VectorizationCostTy 6934 LoopVectorizationCostModel::expectedCost( 6935 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6936 VectorizationCostTy Cost; 6937 6938 // For each block. 6939 for (BasicBlock *BB : TheLoop->blocks()) { 6940 VectorizationCostTy BlockCost; 6941 6942 // For each instruction in the old loop. 6943 for (Instruction &I : BB->instructionsWithoutDebug()) { 6944 // Skip ignored values. 6945 if (ValuesToIgnore.count(&I) || 6946 (VF.isVector() && VecValuesToIgnore.count(&I))) 6947 continue; 6948 6949 VectorizationCostTy C = getInstructionCost(&I, VF); 6950 6951 // Check if we should override the cost. 6952 if (C.first.isValid() && 6953 ForceTargetInstructionCost.getNumOccurrences() > 0) 6954 C.first = InstructionCost(ForceTargetInstructionCost); 6955 6956 // Keep a list of instructions with invalid costs. 6957 if (Invalid && !C.first.isValid()) 6958 Invalid->emplace_back(&I, VF); 6959 6960 BlockCost.first += C.first; 6961 BlockCost.second |= C.second; 6962 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6963 << " for VF " << VF << " For instruction: " << I 6964 << '\n'); 6965 } 6966 6967 // If we are vectorizing a predicated block, it will have been 6968 // if-converted. This means that the block's instructions (aside from 6969 // stores and instructions that may divide by zero) will now be 6970 // unconditionally executed. For the scalar case, we may not always execute 6971 // the predicated block, if it is an if-else block. Thus, scale the block's 6972 // cost by the probability of executing it. blockNeedsPredication from 6973 // Legal is used so as to not include all blocks in tail folded loops. 6974 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6975 BlockCost.first /= getReciprocalPredBlockProb(); 6976 6977 Cost.first += BlockCost.first; 6978 Cost.second |= BlockCost.second; 6979 } 6980 6981 return Cost; 6982 } 6983 6984 /// Gets Address Access SCEV after verifying that the access pattern 6985 /// is loop invariant except the induction variable dependence. 6986 /// 6987 /// This SCEV can be sent to the Target in order to estimate the address 6988 /// calculation cost. 6989 static const SCEV *getAddressAccessSCEV( 6990 Value *Ptr, 6991 LoopVectorizationLegality *Legal, 6992 PredicatedScalarEvolution &PSE, 6993 const Loop *TheLoop) { 6994 6995 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6996 if (!Gep) 6997 return nullptr; 6998 6999 // We are looking for a gep with all loop invariant indices except for one 7000 // which should be an induction variable. 7001 auto SE = PSE.getSE(); 7002 unsigned NumOperands = Gep->getNumOperands(); 7003 for (unsigned i = 1; i < NumOperands; ++i) { 7004 Value *Opd = Gep->getOperand(i); 7005 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 7006 !Legal->isInductionVariable(Opd)) 7007 return nullptr; 7008 } 7009 7010 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 7011 return PSE.getSCEV(Ptr); 7012 } 7013 7014 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 7015 return Legal->hasStride(I->getOperand(0)) || 7016 Legal->hasStride(I->getOperand(1)); 7017 } 7018 7019 InstructionCost 7020 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 7021 ElementCount VF) { 7022 assert(VF.isVector() && 7023 "Scalarization cost of instruction implies vectorization."); 7024 if (VF.isScalable()) 7025 return InstructionCost::getInvalid(); 7026 7027 Type *ValTy = getLoadStoreType(I); 7028 auto SE = PSE.getSE(); 7029 7030 unsigned AS = getLoadStoreAddressSpace(I); 7031 Value *Ptr = getLoadStorePointerOperand(I); 7032 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 7033 7034 // Figure out whether the access is strided and get the stride value 7035 // if it's known in compile time 7036 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 7037 7038 // Get the cost of the scalar memory instruction and address computation. 7039 InstructionCost Cost = 7040 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 7041 7042 // Don't pass *I here, since it is scalar but will actually be part of a 7043 // vectorized loop where the user of it is a vectorized instruction. 7044 const Align Alignment = getLoadStoreAlignment(I); 7045 Cost += VF.getKnownMinValue() * 7046 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 7047 AS, TTI::TCK_RecipThroughput); 7048 7049 // Get the overhead of the extractelement and insertelement instructions 7050 // we might create due to scalarization. 7051 Cost += getScalarizationOverhead(I, VF); 7052 7053 // If we have a predicated load/store, it will need extra i1 extracts and 7054 // conditional branches, but may not be executed for each vector lane. Scale 7055 // the cost by the probability of executing the predicated block. 7056 if (isPredicatedInst(I)) { 7057 Cost /= getReciprocalPredBlockProb(); 7058 7059 // Add the cost of an i1 extract and a branch 7060 auto *Vec_i1Ty = 7061 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 7062 Cost += TTI.getScalarizationOverhead( 7063 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7064 /*Insert=*/false, /*Extract=*/true); 7065 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 7066 7067 if (useEmulatedMaskMemRefHack(I)) 7068 // Artificially setting to a high enough value to practically disable 7069 // vectorization with such operations. 7070 Cost = 3000000; 7071 } 7072 7073 return Cost; 7074 } 7075 7076 InstructionCost 7077 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7078 ElementCount VF) { 7079 Type *ValTy = getLoadStoreType(I); 7080 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7081 Value *Ptr = getLoadStorePointerOperand(I); 7082 unsigned AS = getLoadStoreAddressSpace(I); 7083 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 7084 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7085 7086 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7087 "Stride should be 1 or -1 for consecutive memory access"); 7088 const Align Alignment = getLoadStoreAlignment(I); 7089 InstructionCost Cost = 0; 7090 if (Legal->isMaskRequired(I)) 7091 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7092 CostKind); 7093 else 7094 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7095 CostKind, I); 7096 7097 bool Reverse = ConsecutiveStride < 0; 7098 if (Reverse) 7099 Cost += 7100 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7101 return Cost; 7102 } 7103 7104 InstructionCost 7105 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7106 ElementCount VF) { 7107 assert(Legal->isUniformMemOp(*I)); 7108 7109 Type *ValTy = getLoadStoreType(I); 7110 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7111 const Align Alignment = getLoadStoreAlignment(I); 7112 unsigned AS = getLoadStoreAddressSpace(I); 7113 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7114 if (isa<LoadInst>(I)) { 7115 return TTI.getAddressComputationCost(ValTy) + 7116 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7117 CostKind) + 7118 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7119 } 7120 StoreInst *SI = cast<StoreInst>(I); 7121 7122 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7123 return TTI.getAddressComputationCost(ValTy) + 7124 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7125 CostKind) + 7126 (isLoopInvariantStoreValue 7127 ? 0 7128 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7129 VF.getKnownMinValue() - 1)); 7130 } 7131 7132 InstructionCost 7133 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7134 ElementCount VF) { 7135 Type *ValTy = getLoadStoreType(I); 7136 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7137 const Align Alignment = getLoadStoreAlignment(I); 7138 const Value *Ptr = getLoadStorePointerOperand(I); 7139 7140 return TTI.getAddressComputationCost(VectorTy) + 7141 TTI.getGatherScatterOpCost( 7142 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7143 TargetTransformInfo::TCK_RecipThroughput, I); 7144 } 7145 7146 InstructionCost 7147 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7148 ElementCount VF) { 7149 // TODO: Once we have support for interleaving with scalable vectors 7150 // we can calculate the cost properly here. 7151 if (VF.isScalable()) 7152 return InstructionCost::getInvalid(); 7153 7154 Type *ValTy = getLoadStoreType(I); 7155 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7156 unsigned AS = getLoadStoreAddressSpace(I); 7157 7158 auto Group = getInterleavedAccessGroup(I); 7159 assert(Group && "Fail to get an interleaved access group."); 7160 7161 unsigned InterleaveFactor = Group->getFactor(); 7162 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7163 7164 // Holds the indices of existing members in the interleaved group. 7165 SmallVector<unsigned, 4> Indices; 7166 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 7167 if (Group->getMember(IF)) 7168 Indices.push_back(IF); 7169 7170 // Calculate the cost of the whole interleaved group. 7171 bool UseMaskForGaps = 7172 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 7173 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 7174 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7175 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7176 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7177 7178 if (Group->isReverse()) { 7179 // TODO: Add support for reversed masked interleaved access. 7180 assert(!Legal->isMaskRequired(I) && 7181 "Reverse masked interleaved access not supported."); 7182 Cost += 7183 Group->getNumMembers() * 7184 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7185 } 7186 return Cost; 7187 } 7188 7189 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7190 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7191 using namespace llvm::PatternMatch; 7192 // Early exit for no inloop reductions 7193 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7194 return None; 7195 auto *VectorTy = cast<VectorType>(Ty); 7196 7197 // We are looking for a pattern of, and finding the minimal acceptable cost: 7198 // reduce(mul(ext(A), ext(B))) or 7199 // reduce(mul(A, B)) or 7200 // reduce(ext(A)) or 7201 // reduce(A). 7202 // The basic idea is that we walk down the tree to do that, finding the root 7203 // reduction instruction in InLoopReductionImmediateChains. From there we find 7204 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7205 // of the components. If the reduction cost is lower then we return it for the 7206 // reduction instruction and 0 for the other instructions in the pattern. If 7207 // it is not we return an invalid cost specifying the orignal cost method 7208 // should be used. 7209 Instruction *RetI = I; 7210 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7211 if (!RetI->hasOneUser()) 7212 return None; 7213 RetI = RetI->user_back(); 7214 } 7215 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7216 RetI->user_back()->getOpcode() == Instruction::Add) { 7217 if (!RetI->hasOneUser()) 7218 return None; 7219 RetI = RetI->user_back(); 7220 } 7221 7222 // Test if the found instruction is a reduction, and if not return an invalid 7223 // cost specifying the parent to use the original cost modelling. 7224 if (!InLoopReductionImmediateChains.count(RetI)) 7225 return None; 7226 7227 // Find the reduction this chain is a part of and calculate the basic cost of 7228 // the reduction on its own. 7229 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7230 Instruction *ReductionPhi = LastChain; 7231 while (!isa<PHINode>(ReductionPhi)) 7232 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7233 7234 const RecurrenceDescriptor &RdxDesc = 7235 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7236 7237 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7238 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7239 7240 // If we're using ordered reductions then we can just return the base cost 7241 // here, since getArithmeticReductionCost calculates the full ordered 7242 // reduction cost when FP reassociation is not allowed. 7243 if (useOrderedReductions(RdxDesc)) 7244 return BaseCost; 7245 7246 // Get the operand that was not the reduction chain and match it to one of the 7247 // patterns, returning the better cost if it is found. 7248 Instruction *RedOp = RetI->getOperand(1) == LastChain 7249 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7250 : dyn_cast<Instruction>(RetI->getOperand(1)); 7251 7252 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7253 7254 Instruction *Op0, *Op1; 7255 if (RedOp && 7256 match(RedOp, 7257 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7258 match(Op0, m_ZExtOrSExt(m_Value())) && 7259 Op0->getOpcode() == Op1->getOpcode() && 7260 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7261 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7262 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7263 7264 // Matched reduce(ext(mul(ext(A), ext(B))) 7265 // Note that the extend opcodes need to all match, or if A==B they will have 7266 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7267 // which is equally fine. 7268 bool IsUnsigned = isa<ZExtInst>(Op0); 7269 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7270 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7271 7272 InstructionCost ExtCost = 7273 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7274 TTI::CastContextHint::None, CostKind, Op0); 7275 InstructionCost MulCost = 7276 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7277 InstructionCost Ext2Cost = 7278 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7279 TTI::CastContextHint::None, CostKind, RedOp); 7280 7281 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7282 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7283 CostKind); 7284 7285 if (RedCost.isValid() && 7286 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7287 return I == RetI ? RedCost : 0; 7288 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7289 !TheLoop->isLoopInvariant(RedOp)) { 7290 // Matched reduce(ext(A)) 7291 bool IsUnsigned = isa<ZExtInst>(RedOp); 7292 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7293 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7294 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7295 CostKind); 7296 7297 InstructionCost ExtCost = 7298 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7299 TTI::CastContextHint::None, CostKind, RedOp); 7300 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7301 return I == RetI ? RedCost : 0; 7302 } else if (RedOp && 7303 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7304 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7305 Op0->getOpcode() == Op1->getOpcode() && 7306 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7307 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7308 bool IsUnsigned = isa<ZExtInst>(Op0); 7309 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7310 // Matched reduce(mul(ext, ext)) 7311 InstructionCost ExtCost = 7312 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7313 TTI::CastContextHint::None, CostKind, Op0); 7314 InstructionCost MulCost = 7315 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7316 7317 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7318 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7319 CostKind); 7320 7321 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7322 return I == RetI ? RedCost : 0; 7323 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7324 // Matched reduce(mul()) 7325 InstructionCost MulCost = 7326 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7327 7328 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7329 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7330 CostKind); 7331 7332 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7333 return I == RetI ? RedCost : 0; 7334 } 7335 } 7336 7337 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7338 } 7339 7340 InstructionCost 7341 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7342 ElementCount VF) { 7343 // Calculate scalar cost only. Vectorization cost should be ready at this 7344 // moment. 7345 if (VF.isScalar()) { 7346 Type *ValTy = getLoadStoreType(I); 7347 const Align Alignment = getLoadStoreAlignment(I); 7348 unsigned AS = getLoadStoreAddressSpace(I); 7349 7350 return TTI.getAddressComputationCost(ValTy) + 7351 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7352 TTI::TCK_RecipThroughput, I); 7353 } 7354 return getWideningCost(I, VF); 7355 } 7356 7357 LoopVectorizationCostModel::VectorizationCostTy 7358 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7359 ElementCount VF) { 7360 // If we know that this instruction will remain uniform, check the cost of 7361 // the scalar version. 7362 if (isUniformAfterVectorization(I, VF)) 7363 VF = ElementCount::getFixed(1); 7364 7365 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7366 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7367 7368 // Forced scalars do not have any scalarization overhead. 7369 auto ForcedScalar = ForcedScalars.find(VF); 7370 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7371 auto InstSet = ForcedScalar->second; 7372 if (InstSet.count(I)) 7373 return VectorizationCostTy( 7374 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7375 VF.getKnownMinValue()), 7376 false); 7377 } 7378 7379 Type *VectorTy; 7380 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7381 7382 bool TypeNotScalarized = 7383 VF.isVector() && VectorTy->isVectorTy() && 7384 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7385 return VectorizationCostTy(C, TypeNotScalarized); 7386 } 7387 7388 InstructionCost 7389 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7390 ElementCount VF) const { 7391 7392 // There is no mechanism yet to create a scalable scalarization loop, 7393 // so this is currently Invalid. 7394 if (VF.isScalable()) 7395 return InstructionCost::getInvalid(); 7396 7397 if (VF.isScalar()) 7398 return 0; 7399 7400 InstructionCost Cost = 0; 7401 Type *RetTy = ToVectorTy(I->getType(), VF); 7402 if (!RetTy->isVoidTy() && 7403 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7404 Cost += TTI.getScalarizationOverhead( 7405 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7406 true, false); 7407 7408 // Some targets keep addresses scalar. 7409 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7410 return Cost; 7411 7412 // Some targets support efficient element stores. 7413 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7414 return Cost; 7415 7416 // Collect operands to consider. 7417 CallInst *CI = dyn_cast<CallInst>(I); 7418 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7419 7420 // Skip operands that do not require extraction/scalarization and do not incur 7421 // any overhead. 7422 SmallVector<Type *> Tys; 7423 for (auto *V : filterExtractingOperands(Ops, VF)) 7424 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7425 return Cost + TTI.getOperandsScalarizationOverhead( 7426 filterExtractingOperands(Ops, VF), Tys); 7427 } 7428 7429 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7430 if (VF.isScalar()) 7431 return; 7432 NumPredStores = 0; 7433 for (BasicBlock *BB : TheLoop->blocks()) { 7434 // For each instruction in the old loop. 7435 for (Instruction &I : *BB) { 7436 Value *Ptr = getLoadStorePointerOperand(&I); 7437 if (!Ptr) 7438 continue; 7439 7440 // TODO: We should generate better code and update the cost model for 7441 // predicated uniform stores. Today they are treated as any other 7442 // predicated store (see added test cases in 7443 // invariant-store-vectorization.ll). 7444 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7445 NumPredStores++; 7446 7447 if (Legal->isUniformMemOp(I)) { 7448 // TODO: Avoid replicating loads and stores instead of 7449 // relying on instcombine to remove them. 7450 // Load: Scalar load + broadcast 7451 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7452 InstructionCost Cost; 7453 if (isa<StoreInst>(&I) && VF.isScalable() && 7454 isLegalGatherOrScatter(&I)) { 7455 Cost = getGatherScatterCost(&I, VF); 7456 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7457 } else { 7458 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7459 "Cannot yet scalarize uniform stores"); 7460 Cost = getUniformMemOpCost(&I, VF); 7461 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7462 } 7463 continue; 7464 } 7465 7466 // We assume that widening is the best solution when possible. 7467 if (memoryInstructionCanBeWidened(&I, VF)) { 7468 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7469 int ConsecutiveStride = 7470 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7471 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7472 "Expected consecutive stride."); 7473 InstWidening Decision = 7474 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7475 setWideningDecision(&I, VF, Decision, Cost); 7476 continue; 7477 } 7478 7479 // Choose between Interleaving, Gather/Scatter or Scalarization. 7480 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7481 unsigned NumAccesses = 1; 7482 if (isAccessInterleaved(&I)) { 7483 auto Group = getInterleavedAccessGroup(&I); 7484 assert(Group && "Fail to get an interleaved access group."); 7485 7486 // Make one decision for the whole group. 7487 if (getWideningDecision(&I, VF) != CM_Unknown) 7488 continue; 7489 7490 NumAccesses = Group->getNumMembers(); 7491 if (interleavedAccessCanBeWidened(&I, VF)) 7492 InterleaveCost = getInterleaveGroupCost(&I, VF); 7493 } 7494 7495 InstructionCost GatherScatterCost = 7496 isLegalGatherOrScatter(&I) 7497 ? getGatherScatterCost(&I, VF) * NumAccesses 7498 : InstructionCost::getInvalid(); 7499 7500 InstructionCost ScalarizationCost = 7501 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7502 7503 // Choose better solution for the current VF, 7504 // write down this decision and use it during vectorization. 7505 InstructionCost Cost; 7506 InstWidening Decision; 7507 if (InterleaveCost <= GatherScatterCost && 7508 InterleaveCost < ScalarizationCost) { 7509 Decision = CM_Interleave; 7510 Cost = InterleaveCost; 7511 } else if (GatherScatterCost < ScalarizationCost) { 7512 Decision = CM_GatherScatter; 7513 Cost = GatherScatterCost; 7514 } else { 7515 Decision = CM_Scalarize; 7516 Cost = ScalarizationCost; 7517 } 7518 // If the instructions belongs to an interleave group, the whole group 7519 // receives the same decision. The whole group receives the cost, but 7520 // the cost will actually be assigned to one instruction. 7521 if (auto Group = getInterleavedAccessGroup(&I)) 7522 setWideningDecision(Group, VF, Decision, Cost); 7523 else 7524 setWideningDecision(&I, VF, Decision, Cost); 7525 } 7526 } 7527 7528 // Make sure that any load of address and any other address computation 7529 // remains scalar unless there is gather/scatter support. This avoids 7530 // inevitable extracts into address registers, and also has the benefit of 7531 // activating LSR more, since that pass can't optimize vectorized 7532 // addresses. 7533 if (TTI.prefersVectorizedAddressing()) 7534 return; 7535 7536 // Start with all scalar pointer uses. 7537 SmallPtrSet<Instruction *, 8> AddrDefs; 7538 for (BasicBlock *BB : TheLoop->blocks()) 7539 for (Instruction &I : *BB) { 7540 Instruction *PtrDef = 7541 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7542 if (PtrDef && TheLoop->contains(PtrDef) && 7543 getWideningDecision(&I, VF) != CM_GatherScatter) 7544 AddrDefs.insert(PtrDef); 7545 } 7546 7547 // Add all instructions used to generate the addresses. 7548 SmallVector<Instruction *, 4> Worklist; 7549 append_range(Worklist, AddrDefs); 7550 while (!Worklist.empty()) { 7551 Instruction *I = Worklist.pop_back_val(); 7552 for (auto &Op : I->operands()) 7553 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7554 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7555 AddrDefs.insert(InstOp).second) 7556 Worklist.push_back(InstOp); 7557 } 7558 7559 for (auto *I : AddrDefs) { 7560 if (isa<LoadInst>(I)) { 7561 // Setting the desired widening decision should ideally be handled in 7562 // by cost functions, but since this involves the task of finding out 7563 // if the loaded register is involved in an address computation, it is 7564 // instead changed here when we know this is the case. 7565 InstWidening Decision = getWideningDecision(I, VF); 7566 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7567 // Scalarize a widened load of address. 7568 setWideningDecision( 7569 I, VF, CM_Scalarize, 7570 (VF.getKnownMinValue() * 7571 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7572 else if (auto Group = getInterleavedAccessGroup(I)) { 7573 // Scalarize an interleave group of address loads. 7574 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7575 if (Instruction *Member = Group->getMember(I)) 7576 setWideningDecision( 7577 Member, VF, CM_Scalarize, 7578 (VF.getKnownMinValue() * 7579 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7580 } 7581 } 7582 } else 7583 // Make sure I gets scalarized and a cost estimate without 7584 // scalarization overhead. 7585 ForcedScalars[VF].insert(I); 7586 } 7587 } 7588 7589 InstructionCost 7590 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7591 Type *&VectorTy) { 7592 Type *RetTy = I->getType(); 7593 if (canTruncateToMinimalBitwidth(I, VF)) 7594 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7595 auto SE = PSE.getSE(); 7596 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7597 7598 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7599 ElementCount VF) -> bool { 7600 if (VF.isScalar()) 7601 return true; 7602 7603 auto Scalarized = InstsToScalarize.find(VF); 7604 assert(Scalarized != InstsToScalarize.end() && 7605 "VF not yet analyzed for scalarization profitability"); 7606 return !Scalarized->second.count(I) && 7607 llvm::all_of(I->users(), [&](User *U) { 7608 auto *UI = cast<Instruction>(U); 7609 return !Scalarized->second.count(UI); 7610 }); 7611 }; 7612 (void) hasSingleCopyAfterVectorization; 7613 7614 if (isScalarAfterVectorization(I, VF)) { 7615 // With the exception of GEPs and PHIs, after scalarization there should 7616 // only be one copy of the instruction generated in the loop. This is 7617 // because the VF is either 1, or any instructions that need scalarizing 7618 // have already been dealt with by the the time we get here. As a result, 7619 // it means we don't have to multiply the instruction cost by VF. 7620 assert(I->getOpcode() == Instruction::GetElementPtr || 7621 I->getOpcode() == Instruction::PHI || 7622 (I->getOpcode() == Instruction::BitCast && 7623 I->getType()->isPointerTy()) || 7624 hasSingleCopyAfterVectorization(I, VF)); 7625 VectorTy = RetTy; 7626 } else 7627 VectorTy = ToVectorTy(RetTy, VF); 7628 7629 // TODO: We need to estimate the cost of intrinsic calls. 7630 switch (I->getOpcode()) { 7631 case Instruction::GetElementPtr: 7632 // We mark this instruction as zero-cost because the cost of GEPs in 7633 // vectorized code depends on whether the corresponding memory instruction 7634 // is scalarized or not. Therefore, we handle GEPs with the memory 7635 // instruction cost. 7636 return 0; 7637 case Instruction::Br: { 7638 // In cases of scalarized and predicated instructions, there will be VF 7639 // predicated blocks in the vectorized loop. Each branch around these 7640 // blocks requires also an extract of its vector compare i1 element. 7641 bool ScalarPredicatedBB = false; 7642 BranchInst *BI = cast<BranchInst>(I); 7643 if (VF.isVector() && BI->isConditional() && 7644 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7645 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7646 ScalarPredicatedBB = true; 7647 7648 if (ScalarPredicatedBB) { 7649 // Not possible to scalarize scalable vector with predicated instructions. 7650 if (VF.isScalable()) 7651 return InstructionCost::getInvalid(); 7652 // Return cost for branches around scalarized and predicated blocks. 7653 auto *Vec_i1Ty = 7654 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7655 return ( 7656 TTI.getScalarizationOverhead( 7657 Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false, 7658 true) + 7659 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7660 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7661 // The back-edge branch will remain, as will all scalar branches. 7662 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7663 else 7664 // This branch will be eliminated by if-conversion. 7665 return 0; 7666 // Note: We currently assume zero cost for an unconditional branch inside 7667 // a predicated block since it will become a fall-through, although we 7668 // may decide in the future to call TTI for all branches. 7669 } 7670 case Instruction::PHI: { 7671 auto *Phi = cast<PHINode>(I); 7672 7673 // First-order recurrences are replaced by vector shuffles inside the loop. 7674 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7675 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7676 return TTI.getShuffleCost( 7677 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7678 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7679 7680 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7681 // converted into select instructions. We require N - 1 selects per phi 7682 // node, where N is the number of incoming values. 7683 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7684 return (Phi->getNumIncomingValues() - 1) * 7685 TTI.getCmpSelInstrCost( 7686 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7687 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7688 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7689 7690 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7691 } 7692 case Instruction::UDiv: 7693 case Instruction::SDiv: 7694 case Instruction::URem: 7695 case Instruction::SRem: 7696 // If we have a predicated instruction, it may not be executed for each 7697 // vector lane. Get the scalarization cost and scale this amount by the 7698 // probability of executing the predicated block. If the instruction is not 7699 // predicated, we fall through to the next case. 7700 if (VF.isVector() && isScalarWithPredication(I)) { 7701 InstructionCost Cost = 0; 7702 7703 // These instructions have a non-void type, so account for the phi nodes 7704 // that we will create. This cost is likely to be zero. The phi node 7705 // cost, if any, should be scaled by the block probability because it 7706 // models a copy at the end of each predicated block. 7707 Cost += VF.getKnownMinValue() * 7708 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7709 7710 // The cost of the non-predicated instruction. 7711 Cost += VF.getKnownMinValue() * 7712 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7713 7714 // The cost of insertelement and extractelement instructions needed for 7715 // scalarization. 7716 Cost += getScalarizationOverhead(I, VF); 7717 7718 // Scale the cost by the probability of executing the predicated blocks. 7719 // This assumes the predicated block for each vector lane is equally 7720 // likely. 7721 return Cost / getReciprocalPredBlockProb(); 7722 } 7723 LLVM_FALLTHROUGH; 7724 case Instruction::Add: 7725 case Instruction::FAdd: 7726 case Instruction::Sub: 7727 case Instruction::FSub: 7728 case Instruction::Mul: 7729 case Instruction::FMul: 7730 case Instruction::FDiv: 7731 case Instruction::FRem: 7732 case Instruction::Shl: 7733 case Instruction::LShr: 7734 case Instruction::AShr: 7735 case Instruction::And: 7736 case Instruction::Or: 7737 case Instruction::Xor: { 7738 // Since we will replace the stride by 1 the multiplication should go away. 7739 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7740 return 0; 7741 7742 // Detect reduction patterns 7743 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7744 return *RedCost; 7745 7746 // Certain instructions can be cheaper to vectorize if they have a constant 7747 // second vector operand. One example of this are shifts on x86. 7748 Value *Op2 = I->getOperand(1); 7749 TargetTransformInfo::OperandValueProperties Op2VP; 7750 TargetTransformInfo::OperandValueKind Op2VK = 7751 TTI.getOperandInfo(Op2, Op2VP); 7752 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7753 Op2VK = TargetTransformInfo::OK_UniformValue; 7754 7755 SmallVector<const Value *, 4> Operands(I->operand_values()); 7756 return TTI.getArithmeticInstrCost( 7757 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7758 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7759 } 7760 case Instruction::FNeg: { 7761 return TTI.getArithmeticInstrCost( 7762 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7763 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7764 TargetTransformInfo::OP_None, I->getOperand(0), I); 7765 } 7766 case Instruction::Select: { 7767 SelectInst *SI = cast<SelectInst>(I); 7768 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7769 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7770 7771 const Value *Op0, *Op1; 7772 using namespace llvm::PatternMatch; 7773 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7774 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7775 // select x, y, false --> x & y 7776 // select x, true, y --> x | y 7777 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7778 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7779 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7780 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7781 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7782 Op1->getType()->getScalarSizeInBits() == 1); 7783 7784 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7785 return TTI.getArithmeticInstrCost( 7786 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7787 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7788 } 7789 7790 Type *CondTy = SI->getCondition()->getType(); 7791 if (!ScalarCond) 7792 CondTy = VectorType::get(CondTy, VF); 7793 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7794 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7795 } 7796 case Instruction::ICmp: 7797 case Instruction::FCmp: { 7798 Type *ValTy = I->getOperand(0)->getType(); 7799 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7800 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7801 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7802 VectorTy = ToVectorTy(ValTy, VF); 7803 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7804 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7805 } 7806 case Instruction::Store: 7807 case Instruction::Load: { 7808 ElementCount Width = VF; 7809 if (Width.isVector()) { 7810 InstWidening Decision = getWideningDecision(I, Width); 7811 assert(Decision != CM_Unknown && 7812 "CM decision should be taken at this point"); 7813 if (Decision == CM_Scalarize) 7814 Width = ElementCount::getFixed(1); 7815 } 7816 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7817 return getMemoryInstructionCost(I, VF); 7818 } 7819 case Instruction::BitCast: 7820 if (I->getType()->isPointerTy()) 7821 return 0; 7822 LLVM_FALLTHROUGH; 7823 case Instruction::ZExt: 7824 case Instruction::SExt: 7825 case Instruction::FPToUI: 7826 case Instruction::FPToSI: 7827 case Instruction::FPExt: 7828 case Instruction::PtrToInt: 7829 case Instruction::IntToPtr: 7830 case Instruction::SIToFP: 7831 case Instruction::UIToFP: 7832 case Instruction::Trunc: 7833 case Instruction::FPTrunc: { 7834 // Computes the CastContextHint from a Load/Store instruction. 7835 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7836 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7837 "Expected a load or a store!"); 7838 7839 if (VF.isScalar() || !TheLoop->contains(I)) 7840 return TTI::CastContextHint::Normal; 7841 7842 switch (getWideningDecision(I, VF)) { 7843 case LoopVectorizationCostModel::CM_GatherScatter: 7844 return TTI::CastContextHint::GatherScatter; 7845 case LoopVectorizationCostModel::CM_Interleave: 7846 return TTI::CastContextHint::Interleave; 7847 case LoopVectorizationCostModel::CM_Scalarize: 7848 case LoopVectorizationCostModel::CM_Widen: 7849 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7850 : TTI::CastContextHint::Normal; 7851 case LoopVectorizationCostModel::CM_Widen_Reverse: 7852 return TTI::CastContextHint::Reversed; 7853 case LoopVectorizationCostModel::CM_Unknown: 7854 llvm_unreachable("Instr did not go through cost modelling?"); 7855 } 7856 7857 llvm_unreachable("Unhandled case!"); 7858 }; 7859 7860 unsigned Opcode = I->getOpcode(); 7861 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7862 // For Trunc, the context is the only user, which must be a StoreInst. 7863 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7864 if (I->hasOneUse()) 7865 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7866 CCH = ComputeCCH(Store); 7867 } 7868 // For Z/Sext, the context is the operand, which must be a LoadInst. 7869 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7870 Opcode == Instruction::FPExt) { 7871 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7872 CCH = ComputeCCH(Load); 7873 } 7874 7875 // We optimize the truncation of induction variables having constant 7876 // integer steps. The cost of these truncations is the same as the scalar 7877 // operation. 7878 if (isOptimizableIVTruncate(I, VF)) { 7879 auto *Trunc = cast<TruncInst>(I); 7880 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7881 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7882 } 7883 7884 // Detect reduction patterns 7885 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7886 return *RedCost; 7887 7888 Type *SrcScalarTy = I->getOperand(0)->getType(); 7889 Type *SrcVecTy = 7890 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7891 if (canTruncateToMinimalBitwidth(I, VF)) { 7892 // This cast is going to be shrunk. This may remove the cast or it might 7893 // turn it into slightly different cast. For example, if MinBW == 16, 7894 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7895 // 7896 // Calculate the modified src and dest types. 7897 Type *MinVecTy = VectorTy; 7898 if (Opcode == Instruction::Trunc) { 7899 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7900 VectorTy = 7901 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7902 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7903 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7904 VectorTy = 7905 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7906 } 7907 } 7908 7909 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7910 } 7911 case Instruction::Call: { 7912 bool NeedToScalarize; 7913 CallInst *CI = cast<CallInst>(I); 7914 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7915 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7916 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7917 return std::min(CallCost, IntrinsicCost); 7918 } 7919 return CallCost; 7920 } 7921 case Instruction::ExtractValue: 7922 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7923 case Instruction::Alloca: 7924 // We cannot easily widen alloca to a scalable alloca, as 7925 // the result would need to be a vector of pointers. 7926 if (VF.isScalable()) 7927 return InstructionCost::getInvalid(); 7928 LLVM_FALLTHROUGH; 7929 default: 7930 // This opcode is unknown. Assume that it is the same as 'mul'. 7931 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7932 } // end of switch. 7933 } 7934 7935 char LoopVectorize::ID = 0; 7936 7937 static const char lv_name[] = "Loop Vectorization"; 7938 7939 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7940 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7941 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7942 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7943 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7944 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7945 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7946 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7947 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7948 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7949 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7950 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7951 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7952 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7953 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7954 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7955 7956 namespace llvm { 7957 7958 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7959 7960 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7961 bool VectorizeOnlyWhenForced) { 7962 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7963 } 7964 7965 } // end namespace llvm 7966 7967 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7968 // Check if the pointer operand of a load or store instruction is 7969 // consecutive. 7970 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7971 return Legal->isConsecutivePtr(Ptr); 7972 return false; 7973 } 7974 7975 void LoopVectorizationCostModel::collectValuesToIgnore() { 7976 // Ignore ephemeral values. 7977 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7978 7979 // Ignore type-promoting instructions we identified during reduction 7980 // detection. 7981 for (auto &Reduction : Legal->getReductionVars()) { 7982 RecurrenceDescriptor &RedDes = Reduction.second; 7983 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7984 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7985 } 7986 // Ignore type-casting instructions we identified during induction 7987 // detection. 7988 for (auto &Induction : Legal->getInductionVars()) { 7989 InductionDescriptor &IndDes = Induction.second; 7990 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7991 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7992 } 7993 } 7994 7995 void LoopVectorizationCostModel::collectInLoopReductions() { 7996 for (auto &Reduction : Legal->getReductionVars()) { 7997 PHINode *Phi = Reduction.first; 7998 RecurrenceDescriptor &RdxDesc = Reduction.second; 7999 8000 // We don't collect reductions that are type promoted (yet). 8001 if (RdxDesc.getRecurrenceType() != Phi->getType()) 8002 continue; 8003 8004 // If the target would prefer this reduction to happen "in-loop", then we 8005 // want to record it as such. 8006 unsigned Opcode = RdxDesc.getOpcode(); 8007 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 8008 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 8009 TargetTransformInfo::ReductionFlags())) 8010 continue; 8011 8012 // Check that we can correctly put the reductions into the loop, by 8013 // finding the chain of operations that leads from the phi to the loop 8014 // exit value. 8015 SmallVector<Instruction *, 4> ReductionOperations = 8016 RdxDesc.getReductionOpChain(Phi, TheLoop); 8017 bool InLoop = !ReductionOperations.empty(); 8018 if (InLoop) { 8019 InLoopReductionChains[Phi] = ReductionOperations; 8020 // Add the elements to InLoopReductionImmediateChains for cost modelling. 8021 Instruction *LastChain = Phi; 8022 for (auto *I : ReductionOperations) { 8023 InLoopReductionImmediateChains[I] = LastChain; 8024 LastChain = I; 8025 } 8026 } 8027 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 8028 << " reduction for phi: " << *Phi << "\n"); 8029 } 8030 } 8031 8032 // TODO: we could return a pair of values that specify the max VF and 8033 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 8034 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 8035 // doesn't have a cost model that can choose which plan to execute if 8036 // more than one is generated. 8037 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 8038 LoopVectorizationCostModel &CM) { 8039 unsigned WidestType; 8040 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 8041 return WidestVectorRegBits / WidestType; 8042 } 8043 8044 VectorizationFactor 8045 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 8046 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 8047 ElementCount VF = UserVF; 8048 // Outer loop handling: They may require CFG and instruction level 8049 // transformations before even evaluating whether vectorization is profitable. 8050 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8051 // the vectorization pipeline. 8052 if (!OrigLoop->isInnermost()) { 8053 // If the user doesn't provide a vectorization factor, determine a 8054 // reasonable one. 8055 if (UserVF.isZero()) { 8056 VF = ElementCount::getFixed(determineVPlanVF( 8057 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8058 .getFixedSize(), 8059 CM)); 8060 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8061 8062 // Make sure we have a VF > 1 for stress testing. 8063 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8064 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8065 << "overriding computed VF.\n"); 8066 VF = ElementCount::getFixed(4); 8067 } 8068 } 8069 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8070 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8071 "VF needs to be a power of two"); 8072 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8073 << "VF " << VF << " to build VPlans.\n"); 8074 buildVPlans(VF, VF); 8075 8076 // For VPlan build stress testing, we bail out after VPlan construction. 8077 if (VPlanBuildStressTest) 8078 return VectorizationFactor::Disabled(); 8079 8080 return {VF, 0 /*Cost*/}; 8081 } 8082 8083 LLVM_DEBUG( 8084 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8085 "VPlan-native path.\n"); 8086 return VectorizationFactor::Disabled(); 8087 } 8088 8089 Optional<VectorizationFactor> 8090 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8091 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8092 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8093 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8094 return None; 8095 8096 // Invalidate interleave groups if all blocks of loop will be predicated. 8097 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 8098 !useMaskedInterleavedAccesses(*TTI)) { 8099 LLVM_DEBUG( 8100 dbgs() 8101 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8102 "which requires masked-interleaved support.\n"); 8103 if (CM.InterleaveInfo.invalidateGroups()) 8104 // Invalidating interleave groups also requires invalidating all decisions 8105 // based on them, which includes widening decisions and uniform and scalar 8106 // values. 8107 CM.invalidateCostModelingDecisions(); 8108 } 8109 8110 ElementCount MaxUserVF = 8111 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8112 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8113 if (!UserVF.isZero() && UserVFIsLegal) { 8114 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8115 "VF needs to be a power of two"); 8116 // Collect the instructions (and their associated costs) that will be more 8117 // profitable to scalarize. 8118 if (CM.selectUserVectorizationFactor(UserVF)) { 8119 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8120 CM.collectInLoopReductions(); 8121 buildVPlansWithVPRecipes(UserVF, UserVF); 8122 LLVM_DEBUG(printPlans(dbgs())); 8123 return {{UserVF, 0}}; 8124 } else 8125 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8126 "InvalidCost", ORE, OrigLoop); 8127 } 8128 8129 // Populate the set of Vectorization Factor Candidates. 8130 ElementCountSet VFCandidates; 8131 for (auto VF = ElementCount::getFixed(1); 8132 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8133 VFCandidates.insert(VF); 8134 for (auto VF = ElementCount::getScalable(1); 8135 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8136 VFCandidates.insert(VF); 8137 8138 for (const auto &VF : VFCandidates) { 8139 // Collect Uniform and Scalar instructions after vectorization with VF. 8140 CM.collectUniformsAndScalars(VF); 8141 8142 // Collect the instructions (and their associated costs) that will be more 8143 // profitable to scalarize. 8144 if (VF.isVector()) 8145 CM.collectInstsToScalarize(VF); 8146 } 8147 8148 CM.collectInLoopReductions(); 8149 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8150 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8151 8152 LLVM_DEBUG(printPlans(dbgs())); 8153 if (!MaxFactors.hasVector()) 8154 return VectorizationFactor::Disabled(); 8155 8156 // Select the optimal vectorization factor. 8157 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8158 8159 // Check if it is profitable to vectorize with runtime checks. 8160 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8161 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8162 bool PragmaThresholdReached = 8163 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8164 bool ThresholdReached = 8165 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8166 if ((ThresholdReached && !Hints.allowReordering()) || 8167 PragmaThresholdReached) { 8168 ORE->emit([&]() { 8169 return OptimizationRemarkAnalysisAliasing( 8170 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8171 OrigLoop->getHeader()) 8172 << "loop not vectorized: cannot prove it is safe to reorder " 8173 "memory operations"; 8174 }); 8175 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8176 Hints.emitRemarkWithHints(); 8177 return VectorizationFactor::Disabled(); 8178 } 8179 } 8180 return SelectedVF; 8181 } 8182 8183 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8184 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8185 << '\n'); 8186 BestVF = VF; 8187 BestUF = UF; 8188 8189 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8190 return !Plan->hasVF(VF); 8191 }); 8192 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8193 } 8194 8195 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8196 DominatorTree *DT) { 8197 // Perform the actual loop transformation. 8198 8199 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8200 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8201 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8202 8203 VPTransformState State{ 8204 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8205 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8206 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8207 State.CanonicalIV = ILV.Induction; 8208 8209 ILV.printDebugTracesAtStart(); 8210 8211 //===------------------------------------------------===// 8212 // 8213 // Notice: any optimization or new instruction that go 8214 // into the code below should also be implemented in 8215 // the cost-model. 8216 // 8217 //===------------------------------------------------===// 8218 8219 // 2. Copy and widen instructions from the old loop into the new loop. 8220 VPlans.front()->execute(&State); 8221 8222 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8223 // predication, updating analyses. 8224 ILV.fixVectorizedLoop(State); 8225 8226 ILV.printDebugTracesAtEnd(); 8227 } 8228 8229 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8230 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8231 for (const auto &Plan : VPlans) 8232 if (PrintVPlansInDotFormat) 8233 Plan->printDOT(O); 8234 else 8235 Plan->print(O); 8236 } 8237 #endif 8238 8239 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8240 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8241 8242 // We create new control-flow for the vectorized loop, so the original exit 8243 // conditions will be dead after vectorization if it's only used by the 8244 // terminator 8245 SmallVector<BasicBlock*> ExitingBlocks; 8246 OrigLoop->getExitingBlocks(ExitingBlocks); 8247 for (auto *BB : ExitingBlocks) { 8248 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8249 if (!Cmp || !Cmp->hasOneUse()) 8250 continue; 8251 8252 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8253 if (!DeadInstructions.insert(Cmp).second) 8254 continue; 8255 8256 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8257 // TODO: can recurse through operands in general 8258 for (Value *Op : Cmp->operands()) { 8259 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8260 DeadInstructions.insert(cast<Instruction>(Op)); 8261 } 8262 } 8263 8264 // We create new "steps" for induction variable updates to which the original 8265 // induction variables map. An original update instruction will be dead if 8266 // all its users except the induction variable are dead. 8267 auto *Latch = OrigLoop->getLoopLatch(); 8268 for (auto &Induction : Legal->getInductionVars()) { 8269 PHINode *Ind = Induction.first; 8270 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8271 8272 // If the tail is to be folded by masking, the primary induction variable, 8273 // if exists, isn't dead: it will be used for masking. Don't kill it. 8274 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8275 continue; 8276 8277 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8278 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8279 })) 8280 DeadInstructions.insert(IndUpdate); 8281 8282 // We record as "Dead" also the type-casting instructions we had identified 8283 // during induction analysis. We don't need any handling for them in the 8284 // vectorized loop because we have proven that, under a proper runtime 8285 // test guarding the vectorized loop, the value of the phi, and the casted 8286 // value of the phi, are the same. The last instruction in this casting chain 8287 // will get its scalar/vector/widened def from the scalar/vector/widened def 8288 // of the respective phi node. Any other casts in the induction def-use chain 8289 // have no other uses outside the phi update chain, and will be ignored. 8290 InductionDescriptor &IndDes = Induction.second; 8291 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8292 DeadInstructions.insert(Casts.begin(), Casts.end()); 8293 } 8294 } 8295 8296 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8297 8298 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8299 8300 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8301 Instruction::BinaryOps BinOp) { 8302 // When unrolling and the VF is 1, we only need to add a simple scalar. 8303 Type *Ty = Val->getType(); 8304 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8305 8306 if (Ty->isFloatingPointTy()) { 8307 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8308 8309 // Floating-point operations inherit FMF via the builder's flags. 8310 Value *MulOp = Builder.CreateFMul(C, Step); 8311 return Builder.CreateBinOp(BinOp, Val, MulOp); 8312 } 8313 Constant *C = ConstantInt::get(Ty, StartIdx); 8314 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8315 } 8316 8317 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8318 SmallVector<Metadata *, 4> MDs; 8319 // Reserve first location for self reference to the LoopID metadata node. 8320 MDs.push_back(nullptr); 8321 bool IsUnrollMetadata = false; 8322 MDNode *LoopID = L->getLoopID(); 8323 if (LoopID) { 8324 // First find existing loop unrolling disable metadata. 8325 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8326 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8327 if (MD) { 8328 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8329 IsUnrollMetadata = 8330 S && S->getString().startswith("llvm.loop.unroll.disable"); 8331 } 8332 MDs.push_back(LoopID->getOperand(i)); 8333 } 8334 } 8335 8336 if (!IsUnrollMetadata) { 8337 // Add runtime unroll disable metadata. 8338 LLVMContext &Context = L->getHeader()->getContext(); 8339 SmallVector<Metadata *, 1> DisableOperands; 8340 DisableOperands.push_back( 8341 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8342 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8343 MDs.push_back(DisableNode); 8344 MDNode *NewLoopID = MDNode::get(Context, MDs); 8345 // Set operand 0 to refer to the loop id itself. 8346 NewLoopID->replaceOperandWith(0, NewLoopID); 8347 L->setLoopID(NewLoopID); 8348 } 8349 } 8350 8351 //===--------------------------------------------------------------------===// 8352 // EpilogueVectorizerMainLoop 8353 //===--------------------------------------------------------------------===// 8354 8355 /// This function is partially responsible for generating the control flow 8356 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8357 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8358 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8359 Loop *Lp = createVectorLoopSkeleton(""); 8360 8361 // Generate the code to check the minimum iteration count of the vector 8362 // epilogue (see below). 8363 EPI.EpilogueIterationCountCheck = 8364 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8365 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8366 8367 // Generate the code to check any assumptions that we've made for SCEV 8368 // expressions. 8369 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8370 8371 // Generate the code that checks at runtime if arrays overlap. We put the 8372 // checks into a separate block to make the more common case of few elements 8373 // faster. 8374 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8375 8376 // Generate the iteration count check for the main loop, *after* the check 8377 // for the epilogue loop, so that the path-length is shorter for the case 8378 // that goes directly through the vector epilogue. The longer-path length for 8379 // the main loop is compensated for, by the gain from vectorizing the larger 8380 // trip count. Note: the branch will get updated later on when we vectorize 8381 // the epilogue. 8382 EPI.MainLoopIterationCountCheck = 8383 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8384 8385 // Generate the induction variable. 8386 OldInduction = Legal->getPrimaryInduction(); 8387 Type *IdxTy = Legal->getWidestInductionType(); 8388 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8389 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8390 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8391 EPI.VectorTripCount = CountRoundDown; 8392 Induction = 8393 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8394 getDebugLocFromInstOrOperands(OldInduction)); 8395 8396 // Skip induction resume value creation here because they will be created in 8397 // the second pass. If we created them here, they wouldn't be used anyway, 8398 // because the vplan in the second pass still contains the inductions from the 8399 // original loop. 8400 8401 return completeLoopSkeleton(Lp, OrigLoopID); 8402 } 8403 8404 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8405 LLVM_DEBUG({ 8406 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8407 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8408 << ", Main Loop UF:" << EPI.MainLoopUF 8409 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8410 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8411 }); 8412 } 8413 8414 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8415 DEBUG_WITH_TYPE(VerboseDebug, { 8416 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8417 }); 8418 } 8419 8420 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8421 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8422 assert(L && "Expected valid Loop."); 8423 assert(Bypass && "Expected valid bypass basic block."); 8424 unsigned VFactor = 8425 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8426 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8427 Value *Count = getOrCreateTripCount(L); 8428 // Reuse existing vector loop preheader for TC checks. 8429 // Note that new preheader block is generated for vector loop. 8430 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8431 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8432 8433 // Generate code to check if the loop's trip count is less than VF * UF of the 8434 // main vector loop. 8435 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8436 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8437 8438 Value *CheckMinIters = Builder.CreateICmp( 8439 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8440 "min.iters.check"); 8441 8442 if (!ForEpilogue) 8443 TCCheckBlock->setName("vector.main.loop.iter.check"); 8444 8445 // Create new preheader for vector loop. 8446 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8447 DT, LI, nullptr, "vector.ph"); 8448 8449 if (ForEpilogue) { 8450 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8451 DT->getNode(Bypass)->getIDom()) && 8452 "TC check is expected to dominate Bypass"); 8453 8454 // Update dominator for Bypass & LoopExit. 8455 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8456 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8457 // For loops with multiple exits, there's no edge from the middle block 8458 // to exit blocks (as the epilogue must run) and thus no need to update 8459 // the immediate dominator of the exit blocks. 8460 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8461 8462 LoopBypassBlocks.push_back(TCCheckBlock); 8463 8464 // Save the trip count so we don't have to regenerate it in the 8465 // vec.epilog.iter.check. This is safe to do because the trip count 8466 // generated here dominates the vector epilog iter check. 8467 EPI.TripCount = Count; 8468 } 8469 8470 ReplaceInstWithInst( 8471 TCCheckBlock->getTerminator(), 8472 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8473 8474 return TCCheckBlock; 8475 } 8476 8477 //===--------------------------------------------------------------------===// 8478 // EpilogueVectorizerEpilogueLoop 8479 //===--------------------------------------------------------------------===// 8480 8481 /// This function is partially responsible for generating the control flow 8482 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8483 BasicBlock * 8484 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8485 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8486 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8487 8488 // Now, compare the remaining count and if there aren't enough iterations to 8489 // execute the vectorized epilogue skip to the scalar part. 8490 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8491 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8492 LoopVectorPreHeader = 8493 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8494 LI, nullptr, "vec.epilog.ph"); 8495 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8496 VecEpilogueIterationCountCheck); 8497 8498 // Adjust the control flow taking the state info from the main loop 8499 // vectorization into account. 8500 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8501 "expected this to be saved from the previous pass."); 8502 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8503 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8504 8505 DT->changeImmediateDominator(LoopVectorPreHeader, 8506 EPI.MainLoopIterationCountCheck); 8507 8508 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8509 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8510 8511 if (EPI.SCEVSafetyCheck) 8512 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8513 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8514 if (EPI.MemSafetyCheck) 8515 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8516 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8517 8518 DT->changeImmediateDominator( 8519 VecEpilogueIterationCountCheck, 8520 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8521 8522 DT->changeImmediateDominator(LoopScalarPreHeader, 8523 EPI.EpilogueIterationCountCheck); 8524 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8525 // If there is an epilogue which must run, there's no edge from the 8526 // middle block to exit blocks and thus no need to update the immediate 8527 // dominator of the exit blocks. 8528 DT->changeImmediateDominator(LoopExitBlock, 8529 EPI.EpilogueIterationCountCheck); 8530 8531 // Keep track of bypass blocks, as they feed start values to the induction 8532 // phis in the scalar loop preheader. 8533 if (EPI.SCEVSafetyCheck) 8534 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8535 if (EPI.MemSafetyCheck) 8536 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8537 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8538 8539 // Generate a resume induction for the vector epilogue and put it in the 8540 // vector epilogue preheader 8541 Type *IdxTy = Legal->getWidestInductionType(); 8542 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8543 LoopVectorPreHeader->getFirstNonPHI()); 8544 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8545 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8546 EPI.MainLoopIterationCountCheck); 8547 8548 // Generate the induction variable. 8549 OldInduction = Legal->getPrimaryInduction(); 8550 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8551 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8552 Value *StartIdx = EPResumeVal; 8553 Induction = 8554 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8555 getDebugLocFromInstOrOperands(OldInduction)); 8556 8557 // Generate induction resume values. These variables save the new starting 8558 // indexes for the scalar loop. They are used to test if there are any tail 8559 // iterations left once the vector loop has completed. 8560 // Note that when the vectorized epilogue is skipped due to iteration count 8561 // check, then the resume value for the induction variable comes from 8562 // the trip count of the main vector loop, hence passing the AdditionalBypass 8563 // argument. 8564 createInductionResumeValues(Lp, CountRoundDown, 8565 {VecEpilogueIterationCountCheck, 8566 EPI.VectorTripCount} /* AdditionalBypass */); 8567 8568 AddRuntimeUnrollDisableMetaData(Lp); 8569 return completeLoopSkeleton(Lp, OrigLoopID); 8570 } 8571 8572 BasicBlock * 8573 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8574 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8575 8576 assert(EPI.TripCount && 8577 "Expected trip count to have been safed in the first pass."); 8578 assert( 8579 (!isa<Instruction>(EPI.TripCount) || 8580 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8581 "saved trip count does not dominate insertion point."); 8582 Value *TC = EPI.TripCount; 8583 IRBuilder<> Builder(Insert->getTerminator()); 8584 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8585 8586 // Generate code to check if the loop's trip count is less than VF * UF of the 8587 // vector epilogue loop. 8588 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8589 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8590 8591 Value *CheckMinIters = Builder.CreateICmp( 8592 P, Count, 8593 ConstantInt::get(Count->getType(), 8594 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8595 "min.epilog.iters.check"); 8596 8597 ReplaceInstWithInst( 8598 Insert->getTerminator(), 8599 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8600 8601 LoopBypassBlocks.push_back(Insert); 8602 return Insert; 8603 } 8604 8605 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8606 LLVM_DEBUG({ 8607 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8608 << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8609 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8610 }); 8611 } 8612 8613 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8614 DEBUG_WITH_TYPE(VerboseDebug, { 8615 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8616 }); 8617 } 8618 8619 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8620 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8621 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8622 bool PredicateAtRangeStart = Predicate(Range.Start); 8623 8624 for (ElementCount TmpVF = Range.Start * 2; 8625 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8626 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8627 Range.End = TmpVF; 8628 break; 8629 } 8630 8631 return PredicateAtRangeStart; 8632 } 8633 8634 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8635 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8636 /// of VF's starting at a given VF and extending it as much as possible. Each 8637 /// vectorization decision can potentially shorten this sub-range during 8638 /// buildVPlan(). 8639 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8640 ElementCount MaxVF) { 8641 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8642 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8643 VFRange SubRange = {VF, MaxVFPlusOne}; 8644 VPlans.push_back(buildVPlan(SubRange)); 8645 VF = SubRange.End; 8646 } 8647 } 8648 8649 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8650 VPlanPtr &Plan) { 8651 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8652 8653 // Look for cached value. 8654 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8655 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8656 if (ECEntryIt != EdgeMaskCache.end()) 8657 return ECEntryIt->second; 8658 8659 VPValue *SrcMask = createBlockInMask(Src, Plan); 8660 8661 // The terminator has to be a branch inst! 8662 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8663 assert(BI && "Unexpected terminator found"); 8664 8665 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8666 return EdgeMaskCache[Edge] = SrcMask; 8667 8668 // If source is an exiting block, we know the exit edge is dynamically dead 8669 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8670 // adding uses of an otherwise potentially dead instruction. 8671 if (OrigLoop->isLoopExiting(Src)) 8672 return EdgeMaskCache[Edge] = SrcMask; 8673 8674 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8675 assert(EdgeMask && "No Edge Mask found for condition"); 8676 8677 if (BI->getSuccessor(0) != Dst) 8678 EdgeMask = Builder.createNot(EdgeMask); 8679 8680 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8681 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8682 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8683 // The select version does not introduce new UB if SrcMask is false and 8684 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8685 VPValue *False = Plan->getOrAddVPValue( 8686 ConstantInt::getFalse(BI->getCondition()->getType())); 8687 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8688 } 8689 8690 return EdgeMaskCache[Edge] = EdgeMask; 8691 } 8692 8693 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8694 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8695 8696 // Look for cached value. 8697 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8698 if (BCEntryIt != BlockMaskCache.end()) 8699 return BCEntryIt->second; 8700 8701 // All-one mask is modelled as no-mask following the convention for masked 8702 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8703 VPValue *BlockMask = nullptr; 8704 8705 if (OrigLoop->getHeader() == BB) { 8706 if (!CM.blockNeedsPredication(BB)) 8707 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8708 8709 // Create the block in mask as the first non-phi instruction in the block. 8710 VPBuilder::InsertPointGuard Guard(Builder); 8711 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8712 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8713 8714 // Introduce the early-exit compare IV <= BTC to form header block mask. 8715 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8716 // Start by constructing the desired canonical IV. 8717 VPValue *IV = nullptr; 8718 if (Legal->getPrimaryInduction()) 8719 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8720 else { 8721 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8722 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8723 IV = IVRecipe->getVPSingleValue(); 8724 } 8725 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8726 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8727 8728 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8729 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8730 // as a second argument, we only pass the IV here and extract the 8731 // tripcount from the transform state where codegen of the VP instructions 8732 // happen. 8733 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8734 } else { 8735 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8736 } 8737 return BlockMaskCache[BB] = BlockMask; 8738 } 8739 8740 // This is the block mask. We OR all incoming edges. 8741 for (auto *Predecessor : predecessors(BB)) { 8742 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8743 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8744 return BlockMaskCache[BB] = EdgeMask; 8745 8746 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8747 BlockMask = EdgeMask; 8748 continue; 8749 } 8750 8751 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8752 } 8753 8754 return BlockMaskCache[BB] = BlockMask; 8755 } 8756 8757 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8758 ArrayRef<VPValue *> Operands, 8759 VFRange &Range, 8760 VPlanPtr &Plan) { 8761 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8762 "Must be called with either a load or store"); 8763 8764 auto willWiden = [&](ElementCount VF) -> bool { 8765 if (VF.isScalar()) 8766 return false; 8767 LoopVectorizationCostModel::InstWidening Decision = 8768 CM.getWideningDecision(I, VF); 8769 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8770 "CM decision should be taken at this point."); 8771 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8772 return true; 8773 if (CM.isScalarAfterVectorization(I, VF) || 8774 CM.isProfitableToScalarize(I, VF)) 8775 return false; 8776 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8777 }; 8778 8779 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8780 return nullptr; 8781 8782 VPValue *Mask = nullptr; 8783 if (Legal->isMaskRequired(I)) 8784 Mask = createBlockInMask(I->getParent(), Plan); 8785 8786 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8787 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8788 8789 StoreInst *Store = cast<StoreInst>(I); 8790 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8791 Mask); 8792 } 8793 8794 VPWidenIntOrFpInductionRecipe * 8795 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8796 ArrayRef<VPValue *> Operands) const { 8797 // Check if this is an integer or fp induction. If so, build the recipe that 8798 // produces its scalar and vector values. 8799 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8800 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8801 II.getKind() == InductionDescriptor::IK_FpInduction) { 8802 assert(II.getStartValue() == 8803 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8804 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8805 return new VPWidenIntOrFpInductionRecipe( 8806 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8807 } 8808 8809 return nullptr; 8810 } 8811 8812 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8813 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8814 VPlan &Plan) const { 8815 // Optimize the special case where the source is a constant integer 8816 // induction variable. Notice that we can only optimize the 'trunc' case 8817 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8818 // (c) other casts depend on pointer size. 8819 8820 // Determine whether \p K is a truncation based on an induction variable that 8821 // can be optimized. 8822 auto isOptimizableIVTruncate = 8823 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8824 return [=](ElementCount VF) -> bool { 8825 return CM.isOptimizableIVTruncate(K, VF); 8826 }; 8827 }; 8828 8829 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8830 isOptimizableIVTruncate(I), Range)) { 8831 8832 InductionDescriptor II = 8833 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8834 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8835 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8836 Start, nullptr, I); 8837 } 8838 return nullptr; 8839 } 8840 8841 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8842 ArrayRef<VPValue *> Operands, 8843 VPlanPtr &Plan) { 8844 // If all incoming values are equal, the incoming VPValue can be used directly 8845 // instead of creating a new VPBlendRecipe. 8846 VPValue *FirstIncoming = Operands[0]; 8847 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8848 return FirstIncoming == Inc; 8849 })) { 8850 return Operands[0]; 8851 } 8852 8853 // We know that all PHIs in non-header blocks are converted into selects, so 8854 // we don't have to worry about the insertion order and we can just use the 8855 // builder. At this point we generate the predication tree. There may be 8856 // duplications since this is a simple recursive scan, but future 8857 // optimizations will clean it up. 8858 SmallVector<VPValue *, 2> OperandsWithMask; 8859 unsigned NumIncoming = Phi->getNumIncomingValues(); 8860 8861 for (unsigned In = 0; In < NumIncoming; In++) { 8862 VPValue *EdgeMask = 8863 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8864 assert((EdgeMask || NumIncoming == 1) && 8865 "Multiple predecessors with one having a full mask"); 8866 OperandsWithMask.push_back(Operands[In]); 8867 if (EdgeMask) 8868 OperandsWithMask.push_back(EdgeMask); 8869 } 8870 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8871 } 8872 8873 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8874 ArrayRef<VPValue *> Operands, 8875 VFRange &Range) const { 8876 8877 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8878 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8879 Range); 8880 8881 if (IsPredicated) 8882 return nullptr; 8883 8884 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8885 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8886 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8887 ID == Intrinsic::pseudoprobe || 8888 ID == Intrinsic::experimental_noalias_scope_decl)) 8889 return nullptr; 8890 8891 auto willWiden = [&](ElementCount VF) -> bool { 8892 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8893 // The following case may be scalarized depending on the VF. 8894 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8895 // version of the instruction. 8896 // Is it beneficial to perform intrinsic call compared to lib call? 8897 bool NeedToScalarize = false; 8898 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8899 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8900 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8901 return UseVectorIntrinsic || !NeedToScalarize; 8902 }; 8903 8904 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8905 return nullptr; 8906 8907 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8908 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8909 } 8910 8911 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8912 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8913 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8914 // Instruction should be widened, unless it is scalar after vectorization, 8915 // scalarization is profitable or it is predicated. 8916 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8917 return CM.isScalarAfterVectorization(I, VF) || 8918 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8919 }; 8920 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8921 Range); 8922 } 8923 8924 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8925 ArrayRef<VPValue *> Operands) const { 8926 auto IsVectorizableOpcode = [](unsigned Opcode) { 8927 switch (Opcode) { 8928 case Instruction::Add: 8929 case Instruction::And: 8930 case Instruction::AShr: 8931 case Instruction::BitCast: 8932 case Instruction::FAdd: 8933 case Instruction::FCmp: 8934 case Instruction::FDiv: 8935 case Instruction::FMul: 8936 case Instruction::FNeg: 8937 case Instruction::FPExt: 8938 case Instruction::FPToSI: 8939 case Instruction::FPToUI: 8940 case Instruction::FPTrunc: 8941 case Instruction::FRem: 8942 case Instruction::FSub: 8943 case Instruction::ICmp: 8944 case Instruction::IntToPtr: 8945 case Instruction::LShr: 8946 case Instruction::Mul: 8947 case Instruction::Or: 8948 case Instruction::PtrToInt: 8949 case Instruction::SDiv: 8950 case Instruction::Select: 8951 case Instruction::SExt: 8952 case Instruction::Shl: 8953 case Instruction::SIToFP: 8954 case Instruction::SRem: 8955 case Instruction::Sub: 8956 case Instruction::Trunc: 8957 case Instruction::UDiv: 8958 case Instruction::UIToFP: 8959 case Instruction::URem: 8960 case Instruction::Xor: 8961 case Instruction::ZExt: 8962 return true; 8963 } 8964 return false; 8965 }; 8966 8967 if (!IsVectorizableOpcode(I->getOpcode())) 8968 return nullptr; 8969 8970 // Success: widen this instruction. 8971 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8972 } 8973 8974 void VPRecipeBuilder::fixHeaderPhis() { 8975 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8976 for (VPWidenPHIRecipe *R : PhisToFix) { 8977 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8978 VPRecipeBase *IncR = 8979 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8980 R->addOperand(IncR->getVPSingleValue()); 8981 } 8982 } 8983 8984 VPBasicBlock *VPRecipeBuilder::handleReplication( 8985 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8986 VPlanPtr &Plan) { 8987 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8988 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8989 Range); 8990 8991 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8992 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8993 8994 // Even if the instruction is not marked as uniform, there are certain 8995 // intrinsic calls that can be effectively treated as such, so we check for 8996 // them here. Conservatively, we only do this for scalable vectors, since 8997 // for fixed-width VFs we can always fall back on full scalarization. 8998 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8999 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 9000 case Intrinsic::assume: 9001 case Intrinsic::lifetime_start: 9002 case Intrinsic::lifetime_end: 9003 // For scalable vectors if one of the operands is variant then we still 9004 // want to mark as uniform, which will generate one instruction for just 9005 // the first lane of the vector. We can't scalarize the call in the same 9006 // way as for fixed-width vectors because we don't know how many lanes 9007 // there are. 9008 // 9009 // The reasons for doing it this way for scalable vectors are: 9010 // 1. For the assume intrinsic generating the instruction for the first 9011 // lane is still be better than not generating any at all. For 9012 // example, the input may be a splat across all lanes. 9013 // 2. For the lifetime start/end intrinsics the pointer operand only 9014 // does anything useful when the input comes from a stack object, 9015 // which suggests it should always be uniform. For non-stack objects 9016 // the effect is to poison the object, which still allows us to 9017 // remove the call. 9018 IsUniform = true; 9019 break; 9020 default: 9021 break; 9022 } 9023 } 9024 9025 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 9026 IsUniform, IsPredicated); 9027 setRecipe(I, Recipe); 9028 Plan->addVPValue(I, Recipe); 9029 9030 // Find if I uses a predicated instruction. If so, it will use its scalar 9031 // value. Avoid hoisting the insert-element which packs the scalar value into 9032 // a vector value, as that happens iff all users use the vector value. 9033 for (VPValue *Op : Recipe->operands()) { 9034 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 9035 if (!PredR) 9036 continue; 9037 auto *RepR = 9038 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 9039 assert(RepR->isPredicated() && 9040 "expected Replicate recipe to be predicated"); 9041 RepR->setAlsoPack(false); 9042 } 9043 9044 // Finalize the recipe for Instr, first if it is not predicated. 9045 if (!IsPredicated) { 9046 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9047 VPBB->appendRecipe(Recipe); 9048 return VPBB; 9049 } 9050 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9051 assert(VPBB->getSuccessors().empty() && 9052 "VPBB has successors when handling predicated replication."); 9053 // Record predicated instructions for above packing optimizations. 9054 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9055 VPBlockUtils::insertBlockAfter(Region, VPBB); 9056 auto *RegSucc = new VPBasicBlock(); 9057 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9058 return RegSucc; 9059 } 9060 9061 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9062 VPRecipeBase *PredRecipe, 9063 VPlanPtr &Plan) { 9064 // Instructions marked for predication are replicated and placed under an 9065 // if-then construct to prevent side-effects. 9066 9067 // Generate recipes to compute the block mask for this region. 9068 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9069 9070 // Build the triangular if-then region. 9071 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9072 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9073 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9074 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9075 auto *PHIRecipe = Instr->getType()->isVoidTy() 9076 ? nullptr 9077 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9078 if (PHIRecipe) { 9079 Plan->removeVPValueFor(Instr); 9080 Plan->addVPValue(Instr, PHIRecipe); 9081 } 9082 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9083 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9084 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9085 9086 // Note: first set Entry as region entry and then connect successors starting 9087 // from it in order, to propagate the "parent" of each VPBasicBlock. 9088 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9089 VPBlockUtils::connectBlocks(Pred, Exit); 9090 9091 return Region; 9092 } 9093 9094 VPRecipeOrVPValueTy 9095 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9096 ArrayRef<VPValue *> Operands, 9097 VFRange &Range, VPlanPtr &Plan) { 9098 // First, check for specific widening recipes that deal with calls, memory 9099 // operations, inductions and Phi nodes. 9100 if (auto *CI = dyn_cast<CallInst>(Instr)) 9101 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9102 9103 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9104 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9105 9106 VPRecipeBase *Recipe; 9107 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9108 if (Phi->getParent() != OrigLoop->getHeader()) 9109 return tryToBlend(Phi, Operands, Plan); 9110 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9111 return toVPRecipeResult(Recipe); 9112 9113 VPWidenPHIRecipe *PhiRecipe = nullptr; 9114 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9115 VPValue *StartV = Operands[0]; 9116 if (Legal->isReductionVariable(Phi)) { 9117 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9118 assert(RdxDesc.getRecurrenceStartValue() == 9119 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9120 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9121 CM.isInLoopReduction(Phi), 9122 CM.useOrderedReductions(RdxDesc)); 9123 } else { 9124 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9125 } 9126 9127 // Record the incoming value from the backedge, so we can add the incoming 9128 // value from the backedge after all recipes have been created. 9129 recordRecipeOf(cast<Instruction>( 9130 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9131 PhisToFix.push_back(PhiRecipe); 9132 } else { 9133 // TODO: record start and backedge value for remaining pointer induction 9134 // phis. 9135 assert(Phi->getType()->isPointerTy() && 9136 "only pointer phis should be handled here"); 9137 PhiRecipe = new VPWidenPHIRecipe(Phi); 9138 } 9139 9140 return toVPRecipeResult(PhiRecipe); 9141 } 9142 9143 if (isa<TruncInst>(Instr) && 9144 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9145 Range, *Plan))) 9146 return toVPRecipeResult(Recipe); 9147 9148 if (!shouldWiden(Instr, Range)) 9149 return nullptr; 9150 9151 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9152 return toVPRecipeResult(new VPWidenGEPRecipe( 9153 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9154 9155 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9156 bool InvariantCond = 9157 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9158 return toVPRecipeResult(new VPWidenSelectRecipe( 9159 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9160 } 9161 9162 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9163 } 9164 9165 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9166 ElementCount MaxVF) { 9167 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9168 9169 // Collect instructions from the original loop that will become trivially dead 9170 // in the vectorized loop. We don't need to vectorize these instructions. For 9171 // example, original induction update instructions can become dead because we 9172 // separately emit induction "steps" when generating code for the new loop. 9173 // Similarly, we create a new latch condition when setting up the structure 9174 // of the new loop, so the old one can become dead. 9175 SmallPtrSet<Instruction *, 4> DeadInstructions; 9176 collectTriviallyDeadInstructions(DeadInstructions); 9177 9178 // Add assume instructions we need to drop to DeadInstructions, to prevent 9179 // them from being added to the VPlan. 9180 // TODO: We only need to drop assumes in blocks that get flattend. If the 9181 // control flow is preserved, we should keep them. 9182 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9183 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9184 9185 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9186 // Dead instructions do not need sinking. Remove them from SinkAfter. 9187 for (Instruction *I : DeadInstructions) 9188 SinkAfter.erase(I); 9189 9190 // Cannot sink instructions after dead instructions (there won't be any 9191 // recipes for them). Instead, find the first non-dead previous instruction. 9192 for (auto &P : Legal->getSinkAfter()) { 9193 Instruction *SinkTarget = P.second; 9194 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9195 (void)FirstInst; 9196 while (DeadInstructions.contains(SinkTarget)) { 9197 assert( 9198 SinkTarget != FirstInst && 9199 "Must find a live instruction (at least the one feeding the " 9200 "first-order recurrence PHI) before reaching beginning of the block"); 9201 SinkTarget = SinkTarget->getPrevNode(); 9202 assert(SinkTarget != P.first && 9203 "sink source equals target, no sinking required"); 9204 } 9205 P.second = SinkTarget; 9206 } 9207 9208 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9209 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9210 VFRange SubRange = {VF, MaxVFPlusOne}; 9211 VPlans.push_back( 9212 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9213 VF = SubRange.End; 9214 } 9215 } 9216 9217 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9218 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9219 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9220 9221 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9222 9223 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9224 9225 // --------------------------------------------------------------------------- 9226 // Pre-construction: record ingredients whose recipes we'll need to further 9227 // process after constructing the initial VPlan. 9228 // --------------------------------------------------------------------------- 9229 9230 // Mark instructions we'll need to sink later and their targets as 9231 // ingredients whose recipe we'll need to record. 9232 for (auto &Entry : SinkAfter) { 9233 RecipeBuilder.recordRecipeOf(Entry.first); 9234 RecipeBuilder.recordRecipeOf(Entry.second); 9235 } 9236 for (auto &Reduction : CM.getInLoopReductionChains()) { 9237 PHINode *Phi = Reduction.first; 9238 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9239 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9240 9241 RecipeBuilder.recordRecipeOf(Phi); 9242 for (auto &R : ReductionOperations) { 9243 RecipeBuilder.recordRecipeOf(R); 9244 // For min/max reducitons, where we have a pair of icmp/select, we also 9245 // need to record the ICmp recipe, so it can be removed later. 9246 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9247 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9248 } 9249 } 9250 9251 // For each interleave group which is relevant for this (possibly trimmed) 9252 // Range, add it to the set of groups to be later applied to the VPlan and add 9253 // placeholders for its members' Recipes which we'll be replacing with a 9254 // single VPInterleaveRecipe. 9255 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9256 auto applyIG = [IG, this](ElementCount VF) -> bool { 9257 return (VF.isVector() && // Query is illegal for VF == 1 9258 CM.getWideningDecision(IG->getInsertPos(), VF) == 9259 LoopVectorizationCostModel::CM_Interleave); 9260 }; 9261 if (!getDecisionAndClampRange(applyIG, Range)) 9262 continue; 9263 InterleaveGroups.insert(IG); 9264 for (unsigned i = 0; i < IG->getFactor(); i++) 9265 if (Instruction *Member = IG->getMember(i)) 9266 RecipeBuilder.recordRecipeOf(Member); 9267 }; 9268 9269 // --------------------------------------------------------------------------- 9270 // Build initial VPlan: Scan the body of the loop in a topological order to 9271 // visit each basic block after having visited its predecessor basic blocks. 9272 // --------------------------------------------------------------------------- 9273 9274 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9275 auto Plan = std::make_unique<VPlan>(); 9276 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9277 Plan->setEntry(VPBB); 9278 9279 // Scan the body of the loop in a topological order to visit each basic block 9280 // after having visited its predecessor basic blocks. 9281 LoopBlocksDFS DFS(OrigLoop); 9282 DFS.perform(LI); 9283 9284 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9285 // Relevant instructions from basic block BB will be grouped into VPRecipe 9286 // ingredients and fill a new VPBasicBlock. 9287 unsigned VPBBsForBB = 0; 9288 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9289 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9290 VPBB = FirstVPBBForBB; 9291 Builder.setInsertPoint(VPBB); 9292 9293 // Introduce each ingredient into VPlan. 9294 // TODO: Model and preserve debug instrinsics in VPlan. 9295 for (Instruction &I : BB->instructionsWithoutDebug()) { 9296 Instruction *Instr = &I; 9297 9298 // First filter out irrelevant instructions, to ensure no recipes are 9299 // built for them. 9300 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9301 continue; 9302 9303 SmallVector<VPValue *, 4> Operands; 9304 auto *Phi = dyn_cast<PHINode>(Instr); 9305 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9306 Operands.push_back(Plan->getOrAddVPValue( 9307 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9308 } else { 9309 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9310 Operands = {OpRange.begin(), OpRange.end()}; 9311 } 9312 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9313 Instr, Operands, Range, Plan)) { 9314 // If Instr can be simplified to an existing VPValue, use it. 9315 if (RecipeOrValue.is<VPValue *>()) { 9316 auto *VPV = RecipeOrValue.get<VPValue *>(); 9317 Plan->addVPValue(Instr, VPV); 9318 // If the re-used value is a recipe, register the recipe for the 9319 // instruction, in case the recipe for Instr needs to be recorded. 9320 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9321 RecipeBuilder.setRecipe(Instr, R); 9322 continue; 9323 } 9324 // Otherwise, add the new recipe. 9325 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9326 for (auto *Def : Recipe->definedValues()) { 9327 auto *UV = Def->getUnderlyingValue(); 9328 Plan->addVPValue(UV, Def); 9329 } 9330 9331 RecipeBuilder.setRecipe(Instr, Recipe); 9332 VPBB->appendRecipe(Recipe); 9333 continue; 9334 } 9335 9336 // Otherwise, if all widening options failed, Instruction is to be 9337 // replicated. This may create a successor for VPBB. 9338 VPBasicBlock *NextVPBB = 9339 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9340 if (NextVPBB != VPBB) { 9341 VPBB = NextVPBB; 9342 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9343 : ""); 9344 } 9345 } 9346 } 9347 9348 RecipeBuilder.fixHeaderPhis(); 9349 9350 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9351 // may also be empty, such as the last one VPBB, reflecting original 9352 // basic-blocks with no recipes. 9353 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9354 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9355 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9356 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9357 delete PreEntry; 9358 9359 // --------------------------------------------------------------------------- 9360 // Transform initial VPlan: Apply previously taken decisions, in order, to 9361 // bring the VPlan to its final state. 9362 // --------------------------------------------------------------------------- 9363 9364 // Apply Sink-After legal constraints. 9365 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9366 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9367 if (Region && Region->isReplicator()) { 9368 assert(Region->getNumSuccessors() == 1 && 9369 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9370 assert(R->getParent()->size() == 1 && 9371 "A recipe in an original replicator region must be the only " 9372 "recipe in its block"); 9373 return Region; 9374 } 9375 return nullptr; 9376 }; 9377 for (auto &Entry : SinkAfter) { 9378 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9379 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9380 9381 auto *TargetRegion = GetReplicateRegion(Target); 9382 auto *SinkRegion = GetReplicateRegion(Sink); 9383 if (!SinkRegion) { 9384 // If the sink source is not a replicate region, sink the recipe directly. 9385 if (TargetRegion) { 9386 // The target is in a replication region, make sure to move Sink to 9387 // the block after it, not into the replication region itself. 9388 VPBasicBlock *NextBlock = 9389 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9390 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9391 } else 9392 Sink->moveAfter(Target); 9393 continue; 9394 } 9395 9396 // The sink source is in a replicate region. Unhook the region from the CFG. 9397 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9398 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9399 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9400 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9401 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9402 9403 if (TargetRegion) { 9404 // The target recipe is also in a replicate region, move the sink region 9405 // after the target region. 9406 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9407 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9408 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9409 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9410 } else { 9411 // The sink source is in a replicate region, we need to move the whole 9412 // replicate region, which should only contain a single recipe in the 9413 // main block. 9414 auto *SplitBlock = 9415 Target->getParent()->splitAt(std::next(Target->getIterator())); 9416 9417 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9418 9419 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9420 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9421 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9422 if (VPBB == SplitPred) 9423 VPBB = SplitBlock; 9424 } 9425 } 9426 9427 // Introduce a recipe to combine the incoming and previous values of a 9428 // first-order recurrence. 9429 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9430 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9431 if (!RecurPhi) 9432 continue; 9433 9434 auto *RecurSplice = cast<VPInstruction>( 9435 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9436 {RecurPhi, RecurPhi->getBackedgeValue()})); 9437 9438 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9439 if (auto *Region = GetReplicateRegion(PrevRecipe)) { 9440 VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9441 RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi()); 9442 } else 9443 RecurSplice->moveAfter(PrevRecipe); 9444 RecurPhi->replaceAllUsesWith(RecurSplice); 9445 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9446 // all users. 9447 RecurSplice->setOperand(0, RecurPhi); 9448 } 9449 9450 // Interleave memory: for each Interleave Group we marked earlier as relevant 9451 // for this VPlan, replace the Recipes widening its memory instructions with a 9452 // single VPInterleaveRecipe at its insertion point. 9453 for (auto IG : InterleaveGroups) { 9454 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9455 RecipeBuilder.getRecipe(IG->getInsertPos())); 9456 SmallVector<VPValue *, 4> StoredValues; 9457 for (unsigned i = 0; i < IG->getFactor(); ++i) 9458 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9459 auto *StoreR = 9460 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9461 StoredValues.push_back(StoreR->getStoredValue()); 9462 } 9463 9464 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9465 Recipe->getMask()); 9466 VPIG->insertBefore(Recipe); 9467 unsigned J = 0; 9468 for (unsigned i = 0; i < IG->getFactor(); ++i) 9469 if (Instruction *Member = IG->getMember(i)) { 9470 if (!Member->getType()->isVoidTy()) { 9471 VPValue *OriginalV = Plan->getVPValue(Member); 9472 Plan->removeVPValueFor(Member); 9473 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9474 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9475 J++; 9476 } 9477 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9478 } 9479 } 9480 9481 // Adjust the recipes for any inloop reductions. 9482 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9483 9484 VPlanTransforms::sinkScalarOperands(*Plan); 9485 VPlanTransforms::mergeReplicateRegions(*Plan); 9486 9487 std::string PlanName; 9488 raw_string_ostream RSO(PlanName); 9489 ElementCount VF = Range.Start; 9490 Plan->addVF(VF); 9491 RSO << "Initial VPlan for VF={" << VF; 9492 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9493 Plan->addVF(VF); 9494 RSO << "," << VF; 9495 } 9496 RSO << "},UF>=1"; 9497 RSO.flush(); 9498 Plan->setName(PlanName); 9499 9500 return Plan; 9501 } 9502 9503 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9504 // Outer loop handling: They may require CFG and instruction level 9505 // transformations before even evaluating whether vectorization is profitable. 9506 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9507 // the vectorization pipeline. 9508 assert(!OrigLoop->isInnermost()); 9509 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9510 9511 // Create new empty VPlan 9512 auto Plan = std::make_unique<VPlan>(); 9513 9514 // Build hierarchical CFG 9515 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9516 HCFGBuilder.buildHierarchicalCFG(); 9517 9518 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9519 VF *= 2) 9520 Plan->addVF(VF); 9521 9522 if (EnableVPlanPredication) { 9523 VPlanPredicator VPP(*Plan); 9524 VPP.predicate(); 9525 9526 // Avoid running transformation to recipes until masked code generation in 9527 // VPlan-native path is in place. 9528 return Plan; 9529 } 9530 9531 SmallPtrSet<Instruction *, 1> DeadInstructions; 9532 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9533 Legal->getInductionVars(), 9534 DeadInstructions, *PSE.getSE()); 9535 return Plan; 9536 } 9537 9538 // Adjust the recipes for reductions. For in-loop reductions the chain of 9539 // instructions leading from the loop exit instr to the phi need to be converted 9540 // to reductions, with one operand being vector and the other being the scalar 9541 // reduction chain. For other reductions, a select is introduced between the phi 9542 // and live-out recipes when folding the tail. 9543 void LoopVectorizationPlanner::adjustRecipesForReductions( 9544 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9545 ElementCount MinVF) { 9546 for (auto &Reduction : CM.getInLoopReductionChains()) { 9547 PHINode *Phi = Reduction.first; 9548 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9549 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9550 9551 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9552 continue; 9553 9554 // ReductionOperations are orders top-down from the phi's use to the 9555 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9556 // which of the two operands will remain scalar and which will be reduced. 9557 // For minmax the chain will be the select instructions. 9558 Instruction *Chain = Phi; 9559 for (Instruction *R : ReductionOperations) { 9560 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9561 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9562 9563 VPValue *ChainOp = Plan->getVPValue(Chain); 9564 unsigned FirstOpId; 9565 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9566 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9567 "Expected to replace a VPWidenSelectSC"); 9568 FirstOpId = 1; 9569 } else { 9570 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && 9571 "Expected to replace a VPWidenSC"); 9572 FirstOpId = 0; 9573 } 9574 unsigned VecOpId = 9575 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9576 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9577 9578 auto *CondOp = CM.foldTailByMasking() 9579 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9580 : nullptr; 9581 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9582 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9583 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9584 Plan->removeVPValueFor(R); 9585 Plan->addVPValue(R, RedRecipe); 9586 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9587 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9588 WidenRecipe->eraseFromParent(); 9589 9590 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9591 VPRecipeBase *CompareRecipe = 9592 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9593 assert(isa<VPWidenRecipe>(CompareRecipe) && 9594 "Expected to replace a VPWidenSC"); 9595 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9596 "Expected no remaining users"); 9597 CompareRecipe->eraseFromParent(); 9598 } 9599 Chain = R; 9600 } 9601 } 9602 9603 // If tail is folded by masking, introduce selects between the phi 9604 // and the live-out instruction of each reduction, at the end of the latch. 9605 if (CM.foldTailByMasking()) { 9606 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9607 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9608 if (!PhiR || PhiR->isInLoop()) 9609 continue; 9610 Builder.setInsertPoint(LatchVPBB); 9611 VPValue *Cond = 9612 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9613 VPValue *Red = PhiR->getBackedgeValue(); 9614 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9615 } 9616 } 9617 } 9618 9619 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9620 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9621 VPSlotTracker &SlotTracker) const { 9622 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9623 IG->getInsertPos()->printAsOperand(O, false); 9624 O << ", "; 9625 getAddr()->printAsOperand(O, SlotTracker); 9626 VPValue *Mask = getMask(); 9627 if (Mask) { 9628 O << ", "; 9629 Mask->printAsOperand(O, SlotTracker); 9630 } 9631 9632 unsigned OpIdx = 0; 9633 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9634 if (!IG->getMember(i)) 9635 continue; 9636 if (getNumStoreOperands() > 0) { 9637 O << "\n" << Indent << " store "; 9638 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9639 O << " to index " << i; 9640 } else { 9641 O << "\n" << Indent << " "; 9642 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9643 O << " = load from index " << i; 9644 } 9645 ++OpIdx; 9646 } 9647 } 9648 #endif 9649 9650 void VPWidenCallRecipe::execute(VPTransformState &State) { 9651 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9652 *this, State); 9653 } 9654 9655 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9656 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9657 this, *this, InvariantCond, State); 9658 } 9659 9660 void VPWidenRecipe::execute(VPTransformState &State) { 9661 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9662 } 9663 9664 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9665 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9666 *this, State.UF, State.VF, IsPtrLoopInvariant, 9667 IsIndexLoopInvariant, State); 9668 } 9669 9670 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9671 assert(!State.Instance && "Int or FP induction being replicated."); 9672 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9673 getTruncInst(), getVPValue(0), 9674 getCastValue(), State); 9675 } 9676 9677 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9678 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9679 State); 9680 } 9681 9682 void VPBlendRecipe::execute(VPTransformState &State) { 9683 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9684 // We know that all PHIs in non-header blocks are converted into 9685 // selects, so we don't have to worry about the insertion order and we 9686 // can just use the builder. 9687 // At this point we generate the predication tree. There may be 9688 // duplications since this is a simple recursive scan, but future 9689 // optimizations will clean it up. 9690 9691 unsigned NumIncoming = getNumIncomingValues(); 9692 9693 // Generate a sequence of selects of the form: 9694 // SELECT(Mask3, In3, 9695 // SELECT(Mask2, In2, 9696 // SELECT(Mask1, In1, 9697 // In0))) 9698 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9699 // are essentially undef are taken from In0. 9700 InnerLoopVectorizer::VectorParts Entry(State.UF); 9701 for (unsigned In = 0; In < NumIncoming; ++In) { 9702 for (unsigned Part = 0; Part < State.UF; ++Part) { 9703 // We might have single edge PHIs (blocks) - use an identity 9704 // 'select' for the first PHI operand. 9705 Value *In0 = State.get(getIncomingValue(In), Part); 9706 if (In == 0) 9707 Entry[Part] = In0; // Initialize with the first incoming value. 9708 else { 9709 // Select between the current value and the previous incoming edge 9710 // based on the incoming mask. 9711 Value *Cond = State.get(getMask(In), Part); 9712 Entry[Part] = 9713 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9714 } 9715 } 9716 } 9717 for (unsigned Part = 0; Part < State.UF; ++Part) 9718 State.set(this, Entry[Part], Part); 9719 } 9720 9721 void VPInterleaveRecipe::execute(VPTransformState &State) { 9722 assert(!State.Instance && "Interleave group being replicated."); 9723 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9724 getStoredValues(), getMask()); 9725 } 9726 9727 void VPReductionRecipe::execute(VPTransformState &State) { 9728 assert(!State.Instance && "Reduction being replicated."); 9729 Value *PrevInChain = State.get(getChainOp(), 0); 9730 for (unsigned Part = 0; Part < State.UF; ++Part) { 9731 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9732 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9733 Value *NewVecOp = State.get(getVecOp(), Part); 9734 if (VPValue *Cond = getCondOp()) { 9735 Value *NewCond = State.get(Cond, Part); 9736 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9737 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9738 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9739 Constant *IdenVec = 9740 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9741 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9742 NewVecOp = Select; 9743 } 9744 Value *NewRed; 9745 Value *NextInChain; 9746 if (IsOrdered) { 9747 if (State.VF.isVector()) 9748 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9749 PrevInChain); 9750 else 9751 NewRed = State.Builder.CreateBinOp( 9752 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), 9753 PrevInChain, NewVecOp); 9754 PrevInChain = NewRed; 9755 } else { 9756 PrevInChain = State.get(getChainOp(), Part); 9757 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9758 } 9759 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9760 NextInChain = 9761 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9762 NewRed, PrevInChain); 9763 } else if (IsOrdered) 9764 NextInChain = NewRed; 9765 else { 9766 NextInChain = State.Builder.CreateBinOp( 9767 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9768 PrevInChain); 9769 } 9770 State.set(this, NextInChain, Part); 9771 } 9772 } 9773 9774 void VPReplicateRecipe::execute(VPTransformState &State) { 9775 if (State.Instance) { // Generate a single instance. 9776 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9777 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9778 *State.Instance, IsPredicated, State); 9779 // Insert scalar instance packing it into a vector. 9780 if (AlsoPack && State.VF.isVector()) { 9781 // If we're constructing lane 0, initialize to start from poison. 9782 if (State.Instance->Lane.isFirstLane()) { 9783 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9784 Value *Poison = PoisonValue::get( 9785 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9786 State.set(this, Poison, State.Instance->Part); 9787 } 9788 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9789 } 9790 return; 9791 } 9792 9793 // Generate scalar instances for all VF lanes of all UF parts, unless the 9794 // instruction is uniform inwhich case generate only the first lane for each 9795 // of the UF parts. 9796 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9797 assert((!State.VF.isScalable() || IsUniform) && 9798 "Can't scalarize a scalable vector"); 9799 for (unsigned Part = 0; Part < State.UF; ++Part) 9800 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9801 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9802 VPIteration(Part, Lane), IsPredicated, 9803 State); 9804 } 9805 9806 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9807 assert(State.Instance && "Branch on Mask works only on single instance."); 9808 9809 unsigned Part = State.Instance->Part; 9810 unsigned Lane = State.Instance->Lane.getKnownLane(); 9811 9812 Value *ConditionBit = nullptr; 9813 VPValue *BlockInMask = getMask(); 9814 if (BlockInMask) { 9815 ConditionBit = State.get(BlockInMask, Part); 9816 if (ConditionBit->getType()->isVectorTy()) 9817 ConditionBit = State.Builder.CreateExtractElement( 9818 ConditionBit, State.Builder.getInt32(Lane)); 9819 } else // Block in mask is all-one. 9820 ConditionBit = State.Builder.getTrue(); 9821 9822 // Replace the temporary unreachable terminator with a new conditional branch, 9823 // whose two destinations will be set later when they are created. 9824 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9825 assert(isa<UnreachableInst>(CurrentTerminator) && 9826 "Expected to replace unreachable terminator with conditional branch."); 9827 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9828 CondBr->setSuccessor(0, nullptr); 9829 ReplaceInstWithInst(CurrentTerminator, CondBr); 9830 } 9831 9832 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9833 assert(State.Instance && "Predicated instruction PHI works per instance."); 9834 Instruction *ScalarPredInst = 9835 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9836 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9837 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9838 assert(PredicatingBB && "Predicated block has no single predecessor."); 9839 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9840 "operand must be VPReplicateRecipe"); 9841 9842 // By current pack/unpack logic we need to generate only a single phi node: if 9843 // a vector value for the predicated instruction exists at this point it means 9844 // the instruction has vector users only, and a phi for the vector value is 9845 // needed. In this case the recipe of the predicated instruction is marked to 9846 // also do that packing, thereby "hoisting" the insert-element sequence. 9847 // Otherwise, a phi node for the scalar value is needed. 9848 unsigned Part = State.Instance->Part; 9849 if (State.hasVectorValue(getOperand(0), Part)) { 9850 Value *VectorValue = State.get(getOperand(0), Part); 9851 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9852 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9853 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9854 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9855 if (State.hasVectorValue(this, Part)) 9856 State.reset(this, VPhi, Part); 9857 else 9858 State.set(this, VPhi, Part); 9859 // NOTE: Currently we need to update the value of the operand, so the next 9860 // predicated iteration inserts its generated value in the correct vector. 9861 State.reset(getOperand(0), VPhi, Part); 9862 } else { 9863 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9864 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9865 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9866 PredicatingBB); 9867 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9868 if (State.hasScalarValue(this, *State.Instance)) 9869 State.reset(this, Phi, *State.Instance); 9870 else 9871 State.set(this, Phi, *State.Instance); 9872 // NOTE: Currently we need to update the value of the operand, so the next 9873 // predicated iteration inserts its generated value in the correct vector. 9874 State.reset(getOperand(0), Phi, *State.Instance); 9875 } 9876 } 9877 9878 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9879 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9880 State.ILV->vectorizeMemoryInstruction( 9881 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9882 StoredValue, getMask()); 9883 } 9884 9885 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9886 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9887 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9888 // for predication. 9889 static ScalarEpilogueLowering getScalarEpilogueLowering( 9890 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9891 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9892 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9893 LoopVectorizationLegality &LVL) { 9894 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9895 // don't look at hints or options, and don't request a scalar epilogue. 9896 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9897 // LoopAccessInfo (due to code dependency and not being able to reliably get 9898 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9899 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9900 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9901 // back to the old way and vectorize with versioning when forced. See D81345.) 9902 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9903 PGSOQueryType::IRPass) && 9904 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9905 return CM_ScalarEpilogueNotAllowedOptSize; 9906 9907 // 2) If set, obey the directives 9908 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9909 switch (PreferPredicateOverEpilogue) { 9910 case PreferPredicateTy::ScalarEpilogue: 9911 return CM_ScalarEpilogueAllowed; 9912 case PreferPredicateTy::PredicateElseScalarEpilogue: 9913 return CM_ScalarEpilogueNotNeededUsePredicate; 9914 case PreferPredicateTy::PredicateOrDontVectorize: 9915 return CM_ScalarEpilogueNotAllowedUsePredicate; 9916 }; 9917 } 9918 9919 // 3) If set, obey the hints 9920 switch (Hints.getPredicate()) { 9921 case LoopVectorizeHints::FK_Enabled: 9922 return CM_ScalarEpilogueNotNeededUsePredicate; 9923 case LoopVectorizeHints::FK_Disabled: 9924 return CM_ScalarEpilogueAllowed; 9925 }; 9926 9927 // 4) if the TTI hook indicates this is profitable, request predication. 9928 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9929 LVL.getLAI())) 9930 return CM_ScalarEpilogueNotNeededUsePredicate; 9931 9932 return CM_ScalarEpilogueAllowed; 9933 } 9934 9935 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9936 // If Values have been set for this Def return the one relevant for \p Part. 9937 if (hasVectorValue(Def, Part)) 9938 return Data.PerPartOutput[Def][Part]; 9939 9940 if (!hasScalarValue(Def, {Part, 0})) { 9941 Value *IRV = Def->getLiveInIRValue(); 9942 Value *B = ILV->getBroadcastInstrs(IRV); 9943 set(Def, B, Part); 9944 return B; 9945 } 9946 9947 Value *ScalarValue = get(Def, {Part, 0}); 9948 // If we aren't vectorizing, we can just copy the scalar map values over 9949 // to the vector map. 9950 if (VF.isScalar()) { 9951 set(Def, ScalarValue, Part); 9952 return ScalarValue; 9953 } 9954 9955 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9956 bool IsUniform = RepR && RepR->isUniform(); 9957 9958 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9959 // Check if there is a scalar value for the selected lane. 9960 if (!hasScalarValue(Def, {Part, LastLane})) { 9961 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9962 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9963 "unexpected recipe found to be invariant"); 9964 IsUniform = true; 9965 LastLane = 0; 9966 } 9967 9968 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9969 // Set the insert point after the last scalarized instruction or after the 9970 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9971 // will directly follow the scalar definitions. 9972 auto OldIP = Builder.saveIP(); 9973 auto NewIP = 9974 isa<PHINode>(LastInst) 9975 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9976 : std::next(BasicBlock::iterator(LastInst)); 9977 Builder.SetInsertPoint(&*NewIP); 9978 9979 // However, if we are vectorizing, we need to construct the vector values. 9980 // If the value is known to be uniform after vectorization, we can just 9981 // broadcast the scalar value corresponding to lane zero for each unroll 9982 // iteration. Otherwise, we construct the vector values using 9983 // insertelement instructions. Since the resulting vectors are stored in 9984 // State, we will only generate the insertelements once. 9985 Value *VectorValue = nullptr; 9986 if (IsUniform) { 9987 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9988 set(Def, VectorValue, Part); 9989 } else { 9990 // Initialize packing with insertelements to start from undef. 9991 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9992 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9993 set(Def, Undef, Part); 9994 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9995 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9996 VectorValue = get(Def, Part); 9997 } 9998 Builder.restoreIP(OldIP); 9999 return VectorValue; 10000 } 10001 10002 // Process the loop in the VPlan-native vectorization path. This path builds 10003 // VPlan upfront in the vectorization pipeline, which allows to apply 10004 // VPlan-to-VPlan transformations from the very beginning without modifying the 10005 // input LLVM IR. 10006 static bool processLoopInVPlanNativePath( 10007 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10008 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10009 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10010 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10011 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10012 LoopVectorizationRequirements &Requirements) { 10013 10014 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10015 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10016 return false; 10017 } 10018 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10019 Function *F = L->getHeader()->getParent(); 10020 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10021 10022 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10023 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10024 10025 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10026 &Hints, IAI); 10027 // Use the planner for outer loop vectorization. 10028 // TODO: CM is not used at this point inside the planner. Turn CM into an 10029 // optional argument if we don't need it in the future. 10030 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10031 Requirements, ORE); 10032 10033 // Get user vectorization factor. 10034 ElementCount UserVF = Hints.getWidth(); 10035 10036 CM.collectElementTypesForWidening(); 10037 10038 // Plan how to best vectorize, return the best VF and its cost. 10039 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10040 10041 // If we are stress testing VPlan builds, do not attempt to generate vector 10042 // code. Masked vector code generation support will follow soon. 10043 // Also, do not attempt to vectorize if no vector code will be produced. 10044 if (VPlanBuildStressTest || EnableVPlanPredication || 10045 VectorizationFactor::Disabled() == VF) 10046 return false; 10047 10048 LVP.setBestPlan(VF.Width, 1); 10049 10050 { 10051 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10052 F->getParent()->getDataLayout()); 10053 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10054 &CM, BFI, PSI, Checks); 10055 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10056 << L->getHeader()->getParent()->getName() << "\"\n"); 10057 LVP.executePlan(LB, DT); 10058 } 10059 10060 // Mark the loop as already vectorized to avoid vectorizing again. 10061 Hints.setAlreadyVectorized(); 10062 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10063 return true; 10064 } 10065 10066 // Emit a remark if there are stores to floats that required a floating point 10067 // extension. If the vectorized loop was generated with floating point there 10068 // will be a performance penalty from the conversion overhead and the change in 10069 // the vector width. 10070 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10071 SmallVector<Instruction *, 4> Worklist; 10072 for (BasicBlock *BB : L->getBlocks()) { 10073 for (Instruction &Inst : *BB) { 10074 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10075 if (S->getValueOperand()->getType()->isFloatTy()) 10076 Worklist.push_back(S); 10077 } 10078 } 10079 } 10080 10081 // Traverse the floating point stores upwards searching, for floating point 10082 // conversions. 10083 SmallPtrSet<const Instruction *, 4> Visited; 10084 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10085 while (!Worklist.empty()) { 10086 auto *I = Worklist.pop_back_val(); 10087 if (!L->contains(I)) 10088 continue; 10089 if (!Visited.insert(I).second) 10090 continue; 10091 10092 // Emit a remark if the floating point store required a floating 10093 // point conversion. 10094 // TODO: More work could be done to identify the root cause such as a 10095 // constant or a function return type and point the user to it. 10096 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10097 ORE->emit([&]() { 10098 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10099 I->getDebugLoc(), L->getHeader()) 10100 << "floating point conversion changes vector width. " 10101 << "Mixed floating point precision requires an up/down " 10102 << "cast that will negatively impact performance."; 10103 }); 10104 10105 for (Use &Op : I->operands()) 10106 if (auto *OpI = dyn_cast<Instruction>(Op)) 10107 Worklist.push_back(OpI); 10108 } 10109 } 10110 10111 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10112 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10113 !EnableLoopInterleaving), 10114 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10115 !EnableLoopVectorization) {} 10116 10117 bool LoopVectorizePass::processLoop(Loop *L) { 10118 assert((EnableVPlanNativePath || L->isInnermost()) && 10119 "VPlan-native path is not enabled. Only process inner loops."); 10120 10121 #ifndef NDEBUG 10122 const std::string DebugLocStr = getDebugLocString(L); 10123 #endif /* NDEBUG */ 10124 10125 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10126 << L->getHeader()->getParent()->getName() << "\" from " 10127 << DebugLocStr << "\n"); 10128 10129 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10130 10131 LLVM_DEBUG( 10132 dbgs() << "LV: Loop hints:" 10133 << " force=" 10134 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10135 ? "disabled" 10136 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10137 ? "enabled" 10138 : "?")) 10139 << " width=" << Hints.getWidth() 10140 << " interleave=" << Hints.getInterleave() << "\n"); 10141 10142 // Function containing loop 10143 Function *F = L->getHeader()->getParent(); 10144 10145 // Looking at the diagnostic output is the only way to determine if a loop 10146 // was vectorized (other than looking at the IR or machine code), so it 10147 // is important to generate an optimization remark for each loop. Most of 10148 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10149 // generated as OptimizationRemark and OptimizationRemarkMissed are 10150 // less verbose reporting vectorized loops and unvectorized loops that may 10151 // benefit from vectorization, respectively. 10152 10153 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10154 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10155 return false; 10156 } 10157 10158 PredicatedScalarEvolution PSE(*SE, *L); 10159 10160 // Check if it is legal to vectorize the loop. 10161 LoopVectorizationRequirements Requirements; 10162 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10163 &Requirements, &Hints, DB, AC, BFI, PSI); 10164 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10165 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10166 Hints.emitRemarkWithHints(); 10167 return false; 10168 } 10169 10170 // Check the function attributes and profiles to find out if this function 10171 // should be optimized for size. 10172 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10173 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10174 10175 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10176 // here. They may require CFG and instruction level transformations before 10177 // even evaluating whether vectorization is profitable. Since we cannot modify 10178 // the incoming IR, we need to build VPlan upfront in the vectorization 10179 // pipeline. 10180 if (!L->isInnermost()) 10181 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10182 ORE, BFI, PSI, Hints, Requirements); 10183 10184 assert(L->isInnermost() && "Inner loop expected."); 10185 10186 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10187 // count by optimizing for size, to minimize overheads. 10188 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10189 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10190 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10191 << "This loop is worth vectorizing only if no scalar " 10192 << "iteration overheads are incurred."); 10193 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10194 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10195 else { 10196 LLVM_DEBUG(dbgs() << "\n"); 10197 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10198 } 10199 } 10200 10201 // Check the function attributes to see if implicit floats are allowed. 10202 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10203 // an integer loop and the vector instructions selected are purely integer 10204 // vector instructions? 10205 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10206 reportVectorizationFailure( 10207 "Can't vectorize when the NoImplicitFloat attribute is used", 10208 "loop not vectorized due to NoImplicitFloat attribute", 10209 "NoImplicitFloat", ORE, L); 10210 Hints.emitRemarkWithHints(); 10211 return false; 10212 } 10213 10214 // Check if the target supports potentially unsafe FP vectorization. 10215 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10216 // for the target we're vectorizing for, to make sure none of the 10217 // additional fp-math flags can help. 10218 if (Hints.isPotentiallyUnsafe() && 10219 TTI->isFPVectorizationPotentiallyUnsafe()) { 10220 reportVectorizationFailure( 10221 "Potentially unsafe FP op prevents vectorization", 10222 "loop not vectorized due to unsafe FP support.", 10223 "UnsafeFP", ORE, L); 10224 Hints.emitRemarkWithHints(); 10225 return false; 10226 } 10227 10228 if (!LVL.canVectorizeFPMath(ForceOrderedReductions)) { 10229 ORE->emit([&]() { 10230 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10231 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10232 ExactFPMathInst->getDebugLoc(), 10233 ExactFPMathInst->getParent()) 10234 << "loop not vectorized: cannot prove it is safe to reorder " 10235 "floating-point operations"; 10236 }); 10237 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10238 "reorder floating-point operations\n"); 10239 Hints.emitRemarkWithHints(); 10240 return false; 10241 } 10242 10243 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10244 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10245 10246 // If an override option has been passed in for interleaved accesses, use it. 10247 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10248 UseInterleaved = EnableInterleavedMemAccesses; 10249 10250 // Analyze interleaved memory accesses. 10251 if (UseInterleaved) { 10252 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10253 } 10254 10255 // Use the cost model. 10256 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10257 F, &Hints, IAI); 10258 CM.collectValuesToIgnore(); 10259 CM.collectElementTypesForWidening(); 10260 10261 // Use the planner for vectorization. 10262 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10263 Requirements, ORE); 10264 10265 // Get user vectorization factor and interleave count. 10266 ElementCount UserVF = Hints.getWidth(); 10267 unsigned UserIC = Hints.getInterleave(); 10268 10269 // Plan how to best vectorize, return the best VF and its cost. 10270 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10271 10272 VectorizationFactor VF = VectorizationFactor::Disabled(); 10273 unsigned IC = 1; 10274 10275 if (MaybeVF) { 10276 VF = *MaybeVF; 10277 // Select the interleave count. 10278 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10279 } 10280 10281 // Identify the diagnostic messages that should be produced. 10282 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10283 bool VectorizeLoop = true, InterleaveLoop = true; 10284 if (VF.Width.isScalar()) { 10285 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10286 VecDiagMsg = std::make_pair( 10287 "VectorizationNotBeneficial", 10288 "the cost-model indicates that vectorization is not beneficial"); 10289 VectorizeLoop = false; 10290 } 10291 10292 if (!MaybeVF && UserIC > 1) { 10293 // Tell the user interleaving was avoided up-front, despite being explicitly 10294 // requested. 10295 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10296 "interleaving should be avoided up front\n"); 10297 IntDiagMsg = std::make_pair( 10298 "InterleavingAvoided", 10299 "Ignoring UserIC, because interleaving was avoided up front"); 10300 InterleaveLoop = false; 10301 } else if (IC == 1 && UserIC <= 1) { 10302 // Tell the user interleaving is not beneficial. 10303 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10304 IntDiagMsg = std::make_pair( 10305 "InterleavingNotBeneficial", 10306 "the cost-model indicates that interleaving is not beneficial"); 10307 InterleaveLoop = false; 10308 if (UserIC == 1) { 10309 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10310 IntDiagMsg.second += 10311 " and is explicitly disabled or interleave count is set to 1"; 10312 } 10313 } else if (IC > 1 && UserIC == 1) { 10314 // Tell the user interleaving is beneficial, but it explicitly disabled. 10315 LLVM_DEBUG( 10316 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10317 IntDiagMsg = std::make_pair( 10318 "InterleavingBeneficialButDisabled", 10319 "the cost-model indicates that interleaving is beneficial " 10320 "but is explicitly disabled or interleave count is set to 1"); 10321 InterleaveLoop = false; 10322 } 10323 10324 // Override IC if user provided an interleave count. 10325 IC = UserIC > 0 ? UserIC : IC; 10326 10327 // Emit diagnostic messages, if any. 10328 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10329 if (!VectorizeLoop && !InterleaveLoop) { 10330 // Do not vectorize or interleaving the loop. 10331 ORE->emit([&]() { 10332 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10333 L->getStartLoc(), L->getHeader()) 10334 << VecDiagMsg.second; 10335 }); 10336 ORE->emit([&]() { 10337 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10338 L->getStartLoc(), L->getHeader()) 10339 << IntDiagMsg.second; 10340 }); 10341 return false; 10342 } else if (!VectorizeLoop && InterleaveLoop) { 10343 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10344 ORE->emit([&]() { 10345 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10346 L->getStartLoc(), L->getHeader()) 10347 << VecDiagMsg.second; 10348 }); 10349 } else if (VectorizeLoop && !InterleaveLoop) { 10350 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10351 << ") in " << DebugLocStr << '\n'); 10352 ORE->emit([&]() { 10353 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10354 L->getStartLoc(), L->getHeader()) 10355 << IntDiagMsg.second; 10356 }); 10357 } else if (VectorizeLoop && InterleaveLoop) { 10358 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10359 << ") in " << DebugLocStr << '\n'); 10360 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10361 } 10362 10363 bool DisableRuntimeUnroll = false; 10364 MDNode *OrigLoopID = L->getLoopID(); 10365 { 10366 // Optimistically generate runtime checks. Drop them if they turn out to not 10367 // be profitable. Limit the scope of Checks, so the cleanup happens 10368 // immediately after vector codegeneration is done. 10369 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10370 F->getParent()->getDataLayout()); 10371 if (!VF.Width.isScalar() || IC > 1) 10372 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10373 LVP.setBestPlan(VF.Width, IC); 10374 10375 using namespace ore; 10376 if (!VectorizeLoop) { 10377 assert(IC > 1 && "interleave count should not be 1 or 0"); 10378 // If we decided that it is not legal to vectorize the loop, then 10379 // interleave it. 10380 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10381 &CM, BFI, PSI, Checks); 10382 LVP.executePlan(Unroller, DT); 10383 10384 ORE->emit([&]() { 10385 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10386 L->getHeader()) 10387 << "interleaved loop (interleaved count: " 10388 << NV("InterleaveCount", IC) << ")"; 10389 }); 10390 } else { 10391 // If we decided that it is *legal* to vectorize the loop, then do it. 10392 10393 // Consider vectorizing the epilogue too if it's profitable. 10394 VectorizationFactor EpilogueVF = 10395 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10396 if (EpilogueVF.Width.isVector()) { 10397 10398 // The first pass vectorizes the main loop and creates a scalar epilogue 10399 // to be vectorized by executing the plan (potentially with a different 10400 // factor) again shortly afterwards. 10401 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10402 EpilogueVF.Width.getKnownMinValue(), 10403 1); 10404 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10405 EPI, &LVL, &CM, BFI, PSI, Checks); 10406 10407 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10408 LVP.executePlan(MainILV, DT); 10409 ++LoopsVectorized; 10410 10411 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10412 formLCSSARecursively(*L, *DT, LI, SE); 10413 10414 // Second pass vectorizes the epilogue and adjusts the control flow 10415 // edges from the first pass. 10416 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10417 EPI.MainLoopVF = EPI.EpilogueVF; 10418 EPI.MainLoopUF = EPI.EpilogueUF; 10419 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10420 ORE, EPI, &LVL, &CM, BFI, PSI, 10421 Checks); 10422 LVP.executePlan(EpilogILV, DT); 10423 ++LoopsEpilogueVectorized; 10424 10425 if (!MainILV.areSafetyChecksAdded()) 10426 DisableRuntimeUnroll = true; 10427 } else { 10428 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10429 &LVL, &CM, BFI, PSI, Checks); 10430 LVP.executePlan(LB, DT); 10431 ++LoopsVectorized; 10432 10433 // Add metadata to disable runtime unrolling a scalar loop when there 10434 // are no runtime checks about strides and memory. A scalar loop that is 10435 // rarely used is not worth unrolling. 10436 if (!LB.areSafetyChecksAdded()) 10437 DisableRuntimeUnroll = true; 10438 } 10439 // Report the vectorization decision. 10440 ORE->emit([&]() { 10441 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10442 L->getHeader()) 10443 << "vectorized loop (vectorization width: " 10444 << NV("VectorizationFactor", VF.Width) 10445 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10446 }); 10447 } 10448 10449 if (ORE->allowExtraAnalysis(LV_NAME)) 10450 checkMixedPrecision(L, ORE); 10451 } 10452 10453 Optional<MDNode *> RemainderLoopID = 10454 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10455 LLVMLoopVectorizeFollowupEpilogue}); 10456 if (RemainderLoopID.hasValue()) { 10457 L->setLoopID(RemainderLoopID.getValue()); 10458 } else { 10459 if (DisableRuntimeUnroll) 10460 AddRuntimeUnrollDisableMetaData(L); 10461 10462 // Mark the loop as already vectorized to avoid vectorizing again. 10463 Hints.setAlreadyVectorized(); 10464 } 10465 10466 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10467 return true; 10468 } 10469 10470 LoopVectorizeResult LoopVectorizePass::runImpl( 10471 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10472 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10473 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10474 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10475 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10476 SE = &SE_; 10477 LI = &LI_; 10478 TTI = &TTI_; 10479 DT = &DT_; 10480 BFI = &BFI_; 10481 TLI = TLI_; 10482 AA = &AA_; 10483 AC = &AC_; 10484 GetLAA = &GetLAA_; 10485 DB = &DB_; 10486 ORE = &ORE_; 10487 PSI = PSI_; 10488 10489 // Don't attempt if 10490 // 1. the target claims to have no vector registers, and 10491 // 2. interleaving won't help ILP. 10492 // 10493 // The second condition is necessary because, even if the target has no 10494 // vector registers, loop vectorization may still enable scalar 10495 // interleaving. 10496 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10497 TTI->getMaxInterleaveFactor(1) < 2) 10498 return LoopVectorizeResult(false, false); 10499 10500 bool Changed = false, CFGChanged = false; 10501 10502 // The vectorizer requires loops to be in simplified form. 10503 // Since simplification may add new inner loops, it has to run before the 10504 // legality and profitability checks. This means running the loop vectorizer 10505 // will simplify all loops, regardless of whether anything end up being 10506 // vectorized. 10507 for (auto &L : *LI) 10508 Changed |= CFGChanged |= 10509 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10510 10511 // Build up a worklist of inner-loops to vectorize. This is necessary as 10512 // the act of vectorizing or partially unrolling a loop creates new loops 10513 // and can invalidate iterators across the loops. 10514 SmallVector<Loop *, 8> Worklist; 10515 10516 for (Loop *L : *LI) 10517 collectSupportedLoops(*L, LI, ORE, Worklist); 10518 10519 LoopsAnalyzed += Worklist.size(); 10520 10521 // Now walk the identified inner loops. 10522 while (!Worklist.empty()) { 10523 Loop *L = Worklist.pop_back_val(); 10524 10525 // For the inner loops we actually process, form LCSSA to simplify the 10526 // transform. 10527 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10528 10529 Changed |= CFGChanged |= processLoop(L); 10530 } 10531 10532 // Process each loop nest in the function. 10533 return LoopVectorizeResult(Changed, CFGChanged); 10534 } 10535 10536 PreservedAnalyses LoopVectorizePass::run(Function &F, 10537 FunctionAnalysisManager &AM) { 10538 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10539 auto &LI = AM.getResult<LoopAnalysis>(F); 10540 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10541 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10542 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10543 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10544 auto &AA = AM.getResult<AAManager>(F); 10545 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10546 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10547 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10548 10549 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10550 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10551 [&](Loop &L) -> const LoopAccessInfo & { 10552 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10553 TLI, TTI, nullptr, nullptr}; 10554 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10555 }; 10556 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10557 ProfileSummaryInfo *PSI = 10558 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10559 LoopVectorizeResult Result = 10560 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10561 if (!Result.MadeAnyChange) 10562 return PreservedAnalyses::all(); 10563 PreservedAnalyses PA; 10564 10565 // We currently do not preserve loopinfo/dominator analyses with outer loop 10566 // vectorization. Until this is addressed, mark these analyses as preserved 10567 // only for non-VPlan-native path. 10568 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10569 if (!EnableVPlanNativePath) { 10570 PA.preserve<LoopAnalysis>(); 10571 PA.preserve<DominatorTreeAnalysis>(); 10572 } 10573 if (!Result.MadeCFGChange) 10574 PA.preserveSet<CFGAnalyses>(); 10575 return PA; 10576 } 10577