1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop. 472 /// In the case of epilogue vectorization, this function is overriden to 473 /// handle the more complex control flow around the loops. 474 virtual BasicBlock *createVectorizedLoopSkeleton(); 475 476 /// Widen a single instruction within the innermost loop. 477 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 478 VPTransformState &State); 479 480 /// Widen a single call instruction within the innermost loop. 481 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 482 VPTransformState &State); 483 484 /// Widen a single select instruction within the innermost loop. 485 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 486 bool InvariantCond, VPTransformState &State); 487 488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 489 void fixVectorizedLoop(VPTransformState &State); 490 491 // Return true if any runtime check is added. 492 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 493 494 /// A type for vectorized values in the new loop. Each value from the 495 /// original loop, when vectorized, is represented by UF vector values in the 496 /// new unrolled loop, where UF is the unroll factor. 497 using VectorParts = SmallVector<Value *, 2>; 498 499 /// Vectorize a single GetElementPtrInst based on information gathered and 500 /// decisions taken during planning. 501 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 502 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 503 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 504 505 /// Vectorize a single first-order recurrence or pointer induction PHINode in 506 /// a block. This method handles the induction variable canonicalization. It 507 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 508 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 509 VPTransformState &State); 510 511 /// A helper function to scalarize a single Instruction in the innermost loop. 512 /// Generates a sequence of scalar instances for each lane between \p MinLane 513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 514 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 515 /// Instr's operands. 516 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 517 const VPIteration &Instance, bool IfPredicateInstr, 518 VPTransformState &State); 519 520 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 521 /// is provided, the integer induction variable will first be truncated to 522 /// the corresponding type. 523 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 524 VPValue *Def, VPValue *CastDef, 525 VPTransformState &State); 526 527 /// Construct the vector value of a scalarized value \p V one lane at a time. 528 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 529 VPTransformState &State); 530 531 /// Try to vectorize interleaved access group \p Group with the base address 532 /// given in \p Addr, optionally masking the vector operations if \p 533 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 534 /// values in the vectorized loop. 535 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 536 ArrayRef<VPValue *> VPDefs, 537 VPTransformState &State, VPValue *Addr, 538 ArrayRef<VPValue *> StoredValues, 539 VPValue *BlockInMask = nullptr); 540 541 /// Vectorize Load and Store instructions with the base address given in \p 542 /// Addr, optionally masking the vector operations if \p BlockInMask is 543 /// non-null. Use \p State to translate given VPValues to IR values in the 544 /// vectorized loop. 545 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 546 VPValue *Def, VPValue *Addr, 547 VPValue *StoredValue, VPValue *BlockInMask, 548 bool ConsecutiveStride, bool Reverse); 549 550 /// Set the debug location in the builder \p Ptr using the debug location in 551 /// \p V. If \p Ptr is None then it uses the class member's Builder. 552 void setDebugLocFromInst(const Value *V, 553 Optional<IRBuilder<> *> CustomBuilder = None); 554 555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 556 void fixNonInductionPHIs(VPTransformState &State); 557 558 /// Returns true if the reordering of FP operations is not allowed, but we are 559 /// able to vectorize with strict in-order reductions for the given RdxDesc. 560 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 561 562 /// Create a broadcast instruction. This method generates a broadcast 563 /// instruction (shuffle) for loop invariant values and for the induction 564 /// value. If this is the induction variable then we extend it to N, N+1, ... 565 /// this is needed because each iteration in the loop corresponds to a SIMD 566 /// element. 567 virtual Value *getBroadcastInstrs(Value *V); 568 569 protected: 570 friend class LoopVectorizationPlanner; 571 572 /// A small list of PHINodes. 573 using PhiVector = SmallVector<PHINode *, 4>; 574 575 /// A type for scalarized values in the new loop. Each value from the 576 /// original loop, when scalarized, is represented by UF x VF scalar values 577 /// in the new unrolled loop, where UF is the unroll factor and VF is the 578 /// vectorization factor. 579 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 580 581 /// Set up the values of the IVs correctly when exiting the vector loop. 582 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 583 Value *CountRoundDown, Value *EndValue, 584 BasicBlock *MiddleBlock); 585 586 /// Create a new induction variable inside L. 587 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 588 Value *Step, Instruction *DL); 589 590 /// Handle all cross-iteration phis in the header. 591 void fixCrossIterationPHIs(VPTransformState &State); 592 593 /// Create the exit value of first order recurrences in the middle block and 594 /// update their users. 595 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 596 597 /// Create code for the loop exit value of the reduction. 598 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 599 600 /// Clear NSW/NUW flags from reduction instructions if necessary. 601 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 602 VPTransformState &State); 603 604 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 605 /// means we need to add the appropriate incoming value from the middle 606 /// block as exiting edges from the scalar epilogue loop (if present) are 607 /// already in place, and we exit the vector loop exclusively to the middle 608 /// block. 609 void fixLCSSAPHIs(VPTransformState &State); 610 611 /// Iteratively sink the scalarized operands of a predicated instruction into 612 /// the block that was created for it. 613 void sinkScalarOperands(Instruction *PredInst); 614 615 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 616 /// represented as. 617 void truncateToMinimalBitwidths(VPTransformState &State); 618 619 /// This function adds 620 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 621 /// to each vector element of Val. The sequence starts at StartIndex. 622 /// \p Opcode is relevant for FP induction variable. 623 virtual Value * 624 getStepVector(Value *Val, Value *StartIdx, Value *Step, 625 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 626 627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 628 /// variable on which to base the steps, \p Step is the size of the step, and 629 /// \p EntryVal is the value from the original loop that maps to the steps. 630 /// Note that \p EntryVal doesn't have to be an induction variable - it 631 /// can also be a truncate instruction. 632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 633 const InductionDescriptor &ID, VPValue *Def, 634 VPValue *CastDef, VPTransformState &State); 635 636 /// Create a vector induction phi node based on an existing scalar one. \p 637 /// EntryVal is the value from the original loop that maps to the vector phi 638 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 639 /// truncate instruction, instead of widening the original IV, we widen a 640 /// version of the IV truncated to \p EntryVal's type. 641 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 642 Value *Step, Value *Start, 643 Instruction *EntryVal, VPValue *Def, 644 VPValue *CastDef, 645 VPTransformState &State); 646 647 /// Returns true if an instruction \p I should be scalarized instead of 648 /// vectorized for the chosen vectorization factor. 649 bool shouldScalarizeInstruction(Instruction *I) const; 650 651 /// Returns true if we should generate a scalar version of \p IV. 652 bool needsScalarInduction(Instruction *IV) const; 653 654 /// If there is a cast involved in the induction variable \p ID, which should 655 /// be ignored in the vectorized loop body, this function records the 656 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 657 /// cast. We had already proved that the casted Phi is equal to the uncasted 658 /// Phi in the vectorized loop (under a runtime guard), and therefore 659 /// there is no need to vectorize the cast - the same value can be used in the 660 /// vector loop for both the Phi and the cast. 661 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 662 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 663 /// 664 /// \p EntryVal is the value from the original loop that maps to the vector 665 /// phi node and is used to distinguish what is the IV currently being 666 /// processed - original one (if \p EntryVal is a phi corresponding to the 667 /// original IV) or the "newly-created" one based on the proof mentioned above 668 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 669 /// latter case \p EntryVal is a TruncInst and we must not record anything for 670 /// that IV, but it's error-prone to expect callers of this routine to care 671 /// about that, hence this explicit parameter. 672 void recordVectorLoopValueForInductionCast( 673 const InductionDescriptor &ID, const Instruction *EntryVal, 674 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 675 unsigned Part, unsigned Lane = UINT_MAX); 676 677 /// Generate a shuffle sequence that will reverse the vector Vec. 678 virtual Value *reverseVector(Value *Vec); 679 680 /// Returns (and creates if needed) the original loop trip count. 681 Value *getOrCreateTripCount(Loop *NewLoop); 682 683 /// Returns (and creates if needed) the trip count of the widened loop. 684 Value *getOrCreateVectorTripCount(Loop *NewLoop); 685 686 /// Returns a bitcasted value to the requested vector type. 687 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 688 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 689 const DataLayout &DL); 690 691 /// Emit a bypass check to see if the vector trip count is zero, including if 692 /// it overflows. 693 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 694 695 /// Emit a bypass check to see if all of the SCEV assumptions we've 696 /// had to make are correct. Returns the block containing the checks or 697 /// nullptr if no checks have been added. 698 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 699 700 /// Emit bypass checks to check any memory assumptions we may have made. 701 /// Returns the block containing the checks or nullptr if no checks have been 702 /// added. 703 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 704 705 /// Compute the transformed value of Index at offset StartValue using step 706 /// StepValue. 707 /// For integer induction, returns StartValue + Index * StepValue. 708 /// For pointer induction, returns StartValue[Index * StepValue]. 709 /// FIXME: The newly created binary instructions should contain nsw/nuw 710 /// flags, which can be found from the original scalar operations. 711 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 712 const DataLayout &DL, 713 const InductionDescriptor &ID) const; 714 715 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 716 /// vector loop preheader, middle block and scalar preheader. Also 717 /// allocate a loop object for the new vector loop and return it. 718 Loop *createVectorLoopSkeleton(StringRef Prefix); 719 720 /// Create new phi nodes for the induction variables to resume iteration count 721 /// in the scalar epilogue, from where the vectorized loop left off (given by 722 /// \p VectorTripCount). 723 /// In cases where the loop skeleton is more complicated (eg. epilogue 724 /// vectorization) and the resume values can come from an additional bypass 725 /// block, the \p AdditionalBypass pair provides information about the bypass 726 /// block and the end value on the edge from bypass to this loop. 727 void createInductionResumeValues( 728 Loop *L, Value *VectorTripCount, 729 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 730 731 /// Complete the loop skeleton by adding debug MDs, creating appropriate 732 /// conditional branches in the middle block, preparing the builder and 733 /// running the verifier. Take in the vector loop \p L as argument, and return 734 /// the preheader of the completed vector loop. 735 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 736 737 /// Add additional metadata to \p To that was not present on \p Orig. 738 /// 739 /// Currently this is used to add the noalias annotations based on the 740 /// inserted memchecks. Use this for instructions that are *cloned* into the 741 /// vector loop. 742 void addNewMetadata(Instruction *To, const Instruction *Orig); 743 744 /// Add metadata from one instruction to another. 745 /// 746 /// This includes both the original MDs from \p From and additional ones (\see 747 /// addNewMetadata). Use this for *newly created* instructions in the vector 748 /// loop. 749 void addMetadata(Instruction *To, Instruction *From); 750 751 /// Similar to the previous function but it adds the metadata to a 752 /// vector of instructions. 753 void addMetadata(ArrayRef<Value *> To, Instruction *From); 754 755 /// Allow subclasses to override and print debug traces before/after vplan 756 /// execution, when trace information is requested. 757 virtual void printDebugTracesAtStart(){}; 758 virtual void printDebugTracesAtEnd(){}; 759 760 /// The original loop. 761 Loop *OrigLoop; 762 763 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 764 /// dynamic knowledge to simplify SCEV expressions and converts them to a 765 /// more usable form. 766 PredicatedScalarEvolution &PSE; 767 768 /// Loop Info. 769 LoopInfo *LI; 770 771 /// Dominator Tree. 772 DominatorTree *DT; 773 774 /// Alias Analysis. 775 AAResults *AA; 776 777 /// Target Library Info. 778 const TargetLibraryInfo *TLI; 779 780 /// Target Transform Info. 781 const TargetTransformInfo *TTI; 782 783 /// Assumption Cache. 784 AssumptionCache *AC; 785 786 /// Interface to emit optimization remarks. 787 OptimizationRemarkEmitter *ORE; 788 789 /// LoopVersioning. It's only set up (non-null) if memchecks were 790 /// used. 791 /// 792 /// This is currently only used to add no-alias metadata based on the 793 /// memchecks. The actually versioning is performed manually. 794 std::unique_ptr<LoopVersioning> LVer; 795 796 /// The vectorization SIMD factor to use. Each vector will have this many 797 /// vector elements. 798 ElementCount VF; 799 800 /// The vectorization unroll factor to use. Each scalar is vectorized to this 801 /// many different vector instructions. 802 unsigned UF; 803 804 /// The builder that we use 805 IRBuilder<> Builder; 806 807 // --- Vectorization state --- 808 809 /// The vector-loop preheader. 810 BasicBlock *LoopVectorPreHeader; 811 812 /// The scalar-loop preheader. 813 BasicBlock *LoopScalarPreHeader; 814 815 /// Middle Block between the vector and the scalar. 816 BasicBlock *LoopMiddleBlock; 817 818 /// The unique ExitBlock of the scalar loop if one exists. Note that 819 /// there can be multiple exiting edges reaching this block. 820 BasicBlock *LoopExitBlock; 821 822 /// The vector loop body. 823 BasicBlock *LoopVectorBody; 824 825 /// The scalar loop body. 826 BasicBlock *LoopScalarBody; 827 828 /// A list of all bypass blocks. The first block is the entry of the loop. 829 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 830 831 /// The new Induction variable which was added to the new block. 832 PHINode *Induction = nullptr; 833 834 /// The induction variable of the old basic block. 835 PHINode *OldInduction = nullptr; 836 837 /// Store instructions that were predicated. 838 SmallVector<Instruction *, 4> PredicatedInstructions; 839 840 /// Trip count of the original loop. 841 Value *TripCount = nullptr; 842 843 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 844 Value *VectorTripCount = nullptr; 845 846 /// The legality analysis. 847 LoopVectorizationLegality *Legal; 848 849 /// The profitablity analysis. 850 LoopVectorizationCostModel *Cost; 851 852 // Record whether runtime checks are added. 853 bool AddedSafetyChecks = false; 854 855 // Holds the end values for each induction variable. We save the end values 856 // so we can later fix-up the external users of the induction variables. 857 DenseMap<PHINode *, Value *> IVEndValues; 858 859 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 860 // fixed up at the end of vector code generation. 861 SmallVector<PHINode *, 8> OrigPHIsToFix; 862 863 /// BFI and PSI are used to check for profile guided size optimizations. 864 BlockFrequencyInfo *BFI; 865 ProfileSummaryInfo *PSI; 866 867 // Whether this loop should be optimized for size based on profile guided size 868 // optimizatios. 869 bool OptForSizeBasedOnProfile; 870 871 /// Structure to hold information about generated runtime checks, responsible 872 /// for cleaning the checks, if vectorization turns out unprofitable. 873 GeneratedRTChecks &RTChecks; 874 }; 875 876 class InnerLoopUnroller : public InnerLoopVectorizer { 877 public: 878 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 879 LoopInfo *LI, DominatorTree *DT, 880 const TargetLibraryInfo *TLI, 881 const TargetTransformInfo *TTI, AssumptionCache *AC, 882 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 883 LoopVectorizationLegality *LVL, 884 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 885 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 887 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 888 BFI, PSI, Check) {} 889 890 private: 891 Value *getBroadcastInstrs(Value *V) override; 892 Value *getStepVector( 893 Value *Val, Value *StartIdx, Value *Step, 894 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 895 Value *reverseVector(Value *Vec) override; 896 }; 897 898 /// Encapsulate information regarding vectorization of a loop and its epilogue. 899 /// This information is meant to be updated and used across two stages of 900 /// epilogue vectorization. 901 struct EpilogueLoopVectorizationInfo { 902 ElementCount MainLoopVF = ElementCount::getFixed(0); 903 unsigned MainLoopUF = 0; 904 ElementCount EpilogueVF = ElementCount::getFixed(0); 905 unsigned EpilogueUF = 0; 906 BasicBlock *MainLoopIterationCountCheck = nullptr; 907 BasicBlock *EpilogueIterationCountCheck = nullptr; 908 BasicBlock *SCEVSafetyCheck = nullptr; 909 BasicBlock *MemSafetyCheck = nullptr; 910 Value *TripCount = nullptr; 911 Value *VectorTripCount = nullptr; 912 913 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 914 ElementCount EVF, unsigned EUF) 915 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 916 assert(EUF == 1 && 917 "A high UF for the epilogue loop is likely not beneficial."); 918 } 919 }; 920 921 /// An extension of the inner loop vectorizer that creates a skeleton for a 922 /// vectorized loop that has its epilogue (residual) also vectorized. 923 /// The idea is to run the vplan on a given loop twice, firstly to setup the 924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 925 /// from the first step and vectorize the epilogue. This is achieved by 926 /// deriving two concrete strategy classes from this base class and invoking 927 /// them in succession from the loop vectorizer planner. 928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 929 public: 930 InnerLoopAndEpilogueVectorizer( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 940 Checks), 941 EPI(EPI) {} 942 943 // Override this function to handle the more complex control flow around the 944 // three loops. 945 BasicBlock *createVectorizedLoopSkeleton() final override { 946 return createEpilogueVectorizedLoopSkeleton(); 947 } 948 949 /// The interface for creating a vectorized skeleton using one of two 950 /// different strategies, each corresponding to one execution of the vplan 951 /// as described above. 952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 953 954 /// Holds and updates state information required to vectorize the main loop 955 /// and its epilogue in two separate passes. This setup helps us avoid 956 /// regenerating and recomputing runtime safety checks. It also helps us to 957 /// shorten the iteration-count-check path length for the cases where the 958 /// iteration count of the loop is so small that the main vector loop is 959 /// completely skipped. 960 EpilogueLoopVectorizationInfo &EPI; 961 }; 962 963 /// A specialized derived class of inner loop vectorizer that performs 964 /// vectorization of *main* loops in the process of vectorizing loops and their 965 /// epilogues. 966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 967 public: 968 EpilogueVectorizerMainLoop( 969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 970 DominatorTree *DT, const TargetLibraryInfo *TLI, 971 const TargetTransformInfo *TTI, AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 975 GeneratedRTChecks &Check) 976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 977 EPI, LVL, CM, BFI, PSI, Check) {} 978 /// Implements the interface for creating a vectorized skeleton using the 979 /// *main loop* strategy (ie the first pass of vplan execution). 980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 981 982 protected: 983 /// Emits an iteration count bypass check once for the main loop (when \p 984 /// ForEpilogue is false) and once for the epilogue loop (when \p 985 /// ForEpilogue is true). 986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 987 bool ForEpilogue); 988 void printDebugTracesAtStart() override; 989 void printDebugTracesAtEnd() override; 990 }; 991 992 // A specialized derived class of inner loop vectorizer that performs 993 // vectorization of *epilogue* loops in the process of vectorizing loops and 994 // their epilogues. 995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 996 public: 997 EpilogueVectorizerEpilogueLoop( 998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 999 DominatorTree *DT, const TargetLibraryInfo *TLI, 1000 const TargetTransformInfo *TTI, AssumptionCache *AC, 1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1004 GeneratedRTChecks &Checks) 1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1006 EPI, LVL, CM, BFI, PSI, Checks) {} 1007 /// Implements the interface for creating a vectorized skeleton using the 1008 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1010 1011 protected: 1012 /// Emits an iteration count bypass check after the main vector loop has 1013 /// finished to see if there are any iterations left to execute by either 1014 /// the vector epilogue or the scalar epilogue. 1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1016 BasicBlock *Bypass, 1017 BasicBlock *Insert); 1018 void printDebugTracesAtStart() override; 1019 void printDebugTracesAtEnd() override; 1020 }; 1021 } // end namespace llvm 1022 1023 /// Look for a meaningful debug location on the instruction or it's 1024 /// operands. 1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1026 if (!I) 1027 return I; 1028 1029 DebugLoc Empty; 1030 if (I->getDebugLoc() != Empty) 1031 return I; 1032 1033 for (Use &Op : I->operands()) { 1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1035 if (OpInst->getDebugLoc() != Empty) 1036 return OpInst; 1037 } 1038 1039 return I; 1040 } 1041 1042 void InnerLoopVectorizer::setDebugLocFromInst( 1043 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1044 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1045 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1046 const DILocation *DIL = Inst->getDebugLoc(); 1047 1048 // When a FSDiscriminator is enabled, we don't need to add the multiply 1049 // factors to the discriminators. 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1052 // FIXME: For scalable vectors, assume vscale=1. 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B->SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } else 1062 B->SetCurrentDebugLocation(DIL); 1063 } else 1064 B->SetCurrentDebugLocation(DebugLoc()); 1065 } 1066 1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1068 /// is passed, the message relates to that particular instruction. 1069 #ifndef NDEBUG 1070 static void debugVectorizationMessage(const StringRef Prefix, 1071 const StringRef DebugMsg, 1072 Instruction *I) { 1073 dbgs() << "LV: " << Prefix << DebugMsg; 1074 if (I != nullptr) 1075 dbgs() << " " << *I; 1076 else 1077 dbgs() << '.'; 1078 dbgs() << '\n'; 1079 } 1080 #endif 1081 1082 /// Create an analysis remark that explains why vectorization failed 1083 /// 1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1085 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1086 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1087 /// the location of the remark. \return the remark object that can be 1088 /// streamed to. 1089 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1090 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1091 Value *CodeRegion = TheLoop->getHeader(); 1092 DebugLoc DL = TheLoop->getStartLoc(); 1093 1094 if (I) { 1095 CodeRegion = I->getParent(); 1096 // If there is no debug location attached to the instruction, revert back to 1097 // using the loop's. 1098 if (I->getDebugLoc()) 1099 DL = I->getDebugLoc(); 1100 } 1101 1102 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1103 } 1104 1105 /// Return a value for Step multiplied by VF. 1106 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1107 int64_t Step) { 1108 assert(Ty->isIntegerTy() && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1110 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1111 } 1112 1113 namespace llvm { 1114 1115 /// Return the runtime value for VF. 1116 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1117 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1118 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1119 } 1120 1121 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1122 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1123 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1124 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1125 return B.CreateUIToFP(RuntimeVF, FTy); 1126 } 1127 1128 void reportVectorizationFailure(const StringRef DebugMsg, 1129 const StringRef OREMsg, const StringRef ORETag, 1130 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1131 Instruction *I) { 1132 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1133 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1134 ORE->emit( 1135 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1136 << "loop not vectorized: " << OREMsg); 1137 } 1138 1139 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1140 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1141 Instruction *I) { 1142 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1143 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1144 ORE->emit( 1145 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1146 << Msg); 1147 } 1148 1149 } // end namespace llvm 1150 1151 #ifndef NDEBUG 1152 /// \return string containing a file name and a line # for the given loop. 1153 static std::string getDebugLocString(const Loop *L) { 1154 std::string Result; 1155 if (L) { 1156 raw_string_ostream OS(Result); 1157 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1158 LoopDbgLoc.print(OS); 1159 else 1160 // Just print the module name. 1161 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1162 OS.flush(); 1163 } 1164 return Result; 1165 } 1166 #endif 1167 1168 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1169 const Instruction *Orig) { 1170 // If the loop was versioned with memchecks, add the corresponding no-alias 1171 // metadata. 1172 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1173 LVer->annotateInstWithNoAlias(To, Orig); 1174 } 1175 1176 void InnerLoopVectorizer::addMetadata(Instruction *To, 1177 Instruction *From) { 1178 propagateMetadata(To, From); 1179 addNewMetadata(To, From); 1180 } 1181 1182 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1183 Instruction *From) { 1184 for (Value *V : To) { 1185 if (Instruction *I = dyn_cast<Instruction>(V)) 1186 addMetadata(I, From); 1187 } 1188 } 1189 1190 namespace llvm { 1191 1192 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1193 // lowered. 1194 enum ScalarEpilogueLowering { 1195 1196 // The default: allowing scalar epilogues. 1197 CM_ScalarEpilogueAllowed, 1198 1199 // Vectorization with OptForSize: don't allow epilogues. 1200 CM_ScalarEpilogueNotAllowedOptSize, 1201 1202 // A special case of vectorisation with OptForSize: loops with a very small 1203 // trip count are considered for vectorization under OptForSize, thereby 1204 // making sure the cost of their loop body is dominant, free of runtime 1205 // guards and scalar iteration overheads. 1206 CM_ScalarEpilogueNotAllowedLowTripLoop, 1207 1208 // Loop hint predicate indicating an epilogue is undesired. 1209 CM_ScalarEpilogueNotNeededUsePredicate, 1210 1211 // Directive indicating we must either tail fold or not vectorize 1212 CM_ScalarEpilogueNotAllowedUsePredicate 1213 }; 1214 1215 /// ElementCountComparator creates a total ordering for ElementCount 1216 /// for the purposes of using it in a set structure. 1217 struct ElementCountComparator { 1218 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1219 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1220 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1221 } 1222 }; 1223 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1224 1225 /// LoopVectorizationCostModel - estimates the expected speedups due to 1226 /// vectorization. 1227 /// In many cases vectorization is not profitable. This can happen because of 1228 /// a number of reasons. In this class we mainly attempt to predict the 1229 /// expected speedup/slowdowns due to the supported instruction set. We use the 1230 /// TargetTransformInfo to query the different backends for the cost of 1231 /// different operations. 1232 class LoopVectorizationCostModel { 1233 public: 1234 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1235 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1236 LoopVectorizationLegality *Legal, 1237 const TargetTransformInfo &TTI, 1238 const TargetLibraryInfo *TLI, DemandedBits *DB, 1239 AssumptionCache *AC, 1240 OptimizationRemarkEmitter *ORE, const Function *F, 1241 const LoopVectorizeHints *Hints, 1242 InterleavedAccessInfo &IAI) 1243 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1244 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1245 Hints(Hints), InterleaveInfo(IAI) {} 1246 1247 /// \return An upper bound for the vectorization factors (both fixed and 1248 /// scalable). If the factors are 0, vectorization and interleaving should be 1249 /// avoided up front. 1250 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1251 1252 /// \return True if runtime checks are required for vectorization, and false 1253 /// otherwise. 1254 bool runtimeChecksRequired(); 1255 1256 /// \return The most profitable vectorization factor and the cost of that VF. 1257 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1258 /// then this vectorization factor will be selected if vectorization is 1259 /// possible. 1260 VectorizationFactor 1261 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1262 1263 VectorizationFactor 1264 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1265 const LoopVectorizationPlanner &LVP); 1266 1267 /// Setup cost-based decisions for user vectorization factor. 1268 /// \return true if the UserVF is a feasible VF to be chosen. 1269 bool selectUserVectorizationFactor(ElementCount UserVF) { 1270 collectUniformsAndScalars(UserVF); 1271 collectInstsToScalarize(UserVF); 1272 return expectedCost(UserVF).first.isValid(); 1273 } 1274 1275 /// \return The size (in bits) of the smallest and widest types in the code 1276 /// that needs to be vectorized. We ignore values that remain scalar such as 1277 /// 64 bit loop indices. 1278 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1279 1280 /// \return The desired interleave count. 1281 /// If interleave count has been specified by metadata it will be returned. 1282 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1283 /// are the selected vectorization factor and the cost of the selected VF. 1284 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1285 1286 /// Memory access instruction may be vectorized in more than one way. 1287 /// Form of instruction after vectorization depends on cost. 1288 /// This function takes cost-based decisions for Load/Store instructions 1289 /// and collects them in a map. This decisions map is used for building 1290 /// the lists of loop-uniform and loop-scalar instructions. 1291 /// The calculated cost is saved with widening decision in order to 1292 /// avoid redundant calculations. 1293 void setCostBasedWideningDecision(ElementCount VF); 1294 1295 /// A struct that represents some properties of the register usage 1296 /// of a loop. 1297 struct RegisterUsage { 1298 /// Holds the number of loop invariant values that are used in the loop. 1299 /// The key is ClassID of target-provided register class. 1300 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1301 /// Holds the maximum number of concurrent live intervals in the loop. 1302 /// The key is ClassID of target-provided register class. 1303 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1304 }; 1305 1306 /// \return Returns information about the register usages of the loop for the 1307 /// given vectorization factors. 1308 SmallVector<RegisterUsage, 8> 1309 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1310 1311 /// Collect values we want to ignore in the cost model. 1312 void collectValuesToIgnore(); 1313 1314 /// Collect all element types in the loop for which widening is needed. 1315 void collectElementTypesForWidening(); 1316 1317 /// Split reductions into those that happen in the loop, and those that happen 1318 /// outside. In loop reductions are collected into InLoopReductionChains. 1319 void collectInLoopReductions(); 1320 1321 /// Returns true if we should use strict in-order reductions for the given 1322 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1323 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1324 /// of FP operations. 1325 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1326 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1327 } 1328 1329 /// \returns The smallest bitwidth each instruction can be represented with. 1330 /// The vector equivalents of these instructions should be truncated to this 1331 /// type. 1332 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1333 return MinBWs; 1334 } 1335 1336 /// \returns True if it is more profitable to scalarize instruction \p I for 1337 /// vectorization factor \p VF. 1338 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1339 assert(VF.isVector() && 1340 "Profitable to scalarize relevant only for VF > 1."); 1341 1342 // Cost model is not run in the VPlan-native path - return conservative 1343 // result until this changes. 1344 if (EnableVPlanNativePath) 1345 return false; 1346 1347 auto Scalars = InstsToScalarize.find(VF); 1348 assert(Scalars != InstsToScalarize.end() && 1349 "VF not yet analyzed for scalarization profitability"); 1350 return Scalars->second.find(I) != Scalars->second.end(); 1351 } 1352 1353 /// Returns true if \p I is known to be uniform after vectorization. 1354 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1355 if (VF.isScalar()) 1356 return true; 1357 1358 // Cost model is not run in the VPlan-native path - return conservative 1359 // result until this changes. 1360 if (EnableVPlanNativePath) 1361 return false; 1362 1363 auto UniformsPerVF = Uniforms.find(VF); 1364 assert(UniformsPerVF != Uniforms.end() && 1365 "VF not yet analyzed for uniformity"); 1366 return UniformsPerVF->second.count(I); 1367 } 1368 1369 /// Returns true if \p I is known to be scalar after vectorization. 1370 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1371 if (VF.isScalar()) 1372 return true; 1373 1374 // Cost model is not run in the VPlan-native path - return conservative 1375 // result until this changes. 1376 if (EnableVPlanNativePath) 1377 return false; 1378 1379 auto ScalarsPerVF = Scalars.find(VF); 1380 assert(ScalarsPerVF != Scalars.end() && 1381 "Scalar values are not calculated for VF"); 1382 return ScalarsPerVF->second.count(I); 1383 } 1384 1385 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1386 /// for vectorization factor \p VF. 1387 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1388 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1389 !isProfitableToScalarize(I, VF) && 1390 !isScalarAfterVectorization(I, VF); 1391 } 1392 1393 /// Decision that was taken during cost calculation for memory instruction. 1394 enum InstWidening { 1395 CM_Unknown, 1396 CM_Widen, // For consecutive accesses with stride +1. 1397 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1398 CM_Interleave, 1399 CM_GatherScatter, 1400 CM_Scalarize 1401 }; 1402 1403 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1404 /// instruction \p I and vector width \p VF. 1405 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1406 InstructionCost Cost) { 1407 assert(VF.isVector() && "Expected VF >=2"); 1408 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1409 } 1410 1411 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1412 /// interleaving group \p Grp and vector width \p VF. 1413 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1414 ElementCount VF, InstWidening W, 1415 InstructionCost Cost) { 1416 assert(VF.isVector() && "Expected VF >=2"); 1417 /// Broadcast this decicion to all instructions inside the group. 1418 /// But the cost will be assigned to one instruction only. 1419 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1420 if (auto *I = Grp->getMember(i)) { 1421 if (Grp->getInsertPos() == I) 1422 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1423 else 1424 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1425 } 1426 } 1427 } 1428 1429 /// Return the cost model decision for the given instruction \p I and vector 1430 /// width \p VF. Return CM_Unknown if this instruction did not pass 1431 /// through the cost modeling. 1432 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1433 assert(VF.isVector() && "Expected VF to be a vector VF"); 1434 // Cost model is not run in the VPlan-native path - return conservative 1435 // result until this changes. 1436 if (EnableVPlanNativePath) 1437 return CM_GatherScatter; 1438 1439 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1440 auto Itr = WideningDecisions.find(InstOnVF); 1441 if (Itr == WideningDecisions.end()) 1442 return CM_Unknown; 1443 return Itr->second.first; 1444 } 1445 1446 /// Return the vectorization cost for the given instruction \p I and vector 1447 /// width \p VF. 1448 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1449 assert(VF.isVector() && "Expected VF >=2"); 1450 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1451 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1452 "The cost is not calculated"); 1453 return WideningDecisions[InstOnVF].second; 1454 } 1455 1456 /// Return True if instruction \p I is an optimizable truncate whose operand 1457 /// is an induction variable. Such a truncate will be removed by adding a new 1458 /// induction variable with the destination type. 1459 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1460 // If the instruction is not a truncate, return false. 1461 auto *Trunc = dyn_cast<TruncInst>(I); 1462 if (!Trunc) 1463 return false; 1464 1465 // Get the source and destination types of the truncate. 1466 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1467 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1468 1469 // If the truncate is free for the given types, return false. Replacing a 1470 // free truncate with an induction variable would add an induction variable 1471 // update instruction to each iteration of the loop. We exclude from this 1472 // check the primary induction variable since it will need an update 1473 // instruction regardless. 1474 Value *Op = Trunc->getOperand(0); 1475 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1476 return false; 1477 1478 // If the truncated value is not an induction variable, return false. 1479 return Legal->isInductionPhi(Op); 1480 } 1481 1482 /// Collects the instructions to scalarize for each predicated instruction in 1483 /// the loop. 1484 void collectInstsToScalarize(ElementCount VF); 1485 1486 /// Collect Uniform and Scalar values for the given \p VF. 1487 /// The sets depend on CM decision for Load/Store instructions 1488 /// that may be vectorized as interleave, gather-scatter or scalarized. 1489 void collectUniformsAndScalars(ElementCount VF) { 1490 // Do the analysis once. 1491 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1492 return; 1493 setCostBasedWideningDecision(VF); 1494 collectLoopUniforms(VF); 1495 collectLoopScalars(VF); 1496 } 1497 1498 /// Returns true if the target machine supports masked store operation 1499 /// for the given \p DataType and kind of access to \p Ptr. 1500 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1501 return Legal->isConsecutivePtr(DataType, Ptr) && 1502 TTI.isLegalMaskedStore(DataType, Alignment); 1503 } 1504 1505 /// Returns true if the target machine supports masked load operation 1506 /// for the given \p DataType and kind of access to \p Ptr. 1507 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1508 return Legal->isConsecutivePtr(DataType, Ptr) && 1509 TTI.isLegalMaskedLoad(DataType, Alignment); 1510 } 1511 1512 /// Returns true if the target machine can represent \p V as a masked gather 1513 /// or scatter operation. 1514 bool isLegalGatherOrScatter(Value *V) { 1515 bool LI = isa<LoadInst>(V); 1516 bool SI = isa<StoreInst>(V); 1517 if (!LI && !SI) 1518 return false; 1519 auto *Ty = getLoadStoreType(V); 1520 Align Align = getLoadStoreAlignment(V); 1521 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1522 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1523 } 1524 1525 /// Returns true if the target machine supports all of the reduction 1526 /// variables found for the given VF. 1527 bool canVectorizeReductions(ElementCount VF) const { 1528 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1529 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1530 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1531 })); 1532 } 1533 1534 /// Returns true if \p I is an instruction that will be scalarized with 1535 /// predication. Such instructions include conditional stores and 1536 /// instructions that may divide by zero. 1537 /// If a non-zero VF has been calculated, we check if I will be scalarized 1538 /// predication for that VF. 1539 bool isScalarWithPredication(Instruction *I) const; 1540 1541 // Returns true if \p I is an instruction that will be predicated either 1542 // through scalar predication or masked load/store or masked gather/scatter. 1543 // Superset of instructions that return true for isScalarWithPredication. 1544 bool isPredicatedInst(Instruction *I) { 1545 if (!blockNeedsPredication(I->getParent())) 1546 return false; 1547 // Loads and stores that need some form of masked operation are predicated 1548 // instructions. 1549 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1550 return Legal->isMaskRequired(I); 1551 return isScalarWithPredication(I); 1552 } 1553 1554 /// Returns true if \p I is a memory instruction with consecutive memory 1555 /// access that can be widened. 1556 bool 1557 memoryInstructionCanBeWidened(Instruction *I, 1558 ElementCount VF = ElementCount::getFixed(1)); 1559 1560 /// Returns true if \p I is a memory instruction in an interleaved-group 1561 /// of memory accesses that can be vectorized with wide vector loads/stores 1562 /// and shuffles. 1563 bool 1564 interleavedAccessCanBeWidened(Instruction *I, 1565 ElementCount VF = ElementCount::getFixed(1)); 1566 1567 /// Check if \p Instr belongs to any interleaved access group. 1568 bool isAccessInterleaved(Instruction *Instr) { 1569 return InterleaveInfo.isInterleaved(Instr); 1570 } 1571 1572 /// Get the interleaved access group that \p Instr belongs to. 1573 const InterleaveGroup<Instruction> * 1574 getInterleavedAccessGroup(Instruction *Instr) { 1575 return InterleaveInfo.getInterleaveGroup(Instr); 1576 } 1577 1578 /// Returns true if we're required to use a scalar epilogue for at least 1579 /// the final iteration of the original loop. 1580 bool requiresScalarEpilogue(ElementCount VF) const { 1581 if (!isScalarEpilogueAllowed()) 1582 return false; 1583 // If we might exit from anywhere but the latch, must run the exiting 1584 // iteration in scalar form. 1585 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1586 return true; 1587 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1588 } 1589 1590 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1591 /// loop hint annotation. 1592 bool isScalarEpilogueAllowed() const { 1593 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1594 } 1595 1596 /// Returns true if all loop blocks should be masked to fold tail loop. 1597 bool foldTailByMasking() const { return FoldTailByMasking; } 1598 1599 bool blockNeedsPredication(BasicBlock *BB) const { 1600 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1601 } 1602 1603 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1604 /// nodes to the chain of instructions representing the reductions. Uses a 1605 /// MapVector to ensure deterministic iteration order. 1606 using ReductionChainMap = 1607 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1608 1609 /// Return the chain of instructions representing an inloop reduction. 1610 const ReductionChainMap &getInLoopReductionChains() const { 1611 return InLoopReductionChains; 1612 } 1613 1614 /// Returns true if the Phi is part of an inloop reduction. 1615 bool isInLoopReduction(PHINode *Phi) const { 1616 return InLoopReductionChains.count(Phi); 1617 } 1618 1619 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1620 /// with factor VF. Return the cost of the instruction, including 1621 /// scalarization overhead if it's needed. 1622 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1623 1624 /// Estimate cost of a call instruction CI if it were vectorized with factor 1625 /// VF. Return the cost of the instruction, including scalarization overhead 1626 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1627 /// scalarized - 1628 /// i.e. either vector version isn't available, or is too expensive. 1629 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1630 bool &NeedToScalarize) const; 1631 1632 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1633 /// that of B. 1634 bool isMoreProfitable(const VectorizationFactor &A, 1635 const VectorizationFactor &B) const; 1636 1637 /// Invalidates decisions already taken by the cost model. 1638 void invalidateCostModelingDecisions() { 1639 WideningDecisions.clear(); 1640 Uniforms.clear(); 1641 Scalars.clear(); 1642 } 1643 1644 private: 1645 unsigned NumPredStores = 0; 1646 1647 /// \return An upper bound for the vectorization factors for both 1648 /// fixed and scalable vectorization, where the minimum-known number of 1649 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1650 /// disabled or unsupported, then the scalable part will be equal to 1651 /// ElementCount::getScalable(0). 1652 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1653 ElementCount UserVF); 1654 1655 /// \return the maximized element count based on the targets vector 1656 /// registers and the loop trip-count, but limited to a maximum safe VF. 1657 /// This is a helper function of computeFeasibleMaxVF. 1658 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1659 /// issue that occurred on one of the buildbots which cannot be reproduced 1660 /// without having access to the properietary compiler (see comments on 1661 /// D98509). The issue is currently under investigation and this workaround 1662 /// will be removed as soon as possible. 1663 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1664 unsigned SmallestType, 1665 unsigned WidestType, 1666 const ElementCount &MaxSafeVF); 1667 1668 /// \return the maximum legal scalable VF, based on the safe max number 1669 /// of elements. 1670 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1671 1672 /// The vectorization cost is a combination of the cost itself and a boolean 1673 /// indicating whether any of the contributing operations will actually 1674 /// operate on vector values after type legalization in the backend. If this 1675 /// latter value is false, then all operations will be scalarized (i.e. no 1676 /// vectorization has actually taken place). 1677 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1678 1679 /// Returns the expected execution cost. The unit of the cost does 1680 /// not matter because we use the 'cost' units to compare different 1681 /// vector widths. The cost that is returned is *not* normalized by 1682 /// the factor width. If \p Invalid is not nullptr, this function 1683 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1684 /// each instruction that has an Invalid cost for the given VF. 1685 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1686 VectorizationCostTy 1687 expectedCost(ElementCount VF, 1688 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1689 1690 /// Returns the execution time cost of an instruction for a given vector 1691 /// width. Vector width of one means scalar. 1692 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1693 1694 /// The cost-computation logic from getInstructionCost which provides 1695 /// the vector type as an output parameter. 1696 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1697 Type *&VectorTy); 1698 1699 /// Return the cost of instructions in an inloop reduction pattern, if I is 1700 /// part of that pattern. 1701 Optional<InstructionCost> 1702 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1703 TTI::TargetCostKind CostKind); 1704 1705 /// Calculate vectorization cost of memory instruction \p I. 1706 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1707 1708 /// The cost computation for scalarized memory instruction. 1709 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1710 1711 /// The cost computation for interleaving group of memory instructions. 1712 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1713 1714 /// The cost computation for Gather/Scatter instruction. 1715 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1716 1717 /// The cost computation for widening instruction \p I with consecutive 1718 /// memory access. 1719 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1720 1721 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1722 /// Load: scalar load + broadcast. 1723 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1724 /// element) 1725 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1726 1727 /// Estimate the overhead of scalarizing an instruction. This is a 1728 /// convenience wrapper for the type-based getScalarizationOverhead API. 1729 InstructionCost getScalarizationOverhead(Instruction *I, 1730 ElementCount VF) const; 1731 1732 /// Returns whether the instruction is a load or store and will be a emitted 1733 /// as a vector operation. 1734 bool isConsecutiveLoadOrStore(Instruction *I); 1735 1736 /// Returns true if an artificially high cost for emulated masked memrefs 1737 /// should be used. 1738 bool useEmulatedMaskMemRefHack(Instruction *I); 1739 1740 /// Map of scalar integer values to the smallest bitwidth they can be legally 1741 /// represented as. The vector equivalents of these values should be truncated 1742 /// to this type. 1743 MapVector<Instruction *, uint64_t> MinBWs; 1744 1745 /// A type representing the costs for instructions if they were to be 1746 /// scalarized rather than vectorized. The entries are Instruction-Cost 1747 /// pairs. 1748 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1749 1750 /// A set containing all BasicBlocks that are known to present after 1751 /// vectorization as a predicated block. 1752 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1753 1754 /// Records whether it is allowed to have the original scalar loop execute at 1755 /// least once. This may be needed as a fallback loop in case runtime 1756 /// aliasing/dependence checks fail, or to handle the tail/remainder 1757 /// iterations when the trip count is unknown or doesn't divide by the VF, 1758 /// or as a peel-loop to handle gaps in interleave-groups. 1759 /// Under optsize and when the trip count is very small we don't allow any 1760 /// iterations to execute in the scalar loop. 1761 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1762 1763 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1764 bool FoldTailByMasking = false; 1765 1766 /// A map holding scalar costs for different vectorization factors. The 1767 /// presence of a cost for an instruction in the mapping indicates that the 1768 /// instruction will be scalarized when vectorizing with the associated 1769 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1770 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1771 1772 /// Holds the instructions known to be uniform after vectorization. 1773 /// The data is collected per VF. 1774 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1775 1776 /// Holds the instructions known to be scalar after vectorization. 1777 /// The data is collected per VF. 1778 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1779 1780 /// Holds the instructions (address computations) that are forced to be 1781 /// scalarized. 1782 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1783 1784 /// PHINodes of the reductions that should be expanded in-loop along with 1785 /// their associated chains of reduction operations, in program order from top 1786 /// (PHI) to bottom 1787 ReductionChainMap InLoopReductionChains; 1788 1789 /// A Map of inloop reduction operations and their immediate chain operand. 1790 /// FIXME: This can be removed once reductions can be costed correctly in 1791 /// vplan. This was added to allow quick lookup to the inloop operations, 1792 /// without having to loop through InLoopReductionChains. 1793 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1794 1795 /// Returns the expected difference in cost from scalarizing the expression 1796 /// feeding a predicated instruction \p PredInst. The instructions to 1797 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1798 /// non-negative return value implies the expression will be scalarized. 1799 /// Currently, only single-use chains are considered for scalarization. 1800 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1801 ElementCount VF); 1802 1803 /// Collect the instructions that are uniform after vectorization. An 1804 /// instruction is uniform if we represent it with a single scalar value in 1805 /// the vectorized loop corresponding to each vector iteration. Examples of 1806 /// uniform instructions include pointer operands of consecutive or 1807 /// interleaved memory accesses. Note that although uniformity implies an 1808 /// instruction will be scalar, the reverse is not true. In general, a 1809 /// scalarized instruction will be represented by VF scalar values in the 1810 /// vectorized loop, each corresponding to an iteration of the original 1811 /// scalar loop. 1812 void collectLoopUniforms(ElementCount VF); 1813 1814 /// Collect the instructions that are scalar after vectorization. An 1815 /// instruction is scalar if it is known to be uniform or will be scalarized 1816 /// during vectorization. Non-uniform scalarized instructions will be 1817 /// represented by VF values in the vectorized loop, each corresponding to an 1818 /// iteration of the original scalar loop. 1819 void collectLoopScalars(ElementCount VF); 1820 1821 /// Keeps cost model vectorization decision and cost for instructions. 1822 /// Right now it is used for memory instructions only. 1823 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1824 std::pair<InstWidening, InstructionCost>>; 1825 1826 DecisionList WideningDecisions; 1827 1828 /// Returns true if \p V is expected to be vectorized and it needs to be 1829 /// extracted. 1830 bool needsExtract(Value *V, ElementCount VF) const { 1831 Instruction *I = dyn_cast<Instruction>(V); 1832 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1833 TheLoop->isLoopInvariant(I)) 1834 return false; 1835 1836 // Assume we can vectorize V (and hence we need extraction) if the 1837 // scalars are not computed yet. This can happen, because it is called 1838 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1839 // the scalars are collected. That should be a safe assumption in most 1840 // cases, because we check if the operands have vectorizable types 1841 // beforehand in LoopVectorizationLegality. 1842 return Scalars.find(VF) == Scalars.end() || 1843 !isScalarAfterVectorization(I, VF); 1844 }; 1845 1846 /// Returns a range containing only operands needing to be extracted. 1847 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1848 ElementCount VF) const { 1849 return SmallVector<Value *, 4>(make_filter_range( 1850 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1851 } 1852 1853 /// Determines if we have the infrastructure to vectorize loop \p L and its 1854 /// epilogue, assuming the main loop is vectorized by \p VF. 1855 bool isCandidateForEpilogueVectorization(const Loop &L, 1856 const ElementCount VF) const; 1857 1858 /// Returns true if epilogue vectorization is considered profitable, and 1859 /// false otherwise. 1860 /// \p VF is the vectorization factor chosen for the original loop. 1861 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1862 1863 public: 1864 /// The loop that we evaluate. 1865 Loop *TheLoop; 1866 1867 /// Predicated scalar evolution analysis. 1868 PredicatedScalarEvolution &PSE; 1869 1870 /// Loop Info analysis. 1871 LoopInfo *LI; 1872 1873 /// Vectorization legality. 1874 LoopVectorizationLegality *Legal; 1875 1876 /// Vector target information. 1877 const TargetTransformInfo &TTI; 1878 1879 /// Target Library Info. 1880 const TargetLibraryInfo *TLI; 1881 1882 /// Demanded bits analysis. 1883 DemandedBits *DB; 1884 1885 /// Assumption cache. 1886 AssumptionCache *AC; 1887 1888 /// Interface to emit optimization remarks. 1889 OptimizationRemarkEmitter *ORE; 1890 1891 const Function *TheFunction; 1892 1893 /// Loop Vectorize Hint. 1894 const LoopVectorizeHints *Hints; 1895 1896 /// The interleave access information contains groups of interleaved accesses 1897 /// with the same stride and close to each other. 1898 InterleavedAccessInfo &InterleaveInfo; 1899 1900 /// Values to ignore in the cost model. 1901 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1902 1903 /// Values to ignore in the cost model when VF > 1. 1904 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1905 1906 /// All element types found in the loop. 1907 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1908 1909 /// Profitable vector factors. 1910 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1911 }; 1912 } // end namespace llvm 1913 1914 /// Helper struct to manage generating runtime checks for vectorization. 1915 /// 1916 /// The runtime checks are created up-front in temporary blocks to allow better 1917 /// estimating the cost and un-linked from the existing IR. After deciding to 1918 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1919 /// temporary blocks are completely removed. 1920 class GeneratedRTChecks { 1921 /// Basic block which contains the generated SCEV checks, if any. 1922 BasicBlock *SCEVCheckBlock = nullptr; 1923 1924 /// The value representing the result of the generated SCEV checks. If it is 1925 /// nullptr, either no SCEV checks have been generated or they have been used. 1926 Value *SCEVCheckCond = nullptr; 1927 1928 /// Basic block which contains the generated memory runtime checks, if any. 1929 BasicBlock *MemCheckBlock = nullptr; 1930 1931 /// The value representing the result of the generated memory runtime checks. 1932 /// If it is nullptr, either no memory runtime checks have been generated or 1933 /// they have been used. 1934 Value *MemRuntimeCheckCond = nullptr; 1935 1936 DominatorTree *DT; 1937 LoopInfo *LI; 1938 1939 SCEVExpander SCEVExp; 1940 SCEVExpander MemCheckExp; 1941 1942 public: 1943 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1944 const DataLayout &DL) 1945 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1946 MemCheckExp(SE, DL, "scev.check") {} 1947 1948 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1949 /// accurately estimate the cost of the runtime checks. The blocks are 1950 /// un-linked from the IR and is added back during vector code generation. If 1951 /// there is no vector code generation, the check blocks are removed 1952 /// completely. 1953 void Create(Loop *L, const LoopAccessInfo &LAI, 1954 const SCEVUnionPredicate &UnionPred) { 1955 1956 BasicBlock *LoopHeader = L->getHeader(); 1957 BasicBlock *Preheader = L->getLoopPreheader(); 1958 1959 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1960 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1961 // may be used by SCEVExpander. The blocks will be un-linked from their 1962 // predecessors and removed from LI & DT at the end of the function. 1963 if (!UnionPred.isAlwaysTrue()) { 1964 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1965 nullptr, "vector.scevcheck"); 1966 1967 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1968 &UnionPred, SCEVCheckBlock->getTerminator()); 1969 } 1970 1971 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1972 if (RtPtrChecking.Need) { 1973 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1974 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1975 "vector.memcheck"); 1976 1977 MemRuntimeCheckCond = 1978 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1979 RtPtrChecking.getChecks(), MemCheckExp); 1980 assert(MemRuntimeCheckCond && 1981 "no RT checks generated although RtPtrChecking " 1982 "claimed checks are required"); 1983 } 1984 1985 if (!MemCheckBlock && !SCEVCheckBlock) 1986 return; 1987 1988 // Unhook the temporary block with the checks, update various places 1989 // accordingly. 1990 if (SCEVCheckBlock) 1991 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1992 if (MemCheckBlock) 1993 MemCheckBlock->replaceAllUsesWith(Preheader); 1994 1995 if (SCEVCheckBlock) { 1996 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1997 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1998 Preheader->getTerminator()->eraseFromParent(); 1999 } 2000 if (MemCheckBlock) { 2001 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2002 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2003 Preheader->getTerminator()->eraseFromParent(); 2004 } 2005 2006 DT->changeImmediateDominator(LoopHeader, Preheader); 2007 if (MemCheckBlock) { 2008 DT->eraseNode(MemCheckBlock); 2009 LI->removeBlock(MemCheckBlock); 2010 } 2011 if (SCEVCheckBlock) { 2012 DT->eraseNode(SCEVCheckBlock); 2013 LI->removeBlock(SCEVCheckBlock); 2014 } 2015 } 2016 2017 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2018 /// unused. 2019 ~GeneratedRTChecks() { 2020 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2021 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2022 if (!SCEVCheckCond) 2023 SCEVCleaner.markResultUsed(); 2024 2025 if (!MemRuntimeCheckCond) 2026 MemCheckCleaner.markResultUsed(); 2027 2028 if (MemRuntimeCheckCond) { 2029 auto &SE = *MemCheckExp.getSE(); 2030 // Memory runtime check generation creates compares that use expanded 2031 // values. Remove them before running the SCEVExpanderCleaners. 2032 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2033 if (MemCheckExp.isInsertedInstruction(&I)) 2034 continue; 2035 SE.forgetValue(&I); 2036 I.eraseFromParent(); 2037 } 2038 } 2039 MemCheckCleaner.cleanup(); 2040 SCEVCleaner.cleanup(); 2041 2042 if (SCEVCheckCond) 2043 SCEVCheckBlock->eraseFromParent(); 2044 if (MemRuntimeCheckCond) 2045 MemCheckBlock->eraseFromParent(); 2046 } 2047 2048 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2049 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2050 /// depending on the generated condition. 2051 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2052 BasicBlock *LoopVectorPreHeader, 2053 BasicBlock *LoopExitBlock) { 2054 if (!SCEVCheckCond) 2055 return nullptr; 2056 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2057 if (C->isZero()) 2058 return nullptr; 2059 2060 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2061 2062 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2063 // Create new preheader for vector loop. 2064 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2065 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2066 2067 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2068 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2069 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2070 SCEVCheckBlock); 2071 2072 DT->addNewBlock(SCEVCheckBlock, Pred); 2073 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2074 2075 ReplaceInstWithInst( 2076 SCEVCheckBlock->getTerminator(), 2077 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2078 // Mark the check as used, to prevent it from being removed during cleanup. 2079 SCEVCheckCond = nullptr; 2080 return SCEVCheckBlock; 2081 } 2082 2083 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2084 /// the branches to branch to the vector preheader or \p Bypass, depending on 2085 /// the generated condition. 2086 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2087 BasicBlock *LoopVectorPreHeader) { 2088 // Check if we generated code that checks in runtime if arrays overlap. 2089 if (!MemRuntimeCheckCond) 2090 return nullptr; 2091 2092 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2093 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2094 MemCheckBlock); 2095 2096 DT->addNewBlock(MemCheckBlock, Pred); 2097 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2098 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2099 2100 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2101 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2102 2103 ReplaceInstWithInst( 2104 MemCheckBlock->getTerminator(), 2105 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2106 MemCheckBlock->getTerminator()->setDebugLoc( 2107 Pred->getTerminator()->getDebugLoc()); 2108 2109 // Mark the check as used, to prevent it from being removed during cleanup. 2110 MemRuntimeCheckCond = nullptr; 2111 return MemCheckBlock; 2112 } 2113 }; 2114 2115 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2116 // vectorization. The loop needs to be annotated with #pragma omp simd 2117 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2118 // vector length information is not provided, vectorization is not considered 2119 // explicit. Interleave hints are not allowed either. These limitations will be 2120 // relaxed in the future. 2121 // Please, note that we are currently forced to abuse the pragma 'clang 2122 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2123 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2124 // provides *explicit vectorization hints* (LV can bypass legal checks and 2125 // assume that vectorization is legal). However, both hints are implemented 2126 // using the same metadata (llvm.loop.vectorize, processed by 2127 // LoopVectorizeHints). This will be fixed in the future when the native IR 2128 // representation for pragma 'omp simd' is introduced. 2129 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2130 OptimizationRemarkEmitter *ORE) { 2131 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2132 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2133 2134 // Only outer loops with an explicit vectorization hint are supported. 2135 // Unannotated outer loops are ignored. 2136 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2137 return false; 2138 2139 Function *Fn = OuterLp->getHeader()->getParent(); 2140 if (!Hints.allowVectorization(Fn, OuterLp, 2141 true /*VectorizeOnlyWhenForced*/)) { 2142 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2143 return false; 2144 } 2145 2146 if (Hints.getInterleave() > 1) { 2147 // TODO: Interleave support is future work. 2148 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2149 "outer loops.\n"); 2150 Hints.emitRemarkWithHints(); 2151 return false; 2152 } 2153 2154 return true; 2155 } 2156 2157 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2158 OptimizationRemarkEmitter *ORE, 2159 SmallVectorImpl<Loop *> &V) { 2160 // Collect inner loops and outer loops without irreducible control flow. For 2161 // now, only collect outer loops that have explicit vectorization hints. If we 2162 // are stress testing the VPlan H-CFG construction, we collect the outermost 2163 // loop of every loop nest. 2164 if (L.isInnermost() || VPlanBuildStressTest || 2165 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2166 LoopBlocksRPO RPOT(&L); 2167 RPOT.perform(LI); 2168 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2169 V.push_back(&L); 2170 // TODO: Collect inner loops inside marked outer loops in case 2171 // vectorization fails for the outer loop. Do not invoke 2172 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2173 // already known to be reducible. We can use an inherited attribute for 2174 // that. 2175 return; 2176 } 2177 } 2178 for (Loop *InnerL : L) 2179 collectSupportedLoops(*InnerL, LI, ORE, V); 2180 } 2181 2182 namespace { 2183 2184 /// The LoopVectorize Pass. 2185 struct LoopVectorize : public FunctionPass { 2186 /// Pass identification, replacement for typeid 2187 static char ID; 2188 2189 LoopVectorizePass Impl; 2190 2191 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2192 bool VectorizeOnlyWhenForced = false) 2193 : FunctionPass(ID), 2194 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2195 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2196 } 2197 2198 bool runOnFunction(Function &F) override { 2199 if (skipFunction(F)) 2200 return false; 2201 2202 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2203 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2204 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2205 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2206 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2207 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2208 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2209 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2210 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2211 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2212 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2213 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2214 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2215 2216 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2217 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2218 2219 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2220 GetLAA, *ORE, PSI).MadeAnyChange; 2221 } 2222 2223 void getAnalysisUsage(AnalysisUsage &AU) const override { 2224 AU.addRequired<AssumptionCacheTracker>(); 2225 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2226 AU.addRequired<DominatorTreeWrapperPass>(); 2227 AU.addRequired<LoopInfoWrapperPass>(); 2228 AU.addRequired<ScalarEvolutionWrapperPass>(); 2229 AU.addRequired<TargetTransformInfoWrapperPass>(); 2230 AU.addRequired<AAResultsWrapperPass>(); 2231 AU.addRequired<LoopAccessLegacyAnalysis>(); 2232 AU.addRequired<DemandedBitsWrapperPass>(); 2233 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2234 AU.addRequired<InjectTLIMappingsLegacy>(); 2235 2236 // We currently do not preserve loopinfo/dominator analyses with outer loop 2237 // vectorization. Until this is addressed, mark these analyses as preserved 2238 // only for non-VPlan-native path. 2239 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2240 if (!EnableVPlanNativePath) { 2241 AU.addPreserved<LoopInfoWrapperPass>(); 2242 AU.addPreserved<DominatorTreeWrapperPass>(); 2243 } 2244 2245 AU.addPreserved<BasicAAWrapperPass>(); 2246 AU.addPreserved<GlobalsAAWrapperPass>(); 2247 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2248 } 2249 }; 2250 2251 } // end anonymous namespace 2252 2253 //===----------------------------------------------------------------------===// 2254 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2255 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2256 //===----------------------------------------------------------------------===// 2257 2258 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2259 // We need to place the broadcast of invariant variables outside the loop, 2260 // but only if it's proven safe to do so. Else, broadcast will be inside 2261 // vector loop body. 2262 Instruction *Instr = dyn_cast<Instruction>(V); 2263 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2264 (!Instr || 2265 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2266 // Place the code for broadcasting invariant variables in the new preheader. 2267 IRBuilder<>::InsertPointGuard Guard(Builder); 2268 if (SafeToHoist) 2269 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2270 2271 // Broadcast the scalar into all locations in the vector. 2272 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2273 2274 return Shuf; 2275 } 2276 2277 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2278 const InductionDescriptor &II, Value *Step, Value *Start, 2279 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2280 VPTransformState &State) { 2281 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2282 "Expected either an induction phi-node or a truncate of it!"); 2283 2284 // Construct the initial value of the vector IV in the vector loop preheader 2285 auto CurrIP = Builder.saveIP(); 2286 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2287 if (isa<TruncInst>(EntryVal)) { 2288 assert(Start->getType()->isIntegerTy() && 2289 "Truncation requires an integer type"); 2290 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2291 Step = Builder.CreateTrunc(Step, TruncType); 2292 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2293 } 2294 2295 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2296 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2297 Value *SteppedStart = 2298 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2299 2300 // We create vector phi nodes for both integer and floating-point induction 2301 // variables. Here, we determine the kind of arithmetic we will perform. 2302 Instruction::BinaryOps AddOp; 2303 Instruction::BinaryOps MulOp; 2304 if (Step->getType()->isIntegerTy()) { 2305 AddOp = Instruction::Add; 2306 MulOp = Instruction::Mul; 2307 } else { 2308 AddOp = II.getInductionOpcode(); 2309 MulOp = Instruction::FMul; 2310 } 2311 2312 // Multiply the vectorization factor by the step using integer or 2313 // floating-point arithmetic as appropriate. 2314 Type *StepType = Step->getType(); 2315 Value *RuntimeVF; 2316 if (Step->getType()->isFloatingPointTy()) 2317 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); 2318 else 2319 RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2320 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2321 2322 // Create a vector splat to use in the induction update. 2323 // 2324 // FIXME: If the step is non-constant, we create the vector splat with 2325 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2326 // handle a constant vector splat. 2327 Value *SplatVF = isa<Constant>(Mul) 2328 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2329 : Builder.CreateVectorSplat(VF, Mul); 2330 Builder.restoreIP(CurrIP); 2331 2332 // We may need to add the step a number of times, depending on the unroll 2333 // factor. The last of those goes into the PHI. 2334 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2335 &*LoopVectorBody->getFirstInsertionPt()); 2336 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2337 Instruction *LastInduction = VecInd; 2338 for (unsigned Part = 0; Part < UF; ++Part) { 2339 State.set(Def, LastInduction, Part); 2340 2341 if (isa<TruncInst>(EntryVal)) 2342 addMetadata(LastInduction, EntryVal); 2343 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2344 State, Part); 2345 2346 LastInduction = cast<Instruction>( 2347 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2348 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2349 } 2350 2351 // Move the last step to the end of the latch block. This ensures consistent 2352 // placement of all induction updates. 2353 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2354 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2355 auto *ICmp = cast<Instruction>(Br->getCondition()); 2356 LastInduction->moveBefore(ICmp); 2357 LastInduction->setName("vec.ind.next"); 2358 2359 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2360 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2361 } 2362 2363 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2364 return Cost->isScalarAfterVectorization(I, VF) || 2365 Cost->isProfitableToScalarize(I, VF); 2366 } 2367 2368 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2369 if (shouldScalarizeInstruction(IV)) 2370 return true; 2371 auto isScalarInst = [&](User *U) -> bool { 2372 auto *I = cast<Instruction>(U); 2373 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2374 }; 2375 return llvm::any_of(IV->users(), isScalarInst); 2376 } 2377 2378 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2379 const InductionDescriptor &ID, const Instruction *EntryVal, 2380 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2381 unsigned Part, unsigned Lane) { 2382 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2383 "Expected either an induction phi-node or a truncate of it!"); 2384 2385 // This induction variable is not the phi from the original loop but the 2386 // newly-created IV based on the proof that casted Phi is equal to the 2387 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2388 // re-uses the same InductionDescriptor that original IV uses but we don't 2389 // have to do any recording in this case - that is done when original IV is 2390 // processed. 2391 if (isa<TruncInst>(EntryVal)) 2392 return; 2393 2394 if (!CastDef) { 2395 assert(ID.getCastInsts().empty() && 2396 "there are casts for ID, but no CastDef"); 2397 return; 2398 } 2399 assert(!ID.getCastInsts().empty() && 2400 "there is a CastDef, but no casts for ID"); 2401 // Only the first Cast instruction in the Casts vector is of interest. 2402 // The rest of the Casts (if exist) have no uses outside the 2403 // induction update chain itself. 2404 if (Lane < UINT_MAX) 2405 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2406 else 2407 State.set(CastDef, VectorLoopVal, Part); 2408 } 2409 2410 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2411 TruncInst *Trunc, VPValue *Def, 2412 VPValue *CastDef, 2413 VPTransformState &State) { 2414 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2415 "Primary induction variable must have an integer type"); 2416 2417 auto II = Legal->getInductionVars().find(IV); 2418 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2419 2420 auto ID = II->second; 2421 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2422 2423 // The value from the original loop to which we are mapping the new induction 2424 // variable. 2425 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2426 2427 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2428 2429 // Generate code for the induction step. Note that induction steps are 2430 // required to be loop-invariant 2431 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2432 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2433 "Induction step should be loop invariant"); 2434 if (PSE.getSE()->isSCEVable(IV->getType())) { 2435 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2436 return Exp.expandCodeFor(Step, Step->getType(), 2437 LoopVectorPreHeader->getTerminator()); 2438 } 2439 return cast<SCEVUnknown>(Step)->getValue(); 2440 }; 2441 2442 // The scalar value to broadcast. This is derived from the canonical 2443 // induction variable. If a truncation type is given, truncate the canonical 2444 // induction variable and step. Otherwise, derive these values from the 2445 // induction descriptor. 2446 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2447 Value *ScalarIV = Induction; 2448 if (IV != OldInduction) { 2449 ScalarIV = IV->getType()->isIntegerTy() 2450 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2451 : Builder.CreateCast(Instruction::SIToFP, Induction, 2452 IV->getType()); 2453 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2454 ScalarIV->setName("offset.idx"); 2455 } 2456 if (Trunc) { 2457 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2458 assert(Step->getType()->isIntegerTy() && 2459 "Truncation requires an integer step"); 2460 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2461 Step = Builder.CreateTrunc(Step, TruncType); 2462 } 2463 return ScalarIV; 2464 }; 2465 2466 // Create the vector values from the scalar IV, in the absence of creating a 2467 // vector IV. 2468 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2469 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2470 for (unsigned Part = 0; Part < UF; ++Part) { 2471 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2472 Value *StartIdx; 2473 if (Step->getType()->isFloatingPointTy()) 2474 StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); 2475 else 2476 StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); 2477 2478 Value *EntryPart = 2479 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2480 State.set(Def, EntryPart, Part); 2481 if (Trunc) 2482 addMetadata(EntryPart, Trunc); 2483 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2484 State, Part); 2485 } 2486 }; 2487 2488 // Fast-math-flags propagate from the original induction instruction. 2489 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2490 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2491 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2492 2493 // Now do the actual transformations, and start with creating the step value. 2494 Value *Step = CreateStepValue(ID.getStep()); 2495 if (VF.isZero() || VF.isScalar()) { 2496 Value *ScalarIV = CreateScalarIV(Step); 2497 CreateSplatIV(ScalarIV, Step); 2498 return; 2499 } 2500 2501 // Determine if we want a scalar version of the induction variable. This is 2502 // true if the induction variable itself is not widened, or if it has at 2503 // least one user in the loop that is not widened. 2504 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2505 if (!NeedsScalarIV) { 2506 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2507 State); 2508 return; 2509 } 2510 2511 // Try to create a new independent vector induction variable. If we can't 2512 // create the phi node, we will splat the scalar induction variable in each 2513 // loop iteration. 2514 if (!shouldScalarizeInstruction(EntryVal)) { 2515 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2516 State); 2517 Value *ScalarIV = CreateScalarIV(Step); 2518 // Create scalar steps that can be used by instructions we will later 2519 // scalarize. Note that the addition of the scalar steps will not increase 2520 // the number of instructions in the loop in the common case prior to 2521 // InstCombine. We will be trading one vector extract for each scalar step. 2522 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2523 return; 2524 } 2525 2526 // All IV users are scalar instructions, so only emit a scalar IV, not a 2527 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2528 // predicate used by the masked loads/stores. 2529 Value *ScalarIV = CreateScalarIV(Step); 2530 if (!Cost->isScalarEpilogueAllowed()) 2531 CreateSplatIV(ScalarIV, Step); 2532 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2533 } 2534 2535 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2536 Value *Step, 2537 Instruction::BinaryOps BinOp) { 2538 // Create and check the types. 2539 auto *ValVTy = cast<VectorType>(Val->getType()); 2540 ElementCount VLen = ValVTy->getElementCount(); 2541 2542 Type *STy = Val->getType()->getScalarType(); 2543 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2544 "Induction Step must be an integer or FP"); 2545 assert(Step->getType() == STy && "Step has wrong type"); 2546 2547 SmallVector<Constant *, 8> Indices; 2548 2549 // Create a vector of consecutive numbers from zero to VF. 2550 VectorType *InitVecValVTy = ValVTy; 2551 Type *InitVecValSTy = STy; 2552 if (STy->isFloatingPointTy()) { 2553 InitVecValSTy = 2554 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2555 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2556 } 2557 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2558 2559 // Splat the StartIdx 2560 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2561 2562 if (STy->isIntegerTy()) { 2563 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2564 Step = Builder.CreateVectorSplat(VLen, Step); 2565 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2566 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2567 // which can be found from the original scalar operations. 2568 Step = Builder.CreateMul(InitVec, Step); 2569 return Builder.CreateAdd(Val, Step, "induction"); 2570 } 2571 2572 // Floating point induction. 2573 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2574 "Binary Opcode should be specified for FP induction"); 2575 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2576 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2577 2578 Step = Builder.CreateVectorSplat(VLen, Step); 2579 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2580 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2581 } 2582 2583 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2584 Instruction *EntryVal, 2585 const InductionDescriptor &ID, 2586 VPValue *Def, VPValue *CastDef, 2587 VPTransformState &State) { 2588 // We shouldn't have to build scalar steps if we aren't vectorizing. 2589 assert(VF.isVector() && "VF should be greater than one"); 2590 // Get the value type and ensure it and the step have the same integer type. 2591 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2592 assert(ScalarIVTy == Step->getType() && 2593 "Val and Step should have the same type"); 2594 2595 // We build scalar steps for both integer and floating-point induction 2596 // variables. Here, we determine the kind of arithmetic we will perform. 2597 Instruction::BinaryOps AddOp; 2598 Instruction::BinaryOps MulOp; 2599 if (ScalarIVTy->isIntegerTy()) { 2600 AddOp = Instruction::Add; 2601 MulOp = Instruction::Mul; 2602 } else { 2603 AddOp = ID.getInductionOpcode(); 2604 MulOp = Instruction::FMul; 2605 } 2606 2607 // Determine the number of scalars we need to generate for each unroll 2608 // iteration. If EntryVal is uniform, we only need to generate the first 2609 // lane. Otherwise, we generate all VF values. 2610 bool IsUniform = 2611 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2612 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2613 // Compute the scalar steps and save the results in State. 2614 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2615 ScalarIVTy->getScalarSizeInBits()); 2616 Type *VecIVTy = nullptr; 2617 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2618 if (!IsUniform && VF.isScalable()) { 2619 VecIVTy = VectorType::get(ScalarIVTy, VF); 2620 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2621 SplatStep = Builder.CreateVectorSplat(VF, Step); 2622 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2623 } 2624 2625 for (unsigned Part = 0; Part < UF; ++Part) { 2626 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); 2627 2628 if (!IsUniform && VF.isScalable()) { 2629 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2630 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2631 if (ScalarIVTy->isFloatingPointTy()) 2632 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2633 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2634 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2635 State.set(Def, Add, Part); 2636 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2637 Part); 2638 // It's useful to record the lane values too for the known minimum number 2639 // of elements so we do those below. This improves the code quality when 2640 // trying to extract the first element, for example. 2641 } 2642 2643 if (ScalarIVTy->isFloatingPointTy()) 2644 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2645 2646 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2647 Value *StartIdx = Builder.CreateBinOp( 2648 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2649 // The step returned by `createStepForVF` is a runtime-evaluated value 2650 // when VF is scalable. Otherwise, it should be folded into a Constant. 2651 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2652 "Expected StartIdx to be folded to a constant when VF is not " 2653 "scalable"); 2654 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2655 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2656 State.set(Def, Add, VPIteration(Part, Lane)); 2657 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2658 Part, Lane); 2659 } 2660 } 2661 } 2662 2663 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2664 const VPIteration &Instance, 2665 VPTransformState &State) { 2666 Value *ScalarInst = State.get(Def, Instance); 2667 Value *VectorValue = State.get(Def, Instance.Part); 2668 VectorValue = Builder.CreateInsertElement( 2669 VectorValue, ScalarInst, 2670 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2671 State.set(Def, VectorValue, Instance.Part); 2672 } 2673 2674 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2675 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2676 return Builder.CreateVectorReverse(Vec, "reverse"); 2677 } 2678 2679 // Return whether we allow using masked interleave-groups (for dealing with 2680 // strided loads/stores that reside in predicated blocks, or for dealing 2681 // with gaps). 2682 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2683 // If an override option has been passed in for interleaved accesses, use it. 2684 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2685 return EnableMaskedInterleavedMemAccesses; 2686 2687 return TTI.enableMaskedInterleavedAccessVectorization(); 2688 } 2689 2690 // Try to vectorize the interleave group that \p Instr belongs to. 2691 // 2692 // E.g. Translate following interleaved load group (factor = 3): 2693 // for (i = 0; i < N; i+=3) { 2694 // R = Pic[i]; // Member of index 0 2695 // G = Pic[i+1]; // Member of index 1 2696 // B = Pic[i+2]; // Member of index 2 2697 // ... // do something to R, G, B 2698 // } 2699 // To: 2700 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2701 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2702 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2703 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2704 // 2705 // Or translate following interleaved store group (factor = 3): 2706 // for (i = 0; i < N; i+=3) { 2707 // ... do something to R, G, B 2708 // Pic[i] = R; // Member of index 0 2709 // Pic[i+1] = G; // Member of index 1 2710 // Pic[i+2] = B; // Member of index 2 2711 // } 2712 // To: 2713 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2714 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2715 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2716 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2717 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2718 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2719 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2720 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2721 VPValue *BlockInMask) { 2722 Instruction *Instr = Group->getInsertPos(); 2723 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2724 2725 // Prepare for the vector type of the interleaved load/store. 2726 Type *ScalarTy = getLoadStoreType(Instr); 2727 unsigned InterleaveFactor = Group->getFactor(); 2728 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2729 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2730 2731 // Prepare for the new pointers. 2732 SmallVector<Value *, 2> AddrParts; 2733 unsigned Index = Group->getIndex(Instr); 2734 2735 // TODO: extend the masked interleaved-group support to reversed access. 2736 assert((!BlockInMask || !Group->isReverse()) && 2737 "Reversed masked interleave-group not supported."); 2738 2739 // If the group is reverse, adjust the index to refer to the last vector lane 2740 // instead of the first. We adjust the index from the first vector lane, 2741 // rather than directly getting the pointer for lane VF - 1, because the 2742 // pointer operand of the interleaved access is supposed to be uniform. For 2743 // uniform instructions, we're only required to generate a value for the 2744 // first vector lane in each unroll iteration. 2745 if (Group->isReverse()) 2746 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2747 2748 for (unsigned Part = 0; Part < UF; Part++) { 2749 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2750 setDebugLocFromInst(AddrPart); 2751 2752 // Notice current instruction could be any index. Need to adjust the address 2753 // to the member of index 0. 2754 // 2755 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2756 // b = A[i]; // Member of index 0 2757 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2758 // 2759 // E.g. A[i+1] = a; // Member of index 1 2760 // A[i] = b; // Member of index 0 2761 // A[i+2] = c; // Member of index 2 (Current instruction) 2762 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2763 2764 bool InBounds = false; 2765 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2766 InBounds = gep->isInBounds(); 2767 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2768 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2769 2770 // Cast to the vector pointer type. 2771 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2772 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2773 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2774 } 2775 2776 setDebugLocFromInst(Instr); 2777 Value *PoisonVec = PoisonValue::get(VecTy); 2778 2779 Value *MaskForGaps = nullptr; 2780 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2781 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2782 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2783 } 2784 2785 // Vectorize the interleaved load group. 2786 if (isa<LoadInst>(Instr)) { 2787 // For each unroll part, create a wide load for the group. 2788 SmallVector<Value *, 2> NewLoads; 2789 for (unsigned Part = 0; Part < UF; Part++) { 2790 Instruction *NewLoad; 2791 if (BlockInMask || MaskForGaps) { 2792 assert(useMaskedInterleavedAccesses(*TTI) && 2793 "masked interleaved groups are not allowed."); 2794 Value *GroupMask = MaskForGaps; 2795 if (BlockInMask) { 2796 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2797 Value *ShuffledMask = Builder.CreateShuffleVector( 2798 BlockInMaskPart, 2799 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2800 "interleaved.mask"); 2801 GroupMask = MaskForGaps 2802 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2803 MaskForGaps) 2804 : ShuffledMask; 2805 } 2806 NewLoad = 2807 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2808 GroupMask, PoisonVec, "wide.masked.vec"); 2809 } 2810 else 2811 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2812 Group->getAlign(), "wide.vec"); 2813 Group->addMetadata(NewLoad); 2814 NewLoads.push_back(NewLoad); 2815 } 2816 2817 // For each member in the group, shuffle out the appropriate data from the 2818 // wide loads. 2819 unsigned J = 0; 2820 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2821 Instruction *Member = Group->getMember(I); 2822 2823 // Skip the gaps in the group. 2824 if (!Member) 2825 continue; 2826 2827 auto StrideMask = 2828 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2829 for (unsigned Part = 0; Part < UF; Part++) { 2830 Value *StridedVec = Builder.CreateShuffleVector( 2831 NewLoads[Part], StrideMask, "strided.vec"); 2832 2833 // If this member has different type, cast the result type. 2834 if (Member->getType() != ScalarTy) { 2835 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2836 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2837 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2838 } 2839 2840 if (Group->isReverse()) 2841 StridedVec = reverseVector(StridedVec); 2842 2843 State.set(VPDefs[J], StridedVec, Part); 2844 } 2845 ++J; 2846 } 2847 return; 2848 } 2849 2850 // The sub vector type for current instruction. 2851 auto *SubVT = VectorType::get(ScalarTy, VF); 2852 2853 // Vectorize the interleaved store group. 2854 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2855 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2856 "masked interleaved groups are not allowed."); 2857 assert((!MaskForGaps || !VF.isScalable()) && 2858 "masking gaps for scalable vectors is not yet supported."); 2859 for (unsigned Part = 0; Part < UF; Part++) { 2860 // Collect the stored vector from each member. 2861 SmallVector<Value *, 4> StoredVecs; 2862 for (unsigned i = 0; i < InterleaveFactor; i++) { 2863 assert((Group->getMember(i) || MaskForGaps) && 2864 "Fail to get a member from an interleaved store group"); 2865 Instruction *Member = Group->getMember(i); 2866 2867 // Skip the gaps in the group. 2868 if (!Member) { 2869 Value *Undef = PoisonValue::get(SubVT); 2870 StoredVecs.push_back(Undef); 2871 continue; 2872 } 2873 2874 Value *StoredVec = State.get(StoredValues[i], Part); 2875 2876 if (Group->isReverse()) 2877 StoredVec = reverseVector(StoredVec); 2878 2879 // If this member has different type, cast it to a unified type. 2880 2881 if (StoredVec->getType() != SubVT) 2882 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2883 2884 StoredVecs.push_back(StoredVec); 2885 } 2886 2887 // Concatenate all vectors into a wide vector. 2888 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2889 2890 // Interleave the elements in the wide vector. 2891 Value *IVec = Builder.CreateShuffleVector( 2892 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2893 "interleaved.vec"); 2894 2895 Instruction *NewStoreInstr; 2896 if (BlockInMask || MaskForGaps) { 2897 Value *GroupMask = MaskForGaps; 2898 if (BlockInMask) { 2899 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2900 Value *ShuffledMask = Builder.CreateShuffleVector( 2901 BlockInMaskPart, 2902 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2903 "interleaved.mask"); 2904 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2905 ShuffledMask, MaskForGaps) 2906 : ShuffledMask; 2907 } 2908 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2909 Group->getAlign(), GroupMask); 2910 } else 2911 NewStoreInstr = 2912 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2913 2914 Group->addMetadata(NewStoreInstr); 2915 } 2916 } 2917 2918 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2919 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2920 VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, 2921 bool Reverse) { 2922 // Attempt to issue a wide load. 2923 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2924 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2925 2926 assert((LI || SI) && "Invalid Load/Store instruction"); 2927 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2928 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2929 2930 Type *ScalarDataTy = getLoadStoreType(Instr); 2931 2932 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2933 const Align Alignment = getLoadStoreAlignment(Instr); 2934 bool CreateGatherScatter = !ConsecutiveStride; 2935 2936 VectorParts BlockInMaskParts(UF); 2937 bool isMaskRequired = BlockInMask; 2938 if (isMaskRequired) 2939 for (unsigned Part = 0; Part < UF; ++Part) 2940 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2941 2942 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2943 // Calculate the pointer for the specific unroll-part. 2944 GetElementPtrInst *PartPtr = nullptr; 2945 2946 bool InBounds = false; 2947 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2948 InBounds = gep->isInBounds(); 2949 if (Reverse) { 2950 // If the address is consecutive but reversed, then the 2951 // wide store needs to start at the last vector element. 2952 // RunTimeVF = VScale * VF.getKnownMinValue() 2953 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2954 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2955 // NumElt = -Part * RunTimeVF 2956 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2957 // LastLane = 1 - RunTimeVF 2958 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2959 PartPtr = 2960 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2961 PartPtr->setIsInBounds(InBounds); 2962 PartPtr = cast<GetElementPtrInst>( 2963 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2964 PartPtr->setIsInBounds(InBounds); 2965 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2966 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2967 } else { 2968 Value *Increment = 2969 createStepForVF(Builder, Builder.getInt32Ty(), VF, Part); 2970 PartPtr = cast<GetElementPtrInst>( 2971 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2972 PartPtr->setIsInBounds(InBounds); 2973 } 2974 2975 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2976 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2977 }; 2978 2979 // Handle Stores: 2980 if (SI) { 2981 setDebugLocFromInst(SI); 2982 2983 for (unsigned Part = 0; Part < UF; ++Part) { 2984 Instruction *NewSI = nullptr; 2985 Value *StoredVal = State.get(StoredValue, Part); 2986 if (CreateGatherScatter) { 2987 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2988 Value *VectorGep = State.get(Addr, Part); 2989 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2990 MaskPart); 2991 } else { 2992 if (Reverse) { 2993 // If we store to reverse consecutive memory locations, then we need 2994 // to reverse the order of elements in the stored value. 2995 StoredVal = reverseVector(StoredVal); 2996 // We don't want to update the value in the map as it might be used in 2997 // another expression. So don't call resetVectorValue(StoredVal). 2998 } 2999 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3000 if (isMaskRequired) 3001 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 3002 BlockInMaskParts[Part]); 3003 else 3004 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 3005 } 3006 addMetadata(NewSI, SI); 3007 } 3008 return; 3009 } 3010 3011 // Handle loads. 3012 assert(LI && "Must have a load instruction"); 3013 setDebugLocFromInst(LI); 3014 for (unsigned Part = 0; Part < UF; ++Part) { 3015 Value *NewLI; 3016 if (CreateGatherScatter) { 3017 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3018 Value *VectorGep = State.get(Addr, Part); 3019 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3020 nullptr, "wide.masked.gather"); 3021 addMetadata(NewLI, LI); 3022 } else { 3023 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3024 if (isMaskRequired) 3025 NewLI = Builder.CreateMaskedLoad( 3026 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3027 PoisonValue::get(DataTy), "wide.masked.load"); 3028 else 3029 NewLI = 3030 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3031 3032 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3033 addMetadata(NewLI, LI); 3034 if (Reverse) 3035 NewLI = reverseVector(NewLI); 3036 } 3037 3038 State.set(Def, NewLI, Part); 3039 } 3040 } 3041 3042 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3043 VPUser &User, 3044 const VPIteration &Instance, 3045 bool IfPredicateInstr, 3046 VPTransformState &State) { 3047 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3048 3049 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3050 // the first lane and part. 3051 if (isa<NoAliasScopeDeclInst>(Instr)) 3052 if (!Instance.isFirstIteration()) 3053 return; 3054 3055 setDebugLocFromInst(Instr); 3056 3057 // Does this instruction return a value ? 3058 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3059 3060 Instruction *Cloned = Instr->clone(); 3061 if (!IsVoidRetTy) 3062 Cloned->setName(Instr->getName() + ".cloned"); 3063 3064 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3065 Builder.GetInsertPoint()); 3066 // Replace the operands of the cloned instructions with their scalar 3067 // equivalents in the new loop. 3068 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3069 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3070 auto InputInstance = Instance; 3071 if (!Operand || !OrigLoop->contains(Operand) || 3072 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3073 InputInstance.Lane = VPLane::getFirstLane(); 3074 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3075 Cloned->setOperand(op, NewOp); 3076 } 3077 addNewMetadata(Cloned, Instr); 3078 3079 // Place the cloned scalar in the new loop. 3080 Builder.Insert(Cloned); 3081 3082 State.set(Def, Cloned, Instance); 3083 3084 // If we just cloned a new assumption, add it the assumption cache. 3085 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3086 AC->registerAssumption(II); 3087 3088 // End if-block. 3089 if (IfPredicateInstr) 3090 PredicatedInstructions.push_back(Cloned); 3091 } 3092 3093 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3094 Value *End, Value *Step, 3095 Instruction *DL) { 3096 BasicBlock *Header = L->getHeader(); 3097 BasicBlock *Latch = L->getLoopLatch(); 3098 // As we're just creating this loop, it's possible no latch exists 3099 // yet. If so, use the header as this will be a single block loop. 3100 if (!Latch) 3101 Latch = Header; 3102 3103 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3104 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3105 setDebugLocFromInst(OldInst, &B); 3106 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3107 3108 B.SetInsertPoint(Latch->getTerminator()); 3109 setDebugLocFromInst(OldInst, &B); 3110 3111 // Create i+1 and fill the PHINode. 3112 // 3113 // If the tail is not folded, we know that End - Start >= Step (either 3114 // statically or through the minimum iteration checks). We also know that both 3115 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3116 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3117 // overflows and we can mark the induction increment as NUW. 3118 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3119 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3120 Induction->addIncoming(Start, L->getLoopPreheader()); 3121 Induction->addIncoming(Next, Latch); 3122 // Create the compare. 3123 Value *ICmp = B.CreateICmpEQ(Next, End); 3124 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3125 3126 // Now we have two terminators. Remove the old one from the block. 3127 Latch->getTerminator()->eraseFromParent(); 3128 3129 return Induction; 3130 } 3131 3132 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3133 if (TripCount) 3134 return TripCount; 3135 3136 assert(L && "Create Trip Count for null loop."); 3137 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3138 // Find the loop boundaries. 3139 ScalarEvolution *SE = PSE.getSE(); 3140 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3141 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3142 "Invalid loop count"); 3143 3144 Type *IdxTy = Legal->getWidestInductionType(); 3145 assert(IdxTy && "No type for induction"); 3146 3147 // The exit count might have the type of i64 while the phi is i32. This can 3148 // happen if we have an induction variable that is sign extended before the 3149 // compare. The only way that we get a backedge taken count is that the 3150 // induction variable was signed and as such will not overflow. In such a case 3151 // truncation is legal. 3152 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3153 IdxTy->getPrimitiveSizeInBits()) 3154 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3155 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3156 3157 // Get the total trip count from the count by adding 1. 3158 const SCEV *ExitCount = SE->getAddExpr( 3159 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3160 3161 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3162 3163 // Expand the trip count and place the new instructions in the preheader. 3164 // Notice that the pre-header does not change, only the loop body. 3165 SCEVExpander Exp(*SE, DL, "induction"); 3166 3167 // Count holds the overall loop count (N). 3168 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3169 L->getLoopPreheader()->getTerminator()); 3170 3171 if (TripCount->getType()->isPointerTy()) 3172 TripCount = 3173 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3174 L->getLoopPreheader()->getTerminator()); 3175 3176 return TripCount; 3177 } 3178 3179 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3180 if (VectorTripCount) 3181 return VectorTripCount; 3182 3183 Value *TC = getOrCreateTripCount(L); 3184 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3185 3186 Type *Ty = TC->getType(); 3187 // This is where we can make the step a runtime constant. 3188 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3189 3190 // If the tail is to be folded by masking, round the number of iterations N 3191 // up to a multiple of Step instead of rounding down. This is done by first 3192 // adding Step-1 and then rounding down. Note that it's ok if this addition 3193 // overflows: the vector induction variable will eventually wrap to zero given 3194 // that it starts at zero and its Step is a power of two; the loop will then 3195 // exit, with the last early-exit vector comparison also producing all-true. 3196 if (Cost->foldTailByMasking()) { 3197 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3198 "VF*UF must be a power of 2 when folding tail by masking"); 3199 assert(!VF.isScalable() && 3200 "Tail folding not yet supported for scalable vectors"); 3201 TC = Builder.CreateAdd( 3202 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3203 } 3204 3205 // Now we need to generate the expression for the part of the loop that the 3206 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3207 // iterations are not required for correctness, or N - Step, otherwise. Step 3208 // is equal to the vectorization factor (number of SIMD elements) times the 3209 // unroll factor (number of SIMD instructions). 3210 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3211 3212 // There are cases where we *must* run at least one iteration in the remainder 3213 // loop. See the cost model for when this can happen. If the step evenly 3214 // divides the trip count, we set the remainder to be equal to the step. If 3215 // the step does not evenly divide the trip count, no adjustment is necessary 3216 // since there will already be scalar iterations. Note that the minimum 3217 // iterations check ensures that N >= Step. 3218 if (Cost->requiresScalarEpilogue(VF)) { 3219 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3220 R = Builder.CreateSelect(IsZero, Step, R); 3221 } 3222 3223 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3224 3225 return VectorTripCount; 3226 } 3227 3228 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3229 const DataLayout &DL) { 3230 // Verify that V is a vector type with same number of elements as DstVTy. 3231 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3232 unsigned VF = DstFVTy->getNumElements(); 3233 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3234 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3235 Type *SrcElemTy = SrcVecTy->getElementType(); 3236 Type *DstElemTy = DstFVTy->getElementType(); 3237 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3238 "Vector elements must have same size"); 3239 3240 // Do a direct cast if element types are castable. 3241 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3242 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3243 } 3244 // V cannot be directly casted to desired vector type. 3245 // May happen when V is a floating point vector but DstVTy is a vector of 3246 // pointers or vice-versa. Handle this using a two-step bitcast using an 3247 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3248 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3249 "Only one type should be a pointer type"); 3250 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3251 "Only one type should be a floating point type"); 3252 Type *IntTy = 3253 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3254 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3255 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3256 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3257 } 3258 3259 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3260 BasicBlock *Bypass) { 3261 Value *Count = getOrCreateTripCount(L); 3262 // Reuse existing vector loop preheader for TC checks. 3263 // Note that new preheader block is generated for vector loop. 3264 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3265 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3266 3267 // Generate code to check if the loop's trip count is less than VF * UF, or 3268 // equal to it in case a scalar epilogue is required; this implies that the 3269 // vector trip count is zero. This check also covers the case where adding one 3270 // to the backedge-taken count overflowed leading to an incorrect trip count 3271 // of zero. In this case we will also jump to the scalar loop. 3272 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3273 : ICmpInst::ICMP_ULT; 3274 3275 // If tail is to be folded, vector loop takes care of all iterations. 3276 Value *CheckMinIters = Builder.getFalse(); 3277 if (!Cost->foldTailByMasking()) { 3278 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3279 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3280 } 3281 // Create new preheader for vector loop. 3282 LoopVectorPreHeader = 3283 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3284 "vector.ph"); 3285 3286 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3287 DT->getNode(Bypass)->getIDom()) && 3288 "TC check is expected to dominate Bypass"); 3289 3290 // Update dominator for Bypass & LoopExit (if needed). 3291 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3292 if (!Cost->requiresScalarEpilogue(VF)) 3293 // If there is an epilogue which must run, there's no edge from the 3294 // middle block to exit blocks and thus no need to update the immediate 3295 // dominator of the exit blocks. 3296 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3297 3298 ReplaceInstWithInst( 3299 TCCheckBlock->getTerminator(), 3300 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3301 LoopBypassBlocks.push_back(TCCheckBlock); 3302 } 3303 3304 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3305 3306 BasicBlock *const SCEVCheckBlock = 3307 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3308 if (!SCEVCheckBlock) 3309 return nullptr; 3310 3311 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3312 (OptForSizeBasedOnProfile && 3313 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3314 "Cannot SCEV check stride or overflow when optimizing for size"); 3315 3316 3317 // Update dominator only if this is first RT check. 3318 if (LoopBypassBlocks.empty()) { 3319 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3320 if (!Cost->requiresScalarEpilogue(VF)) 3321 // If there is an epilogue which must run, there's no edge from the 3322 // middle block to exit blocks and thus no need to update the immediate 3323 // dominator of the exit blocks. 3324 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3325 } 3326 3327 LoopBypassBlocks.push_back(SCEVCheckBlock); 3328 AddedSafetyChecks = true; 3329 return SCEVCheckBlock; 3330 } 3331 3332 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3333 BasicBlock *Bypass) { 3334 // VPlan-native path does not do any analysis for runtime checks currently. 3335 if (EnableVPlanNativePath) 3336 return nullptr; 3337 3338 BasicBlock *const MemCheckBlock = 3339 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3340 3341 // Check if we generated code that checks in runtime if arrays overlap. We put 3342 // the checks into a separate block to make the more common case of few 3343 // elements faster. 3344 if (!MemCheckBlock) 3345 return nullptr; 3346 3347 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3348 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3349 "Cannot emit memory checks when optimizing for size, unless forced " 3350 "to vectorize."); 3351 ORE->emit([&]() { 3352 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3353 L->getStartLoc(), L->getHeader()) 3354 << "Code-size may be reduced by not forcing " 3355 "vectorization, or by source-code modifications " 3356 "eliminating the need for runtime checks " 3357 "(e.g., adding 'restrict')."; 3358 }); 3359 } 3360 3361 LoopBypassBlocks.push_back(MemCheckBlock); 3362 3363 AddedSafetyChecks = true; 3364 3365 // We currently don't use LoopVersioning for the actual loop cloning but we 3366 // still use it to add the noalias metadata. 3367 LVer = std::make_unique<LoopVersioning>( 3368 *Legal->getLAI(), 3369 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3370 DT, PSE.getSE()); 3371 LVer->prepareNoAliasMetadata(); 3372 return MemCheckBlock; 3373 } 3374 3375 Value *InnerLoopVectorizer::emitTransformedIndex( 3376 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3377 const InductionDescriptor &ID) const { 3378 3379 SCEVExpander Exp(*SE, DL, "induction"); 3380 auto Step = ID.getStep(); 3381 auto StartValue = ID.getStartValue(); 3382 assert(Index->getType()->getScalarType() == Step->getType() && 3383 "Index scalar type does not match StepValue type"); 3384 3385 // Note: the IR at this point is broken. We cannot use SE to create any new 3386 // SCEV and then expand it, hoping that SCEV's simplification will give us 3387 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3388 // lead to various SCEV crashes. So all we can do is to use builder and rely 3389 // on InstCombine for future simplifications. Here we handle some trivial 3390 // cases only. 3391 auto CreateAdd = [&B](Value *X, Value *Y) { 3392 assert(X->getType() == Y->getType() && "Types don't match!"); 3393 if (auto *CX = dyn_cast<ConstantInt>(X)) 3394 if (CX->isZero()) 3395 return Y; 3396 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3397 if (CY->isZero()) 3398 return X; 3399 return B.CreateAdd(X, Y); 3400 }; 3401 3402 // We allow X to be a vector type, in which case Y will potentially be 3403 // splatted into a vector with the same element count. 3404 auto CreateMul = [&B](Value *X, Value *Y) { 3405 assert(X->getType()->getScalarType() == Y->getType() && 3406 "Types don't match!"); 3407 if (auto *CX = dyn_cast<ConstantInt>(X)) 3408 if (CX->isOne()) 3409 return Y; 3410 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3411 if (CY->isOne()) 3412 return X; 3413 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3414 if (XVTy && !isa<VectorType>(Y->getType())) 3415 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3416 return B.CreateMul(X, Y); 3417 }; 3418 3419 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3420 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3421 // the DomTree is not kept up-to-date for additional blocks generated in the 3422 // vector loop. By using the header as insertion point, we guarantee that the 3423 // expanded instructions dominate all their uses. 3424 auto GetInsertPoint = [this, &B]() { 3425 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3426 if (InsertBB != LoopVectorBody && 3427 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3428 return LoopVectorBody->getTerminator(); 3429 return &*B.GetInsertPoint(); 3430 }; 3431 3432 switch (ID.getKind()) { 3433 case InductionDescriptor::IK_IntInduction: { 3434 assert(!isa<VectorType>(Index->getType()) && 3435 "Vector indices not supported for integer inductions yet"); 3436 assert(Index->getType() == StartValue->getType() && 3437 "Index type does not match StartValue type"); 3438 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3439 return B.CreateSub(StartValue, Index); 3440 auto *Offset = CreateMul( 3441 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3442 return CreateAdd(StartValue, Offset); 3443 } 3444 case InductionDescriptor::IK_PtrInduction: { 3445 assert(isa<SCEVConstant>(Step) && 3446 "Expected constant step for pointer induction"); 3447 return B.CreateGEP( 3448 ID.getElementType(), StartValue, 3449 CreateMul(Index, 3450 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3451 GetInsertPoint()))); 3452 } 3453 case InductionDescriptor::IK_FpInduction: { 3454 assert(!isa<VectorType>(Index->getType()) && 3455 "Vector indices not supported for FP inductions yet"); 3456 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3457 auto InductionBinOp = ID.getInductionBinOp(); 3458 assert(InductionBinOp && 3459 (InductionBinOp->getOpcode() == Instruction::FAdd || 3460 InductionBinOp->getOpcode() == Instruction::FSub) && 3461 "Original bin op should be defined for FP induction"); 3462 3463 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3464 Value *MulExp = B.CreateFMul(StepValue, Index); 3465 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3466 "induction"); 3467 } 3468 case InductionDescriptor::IK_NoInduction: 3469 return nullptr; 3470 } 3471 llvm_unreachable("invalid enum"); 3472 } 3473 3474 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3475 LoopScalarBody = OrigLoop->getHeader(); 3476 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3477 assert(LoopVectorPreHeader && "Invalid loop structure"); 3478 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3479 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3480 "multiple exit loop without required epilogue?"); 3481 3482 LoopMiddleBlock = 3483 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3484 LI, nullptr, Twine(Prefix) + "middle.block"); 3485 LoopScalarPreHeader = 3486 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3487 nullptr, Twine(Prefix) + "scalar.ph"); 3488 3489 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3490 3491 // Set up the middle block terminator. Two cases: 3492 // 1) If we know that we must execute the scalar epilogue, emit an 3493 // unconditional branch. 3494 // 2) Otherwise, we must have a single unique exit block (due to how we 3495 // implement the multiple exit case). In this case, set up a conditonal 3496 // branch from the middle block to the loop scalar preheader, and the 3497 // exit block. completeLoopSkeleton will update the condition to use an 3498 // iteration check, if required to decide whether to execute the remainder. 3499 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3500 BranchInst::Create(LoopScalarPreHeader) : 3501 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3502 Builder.getTrue()); 3503 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3504 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3505 3506 // We intentionally don't let SplitBlock to update LoopInfo since 3507 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3508 // LoopVectorBody is explicitly added to the correct place few lines later. 3509 LoopVectorBody = 3510 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3511 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3512 3513 // Update dominator for loop exit. 3514 if (!Cost->requiresScalarEpilogue(VF)) 3515 // If there is an epilogue which must run, there's no edge from the 3516 // middle block to exit blocks and thus no need to update the immediate 3517 // dominator of the exit blocks. 3518 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3519 3520 // Create and register the new vector loop. 3521 Loop *Lp = LI->AllocateLoop(); 3522 Loop *ParentLoop = OrigLoop->getParentLoop(); 3523 3524 // Insert the new loop into the loop nest and register the new basic blocks 3525 // before calling any utilities such as SCEV that require valid LoopInfo. 3526 if (ParentLoop) { 3527 ParentLoop->addChildLoop(Lp); 3528 } else { 3529 LI->addTopLevelLoop(Lp); 3530 } 3531 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3532 return Lp; 3533 } 3534 3535 void InnerLoopVectorizer::createInductionResumeValues( 3536 Loop *L, Value *VectorTripCount, 3537 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3538 assert(VectorTripCount && L && "Expected valid arguments"); 3539 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3540 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3541 "Inconsistent information about additional bypass."); 3542 // We are going to resume the execution of the scalar loop. 3543 // Go over all of the induction variables that we found and fix the 3544 // PHIs that are left in the scalar version of the loop. 3545 // The starting values of PHI nodes depend on the counter of the last 3546 // iteration in the vectorized loop. 3547 // If we come from a bypass edge then we need to start from the original 3548 // start value. 3549 for (auto &InductionEntry : Legal->getInductionVars()) { 3550 PHINode *OrigPhi = InductionEntry.first; 3551 InductionDescriptor II = InductionEntry.second; 3552 3553 // Create phi nodes to merge from the backedge-taken check block. 3554 PHINode *BCResumeVal = 3555 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3556 LoopScalarPreHeader->getTerminator()); 3557 // Copy original phi DL over to the new one. 3558 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3559 Value *&EndValue = IVEndValues[OrigPhi]; 3560 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3561 if (OrigPhi == OldInduction) { 3562 // We know what the end value is. 3563 EndValue = VectorTripCount; 3564 } else { 3565 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3566 3567 // Fast-math-flags propagate from the original induction instruction. 3568 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3569 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3570 3571 Type *StepType = II.getStep()->getType(); 3572 Instruction::CastOps CastOp = 3573 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3574 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3575 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3576 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3577 EndValue->setName("ind.end"); 3578 3579 // Compute the end value for the additional bypass (if applicable). 3580 if (AdditionalBypass.first) { 3581 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3582 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3583 StepType, true); 3584 CRD = 3585 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3586 EndValueFromAdditionalBypass = 3587 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3588 EndValueFromAdditionalBypass->setName("ind.end"); 3589 } 3590 } 3591 // The new PHI merges the original incoming value, in case of a bypass, 3592 // or the value at the end of the vectorized loop. 3593 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3594 3595 // Fix the scalar body counter (PHI node). 3596 // The old induction's phi node in the scalar body needs the truncated 3597 // value. 3598 for (BasicBlock *BB : LoopBypassBlocks) 3599 BCResumeVal->addIncoming(II.getStartValue(), BB); 3600 3601 if (AdditionalBypass.first) 3602 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3603 EndValueFromAdditionalBypass); 3604 3605 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3606 } 3607 } 3608 3609 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3610 MDNode *OrigLoopID) { 3611 assert(L && "Expected valid loop."); 3612 3613 // The trip counts should be cached by now. 3614 Value *Count = getOrCreateTripCount(L); 3615 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3616 3617 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3618 3619 // Add a check in the middle block to see if we have completed 3620 // all of the iterations in the first vector loop. Three cases: 3621 // 1) If we require a scalar epilogue, there is no conditional branch as 3622 // we unconditionally branch to the scalar preheader. Do nothing. 3623 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3624 // Thus if tail is to be folded, we know we don't need to run the 3625 // remainder and we can use the previous value for the condition (true). 3626 // 3) Otherwise, construct a runtime check. 3627 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3628 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3629 Count, VectorTripCount, "cmp.n", 3630 LoopMiddleBlock->getTerminator()); 3631 3632 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3633 // of the corresponding compare because they may have ended up with 3634 // different line numbers and we want to avoid awkward line stepping while 3635 // debugging. Eg. if the compare has got a line number inside the loop. 3636 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3637 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3638 } 3639 3640 // Get ready to start creating new instructions into the vectorized body. 3641 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3642 "Inconsistent vector loop preheader"); 3643 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3644 3645 Optional<MDNode *> VectorizedLoopID = 3646 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3647 LLVMLoopVectorizeFollowupVectorized}); 3648 if (VectorizedLoopID.hasValue()) { 3649 L->setLoopID(VectorizedLoopID.getValue()); 3650 3651 // Do not setAlreadyVectorized if loop attributes have been defined 3652 // explicitly. 3653 return LoopVectorPreHeader; 3654 } 3655 3656 // Keep all loop hints from the original loop on the vector loop (we'll 3657 // replace the vectorizer-specific hints below). 3658 if (MDNode *LID = OrigLoop->getLoopID()) 3659 L->setLoopID(LID); 3660 3661 LoopVectorizeHints Hints(L, true, *ORE); 3662 Hints.setAlreadyVectorized(); 3663 3664 #ifdef EXPENSIVE_CHECKS 3665 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3666 LI->verify(*DT); 3667 #endif 3668 3669 return LoopVectorPreHeader; 3670 } 3671 3672 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3673 /* 3674 In this function we generate a new loop. The new loop will contain 3675 the vectorized instructions while the old loop will continue to run the 3676 scalar remainder. 3677 3678 [ ] <-- loop iteration number check. 3679 / | 3680 / v 3681 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3682 | / | 3683 | / v 3684 || [ ] <-- vector pre header. 3685 |/ | 3686 | v 3687 | [ ] \ 3688 | [ ]_| <-- vector loop. 3689 | | 3690 | v 3691 \ -[ ] <--- middle-block. 3692 \/ | 3693 /\ v 3694 | ->[ ] <--- new preheader. 3695 | | 3696 (opt) v <-- edge from middle to exit iff epilogue is not required. 3697 | [ ] \ 3698 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3699 \ | 3700 \ v 3701 >[ ] <-- exit block(s). 3702 ... 3703 */ 3704 3705 // Get the metadata of the original loop before it gets modified. 3706 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3707 3708 // Workaround! Compute the trip count of the original loop and cache it 3709 // before we start modifying the CFG. This code has a systemic problem 3710 // wherein it tries to run analysis over partially constructed IR; this is 3711 // wrong, and not simply for SCEV. The trip count of the original loop 3712 // simply happens to be prone to hitting this in practice. In theory, we 3713 // can hit the same issue for any SCEV, or ValueTracking query done during 3714 // mutation. See PR49900. 3715 getOrCreateTripCount(OrigLoop); 3716 3717 // Create an empty vector loop, and prepare basic blocks for the runtime 3718 // checks. 3719 Loop *Lp = createVectorLoopSkeleton(""); 3720 3721 // Now, compare the new count to zero. If it is zero skip the vector loop and 3722 // jump to the scalar loop. This check also covers the case where the 3723 // backedge-taken count is uint##_max: adding one to it will overflow leading 3724 // to an incorrect trip count of zero. In this (rare) case we will also jump 3725 // to the scalar loop. 3726 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3727 3728 // Generate the code to check any assumptions that we've made for SCEV 3729 // expressions. 3730 emitSCEVChecks(Lp, LoopScalarPreHeader); 3731 3732 // Generate the code that checks in runtime if arrays overlap. We put the 3733 // checks into a separate block to make the more common case of few elements 3734 // faster. 3735 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3736 3737 // Some loops have a single integer induction variable, while other loops 3738 // don't. One example is c++ iterators that often have multiple pointer 3739 // induction variables. In the code below we also support a case where we 3740 // don't have a single induction variable. 3741 // 3742 // We try to obtain an induction variable from the original loop as hard 3743 // as possible. However if we don't find one that: 3744 // - is an integer 3745 // - counts from zero, stepping by one 3746 // - is the size of the widest induction variable type 3747 // then we create a new one. 3748 OldInduction = Legal->getPrimaryInduction(); 3749 Type *IdxTy = Legal->getWidestInductionType(); 3750 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3751 // The loop step is equal to the vectorization factor (num of SIMD elements) 3752 // times the unroll factor (num of SIMD instructions). 3753 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3754 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3755 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3756 Induction = 3757 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3758 getDebugLocFromInstOrOperands(OldInduction)); 3759 3760 // Emit phis for the new starting index of the scalar loop. 3761 createInductionResumeValues(Lp, CountRoundDown); 3762 3763 return completeLoopSkeleton(Lp, OrigLoopID); 3764 } 3765 3766 // Fix up external users of the induction variable. At this point, we are 3767 // in LCSSA form, with all external PHIs that use the IV having one input value, 3768 // coming from the remainder loop. We need those PHIs to also have a correct 3769 // value for the IV when arriving directly from the middle block. 3770 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3771 const InductionDescriptor &II, 3772 Value *CountRoundDown, Value *EndValue, 3773 BasicBlock *MiddleBlock) { 3774 // There are two kinds of external IV usages - those that use the value 3775 // computed in the last iteration (the PHI) and those that use the penultimate 3776 // value (the value that feeds into the phi from the loop latch). 3777 // We allow both, but they, obviously, have different values. 3778 3779 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3780 3781 DenseMap<Value *, Value *> MissingVals; 3782 3783 // An external user of the last iteration's value should see the value that 3784 // the remainder loop uses to initialize its own IV. 3785 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3786 for (User *U : PostInc->users()) { 3787 Instruction *UI = cast<Instruction>(U); 3788 if (!OrigLoop->contains(UI)) { 3789 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3790 MissingVals[UI] = EndValue; 3791 } 3792 } 3793 3794 // An external user of the penultimate value need to see EndValue - Step. 3795 // The simplest way to get this is to recompute it from the constituent SCEVs, 3796 // that is Start + (Step * (CRD - 1)). 3797 for (User *U : OrigPhi->users()) { 3798 auto *UI = cast<Instruction>(U); 3799 if (!OrigLoop->contains(UI)) { 3800 const DataLayout &DL = 3801 OrigLoop->getHeader()->getModule()->getDataLayout(); 3802 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3803 3804 IRBuilder<> B(MiddleBlock->getTerminator()); 3805 3806 // Fast-math-flags propagate from the original induction instruction. 3807 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3808 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3809 3810 Value *CountMinusOne = B.CreateSub( 3811 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3812 Value *CMO = 3813 !II.getStep()->getType()->isIntegerTy() 3814 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3815 II.getStep()->getType()) 3816 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3817 CMO->setName("cast.cmo"); 3818 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3819 Escape->setName("ind.escape"); 3820 MissingVals[UI] = Escape; 3821 } 3822 } 3823 3824 for (auto &I : MissingVals) { 3825 PHINode *PHI = cast<PHINode>(I.first); 3826 // One corner case we have to handle is two IVs "chasing" each-other, 3827 // that is %IV2 = phi [...], [ %IV1, %latch ] 3828 // In this case, if IV1 has an external use, we need to avoid adding both 3829 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3830 // don't already have an incoming value for the middle block. 3831 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3832 PHI->addIncoming(I.second, MiddleBlock); 3833 } 3834 } 3835 3836 namespace { 3837 3838 struct CSEDenseMapInfo { 3839 static bool canHandle(const Instruction *I) { 3840 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3841 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3842 } 3843 3844 static inline Instruction *getEmptyKey() { 3845 return DenseMapInfo<Instruction *>::getEmptyKey(); 3846 } 3847 3848 static inline Instruction *getTombstoneKey() { 3849 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3850 } 3851 3852 static unsigned getHashValue(const Instruction *I) { 3853 assert(canHandle(I) && "Unknown instruction!"); 3854 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3855 I->value_op_end())); 3856 } 3857 3858 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3859 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3860 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3861 return LHS == RHS; 3862 return LHS->isIdenticalTo(RHS); 3863 } 3864 }; 3865 3866 } // end anonymous namespace 3867 3868 ///Perform cse of induction variable instructions. 3869 static void cse(BasicBlock *BB) { 3870 // Perform simple cse. 3871 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3872 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3873 if (!CSEDenseMapInfo::canHandle(&In)) 3874 continue; 3875 3876 // Check if we can replace this instruction with any of the 3877 // visited instructions. 3878 if (Instruction *V = CSEMap.lookup(&In)) { 3879 In.replaceAllUsesWith(V); 3880 In.eraseFromParent(); 3881 continue; 3882 } 3883 3884 CSEMap[&In] = &In; 3885 } 3886 } 3887 3888 InstructionCost 3889 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3890 bool &NeedToScalarize) const { 3891 Function *F = CI->getCalledFunction(); 3892 Type *ScalarRetTy = CI->getType(); 3893 SmallVector<Type *, 4> Tys, ScalarTys; 3894 for (auto &ArgOp : CI->args()) 3895 ScalarTys.push_back(ArgOp->getType()); 3896 3897 // Estimate cost of scalarized vector call. The source operands are assumed 3898 // to be vectors, so we need to extract individual elements from there, 3899 // execute VF scalar calls, and then gather the result into the vector return 3900 // value. 3901 InstructionCost ScalarCallCost = 3902 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3903 if (VF.isScalar()) 3904 return ScalarCallCost; 3905 3906 // Compute corresponding vector type for return value and arguments. 3907 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3908 for (Type *ScalarTy : ScalarTys) 3909 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3910 3911 // Compute costs of unpacking argument values for the scalar calls and 3912 // packing the return values to a vector. 3913 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3914 3915 InstructionCost Cost = 3916 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3917 3918 // If we can't emit a vector call for this function, then the currently found 3919 // cost is the cost we need to return. 3920 NeedToScalarize = true; 3921 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3922 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3923 3924 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3925 return Cost; 3926 3927 // If the corresponding vector cost is cheaper, return its cost. 3928 InstructionCost VectorCallCost = 3929 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3930 if (VectorCallCost < Cost) { 3931 NeedToScalarize = false; 3932 Cost = VectorCallCost; 3933 } 3934 return Cost; 3935 } 3936 3937 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3938 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3939 return Elt; 3940 return VectorType::get(Elt, VF); 3941 } 3942 3943 InstructionCost 3944 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3945 ElementCount VF) const { 3946 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3947 assert(ID && "Expected intrinsic call!"); 3948 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3949 FastMathFlags FMF; 3950 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3951 FMF = FPMO->getFastMathFlags(); 3952 3953 SmallVector<const Value *> Arguments(CI->args()); 3954 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3955 SmallVector<Type *> ParamTys; 3956 std::transform(FTy->param_begin(), FTy->param_end(), 3957 std::back_inserter(ParamTys), 3958 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3959 3960 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3961 dyn_cast<IntrinsicInst>(CI)); 3962 return TTI.getIntrinsicInstrCost(CostAttrs, 3963 TargetTransformInfo::TCK_RecipThroughput); 3964 } 3965 3966 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3967 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3968 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3969 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3970 } 3971 3972 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3973 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3974 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3975 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3976 } 3977 3978 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3979 // For every instruction `I` in MinBWs, truncate the operands, create a 3980 // truncated version of `I` and reextend its result. InstCombine runs 3981 // later and will remove any ext/trunc pairs. 3982 SmallPtrSet<Value *, 4> Erased; 3983 for (const auto &KV : Cost->getMinimalBitwidths()) { 3984 // If the value wasn't vectorized, we must maintain the original scalar 3985 // type. The absence of the value from State indicates that it 3986 // wasn't vectorized. 3987 // FIXME: Should not rely on getVPValue at this point. 3988 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3989 if (!State.hasAnyVectorValue(Def)) 3990 continue; 3991 for (unsigned Part = 0; Part < UF; ++Part) { 3992 Value *I = State.get(Def, Part); 3993 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3994 continue; 3995 Type *OriginalTy = I->getType(); 3996 Type *ScalarTruncatedTy = 3997 IntegerType::get(OriginalTy->getContext(), KV.second); 3998 auto *TruncatedTy = VectorType::get( 3999 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 4000 if (TruncatedTy == OriginalTy) 4001 continue; 4002 4003 IRBuilder<> B(cast<Instruction>(I)); 4004 auto ShrinkOperand = [&](Value *V) -> Value * { 4005 if (auto *ZI = dyn_cast<ZExtInst>(V)) 4006 if (ZI->getSrcTy() == TruncatedTy) 4007 return ZI->getOperand(0); 4008 return B.CreateZExtOrTrunc(V, TruncatedTy); 4009 }; 4010 4011 // The actual instruction modification depends on the instruction type, 4012 // unfortunately. 4013 Value *NewI = nullptr; 4014 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4015 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4016 ShrinkOperand(BO->getOperand(1))); 4017 4018 // Any wrapping introduced by shrinking this operation shouldn't be 4019 // considered undefined behavior. So, we can't unconditionally copy 4020 // arithmetic wrapping flags to NewI. 4021 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4022 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4023 NewI = 4024 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4025 ShrinkOperand(CI->getOperand(1))); 4026 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4027 NewI = B.CreateSelect(SI->getCondition(), 4028 ShrinkOperand(SI->getTrueValue()), 4029 ShrinkOperand(SI->getFalseValue())); 4030 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4031 switch (CI->getOpcode()) { 4032 default: 4033 llvm_unreachable("Unhandled cast!"); 4034 case Instruction::Trunc: 4035 NewI = ShrinkOperand(CI->getOperand(0)); 4036 break; 4037 case Instruction::SExt: 4038 NewI = B.CreateSExtOrTrunc( 4039 CI->getOperand(0), 4040 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4041 break; 4042 case Instruction::ZExt: 4043 NewI = B.CreateZExtOrTrunc( 4044 CI->getOperand(0), 4045 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4046 break; 4047 } 4048 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4049 auto Elements0 = 4050 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4051 auto *O0 = B.CreateZExtOrTrunc( 4052 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4053 auto Elements1 = 4054 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4055 auto *O1 = B.CreateZExtOrTrunc( 4056 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4057 4058 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4059 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4060 // Don't do anything with the operands, just extend the result. 4061 continue; 4062 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4063 auto Elements = 4064 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4065 auto *O0 = B.CreateZExtOrTrunc( 4066 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4067 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4068 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4069 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4070 auto Elements = 4071 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4072 auto *O0 = B.CreateZExtOrTrunc( 4073 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4074 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4075 } else { 4076 // If we don't know what to do, be conservative and don't do anything. 4077 continue; 4078 } 4079 4080 // Lastly, extend the result. 4081 NewI->takeName(cast<Instruction>(I)); 4082 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4083 I->replaceAllUsesWith(Res); 4084 cast<Instruction>(I)->eraseFromParent(); 4085 Erased.insert(I); 4086 State.reset(Def, Res, Part); 4087 } 4088 } 4089 4090 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4091 for (const auto &KV : Cost->getMinimalBitwidths()) { 4092 // If the value wasn't vectorized, we must maintain the original scalar 4093 // type. The absence of the value from State indicates that it 4094 // wasn't vectorized. 4095 // FIXME: Should not rely on getVPValue at this point. 4096 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4097 if (!State.hasAnyVectorValue(Def)) 4098 continue; 4099 for (unsigned Part = 0; Part < UF; ++Part) { 4100 Value *I = State.get(Def, Part); 4101 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4102 if (Inst && Inst->use_empty()) { 4103 Value *NewI = Inst->getOperand(0); 4104 Inst->eraseFromParent(); 4105 State.reset(Def, NewI, Part); 4106 } 4107 } 4108 } 4109 } 4110 4111 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4112 // Insert truncates and extends for any truncated instructions as hints to 4113 // InstCombine. 4114 if (VF.isVector()) 4115 truncateToMinimalBitwidths(State); 4116 4117 // Fix widened non-induction PHIs by setting up the PHI operands. 4118 if (OrigPHIsToFix.size()) { 4119 assert(EnableVPlanNativePath && 4120 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4121 fixNonInductionPHIs(State); 4122 } 4123 4124 // At this point every instruction in the original loop is widened to a 4125 // vector form. Now we need to fix the recurrences in the loop. These PHI 4126 // nodes are currently empty because we did not want to introduce cycles. 4127 // This is the second stage of vectorizing recurrences. 4128 fixCrossIterationPHIs(State); 4129 4130 // Forget the original basic block. 4131 PSE.getSE()->forgetLoop(OrigLoop); 4132 4133 // If we inserted an edge from the middle block to the unique exit block, 4134 // update uses outside the loop (phis) to account for the newly inserted 4135 // edge. 4136 if (!Cost->requiresScalarEpilogue(VF)) { 4137 // Fix-up external users of the induction variables. 4138 for (auto &Entry : Legal->getInductionVars()) 4139 fixupIVUsers(Entry.first, Entry.second, 4140 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4141 IVEndValues[Entry.first], LoopMiddleBlock); 4142 4143 fixLCSSAPHIs(State); 4144 } 4145 4146 for (Instruction *PI : PredicatedInstructions) 4147 sinkScalarOperands(&*PI); 4148 4149 // Remove redundant induction instructions. 4150 cse(LoopVectorBody); 4151 4152 // Set/update profile weights for the vector and remainder loops as original 4153 // loop iterations are now distributed among them. Note that original loop 4154 // represented by LoopScalarBody becomes remainder loop after vectorization. 4155 // 4156 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4157 // end up getting slightly roughened result but that should be OK since 4158 // profile is not inherently precise anyway. Note also possible bypass of 4159 // vector code caused by legality checks is ignored, assigning all the weight 4160 // to the vector loop, optimistically. 4161 // 4162 // For scalable vectorization we can't know at compile time how many iterations 4163 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4164 // vscale of '1'. 4165 setProfileInfoAfterUnrolling( 4166 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4167 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4168 } 4169 4170 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4171 // In order to support recurrences we need to be able to vectorize Phi nodes. 4172 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4173 // stage #2: We now need to fix the recurrences by adding incoming edges to 4174 // the currently empty PHI nodes. At this point every instruction in the 4175 // original loop is widened to a vector form so we can use them to construct 4176 // the incoming edges. 4177 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4178 for (VPRecipeBase &R : Header->phis()) { 4179 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4180 fixReduction(ReductionPhi, State); 4181 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4182 fixFirstOrderRecurrence(FOR, State); 4183 } 4184 } 4185 4186 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4187 VPTransformState &State) { 4188 // This is the second phase of vectorizing first-order recurrences. An 4189 // overview of the transformation is described below. Suppose we have the 4190 // following loop. 4191 // 4192 // for (int i = 0; i < n; ++i) 4193 // b[i] = a[i] - a[i - 1]; 4194 // 4195 // There is a first-order recurrence on "a". For this loop, the shorthand 4196 // scalar IR looks like: 4197 // 4198 // scalar.ph: 4199 // s_init = a[-1] 4200 // br scalar.body 4201 // 4202 // scalar.body: 4203 // i = phi [0, scalar.ph], [i+1, scalar.body] 4204 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4205 // s2 = a[i] 4206 // b[i] = s2 - s1 4207 // br cond, scalar.body, ... 4208 // 4209 // In this example, s1 is a recurrence because it's value depends on the 4210 // previous iteration. In the first phase of vectorization, we created a 4211 // vector phi v1 for s1. We now complete the vectorization and produce the 4212 // shorthand vector IR shown below (for VF = 4, UF = 1). 4213 // 4214 // vector.ph: 4215 // v_init = vector(..., ..., ..., a[-1]) 4216 // br vector.body 4217 // 4218 // vector.body 4219 // i = phi [0, vector.ph], [i+4, vector.body] 4220 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4221 // v2 = a[i, i+1, i+2, i+3]; 4222 // v3 = vector(v1(3), v2(0, 1, 2)) 4223 // b[i, i+1, i+2, i+3] = v2 - v3 4224 // br cond, vector.body, middle.block 4225 // 4226 // middle.block: 4227 // x = v2(3) 4228 // br scalar.ph 4229 // 4230 // scalar.ph: 4231 // s_init = phi [x, middle.block], [a[-1], otherwise] 4232 // br scalar.body 4233 // 4234 // After execution completes the vector loop, we extract the next value of 4235 // the recurrence (x) to use as the initial value in the scalar loop. 4236 4237 // Extract the last vector element in the middle block. This will be the 4238 // initial value for the recurrence when jumping to the scalar loop. 4239 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4240 Value *Incoming = State.get(PreviousDef, UF - 1); 4241 auto *ExtractForScalar = Incoming; 4242 auto *IdxTy = Builder.getInt32Ty(); 4243 if (VF.isVector()) { 4244 auto *One = ConstantInt::get(IdxTy, 1); 4245 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4246 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4247 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4248 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4249 "vector.recur.extract"); 4250 } 4251 // Extract the second last element in the middle block if the 4252 // Phi is used outside the loop. We need to extract the phi itself 4253 // and not the last element (the phi update in the current iteration). This 4254 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4255 // when the scalar loop is not run at all. 4256 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4257 if (VF.isVector()) { 4258 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4259 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4260 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4261 Incoming, Idx, "vector.recur.extract.for.phi"); 4262 } else if (UF > 1) 4263 // When loop is unrolled without vectorizing, initialize 4264 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4265 // of `Incoming`. This is analogous to the vectorized case above: extracting 4266 // the second last element when VF > 1. 4267 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4268 4269 // Fix the initial value of the original recurrence in the scalar loop. 4270 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4271 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4272 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4273 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4274 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4275 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4276 Start->addIncoming(Incoming, BB); 4277 } 4278 4279 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4280 Phi->setName("scalar.recur"); 4281 4282 // Finally, fix users of the recurrence outside the loop. The users will need 4283 // either the last value of the scalar recurrence or the last value of the 4284 // vector recurrence we extracted in the middle block. Since the loop is in 4285 // LCSSA form, we just need to find all the phi nodes for the original scalar 4286 // recurrence in the exit block, and then add an edge for the middle block. 4287 // Note that LCSSA does not imply single entry when the original scalar loop 4288 // had multiple exiting edges (as we always run the last iteration in the 4289 // scalar epilogue); in that case, there is no edge from middle to exit and 4290 // and thus no phis which needed updated. 4291 if (!Cost->requiresScalarEpilogue(VF)) 4292 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4293 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4294 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4295 } 4296 4297 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4298 VPTransformState &State) { 4299 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4300 // Get it's reduction variable descriptor. 4301 assert(Legal->isReductionVariable(OrigPhi) && 4302 "Unable to find the reduction variable"); 4303 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4304 4305 RecurKind RK = RdxDesc.getRecurrenceKind(); 4306 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4307 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4308 setDebugLocFromInst(ReductionStartValue); 4309 4310 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4311 // This is the vector-clone of the value that leaves the loop. 4312 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4313 4314 // Wrap flags are in general invalid after vectorization, clear them. 4315 clearReductionWrapFlags(RdxDesc, State); 4316 4317 // Before each round, move the insertion point right between 4318 // the PHIs and the values we are going to write. 4319 // This allows us to write both PHINodes and the extractelement 4320 // instructions. 4321 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4322 4323 setDebugLocFromInst(LoopExitInst); 4324 4325 Type *PhiTy = OrigPhi->getType(); 4326 // If tail is folded by masking, the vector value to leave the loop should be 4327 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4328 // instead of the former. For an inloop reduction the reduction will already 4329 // be predicated, and does not need to be handled here. 4330 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4331 for (unsigned Part = 0; Part < UF; ++Part) { 4332 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4333 Value *Sel = nullptr; 4334 for (User *U : VecLoopExitInst->users()) { 4335 if (isa<SelectInst>(U)) { 4336 assert(!Sel && "Reduction exit feeding two selects"); 4337 Sel = U; 4338 } else 4339 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4340 } 4341 assert(Sel && "Reduction exit feeds no select"); 4342 State.reset(LoopExitInstDef, Sel, Part); 4343 4344 // If the target can create a predicated operator for the reduction at no 4345 // extra cost in the loop (for example a predicated vadd), it can be 4346 // cheaper for the select to remain in the loop than be sunk out of it, 4347 // and so use the select value for the phi instead of the old 4348 // LoopExitValue. 4349 if (PreferPredicatedReductionSelect || 4350 TTI->preferPredicatedReductionSelect( 4351 RdxDesc.getOpcode(), PhiTy, 4352 TargetTransformInfo::ReductionFlags())) { 4353 auto *VecRdxPhi = 4354 cast<PHINode>(State.get(PhiR, Part)); 4355 VecRdxPhi->setIncomingValueForBlock( 4356 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4357 } 4358 } 4359 } 4360 4361 // If the vector reduction can be performed in a smaller type, we truncate 4362 // then extend the loop exit value to enable InstCombine to evaluate the 4363 // entire expression in the smaller type. 4364 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4365 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4366 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4367 Builder.SetInsertPoint( 4368 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4369 VectorParts RdxParts(UF); 4370 for (unsigned Part = 0; Part < UF; ++Part) { 4371 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4372 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4373 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4374 : Builder.CreateZExt(Trunc, VecTy); 4375 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4376 if (U != Trunc) { 4377 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4378 RdxParts[Part] = Extnd; 4379 } 4380 } 4381 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4382 for (unsigned Part = 0; Part < UF; ++Part) { 4383 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4384 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4385 } 4386 } 4387 4388 // Reduce all of the unrolled parts into a single vector. 4389 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4390 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4391 4392 // The middle block terminator has already been assigned a DebugLoc here (the 4393 // OrigLoop's single latch terminator). We want the whole middle block to 4394 // appear to execute on this line because: (a) it is all compiler generated, 4395 // (b) these instructions are always executed after evaluating the latch 4396 // conditional branch, and (c) other passes may add new predecessors which 4397 // terminate on this line. This is the easiest way to ensure we don't 4398 // accidentally cause an extra step back into the loop while debugging. 4399 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4400 if (PhiR->isOrdered()) 4401 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4402 else { 4403 // Floating-point operations should have some FMF to enable the reduction. 4404 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4405 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4406 for (unsigned Part = 1; Part < UF; ++Part) { 4407 Value *RdxPart = State.get(LoopExitInstDef, Part); 4408 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4409 ReducedPartRdx = Builder.CreateBinOp( 4410 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4411 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4412 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4413 ReducedPartRdx, RdxPart); 4414 else 4415 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4416 } 4417 } 4418 4419 // Create the reduction after the loop. Note that inloop reductions create the 4420 // target reduction in the loop using a Reduction recipe. 4421 if (VF.isVector() && !PhiR->isInLoop()) { 4422 ReducedPartRdx = 4423 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4424 // If the reduction can be performed in a smaller type, we need to extend 4425 // the reduction to the wider type before we branch to the original loop. 4426 if (PhiTy != RdxDesc.getRecurrenceType()) 4427 ReducedPartRdx = RdxDesc.isSigned() 4428 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4429 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4430 } 4431 4432 // Create a phi node that merges control-flow from the backedge-taken check 4433 // block and the middle block. 4434 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4435 LoopScalarPreHeader->getTerminator()); 4436 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4437 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4438 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4439 4440 // Now, we need to fix the users of the reduction variable 4441 // inside and outside of the scalar remainder loop. 4442 4443 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4444 // in the exit blocks. See comment on analogous loop in 4445 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4446 if (!Cost->requiresScalarEpilogue(VF)) 4447 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4448 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4449 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4450 4451 // Fix the scalar loop reduction variable with the incoming reduction sum 4452 // from the vector body and from the backedge value. 4453 int IncomingEdgeBlockIdx = 4454 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4455 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4456 // Pick the other block. 4457 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4458 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4459 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4460 } 4461 4462 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4463 VPTransformState &State) { 4464 RecurKind RK = RdxDesc.getRecurrenceKind(); 4465 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4466 return; 4467 4468 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4469 assert(LoopExitInstr && "null loop exit instruction"); 4470 SmallVector<Instruction *, 8> Worklist; 4471 SmallPtrSet<Instruction *, 8> Visited; 4472 Worklist.push_back(LoopExitInstr); 4473 Visited.insert(LoopExitInstr); 4474 4475 while (!Worklist.empty()) { 4476 Instruction *Cur = Worklist.pop_back_val(); 4477 if (isa<OverflowingBinaryOperator>(Cur)) 4478 for (unsigned Part = 0; Part < UF; ++Part) { 4479 // FIXME: Should not rely on getVPValue at this point. 4480 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4481 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4482 } 4483 4484 for (User *U : Cur->users()) { 4485 Instruction *UI = cast<Instruction>(U); 4486 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4487 Visited.insert(UI).second) 4488 Worklist.push_back(UI); 4489 } 4490 } 4491 } 4492 4493 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4494 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4495 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4496 // Some phis were already hand updated by the reduction and recurrence 4497 // code above, leave them alone. 4498 continue; 4499 4500 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4501 // Non-instruction incoming values will have only one value. 4502 4503 VPLane Lane = VPLane::getFirstLane(); 4504 if (isa<Instruction>(IncomingValue) && 4505 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4506 VF)) 4507 Lane = VPLane::getLastLaneForVF(VF); 4508 4509 // Can be a loop invariant incoming value or the last scalar value to be 4510 // extracted from the vectorized loop. 4511 // FIXME: Should not rely on getVPValue at this point. 4512 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4513 Value *lastIncomingValue = 4514 OrigLoop->isLoopInvariant(IncomingValue) 4515 ? IncomingValue 4516 : State.get(State.Plan->getVPValue(IncomingValue, true), 4517 VPIteration(UF - 1, Lane)); 4518 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4519 } 4520 } 4521 4522 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4523 // The basic block and loop containing the predicated instruction. 4524 auto *PredBB = PredInst->getParent(); 4525 auto *VectorLoop = LI->getLoopFor(PredBB); 4526 4527 // Initialize a worklist with the operands of the predicated instruction. 4528 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4529 4530 // Holds instructions that we need to analyze again. An instruction may be 4531 // reanalyzed if we don't yet know if we can sink it or not. 4532 SmallVector<Instruction *, 8> InstsToReanalyze; 4533 4534 // Returns true if a given use occurs in the predicated block. Phi nodes use 4535 // their operands in their corresponding predecessor blocks. 4536 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4537 auto *I = cast<Instruction>(U.getUser()); 4538 BasicBlock *BB = I->getParent(); 4539 if (auto *Phi = dyn_cast<PHINode>(I)) 4540 BB = Phi->getIncomingBlock( 4541 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4542 return BB == PredBB; 4543 }; 4544 4545 // Iteratively sink the scalarized operands of the predicated instruction 4546 // into the block we created for it. When an instruction is sunk, it's 4547 // operands are then added to the worklist. The algorithm ends after one pass 4548 // through the worklist doesn't sink a single instruction. 4549 bool Changed; 4550 do { 4551 // Add the instructions that need to be reanalyzed to the worklist, and 4552 // reset the changed indicator. 4553 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4554 InstsToReanalyze.clear(); 4555 Changed = false; 4556 4557 while (!Worklist.empty()) { 4558 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4559 4560 // We can't sink an instruction if it is a phi node, is not in the loop, 4561 // or may have side effects. 4562 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4563 I->mayHaveSideEffects()) 4564 continue; 4565 4566 // If the instruction is already in PredBB, check if we can sink its 4567 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4568 // sinking the scalar instruction I, hence it appears in PredBB; but it 4569 // may have failed to sink I's operands (recursively), which we try 4570 // (again) here. 4571 if (I->getParent() == PredBB) { 4572 Worklist.insert(I->op_begin(), I->op_end()); 4573 continue; 4574 } 4575 4576 // It's legal to sink the instruction if all its uses occur in the 4577 // predicated block. Otherwise, there's nothing to do yet, and we may 4578 // need to reanalyze the instruction. 4579 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4580 InstsToReanalyze.push_back(I); 4581 continue; 4582 } 4583 4584 // Move the instruction to the beginning of the predicated block, and add 4585 // it's operands to the worklist. 4586 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4587 Worklist.insert(I->op_begin(), I->op_end()); 4588 4589 // The sinking may have enabled other instructions to be sunk, so we will 4590 // need to iterate. 4591 Changed = true; 4592 } 4593 } while (Changed); 4594 } 4595 4596 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4597 for (PHINode *OrigPhi : OrigPHIsToFix) { 4598 VPWidenPHIRecipe *VPPhi = 4599 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4600 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4601 // Make sure the builder has a valid insert point. 4602 Builder.SetInsertPoint(NewPhi); 4603 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4604 VPValue *Inc = VPPhi->getIncomingValue(i); 4605 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4606 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4607 } 4608 } 4609 } 4610 4611 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4612 return Cost->useOrderedReductions(RdxDesc); 4613 } 4614 4615 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4616 VPUser &Operands, unsigned UF, 4617 ElementCount VF, bool IsPtrLoopInvariant, 4618 SmallBitVector &IsIndexLoopInvariant, 4619 VPTransformState &State) { 4620 // Construct a vector GEP by widening the operands of the scalar GEP as 4621 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4622 // results in a vector of pointers when at least one operand of the GEP 4623 // is vector-typed. Thus, to keep the representation compact, we only use 4624 // vector-typed operands for loop-varying values. 4625 4626 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4627 // If we are vectorizing, but the GEP has only loop-invariant operands, 4628 // the GEP we build (by only using vector-typed operands for 4629 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4630 // produce a vector of pointers, we need to either arbitrarily pick an 4631 // operand to broadcast, or broadcast a clone of the original GEP. 4632 // Here, we broadcast a clone of the original. 4633 // 4634 // TODO: If at some point we decide to scalarize instructions having 4635 // loop-invariant operands, this special case will no longer be 4636 // required. We would add the scalarization decision to 4637 // collectLoopScalars() and teach getVectorValue() to broadcast 4638 // the lane-zero scalar value. 4639 auto *Clone = Builder.Insert(GEP->clone()); 4640 for (unsigned Part = 0; Part < UF; ++Part) { 4641 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4642 State.set(VPDef, EntryPart, Part); 4643 addMetadata(EntryPart, GEP); 4644 } 4645 } else { 4646 // If the GEP has at least one loop-varying operand, we are sure to 4647 // produce a vector of pointers. But if we are only unrolling, we want 4648 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4649 // produce with the code below will be scalar (if VF == 1) or vector 4650 // (otherwise). Note that for the unroll-only case, we still maintain 4651 // values in the vector mapping with initVector, as we do for other 4652 // instructions. 4653 for (unsigned Part = 0; Part < UF; ++Part) { 4654 // The pointer operand of the new GEP. If it's loop-invariant, we 4655 // won't broadcast it. 4656 auto *Ptr = IsPtrLoopInvariant 4657 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4658 : State.get(Operands.getOperand(0), Part); 4659 4660 // Collect all the indices for the new GEP. If any index is 4661 // loop-invariant, we won't broadcast it. 4662 SmallVector<Value *, 4> Indices; 4663 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4664 VPValue *Operand = Operands.getOperand(I); 4665 if (IsIndexLoopInvariant[I - 1]) 4666 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4667 else 4668 Indices.push_back(State.get(Operand, Part)); 4669 } 4670 4671 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4672 // but it should be a vector, otherwise. 4673 auto *NewGEP = 4674 GEP->isInBounds() 4675 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4676 Indices) 4677 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4678 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4679 "NewGEP is not a pointer vector"); 4680 State.set(VPDef, NewGEP, Part); 4681 addMetadata(NewGEP, GEP); 4682 } 4683 } 4684 } 4685 4686 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4687 VPWidenPHIRecipe *PhiR, 4688 VPTransformState &State) { 4689 PHINode *P = cast<PHINode>(PN); 4690 if (EnableVPlanNativePath) { 4691 // Currently we enter here in the VPlan-native path for non-induction 4692 // PHIs where all control flow is uniform. We simply widen these PHIs. 4693 // Create a vector phi with no operands - the vector phi operands will be 4694 // set at the end of vector code generation. 4695 Type *VecTy = (State.VF.isScalar()) 4696 ? PN->getType() 4697 : VectorType::get(PN->getType(), State.VF); 4698 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4699 State.set(PhiR, VecPhi, 0); 4700 OrigPHIsToFix.push_back(P); 4701 4702 return; 4703 } 4704 4705 assert(PN->getParent() == OrigLoop->getHeader() && 4706 "Non-header phis should have been handled elsewhere"); 4707 4708 // In order to support recurrences we need to be able to vectorize Phi nodes. 4709 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4710 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4711 // this value when we vectorize all of the instructions that use the PHI. 4712 4713 assert(!Legal->isReductionVariable(P) && 4714 "reductions should be handled elsewhere"); 4715 4716 setDebugLocFromInst(P); 4717 4718 // This PHINode must be an induction variable. 4719 // Make sure that we know about it. 4720 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4721 4722 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4723 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4724 4725 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4726 // which can be found from the original scalar operations. 4727 switch (II.getKind()) { 4728 case InductionDescriptor::IK_NoInduction: 4729 llvm_unreachable("Unknown induction"); 4730 case InductionDescriptor::IK_IntInduction: 4731 case InductionDescriptor::IK_FpInduction: 4732 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4733 case InductionDescriptor::IK_PtrInduction: { 4734 // Handle the pointer induction variable case. 4735 assert(P->getType()->isPointerTy() && "Unexpected type."); 4736 4737 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4738 // This is the normalized GEP that starts counting at zero. 4739 Value *PtrInd = 4740 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4741 // Determine the number of scalars we need to generate for each unroll 4742 // iteration. If the instruction is uniform, we only need to generate the 4743 // first lane. Otherwise, we generate all VF values. 4744 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4745 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4746 4747 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4748 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4749 if (NeedsVectorIndex) { 4750 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4751 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4752 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4753 } 4754 4755 for (unsigned Part = 0; Part < UF; ++Part) { 4756 Value *PartStart = 4757 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4758 4759 if (NeedsVectorIndex) { 4760 // Here we cache the whole vector, which means we can support the 4761 // extraction of any lane. However, in some cases the extractelement 4762 // instruction that is generated for scalar uses of this vector (e.g. 4763 // a load instruction) is not folded away. Therefore we still 4764 // calculate values for the first n lanes to avoid redundant moves 4765 // (when extracting the 0th element) and to produce scalar code (i.e. 4766 // additional add/gep instructions instead of expensive extractelement 4767 // instructions) when extracting higher-order elements. 4768 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4769 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4770 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4771 Value *SclrGep = 4772 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4773 SclrGep->setName("next.gep"); 4774 State.set(PhiR, SclrGep, Part); 4775 } 4776 4777 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4778 Value *Idx = Builder.CreateAdd( 4779 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4780 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4781 Value *SclrGep = 4782 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4783 SclrGep->setName("next.gep"); 4784 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4785 } 4786 } 4787 return; 4788 } 4789 assert(isa<SCEVConstant>(II.getStep()) && 4790 "Induction step not a SCEV constant!"); 4791 Type *PhiType = II.getStep()->getType(); 4792 4793 // Build a pointer phi 4794 Value *ScalarStartValue = II.getStartValue(); 4795 Type *ScStValueType = ScalarStartValue->getType(); 4796 PHINode *NewPointerPhi = 4797 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4798 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4799 4800 // A pointer induction, performed by using a gep 4801 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4802 Instruction *InductionLoc = LoopLatch->getTerminator(); 4803 const SCEV *ScalarStep = II.getStep(); 4804 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4805 Value *ScalarStepValue = 4806 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4807 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4808 Value *NumUnrolledElems = 4809 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4810 Value *InductionGEP = GetElementPtrInst::Create( 4811 II.getElementType(), NewPointerPhi, 4812 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4813 InductionLoc); 4814 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4815 4816 // Create UF many actual address geps that use the pointer 4817 // phi as base and a vectorized version of the step value 4818 // (<step*0, ..., step*N>) as offset. 4819 for (unsigned Part = 0; Part < State.UF; ++Part) { 4820 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4821 Value *StartOffsetScalar = 4822 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4823 Value *StartOffset = 4824 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4825 // Create a vector of consecutive numbers from zero to VF. 4826 StartOffset = 4827 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4828 4829 Value *GEP = Builder.CreateGEP( 4830 II.getElementType(), NewPointerPhi, 4831 Builder.CreateMul( 4832 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4833 "vector.gep")); 4834 State.set(PhiR, GEP, Part); 4835 } 4836 } 4837 } 4838 } 4839 4840 /// A helper function for checking whether an integer division-related 4841 /// instruction may divide by zero (in which case it must be predicated if 4842 /// executed conditionally in the scalar code). 4843 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4844 /// Non-zero divisors that are non compile-time constants will not be 4845 /// converted into multiplication, so we will still end up scalarizing 4846 /// the division, but can do so w/o predication. 4847 static bool mayDivideByZero(Instruction &I) { 4848 assert((I.getOpcode() == Instruction::UDiv || 4849 I.getOpcode() == Instruction::SDiv || 4850 I.getOpcode() == Instruction::URem || 4851 I.getOpcode() == Instruction::SRem) && 4852 "Unexpected instruction"); 4853 Value *Divisor = I.getOperand(1); 4854 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4855 return !CInt || CInt->isZero(); 4856 } 4857 4858 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4859 VPUser &User, 4860 VPTransformState &State) { 4861 switch (I.getOpcode()) { 4862 case Instruction::Call: 4863 case Instruction::Br: 4864 case Instruction::PHI: 4865 case Instruction::GetElementPtr: 4866 case Instruction::Select: 4867 llvm_unreachable("This instruction is handled by a different recipe."); 4868 case Instruction::UDiv: 4869 case Instruction::SDiv: 4870 case Instruction::SRem: 4871 case Instruction::URem: 4872 case Instruction::Add: 4873 case Instruction::FAdd: 4874 case Instruction::Sub: 4875 case Instruction::FSub: 4876 case Instruction::FNeg: 4877 case Instruction::Mul: 4878 case Instruction::FMul: 4879 case Instruction::FDiv: 4880 case Instruction::FRem: 4881 case Instruction::Shl: 4882 case Instruction::LShr: 4883 case Instruction::AShr: 4884 case Instruction::And: 4885 case Instruction::Or: 4886 case Instruction::Xor: { 4887 // Just widen unops and binops. 4888 setDebugLocFromInst(&I); 4889 4890 for (unsigned Part = 0; Part < UF; ++Part) { 4891 SmallVector<Value *, 2> Ops; 4892 for (VPValue *VPOp : User.operands()) 4893 Ops.push_back(State.get(VPOp, Part)); 4894 4895 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4896 4897 if (auto *VecOp = dyn_cast<Instruction>(V)) 4898 VecOp->copyIRFlags(&I); 4899 4900 // Use this vector value for all users of the original instruction. 4901 State.set(Def, V, Part); 4902 addMetadata(V, &I); 4903 } 4904 4905 break; 4906 } 4907 case Instruction::ICmp: 4908 case Instruction::FCmp: { 4909 // Widen compares. Generate vector compares. 4910 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4911 auto *Cmp = cast<CmpInst>(&I); 4912 setDebugLocFromInst(Cmp); 4913 for (unsigned Part = 0; Part < UF; ++Part) { 4914 Value *A = State.get(User.getOperand(0), Part); 4915 Value *B = State.get(User.getOperand(1), Part); 4916 Value *C = nullptr; 4917 if (FCmp) { 4918 // Propagate fast math flags. 4919 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4920 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4921 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4922 } else { 4923 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4924 } 4925 State.set(Def, C, Part); 4926 addMetadata(C, &I); 4927 } 4928 4929 break; 4930 } 4931 4932 case Instruction::ZExt: 4933 case Instruction::SExt: 4934 case Instruction::FPToUI: 4935 case Instruction::FPToSI: 4936 case Instruction::FPExt: 4937 case Instruction::PtrToInt: 4938 case Instruction::IntToPtr: 4939 case Instruction::SIToFP: 4940 case Instruction::UIToFP: 4941 case Instruction::Trunc: 4942 case Instruction::FPTrunc: 4943 case Instruction::BitCast: { 4944 auto *CI = cast<CastInst>(&I); 4945 setDebugLocFromInst(CI); 4946 4947 /// Vectorize casts. 4948 Type *DestTy = 4949 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4950 4951 for (unsigned Part = 0; Part < UF; ++Part) { 4952 Value *A = State.get(User.getOperand(0), Part); 4953 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4954 State.set(Def, Cast, Part); 4955 addMetadata(Cast, &I); 4956 } 4957 break; 4958 } 4959 default: 4960 // This instruction is not vectorized by simple widening. 4961 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4962 llvm_unreachable("Unhandled instruction!"); 4963 } // end of switch. 4964 } 4965 4966 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4967 VPUser &ArgOperands, 4968 VPTransformState &State) { 4969 assert(!isa<DbgInfoIntrinsic>(I) && 4970 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4971 setDebugLocFromInst(&I); 4972 4973 Module *M = I.getParent()->getParent()->getParent(); 4974 auto *CI = cast<CallInst>(&I); 4975 4976 SmallVector<Type *, 4> Tys; 4977 for (Value *ArgOperand : CI->args()) 4978 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4979 4980 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4981 4982 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4983 // version of the instruction. 4984 // Is it beneficial to perform intrinsic call compared to lib call? 4985 bool NeedToScalarize = false; 4986 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4987 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4988 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4989 assert((UseVectorIntrinsic || !NeedToScalarize) && 4990 "Instruction should be scalarized elsewhere."); 4991 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4992 "Either the intrinsic cost or vector call cost must be valid"); 4993 4994 for (unsigned Part = 0; Part < UF; ++Part) { 4995 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4996 SmallVector<Value *, 4> Args; 4997 for (auto &I : enumerate(ArgOperands.operands())) { 4998 // Some intrinsics have a scalar argument - don't replace it with a 4999 // vector. 5000 Value *Arg; 5001 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5002 Arg = State.get(I.value(), Part); 5003 else { 5004 Arg = State.get(I.value(), VPIteration(0, 0)); 5005 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 5006 TysForDecl.push_back(Arg->getType()); 5007 } 5008 Args.push_back(Arg); 5009 } 5010 5011 Function *VectorF; 5012 if (UseVectorIntrinsic) { 5013 // Use vector version of the intrinsic. 5014 if (VF.isVector()) 5015 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5016 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5017 assert(VectorF && "Can't retrieve vector intrinsic."); 5018 } else { 5019 // Use vector version of the function call. 5020 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5021 #ifndef NDEBUG 5022 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5023 "Can't create vector function."); 5024 #endif 5025 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5026 } 5027 SmallVector<OperandBundleDef, 1> OpBundles; 5028 CI->getOperandBundlesAsDefs(OpBundles); 5029 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5030 5031 if (isa<FPMathOperator>(V)) 5032 V->copyFastMathFlags(CI); 5033 5034 State.set(Def, V, Part); 5035 addMetadata(V, &I); 5036 } 5037 } 5038 5039 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5040 VPUser &Operands, 5041 bool InvariantCond, 5042 VPTransformState &State) { 5043 setDebugLocFromInst(&I); 5044 5045 // The condition can be loop invariant but still defined inside the 5046 // loop. This means that we can't just use the original 'cond' value. 5047 // We have to take the 'vectorized' value and pick the first lane. 5048 // Instcombine will make this a no-op. 5049 auto *InvarCond = InvariantCond 5050 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5051 : nullptr; 5052 5053 for (unsigned Part = 0; Part < UF; ++Part) { 5054 Value *Cond = 5055 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5056 Value *Op0 = State.get(Operands.getOperand(1), Part); 5057 Value *Op1 = State.get(Operands.getOperand(2), Part); 5058 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5059 State.set(VPDef, Sel, Part); 5060 addMetadata(Sel, &I); 5061 } 5062 } 5063 5064 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5065 // We should not collect Scalars more than once per VF. Right now, this 5066 // function is called from collectUniformsAndScalars(), which already does 5067 // this check. Collecting Scalars for VF=1 does not make any sense. 5068 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5069 "This function should not be visited twice for the same VF"); 5070 5071 SmallSetVector<Instruction *, 8> Worklist; 5072 5073 // These sets are used to seed the analysis with pointers used by memory 5074 // accesses that will remain scalar. 5075 SmallSetVector<Instruction *, 8> ScalarPtrs; 5076 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5077 auto *Latch = TheLoop->getLoopLatch(); 5078 5079 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5080 // The pointer operands of loads and stores will be scalar as long as the 5081 // memory access is not a gather or scatter operation. The value operand of a 5082 // store will remain scalar if the store is scalarized. 5083 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5084 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5085 assert(WideningDecision != CM_Unknown && 5086 "Widening decision should be ready at this moment"); 5087 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5088 if (Ptr == Store->getValueOperand()) 5089 return WideningDecision == CM_Scalarize; 5090 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5091 "Ptr is neither a value or pointer operand"); 5092 return WideningDecision != CM_GatherScatter; 5093 }; 5094 5095 // A helper that returns true if the given value is a bitcast or 5096 // getelementptr instruction contained in the loop. 5097 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5098 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5099 isa<GetElementPtrInst>(V)) && 5100 !TheLoop->isLoopInvariant(V); 5101 }; 5102 5103 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5104 if (!isa<PHINode>(Ptr) || 5105 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5106 return false; 5107 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5108 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5109 return false; 5110 return isScalarUse(MemAccess, Ptr); 5111 }; 5112 5113 // A helper that evaluates a memory access's use of a pointer. If the 5114 // pointer is actually the pointer induction of a loop, it is being 5115 // inserted into Worklist. If the use will be a scalar use, and the 5116 // pointer is only used by memory accesses, we place the pointer in 5117 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5118 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5119 if (isScalarPtrInduction(MemAccess, Ptr)) { 5120 Worklist.insert(cast<Instruction>(Ptr)); 5121 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5122 << "\n"); 5123 5124 Instruction *Update = cast<Instruction>( 5125 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5126 5127 // If there is more than one user of Update (Ptr), we shouldn't assume it 5128 // will be scalar after vectorisation as other users of the instruction 5129 // may require widening. Otherwise, add it to ScalarPtrs. 5130 if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) { 5131 ScalarPtrs.insert(Update); 5132 return; 5133 } 5134 } 5135 // We only care about bitcast and getelementptr instructions contained in 5136 // the loop. 5137 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5138 return; 5139 5140 // If the pointer has already been identified as scalar (e.g., if it was 5141 // also identified as uniform), there's nothing to do. 5142 auto *I = cast<Instruction>(Ptr); 5143 if (Worklist.count(I)) 5144 return; 5145 5146 // If the use of the pointer will be a scalar use, and all users of the 5147 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5148 // place the pointer in PossibleNonScalarPtrs. 5149 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5150 return isa<LoadInst>(U) || isa<StoreInst>(U); 5151 })) 5152 ScalarPtrs.insert(I); 5153 else 5154 PossibleNonScalarPtrs.insert(I); 5155 }; 5156 5157 // We seed the scalars analysis with three classes of instructions: (1) 5158 // instructions marked uniform-after-vectorization and (2) bitcast, 5159 // getelementptr and (pointer) phi instructions used by memory accesses 5160 // requiring a scalar use. 5161 // 5162 // (1) Add to the worklist all instructions that have been identified as 5163 // uniform-after-vectorization. 5164 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5165 5166 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5167 // memory accesses requiring a scalar use. The pointer operands of loads and 5168 // stores will be scalar as long as the memory accesses is not a gather or 5169 // scatter operation. The value operand of a store will remain scalar if the 5170 // store is scalarized. 5171 for (auto *BB : TheLoop->blocks()) 5172 for (auto &I : *BB) { 5173 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5174 evaluatePtrUse(Load, Load->getPointerOperand()); 5175 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5176 evaluatePtrUse(Store, Store->getPointerOperand()); 5177 evaluatePtrUse(Store, Store->getValueOperand()); 5178 } 5179 } 5180 for (auto *I : ScalarPtrs) 5181 if (!PossibleNonScalarPtrs.count(I)) { 5182 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5183 Worklist.insert(I); 5184 } 5185 5186 // Insert the forced scalars. 5187 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5188 // induction variable when the PHI user is scalarized. 5189 auto ForcedScalar = ForcedScalars.find(VF); 5190 if (ForcedScalar != ForcedScalars.end()) 5191 for (auto *I : ForcedScalar->second) 5192 Worklist.insert(I); 5193 5194 // Expand the worklist by looking through any bitcasts and getelementptr 5195 // instructions we've already identified as scalar. This is similar to the 5196 // expansion step in collectLoopUniforms(); however, here we're only 5197 // expanding to include additional bitcasts and getelementptr instructions. 5198 unsigned Idx = 0; 5199 while (Idx != Worklist.size()) { 5200 Instruction *Dst = Worklist[Idx++]; 5201 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5202 continue; 5203 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5204 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5205 auto *J = cast<Instruction>(U); 5206 return !TheLoop->contains(J) || Worklist.count(J) || 5207 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5208 isScalarUse(J, Src)); 5209 })) { 5210 Worklist.insert(Src); 5211 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5212 } 5213 } 5214 5215 // An induction variable will remain scalar if all users of the induction 5216 // variable and induction variable update remain scalar. 5217 for (auto &Induction : Legal->getInductionVars()) { 5218 auto *Ind = Induction.first; 5219 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5220 5221 // If tail-folding is applied, the primary induction variable will be used 5222 // to feed a vector compare. 5223 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5224 continue; 5225 5226 // Determine if all users of the induction variable are scalar after 5227 // vectorization. 5228 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5229 auto *I = cast<Instruction>(U); 5230 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5231 }); 5232 if (!ScalarInd) 5233 continue; 5234 5235 // Determine if all users of the induction variable update instruction are 5236 // scalar after vectorization. 5237 auto ScalarIndUpdate = 5238 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5239 auto *I = cast<Instruction>(U); 5240 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5241 }); 5242 if (!ScalarIndUpdate) 5243 continue; 5244 5245 // The induction variable and its update instruction will remain scalar. 5246 Worklist.insert(Ind); 5247 Worklist.insert(IndUpdate); 5248 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5249 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5250 << "\n"); 5251 } 5252 5253 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5254 } 5255 5256 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5257 if (!blockNeedsPredication(I->getParent())) 5258 return false; 5259 switch(I->getOpcode()) { 5260 default: 5261 break; 5262 case Instruction::Load: 5263 case Instruction::Store: { 5264 if (!Legal->isMaskRequired(I)) 5265 return false; 5266 auto *Ptr = getLoadStorePointerOperand(I); 5267 auto *Ty = getLoadStoreType(I); 5268 const Align Alignment = getLoadStoreAlignment(I); 5269 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5270 TTI.isLegalMaskedGather(Ty, Alignment)) 5271 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5272 TTI.isLegalMaskedScatter(Ty, Alignment)); 5273 } 5274 case Instruction::UDiv: 5275 case Instruction::SDiv: 5276 case Instruction::SRem: 5277 case Instruction::URem: 5278 return mayDivideByZero(*I); 5279 } 5280 return false; 5281 } 5282 5283 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5284 Instruction *I, ElementCount VF) { 5285 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5286 assert(getWideningDecision(I, VF) == CM_Unknown && 5287 "Decision should not be set yet."); 5288 auto *Group = getInterleavedAccessGroup(I); 5289 assert(Group && "Must have a group."); 5290 5291 // If the instruction's allocated size doesn't equal it's type size, it 5292 // requires padding and will be scalarized. 5293 auto &DL = I->getModule()->getDataLayout(); 5294 auto *ScalarTy = getLoadStoreType(I); 5295 if (hasIrregularType(ScalarTy, DL)) 5296 return false; 5297 5298 // Check if masking is required. 5299 // A Group may need masking for one of two reasons: it resides in a block that 5300 // needs predication, or it was decided to use masking to deal with gaps 5301 // (either a gap at the end of a load-access that may result in a speculative 5302 // load, or any gaps in a store-access). 5303 bool PredicatedAccessRequiresMasking = 5304 blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5305 bool LoadAccessWithGapsRequiresEpilogMasking = 5306 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 5307 !isScalarEpilogueAllowed(); 5308 bool StoreAccessWithGapsRequiresMasking = 5309 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 5310 if (!PredicatedAccessRequiresMasking && 5311 !LoadAccessWithGapsRequiresEpilogMasking && 5312 !StoreAccessWithGapsRequiresMasking) 5313 return true; 5314 5315 // If masked interleaving is required, we expect that the user/target had 5316 // enabled it, because otherwise it either wouldn't have been created or 5317 // it should have been invalidated by the CostModel. 5318 assert(useMaskedInterleavedAccesses(TTI) && 5319 "Masked interleave-groups for predicated accesses are not enabled."); 5320 5321 if (Group->isReverse()) 5322 return false; 5323 5324 auto *Ty = getLoadStoreType(I); 5325 const Align Alignment = getLoadStoreAlignment(I); 5326 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5327 : TTI.isLegalMaskedStore(Ty, Alignment); 5328 } 5329 5330 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5331 Instruction *I, ElementCount VF) { 5332 // Get and ensure we have a valid memory instruction. 5333 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 5334 5335 auto *Ptr = getLoadStorePointerOperand(I); 5336 auto *ScalarTy = getLoadStoreType(I); 5337 5338 // In order to be widened, the pointer should be consecutive, first of all. 5339 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5340 return false; 5341 5342 // If the instruction is a store located in a predicated block, it will be 5343 // scalarized. 5344 if (isScalarWithPredication(I)) 5345 return false; 5346 5347 // If the instruction's allocated size doesn't equal it's type size, it 5348 // requires padding and will be scalarized. 5349 auto &DL = I->getModule()->getDataLayout(); 5350 if (hasIrregularType(ScalarTy, DL)) 5351 return false; 5352 5353 return true; 5354 } 5355 5356 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5357 // We should not collect Uniforms more than once per VF. Right now, 5358 // this function is called from collectUniformsAndScalars(), which 5359 // already does this check. Collecting Uniforms for VF=1 does not make any 5360 // sense. 5361 5362 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5363 "This function should not be visited twice for the same VF"); 5364 5365 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5366 // not analyze again. Uniforms.count(VF) will return 1. 5367 Uniforms[VF].clear(); 5368 5369 // We now know that the loop is vectorizable! 5370 // Collect instructions inside the loop that will remain uniform after 5371 // vectorization. 5372 5373 // Global values, params and instructions outside of current loop are out of 5374 // scope. 5375 auto isOutOfScope = [&](Value *V) -> bool { 5376 Instruction *I = dyn_cast<Instruction>(V); 5377 return (!I || !TheLoop->contains(I)); 5378 }; 5379 5380 // Worklist containing uniform instructions demanding lane 0. 5381 SetVector<Instruction *> Worklist; 5382 BasicBlock *Latch = TheLoop->getLoopLatch(); 5383 5384 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5385 // that are scalar with predication must not be considered uniform after 5386 // vectorization, because that would create an erroneous replicating region 5387 // where only a single instance out of VF should be formed. 5388 // TODO: optimize such seldom cases if found important, see PR40816. 5389 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5390 if (isOutOfScope(I)) { 5391 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5392 << *I << "\n"); 5393 return; 5394 } 5395 if (isScalarWithPredication(I)) { 5396 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5397 << *I << "\n"); 5398 return; 5399 } 5400 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5401 Worklist.insert(I); 5402 }; 5403 5404 // Start with the conditional branch. If the branch condition is an 5405 // instruction contained in the loop that is only used by the branch, it is 5406 // uniform. 5407 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5408 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5409 addToWorklistIfAllowed(Cmp); 5410 5411 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5412 InstWidening WideningDecision = getWideningDecision(I, VF); 5413 assert(WideningDecision != CM_Unknown && 5414 "Widening decision should be ready at this moment"); 5415 5416 // A uniform memory op is itself uniform. We exclude uniform stores 5417 // here as they demand the last lane, not the first one. 5418 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5419 assert(WideningDecision == CM_Scalarize); 5420 return true; 5421 } 5422 5423 return (WideningDecision == CM_Widen || 5424 WideningDecision == CM_Widen_Reverse || 5425 WideningDecision == CM_Interleave); 5426 }; 5427 5428 5429 // Returns true if Ptr is the pointer operand of a memory access instruction 5430 // I, and I is known to not require scalarization. 5431 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5432 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5433 }; 5434 5435 // Holds a list of values which are known to have at least one uniform use. 5436 // Note that there may be other uses which aren't uniform. A "uniform use" 5437 // here is something which only demands lane 0 of the unrolled iterations; 5438 // it does not imply that all lanes produce the same value (e.g. this is not 5439 // the usual meaning of uniform) 5440 SetVector<Value *> HasUniformUse; 5441 5442 // Scan the loop for instructions which are either a) known to have only 5443 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5444 for (auto *BB : TheLoop->blocks()) 5445 for (auto &I : *BB) { 5446 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5447 switch (II->getIntrinsicID()) { 5448 case Intrinsic::sideeffect: 5449 case Intrinsic::experimental_noalias_scope_decl: 5450 case Intrinsic::assume: 5451 case Intrinsic::lifetime_start: 5452 case Intrinsic::lifetime_end: 5453 if (TheLoop->hasLoopInvariantOperands(&I)) 5454 addToWorklistIfAllowed(&I); 5455 break; 5456 default: 5457 break; 5458 } 5459 } 5460 5461 // ExtractValue instructions must be uniform, because the operands are 5462 // known to be loop-invariant. 5463 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5464 assert(isOutOfScope(EVI->getAggregateOperand()) && 5465 "Expected aggregate value to be loop invariant"); 5466 addToWorklistIfAllowed(EVI); 5467 continue; 5468 } 5469 5470 // If there's no pointer operand, there's nothing to do. 5471 auto *Ptr = getLoadStorePointerOperand(&I); 5472 if (!Ptr) 5473 continue; 5474 5475 // A uniform memory op is itself uniform. We exclude uniform stores 5476 // here as they demand the last lane, not the first one. 5477 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5478 addToWorklistIfAllowed(&I); 5479 5480 if (isUniformDecision(&I, VF)) { 5481 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5482 HasUniformUse.insert(Ptr); 5483 } 5484 } 5485 5486 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5487 // demanding) users. Since loops are assumed to be in LCSSA form, this 5488 // disallows uses outside the loop as well. 5489 for (auto *V : HasUniformUse) { 5490 if (isOutOfScope(V)) 5491 continue; 5492 auto *I = cast<Instruction>(V); 5493 auto UsersAreMemAccesses = 5494 llvm::all_of(I->users(), [&](User *U) -> bool { 5495 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5496 }); 5497 if (UsersAreMemAccesses) 5498 addToWorklistIfAllowed(I); 5499 } 5500 5501 // Expand Worklist in topological order: whenever a new instruction 5502 // is added , its users should be already inside Worklist. It ensures 5503 // a uniform instruction will only be used by uniform instructions. 5504 unsigned idx = 0; 5505 while (idx != Worklist.size()) { 5506 Instruction *I = Worklist[idx++]; 5507 5508 for (auto OV : I->operand_values()) { 5509 // isOutOfScope operands cannot be uniform instructions. 5510 if (isOutOfScope(OV)) 5511 continue; 5512 // First order recurrence Phi's should typically be considered 5513 // non-uniform. 5514 auto *OP = dyn_cast<PHINode>(OV); 5515 if (OP && Legal->isFirstOrderRecurrence(OP)) 5516 continue; 5517 // If all the users of the operand are uniform, then add the 5518 // operand into the uniform worklist. 5519 auto *OI = cast<Instruction>(OV); 5520 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5521 auto *J = cast<Instruction>(U); 5522 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5523 })) 5524 addToWorklistIfAllowed(OI); 5525 } 5526 } 5527 5528 // For an instruction to be added into Worklist above, all its users inside 5529 // the loop should also be in Worklist. However, this condition cannot be 5530 // true for phi nodes that form a cyclic dependence. We must process phi 5531 // nodes separately. An induction variable will remain uniform if all users 5532 // of the induction variable and induction variable update remain uniform. 5533 // The code below handles both pointer and non-pointer induction variables. 5534 for (auto &Induction : Legal->getInductionVars()) { 5535 auto *Ind = Induction.first; 5536 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5537 5538 // Determine if all users of the induction variable are uniform after 5539 // vectorization. 5540 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5541 auto *I = cast<Instruction>(U); 5542 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5543 isVectorizedMemAccessUse(I, Ind); 5544 }); 5545 if (!UniformInd) 5546 continue; 5547 5548 // Determine if all users of the induction variable update instruction are 5549 // uniform after vectorization. 5550 auto UniformIndUpdate = 5551 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5552 auto *I = cast<Instruction>(U); 5553 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5554 isVectorizedMemAccessUse(I, IndUpdate); 5555 }); 5556 if (!UniformIndUpdate) 5557 continue; 5558 5559 // The induction variable and its update instruction will remain uniform. 5560 addToWorklistIfAllowed(Ind); 5561 addToWorklistIfAllowed(IndUpdate); 5562 } 5563 5564 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5565 } 5566 5567 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5568 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5569 5570 if (Legal->getRuntimePointerChecking()->Need) { 5571 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5572 "runtime pointer checks needed. Enable vectorization of this " 5573 "loop with '#pragma clang loop vectorize(enable)' when " 5574 "compiling with -Os/-Oz", 5575 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5576 return true; 5577 } 5578 5579 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5580 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5581 "runtime SCEV checks needed. Enable vectorization of this " 5582 "loop with '#pragma clang loop vectorize(enable)' when " 5583 "compiling with -Os/-Oz", 5584 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5585 return true; 5586 } 5587 5588 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5589 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5590 reportVectorizationFailure("Runtime stride check for small trip count", 5591 "runtime stride == 1 checks needed. Enable vectorization of " 5592 "this loop without such check by compiling with -Os/-Oz", 5593 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5594 return true; 5595 } 5596 5597 return false; 5598 } 5599 5600 ElementCount 5601 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5602 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5603 return ElementCount::getScalable(0); 5604 5605 if (Hints->isScalableVectorizationDisabled()) { 5606 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5607 "ScalableVectorizationDisabled", ORE, TheLoop); 5608 return ElementCount::getScalable(0); 5609 } 5610 5611 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5612 5613 auto MaxScalableVF = ElementCount::getScalable( 5614 std::numeric_limits<ElementCount::ScalarTy>::max()); 5615 5616 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5617 // FIXME: While for scalable vectors this is currently sufficient, this should 5618 // be replaced by a more detailed mechanism that filters out specific VFs, 5619 // instead of invalidating vectorization for a whole set of VFs based on the 5620 // MaxVF. 5621 5622 // Disable scalable vectorization if the loop contains unsupported reductions. 5623 if (!canVectorizeReductions(MaxScalableVF)) { 5624 reportVectorizationInfo( 5625 "Scalable vectorization not supported for the reduction " 5626 "operations found in this loop.", 5627 "ScalableVFUnfeasible", ORE, TheLoop); 5628 return ElementCount::getScalable(0); 5629 } 5630 5631 // Disable scalable vectorization if the loop contains any instructions 5632 // with element types not supported for scalable vectors. 5633 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5634 return !Ty->isVoidTy() && 5635 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5636 })) { 5637 reportVectorizationInfo("Scalable vectorization is not supported " 5638 "for all element types found in this loop.", 5639 "ScalableVFUnfeasible", ORE, TheLoop); 5640 return ElementCount::getScalable(0); 5641 } 5642 5643 if (Legal->isSafeForAnyVectorWidth()) 5644 return MaxScalableVF; 5645 5646 // Limit MaxScalableVF by the maximum safe dependence distance. 5647 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5648 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5649 unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) 5650 .getVScaleRangeArgs() 5651 .second; 5652 if (VScaleMax > 0) 5653 MaxVScale = VScaleMax; 5654 } 5655 MaxScalableVF = ElementCount::getScalable( 5656 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5657 if (!MaxScalableVF) 5658 reportVectorizationInfo( 5659 "Max legal vector width too small, scalable vectorization " 5660 "unfeasible.", 5661 "ScalableVFUnfeasible", ORE, TheLoop); 5662 5663 return MaxScalableVF; 5664 } 5665 5666 FixedScalableVFPair 5667 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5668 ElementCount UserVF) { 5669 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5670 unsigned SmallestType, WidestType; 5671 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5672 5673 // Get the maximum safe dependence distance in bits computed by LAA. 5674 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5675 // the memory accesses that is most restrictive (involved in the smallest 5676 // dependence distance). 5677 unsigned MaxSafeElements = 5678 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5679 5680 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5681 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5682 5683 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5684 << ".\n"); 5685 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5686 << ".\n"); 5687 5688 // First analyze the UserVF, fall back if the UserVF should be ignored. 5689 if (UserVF) { 5690 auto MaxSafeUserVF = 5691 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5692 5693 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5694 // If `VF=vscale x N` is safe, then so is `VF=N` 5695 if (UserVF.isScalable()) 5696 return FixedScalableVFPair( 5697 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5698 else 5699 return UserVF; 5700 } 5701 5702 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5703 5704 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5705 // is better to ignore the hint and let the compiler choose a suitable VF. 5706 if (!UserVF.isScalable()) { 5707 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5708 << " is unsafe, clamping to max safe VF=" 5709 << MaxSafeFixedVF << ".\n"); 5710 ORE->emit([&]() { 5711 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5712 TheLoop->getStartLoc(), 5713 TheLoop->getHeader()) 5714 << "User-specified vectorization factor " 5715 << ore::NV("UserVectorizationFactor", UserVF) 5716 << " is unsafe, clamping to maximum safe vectorization factor " 5717 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5718 }); 5719 return MaxSafeFixedVF; 5720 } 5721 5722 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5723 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5724 << " is ignored because scalable vectors are not " 5725 "available.\n"); 5726 ORE->emit([&]() { 5727 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5728 TheLoop->getStartLoc(), 5729 TheLoop->getHeader()) 5730 << "User-specified vectorization factor " 5731 << ore::NV("UserVectorizationFactor", UserVF) 5732 << " is ignored because the target does not support scalable " 5733 "vectors. The compiler will pick a more suitable value."; 5734 }); 5735 } else { 5736 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5737 << " is unsafe. Ignoring scalable UserVF.\n"); 5738 ORE->emit([&]() { 5739 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5740 TheLoop->getStartLoc(), 5741 TheLoop->getHeader()) 5742 << "User-specified vectorization factor " 5743 << ore::NV("UserVectorizationFactor", UserVF) 5744 << " is unsafe. Ignoring the hint to let the compiler pick a " 5745 "more suitable value."; 5746 }); 5747 } 5748 } 5749 5750 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5751 << " / " << WidestType << " bits.\n"); 5752 5753 FixedScalableVFPair Result(ElementCount::getFixed(1), 5754 ElementCount::getScalable(0)); 5755 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5756 WidestType, MaxSafeFixedVF)) 5757 Result.FixedVF = MaxVF; 5758 5759 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5760 WidestType, MaxSafeScalableVF)) 5761 if (MaxVF.isScalable()) { 5762 Result.ScalableVF = MaxVF; 5763 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5764 << "\n"); 5765 } 5766 5767 return Result; 5768 } 5769 5770 FixedScalableVFPair 5771 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5772 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5773 // TODO: It may by useful to do since it's still likely to be dynamically 5774 // uniform if the target can skip. 5775 reportVectorizationFailure( 5776 "Not inserting runtime ptr check for divergent target", 5777 "runtime pointer checks needed. Not enabled for divergent target", 5778 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5779 return FixedScalableVFPair::getNone(); 5780 } 5781 5782 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5783 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5784 if (TC == 1) { 5785 reportVectorizationFailure("Single iteration (non) loop", 5786 "loop trip count is one, irrelevant for vectorization", 5787 "SingleIterationLoop", ORE, TheLoop); 5788 return FixedScalableVFPair::getNone(); 5789 } 5790 5791 switch (ScalarEpilogueStatus) { 5792 case CM_ScalarEpilogueAllowed: 5793 return computeFeasibleMaxVF(TC, UserVF); 5794 case CM_ScalarEpilogueNotAllowedUsePredicate: 5795 LLVM_FALLTHROUGH; 5796 case CM_ScalarEpilogueNotNeededUsePredicate: 5797 LLVM_DEBUG( 5798 dbgs() << "LV: vector predicate hint/switch found.\n" 5799 << "LV: Not allowing scalar epilogue, creating predicated " 5800 << "vector loop.\n"); 5801 break; 5802 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5803 // fallthrough as a special case of OptForSize 5804 case CM_ScalarEpilogueNotAllowedOptSize: 5805 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5806 LLVM_DEBUG( 5807 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5808 else 5809 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5810 << "count.\n"); 5811 5812 // Bail if runtime checks are required, which are not good when optimising 5813 // for size. 5814 if (runtimeChecksRequired()) 5815 return FixedScalableVFPair::getNone(); 5816 5817 break; 5818 } 5819 5820 // The only loops we can vectorize without a scalar epilogue, are loops with 5821 // a bottom-test and a single exiting block. We'd have to handle the fact 5822 // that not every instruction executes on the last iteration. This will 5823 // require a lane mask which varies through the vector loop body. (TODO) 5824 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5825 // If there was a tail-folding hint/switch, but we can't fold the tail by 5826 // masking, fallback to a vectorization with a scalar epilogue. 5827 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5828 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5829 "scalar epilogue instead.\n"); 5830 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5831 return computeFeasibleMaxVF(TC, UserVF); 5832 } 5833 return FixedScalableVFPair::getNone(); 5834 } 5835 5836 // Now try the tail folding 5837 5838 // Invalidate interleave groups that require an epilogue if we can't mask 5839 // the interleave-group. 5840 if (!useMaskedInterleavedAccesses(TTI)) { 5841 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5842 "No decisions should have been taken at this point"); 5843 // Note: There is no need to invalidate any cost modeling decisions here, as 5844 // non where taken so far. 5845 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5846 } 5847 5848 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5849 // Avoid tail folding if the trip count is known to be a multiple of any VF 5850 // we chose. 5851 // FIXME: The condition below pessimises the case for fixed-width vectors, 5852 // when scalable VFs are also candidates for vectorization. 5853 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5854 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5855 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5856 "MaxFixedVF must be a power of 2"); 5857 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5858 : MaxFixedVF.getFixedValue(); 5859 ScalarEvolution *SE = PSE.getSE(); 5860 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5861 const SCEV *ExitCount = SE->getAddExpr( 5862 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5863 const SCEV *Rem = SE->getURemExpr( 5864 SE->applyLoopGuards(ExitCount, TheLoop), 5865 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5866 if (Rem->isZero()) { 5867 // Accept MaxFixedVF if we do not have a tail. 5868 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5869 return MaxFactors; 5870 } 5871 } 5872 5873 // For scalable vectors, don't use tail folding as this is currently not yet 5874 // supported. The code is likely to have ended up here if the tripcount is 5875 // low, in which case it makes sense not to use scalable vectors. 5876 if (MaxFactors.ScalableVF.isVector()) 5877 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5878 5879 // If we don't know the precise trip count, or if the trip count that we 5880 // found modulo the vectorization factor is not zero, try to fold the tail 5881 // by masking. 5882 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5883 if (Legal->prepareToFoldTailByMasking()) { 5884 FoldTailByMasking = true; 5885 return MaxFactors; 5886 } 5887 5888 // If there was a tail-folding hint/switch, but we can't fold the tail by 5889 // masking, fallback to a vectorization with a scalar epilogue. 5890 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5891 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5892 "scalar epilogue instead.\n"); 5893 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5894 return MaxFactors; 5895 } 5896 5897 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5898 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5899 return FixedScalableVFPair::getNone(); 5900 } 5901 5902 if (TC == 0) { 5903 reportVectorizationFailure( 5904 "Unable to calculate the loop count due to complex control flow", 5905 "unable to calculate the loop count due to complex control flow", 5906 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5907 return FixedScalableVFPair::getNone(); 5908 } 5909 5910 reportVectorizationFailure( 5911 "Cannot optimize for size and vectorize at the same time.", 5912 "cannot optimize for size and vectorize at the same time. " 5913 "Enable vectorization of this loop with '#pragma clang loop " 5914 "vectorize(enable)' when compiling with -Os/-Oz", 5915 "NoTailLoopWithOptForSize", ORE, TheLoop); 5916 return FixedScalableVFPair::getNone(); 5917 } 5918 5919 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5920 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5921 const ElementCount &MaxSafeVF) { 5922 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5923 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5924 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5925 : TargetTransformInfo::RGK_FixedWidthVector); 5926 5927 // Convenience function to return the minimum of two ElementCounts. 5928 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5929 assert((LHS.isScalable() == RHS.isScalable()) && 5930 "Scalable flags must match"); 5931 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5932 }; 5933 5934 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5935 // Note that both WidestRegister and WidestType may not be a powers of 2. 5936 auto MaxVectorElementCount = ElementCount::get( 5937 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5938 ComputeScalableMaxVF); 5939 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5940 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5941 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5942 5943 if (!MaxVectorElementCount) { 5944 LLVM_DEBUG(dbgs() << "LV: The target has no " 5945 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5946 << " vector registers.\n"); 5947 return ElementCount::getFixed(1); 5948 } 5949 5950 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5951 if (ConstTripCount && 5952 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5953 isPowerOf2_32(ConstTripCount)) { 5954 // We need to clamp the VF to be the ConstTripCount. There is no point in 5955 // choosing a higher viable VF as done in the loop below. If 5956 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5957 // the TC is less than or equal to the known number of lanes. 5958 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5959 << ConstTripCount << "\n"); 5960 return TripCountEC; 5961 } 5962 5963 ElementCount MaxVF = MaxVectorElementCount; 5964 if (TTI.shouldMaximizeVectorBandwidth() || 5965 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5966 auto MaxVectorElementCountMaxBW = ElementCount::get( 5967 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5968 ComputeScalableMaxVF); 5969 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5970 5971 // Collect all viable vectorization factors larger than the default MaxVF 5972 // (i.e. MaxVectorElementCount). 5973 SmallVector<ElementCount, 8> VFs; 5974 for (ElementCount VS = MaxVectorElementCount * 2; 5975 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5976 VFs.push_back(VS); 5977 5978 // For each VF calculate its register usage. 5979 auto RUs = calculateRegisterUsage(VFs); 5980 5981 // Select the largest VF which doesn't require more registers than existing 5982 // ones. 5983 for (int i = RUs.size() - 1; i >= 0; --i) { 5984 bool Selected = true; 5985 for (auto &pair : RUs[i].MaxLocalUsers) { 5986 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5987 if (pair.second > TargetNumRegisters) 5988 Selected = false; 5989 } 5990 if (Selected) { 5991 MaxVF = VFs[i]; 5992 break; 5993 } 5994 } 5995 if (ElementCount MinVF = 5996 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5997 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5998 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5999 << ") with target's minimum: " << MinVF << '\n'); 6000 MaxVF = MinVF; 6001 } 6002 } 6003 } 6004 return MaxVF; 6005 } 6006 6007 bool LoopVectorizationCostModel::isMoreProfitable( 6008 const VectorizationFactor &A, const VectorizationFactor &B) const { 6009 InstructionCost CostA = A.Cost; 6010 InstructionCost CostB = B.Cost; 6011 6012 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6013 6014 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6015 MaxTripCount) { 6016 // If we are folding the tail and the trip count is a known (possibly small) 6017 // constant, the trip count will be rounded up to an integer number of 6018 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6019 // which we compare directly. When not folding the tail, the total cost will 6020 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6021 // approximated with the per-lane cost below instead of using the tripcount 6022 // as here. 6023 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6024 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6025 return RTCostA < RTCostB; 6026 } 6027 6028 // Improve estimate for the vector width if it is scalable. 6029 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 6030 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 6031 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 6032 if (A.Width.isScalable()) 6033 EstimatedWidthA *= VScale.getValue(); 6034 if (B.Width.isScalable()) 6035 EstimatedWidthB *= VScale.getValue(); 6036 } 6037 6038 // When set to preferred, for now assume vscale may be larger than 1 (or the 6039 // one being tuned for), so that scalable vectorization is slightly favorable 6040 // over fixed-width vectorization. 6041 if (Hints->isScalableVectorizationPreferred()) 6042 if (A.Width.isScalable() && !B.Width.isScalable()) 6043 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 6044 6045 // To avoid the need for FP division: 6046 // (CostA / A.Width) < (CostB / B.Width) 6047 // <=> (CostA * B.Width) < (CostB * A.Width) 6048 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 6049 } 6050 6051 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 6052 const ElementCountSet &VFCandidates) { 6053 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6054 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6055 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6056 assert(VFCandidates.count(ElementCount::getFixed(1)) && 6057 "Expected Scalar VF to be a candidate"); 6058 6059 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6060 VectorizationFactor ChosenFactor = ScalarCost; 6061 6062 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6063 if (ForceVectorization && VFCandidates.size() > 1) { 6064 // Ignore scalar width, because the user explicitly wants vectorization. 6065 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6066 // evaluation. 6067 ChosenFactor.Cost = InstructionCost::getMax(); 6068 } 6069 6070 SmallVector<InstructionVFPair> InvalidCosts; 6071 for (const auto &i : VFCandidates) { 6072 // The cost for scalar VF=1 is already calculated, so ignore it. 6073 if (i.isScalar()) 6074 continue; 6075 6076 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 6077 VectorizationFactor Candidate(i, C.first); 6078 6079 #ifndef NDEBUG 6080 unsigned AssumedMinimumVscale = 1; 6081 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 6082 AssumedMinimumVscale = VScale.getValue(); 6083 unsigned Width = 6084 Candidate.Width.isScalable() 6085 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 6086 : Candidate.Width.getFixedValue(); 6087 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 6088 << " costs: " << (Candidate.Cost / Width)); 6089 if (i.isScalable()) 6090 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 6091 << AssumedMinimumVscale << ")"); 6092 LLVM_DEBUG(dbgs() << ".\n"); 6093 #endif 6094 6095 if (!C.second && !ForceVectorization) { 6096 LLVM_DEBUG( 6097 dbgs() << "LV: Not considering vector loop of width " << i 6098 << " because it will not generate any vector instructions.\n"); 6099 continue; 6100 } 6101 6102 // If profitable add it to ProfitableVF list. 6103 if (isMoreProfitable(Candidate, ScalarCost)) 6104 ProfitableVFs.push_back(Candidate); 6105 6106 if (isMoreProfitable(Candidate, ChosenFactor)) 6107 ChosenFactor = Candidate; 6108 } 6109 6110 // Emit a report of VFs with invalid costs in the loop. 6111 if (!InvalidCosts.empty()) { 6112 // Group the remarks per instruction, keeping the instruction order from 6113 // InvalidCosts. 6114 std::map<Instruction *, unsigned> Numbering; 6115 unsigned I = 0; 6116 for (auto &Pair : InvalidCosts) 6117 if (!Numbering.count(Pair.first)) 6118 Numbering[Pair.first] = I++; 6119 6120 // Sort the list, first on instruction(number) then on VF. 6121 llvm::sort(InvalidCosts, 6122 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6123 if (Numbering[A.first] != Numbering[B.first]) 6124 return Numbering[A.first] < Numbering[B.first]; 6125 ElementCountComparator ECC; 6126 return ECC(A.second, B.second); 6127 }); 6128 6129 // For a list of ordered instruction-vf pairs: 6130 // [(load, vf1), (load, vf2), (store, vf1)] 6131 // Group the instructions together to emit separate remarks for: 6132 // load (vf1, vf2) 6133 // store (vf1) 6134 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6135 auto Subset = ArrayRef<InstructionVFPair>(); 6136 do { 6137 if (Subset.empty()) 6138 Subset = Tail.take_front(1); 6139 6140 Instruction *I = Subset.front().first; 6141 6142 // If the next instruction is different, or if there are no other pairs, 6143 // emit a remark for the collated subset. e.g. 6144 // [(load, vf1), (load, vf2))] 6145 // to emit: 6146 // remark: invalid costs for 'load' at VF=(vf, vf2) 6147 if (Subset == Tail || Tail[Subset.size()].first != I) { 6148 std::string OutString; 6149 raw_string_ostream OS(OutString); 6150 assert(!Subset.empty() && "Unexpected empty range"); 6151 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6152 for (auto &Pair : Subset) 6153 OS << (Pair.second == Subset.front().second ? "" : ", ") 6154 << Pair.second; 6155 OS << "):"; 6156 if (auto *CI = dyn_cast<CallInst>(I)) 6157 OS << " call to " << CI->getCalledFunction()->getName(); 6158 else 6159 OS << " " << I->getOpcodeName(); 6160 OS.flush(); 6161 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6162 Tail = Tail.drop_front(Subset.size()); 6163 Subset = {}; 6164 } else 6165 // Grow the subset by one element 6166 Subset = Tail.take_front(Subset.size() + 1); 6167 } while (!Tail.empty()); 6168 } 6169 6170 if (!EnableCondStoresVectorization && NumPredStores) { 6171 reportVectorizationFailure("There are conditional stores.", 6172 "store that is conditionally executed prevents vectorization", 6173 "ConditionalStore", ORE, TheLoop); 6174 ChosenFactor = ScalarCost; 6175 } 6176 6177 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6178 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6179 << "LV: Vectorization seems to be not beneficial, " 6180 << "but was forced by a user.\n"); 6181 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6182 return ChosenFactor; 6183 } 6184 6185 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6186 const Loop &L, ElementCount VF) const { 6187 // Cross iteration phis such as reductions need special handling and are 6188 // currently unsupported. 6189 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6190 return Legal->isFirstOrderRecurrence(&Phi) || 6191 Legal->isReductionVariable(&Phi); 6192 })) 6193 return false; 6194 6195 // Phis with uses outside of the loop require special handling and are 6196 // currently unsupported. 6197 for (auto &Entry : Legal->getInductionVars()) { 6198 // Look for uses of the value of the induction at the last iteration. 6199 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6200 for (User *U : PostInc->users()) 6201 if (!L.contains(cast<Instruction>(U))) 6202 return false; 6203 // Look for uses of penultimate value of the induction. 6204 for (User *U : Entry.first->users()) 6205 if (!L.contains(cast<Instruction>(U))) 6206 return false; 6207 } 6208 6209 // Induction variables that are widened require special handling that is 6210 // currently not supported. 6211 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6212 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6213 this->isProfitableToScalarize(Entry.first, VF)); 6214 })) 6215 return false; 6216 6217 // Epilogue vectorization code has not been auditted to ensure it handles 6218 // non-latch exits properly. It may be fine, but it needs auditted and 6219 // tested. 6220 if (L.getExitingBlock() != L.getLoopLatch()) 6221 return false; 6222 6223 return true; 6224 } 6225 6226 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6227 const ElementCount VF) const { 6228 // FIXME: We need a much better cost-model to take different parameters such 6229 // as register pressure, code size increase and cost of extra branches into 6230 // account. For now we apply a very crude heuristic and only consider loops 6231 // with vectorization factors larger than a certain value. 6232 // We also consider epilogue vectorization unprofitable for targets that don't 6233 // consider interleaving beneficial (eg. MVE). 6234 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6235 return false; 6236 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6237 return true; 6238 return false; 6239 } 6240 6241 VectorizationFactor 6242 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6243 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6244 VectorizationFactor Result = VectorizationFactor::Disabled(); 6245 if (!EnableEpilogueVectorization) { 6246 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6247 return Result; 6248 } 6249 6250 if (!isScalarEpilogueAllowed()) { 6251 LLVM_DEBUG( 6252 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6253 "allowed.\n";); 6254 return Result; 6255 } 6256 6257 // Not really a cost consideration, but check for unsupported cases here to 6258 // simplify the logic. 6259 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6260 LLVM_DEBUG( 6261 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6262 "not a supported candidate.\n";); 6263 return Result; 6264 } 6265 6266 if (EpilogueVectorizationForceVF > 1) { 6267 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6268 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 6269 if (LVP.hasPlanWithVF(ForcedEC)) 6270 return {ForcedEC, 0}; 6271 else { 6272 LLVM_DEBUG( 6273 dbgs() 6274 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6275 return Result; 6276 } 6277 } 6278 6279 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6280 TheLoop->getHeader()->getParent()->hasMinSize()) { 6281 LLVM_DEBUG( 6282 dbgs() 6283 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6284 return Result; 6285 } 6286 6287 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 6288 if (MainLoopVF.isScalable()) 6289 LLVM_DEBUG( 6290 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 6291 "yet supported. Converting to fixed-width (VF=" 6292 << FixedMainLoopVF << ") instead\n"); 6293 6294 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 6295 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 6296 "this loop\n"); 6297 return Result; 6298 } 6299 6300 for (auto &NextVF : ProfitableVFs) 6301 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 6302 (Result.Width.getFixedValue() == 1 || 6303 isMoreProfitable(NextVF, Result)) && 6304 LVP.hasPlanWithVF(NextVF.Width)) 6305 Result = NextVF; 6306 6307 if (Result != VectorizationFactor::Disabled()) 6308 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6309 << Result.Width.getFixedValue() << "\n";); 6310 return Result; 6311 } 6312 6313 std::pair<unsigned, unsigned> 6314 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6315 unsigned MinWidth = -1U; 6316 unsigned MaxWidth = 8; 6317 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6318 for (Type *T : ElementTypesInLoop) { 6319 MinWidth = std::min<unsigned>( 6320 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6321 MaxWidth = std::max<unsigned>( 6322 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6323 } 6324 return {MinWidth, MaxWidth}; 6325 } 6326 6327 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6328 ElementTypesInLoop.clear(); 6329 // For each block. 6330 for (BasicBlock *BB : TheLoop->blocks()) { 6331 // For each instruction in the loop. 6332 for (Instruction &I : BB->instructionsWithoutDebug()) { 6333 Type *T = I.getType(); 6334 6335 // Skip ignored values. 6336 if (ValuesToIgnore.count(&I)) 6337 continue; 6338 6339 // Only examine Loads, Stores and PHINodes. 6340 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6341 continue; 6342 6343 // Examine PHI nodes that are reduction variables. Update the type to 6344 // account for the recurrence type. 6345 if (auto *PN = dyn_cast<PHINode>(&I)) { 6346 if (!Legal->isReductionVariable(PN)) 6347 continue; 6348 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6349 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6350 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6351 RdxDesc.getRecurrenceType(), 6352 TargetTransformInfo::ReductionFlags())) 6353 continue; 6354 T = RdxDesc.getRecurrenceType(); 6355 } 6356 6357 // Examine the stored values. 6358 if (auto *ST = dyn_cast<StoreInst>(&I)) 6359 T = ST->getValueOperand()->getType(); 6360 6361 // Ignore loaded pointer types and stored pointer types that are not 6362 // vectorizable. 6363 // 6364 // FIXME: The check here attempts to predict whether a load or store will 6365 // be vectorized. We only know this for certain after a VF has 6366 // been selected. Here, we assume that if an access can be 6367 // vectorized, it will be. We should also look at extending this 6368 // optimization to non-pointer types. 6369 // 6370 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6371 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6372 continue; 6373 6374 ElementTypesInLoop.insert(T); 6375 } 6376 } 6377 } 6378 6379 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6380 unsigned LoopCost) { 6381 // -- The interleave heuristics -- 6382 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6383 // There are many micro-architectural considerations that we can't predict 6384 // at this level. For example, frontend pressure (on decode or fetch) due to 6385 // code size, or the number and capabilities of the execution ports. 6386 // 6387 // We use the following heuristics to select the interleave count: 6388 // 1. If the code has reductions, then we interleave to break the cross 6389 // iteration dependency. 6390 // 2. If the loop is really small, then we interleave to reduce the loop 6391 // overhead. 6392 // 3. We don't interleave if we think that we will spill registers to memory 6393 // due to the increased register pressure. 6394 6395 if (!isScalarEpilogueAllowed()) 6396 return 1; 6397 6398 // We used the distance for the interleave count. 6399 if (Legal->getMaxSafeDepDistBytes() != -1U) 6400 return 1; 6401 6402 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6403 const bool HasReductions = !Legal->getReductionVars().empty(); 6404 // Do not interleave loops with a relatively small known or estimated trip 6405 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6406 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6407 // because with the above conditions interleaving can expose ILP and break 6408 // cross iteration dependences for reductions. 6409 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6410 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6411 return 1; 6412 6413 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6414 // We divide by these constants so assume that we have at least one 6415 // instruction that uses at least one register. 6416 for (auto& pair : R.MaxLocalUsers) { 6417 pair.second = std::max(pair.second, 1U); 6418 } 6419 6420 // We calculate the interleave count using the following formula. 6421 // Subtract the number of loop invariants from the number of available 6422 // registers. These registers are used by all of the interleaved instances. 6423 // Next, divide the remaining registers by the number of registers that is 6424 // required by the loop, in order to estimate how many parallel instances 6425 // fit without causing spills. All of this is rounded down if necessary to be 6426 // a power of two. We want power of two interleave count to simplify any 6427 // addressing operations or alignment considerations. 6428 // We also want power of two interleave counts to ensure that the induction 6429 // variable of the vector loop wraps to zero, when tail is folded by masking; 6430 // this currently happens when OptForSize, in which case IC is set to 1 above. 6431 unsigned IC = UINT_MAX; 6432 6433 for (auto& pair : R.MaxLocalUsers) { 6434 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6435 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6436 << " registers of " 6437 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6438 if (VF.isScalar()) { 6439 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6440 TargetNumRegisters = ForceTargetNumScalarRegs; 6441 } else { 6442 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6443 TargetNumRegisters = ForceTargetNumVectorRegs; 6444 } 6445 unsigned MaxLocalUsers = pair.second; 6446 unsigned LoopInvariantRegs = 0; 6447 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6448 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6449 6450 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6451 // Don't count the induction variable as interleaved. 6452 if (EnableIndVarRegisterHeur) { 6453 TmpIC = 6454 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6455 std::max(1U, (MaxLocalUsers - 1))); 6456 } 6457 6458 IC = std::min(IC, TmpIC); 6459 } 6460 6461 // Clamp the interleave ranges to reasonable counts. 6462 unsigned MaxInterleaveCount = 6463 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6464 6465 // Check if the user has overridden the max. 6466 if (VF.isScalar()) { 6467 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6468 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6469 } else { 6470 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6471 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6472 } 6473 6474 // If trip count is known or estimated compile time constant, limit the 6475 // interleave count to be less than the trip count divided by VF, provided it 6476 // is at least 1. 6477 // 6478 // For scalable vectors we can't know if interleaving is beneficial. It may 6479 // not be beneficial for small loops if none of the lanes in the second vector 6480 // iterations is enabled. However, for larger loops, there is likely to be a 6481 // similar benefit as for fixed-width vectors. For now, we choose to leave 6482 // the InterleaveCount as if vscale is '1', although if some information about 6483 // the vector is known (e.g. min vector size), we can make a better decision. 6484 if (BestKnownTC) { 6485 MaxInterleaveCount = 6486 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6487 // Make sure MaxInterleaveCount is greater than 0. 6488 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6489 } 6490 6491 assert(MaxInterleaveCount > 0 && 6492 "Maximum interleave count must be greater than 0"); 6493 6494 // Clamp the calculated IC to be between the 1 and the max interleave count 6495 // that the target and trip count allows. 6496 if (IC > MaxInterleaveCount) 6497 IC = MaxInterleaveCount; 6498 else 6499 // Make sure IC is greater than 0. 6500 IC = std::max(1u, IC); 6501 6502 assert(IC > 0 && "Interleave count must be greater than 0."); 6503 6504 // If we did not calculate the cost for VF (because the user selected the VF) 6505 // then we calculate the cost of VF here. 6506 if (LoopCost == 0) { 6507 InstructionCost C = expectedCost(VF).first; 6508 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6509 LoopCost = *C.getValue(); 6510 } 6511 6512 assert(LoopCost && "Non-zero loop cost expected"); 6513 6514 // Interleave if we vectorized this loop and there is a reduction that could 6515 // benefit from interleaving. 6516 if (VF.isVector() && HasReductions) { 6517 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6518 return IC; 6519 } 6520 6521 // Note that if we've already vectorized the loop we will have done the 6522 // runtime check and so interleaving won't require further checks. 6523 bool InterleavingRequiresRuntimePointerCheck = 6524 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6525 6526 // We want to interleave small loops in order to reduce the loop overhead and 6527 // potentially expose ILP opportunities. 6528 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6529 << "LV: IC is " << IC << '\n' 6530 << "LV: VF is " << VF << '\n'); 6531 const bool AggressivelyInterleaveReductions = 6532 TTI.enableAggressiveInterleaving(HasReductions); 6533 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6534 // We assume that the cost overhead is 1 and we use the cost model 6535 // to estimate the cost of the loop and interleave until the cost of the 6536 // loop overhead is about 5% of the cost of the loop. 6537 unsigned SmallIC = 6538 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6539 6540 // Interleave until store/load ports (estimated by max interleave count) are 6541 // saturated. 6542 unsigned NumStores = Legal->getNumStores(); 6543 unsigned NumLoads = Legal->getNumLoads(); 6544 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6545 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6546 6547 // There is little point in interleaving for reductions containing selects 6548 // and compares when VF=1 since it may just create more overhead than it's 6549 // worth for loops with small trip counts. This is because we still have to 6550 // do the final reduction after the loop. 6551 bool HasSelectCmpReductions = 6552 HasReductions && 6553 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6554 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6555 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6556 RdxDesc.getRecurrenceKind()); 6557 }); 6558 if (HasSelectCmpReductions) { 6559 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6560 return 1; 6561 } 6562 6563 // If we have a scalar reduction (vector reductions are already dealt with 6564 // by this point), we can increase the critical path length if the loop 6565 // we're interleaving is inside another loop. For tree-wise reductions 6566 // set the limit to 2, and for ordered reductions it's best to disable 6567 // interleaving entirely. 6568 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6569 bool HasOrderedReductions = 6570 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6571 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6572 return RdxDesc.isOrdered(); 6573 }); 6574 if (HasOrderedReductions) { 6575 LLVM_DEBUG( 6576 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6577 return 1; 6578 } 6579 6580 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6581 SmallIC = std::min(SmallIC, F); 6582 StoresIC = std::min(StoresIC, F); 6583 LoadsIC = std::min(LoadsIC, F); 6584 } 6585 6586 if (EnableLoadStoreRuntimeInterleave && 6587 std::max(StoresIC, LoadsIC) > SmallIC) { 6588 LLVM_DEBUG( 6589 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6590 return std::max(StoresIC, LoadsIC); 6591 } 6592 6593 // If there are scalar reductions and TTI has enabled aggressive 6594 // interleaving for reductions, we will interleave to expose ILP. 6595 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6596 AggressivelyInterleaveReductions) { 6597 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6598 // Interleave no less than SmallIC but not as aggressive as the normal IC 6599 // to satisfy the rare situation when resources are too limited. 6600 return std::max(IC / 2, SmallIC); 6601 } else { 6602 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6603 return SmallIC; 6604 } 6605 } 6606 6607 // Interleave if this is a large loop (small loops are already dealt with by 6608 // this point) that could benefit from interleaving. 6609 if (AggressivelyInterleaveReductions) { 6610 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6611 return IC; 6612 } 6613 6614 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6615 return 1; 6616 } 6617 6618 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6619 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6620 // This function calculates the register usage by measuring the highest number 6621 // of values that are alive at a single location. Obviously, this is a very 6622 // rough estimation. We scan the loop in a topological order in order and 6623 // assign a number to each instruction. We use RPO to ensure that defs are 6624 // met before their users. We assume that each instruction that has in-loop 6625 // users starts an interval. We record every time that an in-loop value is 6626 // used, so we have a list of the first and last occurrences of each 6627 // instruction. Next, we transpose this data structure into a multi map that 6628 // holds the list of intervals that *end* at a specific location. This multi 6629 // map allows us to perform a linear search. We scan the instructions linearly 6630 // and record each time that a new interval starts, by placing it in a set. 6631 // If we find this value in the multi-map then we remove it from the set. 6632 // The max register usage is the maximum size of the set. 6633 // We also search for instructions that are defined outside the loop, but are 6634 // used inside the loop. We need this number separately from the max-interval 6635 // usage number because when we unroll, loop-invariant values do not take 6636 // more register. 6637 LoopBlocksDFS DFS(TheLoop); 6638 DFS.perform(LI); 6639 6640 RegisterUsage RU; 6641 6642 // Each 'key' in the map opens a new interval. The values 6643 // of the map are the index of the 'last seen' usage of the 6644 // instruction that is the key. 6645 using IntervalMap = DenseMap<Instruction *, unsigned>; 6646 6647 // Maps instruction to its index. 6648 SmallVector<Instruction *, 64> IdxToInstr; 6649 // Marks the end of each interval. 6650 IntervalMap EndPoint; 6651 // Saves the list of instruction indices that are used in the loop. 6652 SmallPtrSet<Instruction *, 8> Ends; 6653 // Saves the list of values that are used in the loop but are 6654 // defined outside the loop, such as arguments and constants. 6655 SmallPtrSet<Value *, 8> LoopInvariants; 6656 6657 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6658 for (Instruction &I : BB->instructionsWithoutDebug()) { 6659 IdxToInstr.push_back(&I); 6660 6661 // Save the end location of each USE. 6662 for (Value *U : I.operands()) { 6663 auto *Instr = dyn_cast<Instruction>(U); 6664 6665 // Ignore non-instruction values such as arguments, constants, etc. 6666 if (!Instr) 6667 continue; 6668 6669 // If this instruction is outside the loop then record it and continue. 6670 if (!TheLoop->contains(Instr)) { 6671 LoopInvariants.insert(Instr); 6672 continue; 6673 } 6674 6675 // Overwrite previous end points. 6676 EndPoint[Instr] = IdxToInstr.size(); 6677 Ends.insert(Instr); 6678 } 6679 } 6680 } 6681 6682 // Saves the list of intervals that end with the index in 'key'. 6683 using InstrList = SmallVector<Instruction *, 2>; 6684 DenseMap<unsigned, InstrList> TransposeEnds; 6685 6686 // Transpose the EndPoints to a list of values that end at each index. 6687 for (auto &Interval : EndPoint) 6688 TransposeEnds[Interval.second].push_back(Interval.first); 6689 6690 SmallPtrSet<Instruction *, 8> OpenIntervals; 6691 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6692 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6693 6694 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6695 6696 // A lambda that gets the register usage for the given type and VF. 6697 const auto &TTICapture = TTI; 6698 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6699 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6700 return 0; 6701 InstructionCost::CostType RegUsage = 6702 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6703 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6704 "Nonsensical values for register usage."); 6705 return RegUsage; 6706 }; 6707 6708 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6709 Instruction *I = IdxToInstr[i]; 6710 6711 // Remove all of the instructions that end at this location. 6712 InstrList &List = TransposeEnds[i]; 6713 for (Instruction *ToRemove : List) 6714 OpenIntervals.erase(ToRemove); 6715 6716 // Ignore instructions that are never used within the loop. 6717 if (!Ends.count(I)) 6718 continue; 6719 6720 // Skip ignored values. 6721 if (ValuesToIgnore.count(I)) 6722 continue; 6723 6724 // For each VF find the maximum usage of registers. 6725 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6726 // Count the number of live intervals. 6727 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6728 6729 if (VFs[j].isScalar()) { 6730 for (auto Inst : OpenIntervals) { 6731 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6732 if (RegUsage.find(ClassID) == RegUsage.end()) 6733 RegUsage[ClassID] = 1; 6734 else 6735 RegUsage[ClassID] += 1; 6736 } 6737 } else { 6738 collectUniformsAndScalars(VFs[j]); 6739 for (auto Inst : OpenIntervals) { 6740 // Skip ignored values for VF > 1. 6741 if (VecValuesToIgnore.count(Inst)) 6742 continue; 6743 if (isScalarAfterVectorization(Inst, VFs[j])) { 6744 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6745 if (RegUsage.find(ClassID) == RegUsage.end()) 6746 RegUsage[ClassID] = 1; 6747 else 6748 RegUsage[ClassID] += 1; 6749 } else { 6750 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6751 if (RegUsage.find(ClassID) == RegUsage.end()) 6752 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6753 else 6754 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6755 } 6756 } 6757 } 6758 6759 for (auto& pair : RegUsage) { 6760 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6761 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6762 else 6763 MaxUsages[j][pair.first] = pair.second; 6764 } 6765 } 6766 6767 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6768 << OpenIntervals.size() << '\n'); 6769 6770 // Add the current instruction to the list of open intervals. 6771 OpenIntervals.insert(I); 6772 } 6773 6774 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6775 SmallMapVector<unsigned, unsigned, 4> Invariant; 6776 6777 for (auto Inst : LoopInvariants) { 6778 unsigned Usage = 6779 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6780 unsigned ClassID = 6781 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6782 if (Invariant.find(ClassID) == Invariant.end()) 6783 Invariant[ClassID] = Usage; 6784 else 6785 Invariant[ClassID] += Usage; 6786 } 6787 6788 LLVM_DEBUG({ 6789 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6790 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6791 << " item\n"; 6792 for (const auto &pair : MaxUsages[i]) { 6793 dbgs() << "LV(REG): RegisterClass: " 6794 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6795 << " registers\n"; 6796 } 6797 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6798 << " item\n"; 6799 for (const auto &pair : Invariant) { 6800 dbgs() << "LV(REG): RegisterClass: " 6801 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6802 << " registers\n"; 6803 } 6804 }); 6805 6806 RU.LoopInvariantRegs = Invariant; 6807 RU.MaxLocalUsers = MaxUsages[i]; 6808 RUs[i] = RU; 6809 } 6810 6811 return RUs; 6812 } 6813 6814 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6815 // TODO: Cost model for emulated masked load/store is completely 6816 // broken. This hack guides the cost model to use an artificially 6817 // high enough value to practically disable vectorization with such 6818 // operations, except where previously deployed legality hack allowed 6819 // using very low cost values. This is to avoid regressions coming simply 6820 // from moving "masked load/store" check from legality to cost model. 6821 // Masked Load/Gather emulation was previously never allowed. 6822 // Limited number of Masked Store/Scatter emulation was allowed. 6823 assert(isPredicatedInst(I) && 6824 "Expecting a scalar emulated instruction"); 6825 return isa<LoadInst>(I) || 6826 (isa<StoreInst>(I) && 6827 NumPredStores > NumberOfStoresToPredicate); 6828 } 6829 6830 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6831 // If we aren't vectorizing the loop, or if we've already collected the 6832 // instructions to scalarize, there's nothing to do. Collection may already 6833 // have occurred if we have a user-selected VF and are now computing the 6834 // expected cost for interleaving. 6835 if (VF.isScalar() || VF.isZero() || 6836 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6837 return; 6838 6839 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6840 // not profitable to scalarize any instructions, the presence of VF in the 6841 // map will indicate that we've analyzed it already. 6842 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6843 6844 // Find all the instructions that are scalar with predication in the loop and 6845 // determine if it would be better to not if-convert the blocks they are in. 6846 // If so, we also record the instructions to scalarize. 6847 for (BasicBlock *BB : TheLoop->blocks()) { 6848 if (!blockNeedsPredication(BB)) 6849 continue; 6850 for (Instruction &I : *BB) 6851 if (isScalarWithPredication(&I)) { 6852 ScalarCostsTy ScalarCosts; 6853 // Do not apply discount if scalable, because that would lead to 6854 // invalid scalarization costs. 6855 // Do not apply discount logic if hacked cost is needed 6856 // for emulated masked memrefs. 6857 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6858 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6859 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6860 // Remember that BB will remain after vectorization. 6861 PredicatedBBsAfterVectorization.insert(BB); 6862 } 6863 } 6864 } 6865 6866 int LoopVectorizationCostModel::computePredInstDiscount( 6867 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6868 assert(!isUniformAfterVectorization(PredInst, VF) && 6869 "Instruction marked uniform-after-vectorization will be predicated"); 6870 6871 // Initialize the discount to zero, meaning that the scalar version and the 6872 // vector version cost the same. 6873 InstructionCost Discount = 0; 6874 6875 // Holds instructions to analyze. The instructions we visit are mapped in 6876 // ScalarCosts. Those instructions are the ones that would be scalarized if 6877 // we find that the scalar version costs less. 6878 SmallVector<Instruction *, 8> Worklist; 6879 6880 // Returns true if the given instruction can be scalarized. 6881 auto canBeScalarized = [&](Instruction *I) -> bool { 6882 // We only attempt to scalarize instructions forming a single-use chain 6883 // from the original predicated block that would otherwise be vectorized. 6884 // Although not strictly necessary, we give up on instructions we know will 6885 // already be scalar to avoid traversing chains that are unlikely to be 6886 // beneficial. 6887 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6888 isScalarAfterVectorization(I, VF)) 6889 return false; 6890 6891 // If the instruction is scalar with predication, it will be analyzed 6892 // separately. We ignore it within the context of PredInst. 6893 if (isScalarWithPredication(I)) 6894 return false; 6895 6896 // If any of the instruction's operands are uniform after vectorization, 6897 // the instruction cannot be scalarized. This prevents, for example, a 6898 // masked load from being scalarized. 6899 // 6900 // We assume we will only emit a value for lane zero of an instruction 6901 // marked uniform after vectorization, rather than VF identical values. 6902 // Thus, if we scalarize an instruction that uses a uniform, we would 6903 // create uses of values corresponding to the lanes we aren't emitting code 6904 // for. This behavior can be changed by allowing getScalarValue to clone 6905 // the lane zero values for uniforms rather than asserting. 6906 for (Use &U : I->operands()) 6907 if (auto *J = dyn_cast<Instruction>(U.get())) 6908 if (isUniformAfterVectorization(J, VF)) 6909 return false; 6910 6911 // Otherwise, we can scalarize the instruction. 6912 return true; 6913 }; 6914 6915 // Compute the expected cost discount from scalarizing the entire expression 6916 // feeding the predicated instruction. We currently only consider expressions 6917 // that are single-use instruction chains. 6918 Worklist.push_back(PredInst); 6919 while (!Worklist.empty()) { 6920 Instruction *I = Worklist.pop_back_val(); 6921 6922 // If we've already analyzed the instruction, there's nothing to do. 6923 if (ScalarCosts.find(I) != ScalarCosts.end()) 6924 continue; 6925 6926 // Compute the cost of the vector instruction. Note that this cost already 6927 // includes the scalarization overhead of the predicated instruction. 6928 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6929 6930 // Compute the cost of the scalarized instruction. This cost is the cost of 6931 // the instruction as if it wasn't if-converted and instead remained in the 6932 // predicated block. We will scale this cost by block probability after 6933 // computing the scalarization overhead. 6934 InstructionCost ScalarCost = 6935 VF.getFixedValue() * 6936 getInstructionCost(I, ElementCount::getFixed(1)).first; 6937 6938 // Compute the scalarization overhead of needed insertelement instructions 6939 // and phi nodes. 6940 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6941 ScalarCost += TTI.getScalarizationOverhead( 6942 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6943 APInt::getAllOnes(VF.getFixedValue()), true, false); 6944 ScalarCost += 6945 VF.getFixedValue() * 6946 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6947 } 6948 6949 // Compute the scalarization overhead of needed extractelement 6950 // instructions. For each of the instruction's operands, if the operand can 6951 // be scalarized, add it to the worklist; otherwise, account for the 6952 // overhead. 6953 for (Use &U : I->operands()) 6954 if (auto *J = dyn_cast<Instruction>(U.get())) { 6955 assert(VectorType::isValidElementType(J->getType()) && 6956 "Instruction has non-scalar type"); 6957 if (canBeScalarized(J)) 6958 Worklist.push_back(J); 6959 else if (needsExtract(J, VF)) { 6960 ScalarCost += TTI.getScalarizationOverhead( 6961 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6962 APInt::getAllOnes(VF.getFixedValue()), false, true); 6963 } 6964 } 6965 6966 // Scale the total scalar cost by block probability. 6967 ScalarCost /= getReciprocalPredBlockProb(); 6968 6969 // Compute the discount. A non-negative discount means the vector version 6970 // of the instruction costs more, and scalarizing would be beneficial. 6971 Discount += VectorCost - ScalarCost; 6972 ScalarCosts[I] = ScalarCost; 6973 } 6974 6975 return *Discount.getValue(); 6976 } 6977 6978 LoopVectorizationCostModel::VectorizationCostTy 6979 LoopVectorizationCostModel::expectedCost( 6980 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6981 VectorizationCostTy Cost; 6982 6983 // For each block. 6984 for (BasicBlock *BB : TheLoop->blocks()) { 6985 VectorizationCostTy BlockCost; 6986 6987 // For each instruction in the old loop. 6988 for (Instruction &I : BB->instructionsWithoutDebug()) { 6989 // Skip ignored values. 6990 if (ValuesToIgnore.count(&I) || 6991 (VF.isVector() && VecValuesToIgnore.count(&I))) 6992 continue; 6993 6994 VectorizationCostTy C = getInstructionCost(&I, VF); 6995 6996 // Check if we should override the cost. 6997 if (C.first.isValid() && 6998 ForceTargetInstructionCost.getNumOccurrences() > 0) 6999 C.first = InstructionCost(ForceTargetInstructionCost); 7000 7001 // Keep a list of instructions with invalid costs. 7002 if (Invalid && !C.first.isValid()) 7003 Invalid->emplace_back(&I, VF); 7004 7005 BlockCost.first += C.first; 7006 BlockCost.second |= C.second; 7007 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 7008 << " for VF " << VF << " For instruction: " << I 7009 << '\n'); 7010 } 7011 7012 // If we are vectorizing a predicated block, it will have been 7013 // if-converted. This means that the block's instructions (aside from 7014 // stores and instructions that may divide by zero) will now be 7015 // unconditionally executed. For the scalar case, we may not always execute 7016 // the predicated block, if it is an if-else block. Thus, scale the block's 7017 // cost by the probability of executing it. blockNeedsPredication from 7018 // Legal is used so as to not include all blocks in tail folded loops. 7019 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 7020 BlockCost.first /= getReciprocalPredBlockProb(); 7021 7022 Cost.first += BlockCost.first; 7023 Cost.second |= BlockCost.second; 7024 } 7025 7026 return Cost; 7027 } 7028 7029 /// Gets Address Access SCEV after verifying that the access pattern 7030 /// is loop invariant except the induction variable dependence. 7031 /// 7032 /// This SCEV can be sent to the Target in order to estimate the address 7033 /// calculation cost. 7034 static const SCEV *getAddressAccessSCEV( 7035 Value *Ptr, 7036 LoopVectorizationLegality *Legal, 7037 PredicatedScalarEvolution &PSE, 7038 const Loop *TheLoop) { 7039 7040 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 7041 if (!Gep) 7042 return nullptr; 7043 7044 // We are looking for a gep with all loop invariant indices except for one 7045 // which should be an induction variable. 7046 auto SE = PSE.getSE(); 7047 unsigned NumOperands = Gep->getNumOperands(); 7048 for (unsigned i = 1; i < NumOperands; ++i) { 7049 Value *Opd = Gep->getOperand(i); 7050 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 7051 !Legal->isInductionVariable(Opd)) 7052 return nullptr; 7053 } 7054 7055 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 7056 return PSE.getSCEV(Ptr); 7057 } 7058 7059 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 7060 return Legal->hasStride(I->getOperand(0)) || 7061 Legal->hasStride(I->getOperand(1)); 7062 } 7063 7064 InstructionCost 7065 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 7066 ElementCount VF) { 7067 assert(VF.isVector() && 7068 "Scalarization cost of instruction implies vectorization."); 7069 if (VF.isScalable()) 7070 return InstructionCost::getInvalid(); 7071 7072 Type *ValTy = getLoadStoreType(I); 7073 auto SE = PSE.getSE(); 7074 7075 unsigned AS = getLoadStoreAddressSpace(I); 7076 Value *Ptr = getLoadStorePointerOperand(I); 7077 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 7078 7079 // Figure out whether the access is strided and get the stride value 7080 // if it's known in compile time 7081 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 7082 7083 // Get the cost of the scalar memory instruction and address computation. 7084 InstructionCost Cost = 7085 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 7086 7087 // Don't pass *I here, since it is scalar but will actually be part of a 7088 // vectorized loop where the user of it is a vectorized instruction. 7089 const Align Alignment = getLoadStoreAlignment(I); 7090 Cost += VF.getKnownMinValue() * 7091 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 7092 AS, TTI::TCK_RecipThroughput); 7093 7094 // Get the overhead of the extractelement and insertelement instructions 7095 // we might create due to scalarization. 7096 Cost += getScalarizationOverhead(I, VF); 7097 7098 // If we have a predicated load/store, it will need extra i1 extracts and 7099 // conditional branches, but may not be executed for each vector lane. Scale 7100 // the cost by the probability of executing the predicated block. 7101 if (isPredicatedInst(I)) { 7102 Cost /= getReciprocalPredBlockProb(); 7103 7104 // Add the cost of an i1 extract and a branch 7105 auto *Vec_i1Ty = 7106 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 7107 Cost += TTI.getScalarizationOverhead( 7108 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 7109 /*Insert=*/false, /*Extract=*/true); 7110 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 7111 7112 if (useEmulatedMaskMemRefHack(I)) 7113 // Artificially setting to a high enough value to practically disable 7114 // vectorization with such operations. 7115 Cost = 3000000; 7116 } 7117 7118 return Cost; 7119 } 7120 7121 InstructionCost 7122 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7123 ElementCount VF) { 7124 Type *ValTy = getLoadStoreType(I); 7125 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7126 Value *Ptr = getLoadStorePointerOperand(I); 7127 unsigned AS = getLoadStoreAddressSpace(I); 7128 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 7129 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7130 7131 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7132 "Stride should be 1 or -1 for consecutive memory access"); 7133 const Align Alignment = getLoadStoreAlignment(I); 7134 InstructionCost Cost = 0; 7135 if (Legal->isMaskRequired(I)) 7136 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7137 CostKind); 7138 else 7139 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7140 CostKind, I); 7141 7142 bool Reverse = ConsecutiveStride < 0; 7143 if (Reverse) 7144 Cost += 7145 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7146 return Cost; 7147 } 7148 7149 InstructionCost 7150 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7151 ElementCount VF) { 7152 assert(Legal->isUniformMemOp(*I)); 7153 7154 Type *ValTy = getLoadStoreType(I); 7155 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7156 const Align Alignment = getLoadStoreAlignment(I); 7157 unsigned AS = getLoadStoreAddressSpace(I); 7158 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7159 if (isa<LoadInst>(I)) { 7160 return TTI.getAddressComputationCost(ValTy) + 7161 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7162 CostKind) + 7163 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7164 } 7165 StoreInst *SI = cast<StoreInst>(I); 7166 7167 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7168 return TTI.getAddressComputationCost(ValTy) + 7169 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7170 CostKind) + 7171 (isLoopInvariantStoreValue 7172 ? 0 7173 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7174 VF.getKnownMinValue() - 1)); 7175 } 7176 7177 InstructionCost 7178 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7179 ElementCount VF) { 7180 Type *ValTy = getLoadStoreType(I); 7181 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7182 const Align Alignment = getLoadStoreAlignment(I); 7183 const Value *Ptr = getLoadStorePointerOperand(I); 7184 7185 return TTI.getAddressComputationCost(VectorTy) + 7186 TTI.getGatherScatterOpCost( 7187 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7188 TargetTransformInfo::TCK_RecipThroughput, I); 7189 } 7190 7191 InstructionCost 7192 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7193 ElementCount VF) { 7194 // TODO: Once we have support for interleaving with scalable vectors 7195 // we can calculate the cost properly here. 7196 if (VF.isScalable()) 7197 return InstructionCost::getInvalid(); 7198 7199 Type *ValTy = getLoadStoreType(I); 7200 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7201 unsigned AS = getLoadStoreAddressSpace(I); 7202 7203 auto Group = getInterleavedAccessGroup(I); 7204 assert(Group && "Fail to get an interleaved access group."); 7205 7206 unsigned InterleaveFactor = Group->getFactor(); 7207 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7208 7209 // Holds the indices of existing members in the interleaved group. 7210 SmallVector<unsigned, 4> Indices; 7211 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 7212 if (Group->getMember(IF)) 7213 Indices.push_back(IF); 7214 7215 // Calculate the cost of the whole interleaved group. 7216 bool UseMaskForGaps = 7217 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 7218 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 7219 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7220 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7221 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7222 7223 if (Group->isReverse()) { 7224 // TODO: Add support for reversed masked interleaved access. 7225 assert(!Legal->isMaskRequired(I) && 7226 "Reverse masked interleaved access not supported."); 7227 Cost += 7228 Group->getNumMembers() * 7229 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7230 } 7231 return Cost; 7232 } 7233 7234 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7235 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7236 using namespace llvm::PatternMatch; 7237 // Early exit for no inloop reductions 7238 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7239 return None; 7240 auto *VectorTy = cast<VectorType>(Ty); 7241 7242 // We are looking for a pattern of, and finding the minimal acceptable cost: 7243 // reduce(mul(ext(A), ext(B))) or 7244 // reduce(mul(A, B)) or 7245 // reduce(ext(A)) or 7246 // reduce(A). 7247 // The basic idea is that we walk down the tree to do that, finding the root 7248 // reduction instruction in InLoopReductionImmediateChains. From there we find 7249 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7250 // of the components. If the reduction cost is lower then we return it for the 7251 // reduction instruction and 0 for the other instructions in the pattern. If 7252 // it is not we return an invalid cost specifying the orignal cost method 7253 // should be used. 7254 Instruction *RetI = I; 7255 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7256 if (!RetI->hasOneUser()) 7257 return None; 7258 RetI = RetI->user_back(); 7259 } 7260 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7261 RetI->user_back()->getOpcode() == Instruction::Add) { 7262 if (!RetI->hasOneUser()) 7263 return None; 7264 RetI = RetI->user_back(); 7265 } 7266 7267 // Test if the found instruction is a reduction, and if not return an invalid 7268 // cost specifying the parent to use the original cost modelling. 7269 if (!InLoopReductionImmediateChains.count(RetI)) 7270 return None; 7271 7272 // Find the reduction this chain is a part of and calculate the basic cost of 7273 // the reduction on its own. 7274 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7275 Instruction *ReductionPhi = LastChain; 7276 while (!isa<PHINode>(ReductionPhi)) 7277 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7278 7279 const RecurrenceDescriptor &RdxDesc = 7280 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7281 7282 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7283 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7284 7285 // If we're using ordered reductions then we can just return the base cost 7286 // here, since getArithmeticReductionCost calculates the full ordered 7287 // reduction cost when FP reassociation is not allowed. 7288 if (useOrderedReductions(RdxDesc)) 7289 return BaseCost; 7290 7291 // Get the operand that was not the reduction chain and match it to one of the 7292 // patterns, returning the better cost if it is found. 7293 Instruction *RedOp = RetI->getOperand(1) == LastChain 7294 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7295 : dyn_cast<Instruction>(RetI->getOperand(1)); 7296 7297 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7298 7299 Instruction *Op0, *Op1; 7300 if (RedOp && 7301 match(RedOp, 7302 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7303 match(Op0, m_ZExtOrSExt(m_Value())) && 7304 Op0->getOpcode() == Op1->getOpcode() && 7305 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7306 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7307 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7308 7309 // Matched reduce(ext(mul(ext(A), ext(B))) 7310 // Note that the extend opcodes need to all match, or if A==B they will have 7311 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7312 // which is equally fine. 7313 bool IsUnsigned = isa<ZExtInst>(Op0); 7314 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7315 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7316 7317 InstructionCost ExtCost = 7318 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7319 TTI::CastContextHint::None, CostKind, Op0); 7320 InstructionCost MulCost = 7321 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7322 InstructionCost Ext2Cost = 7323 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7324 TTI::CastContextHint::None, CostKind, RedOp); 7325 7326 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7327 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7328 CostKind); 7329 7330 if (RedCost.isValid() && 7331 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7332 return I == RetI ? RedCost : 0; 7333 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7334 !TheLoop->isLoopInvariant(RedOp)) { 7335 // Matched reduce(ext(A)) 7336 bool IsUnsigned = isa<ZExtInst>(RedOp); 7337 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7338 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7339 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7340 CostKind); 7341 7342 InstructionCost ExtCost = 7343 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7344 TTI::CastContextHint::None, CostKind, RedOp); 7345 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7346 return I == RetI ? RedCost : 0; 7347 } else if (RedOp && 7348 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7349 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7350 Op0->getOpcode() == Op1->getOpcode() && 7351 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7352 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7353 bool IsUnsigned = isa<ZExtInst>(Op0); 7354 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7355 // Matched reduce(mul(ext, ext)) 7356 InstructionCost ExtCost = 7357 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7358 TTI::CastContextHint::None, CostKind, Op0); 7359 InstructionCost MulCost = 7360 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7361 7362 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7363 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7364 CostKind); 7365 7366 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7367 return I == RetI ? RedCost : 0; 7368 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7369 // Matched reduce(mul()) 7370 InstructionCost MulCost = 7371 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7372 7373 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7374 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7375 CostKind); 7376 7377 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7378 return I == RetI ? RedCost : 0; 7379 } 7380 } 7381 7382 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7383 } 7384 7385 InstructionCost 7386 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7387 ElementCount VF) { 7388 // Calculate scalar cost only. Vectorization cost should be ready at this 7389 // moment. 7390 if (VF.isScalar()) { 7391 Type *ValTy = getLoadStoreType(I); 7392 const Align Alignment = getLoadStoreAlignment(I); 7393 unsigned AS = getLoadStoreAddressSpace(I); 7394 7395 return TTI.getAddressComputationCost(ValTy) + 7396 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7397 TTI::TCK_RecipThroughput, I); 7398 } 7399 return getWideningCost(I, VF); 7400 } 7401 7402 LoopVectorizationCostModel::VectorizationCostTy 7403 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7404 ElementCount VF) { 7405 // If we know that this instruction will remain uniform, check the cost of 7406 // the scalar version. 7407 if (isUniformAfterVectorization(I, VF)) 7408 VF = ElementCount::getFixed(1); 7409 7410 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7411 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7412 7413 // Forced scalars do not have any scalarization overhead. 7414 auto ForcedScalar = ForcedScalars.find(VF); 7415 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7416 auto InstSet = ForcedScalar->second; 7417 if (InstSet.count(I)) 7418 return VectorizationCostTy( 7419 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7420 VF.getKnownMinValue()), 7421 false); 7422 } 7423 7424 Type *VectorTy; 7425 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7426 7427 bool TypeNotScalarized = 7428 VF.isVector() && VectorTy->isVectorTy() && 7429 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7430 return VectorizationCostTy(C, TypeNotScalarized); 7431 } 7432 7433 InstructionCost 7434 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7435 ElementCount VF) const { 7436 7437 // There is no mechanism yet to create a scalable scalarization loop, 7438 // so this is currently Invalid. 7439 if (VF.isScalable()) 7440 return InstructionCost::getInvalid(); 7441 7442 if (VF.isScalar()) 7443 return 0; 7444 7445 InstructionCost Cost = 0; 7446 Type *RetTy = ToVectorTy(I->getType(), VF); 7447 if (!RetTy->isVoidTy() && 7448 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7449 Cost += TTI.getScalarizationOverhead( 7450 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7451 false); 7452 7453 // Some targets keep addresses scalar. 7454 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7455 return Cost; 7456 7457 // Some targets support efficient element stores. 7458 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7459 return Cost; 7460 7461 // Collect operands to consider. 7462 CallInst *CI = dyn_cast<CallInst>(I); 7463 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7464 7465 // Skip operands that do not require extraction/scalarization and do not incur 7466 // any overhead. 7467 SmallVector<Type *> Tys; 7468 for (auto *V : filterExtractingOperands(Ops, VF)) 7469 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7470 return Cost + TTI.getOperandsScalarizationOverhead( 7471 filterExtractingOperands(Ops, VF), Tys); 7472 } 7473 7474 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7475 if (VF.isScalar()) 7476 return; 7477 NumPredStores = 0; 7478 for (BasicBlock *BB : TheLoop->blocks()) { 7479 // For each instruction in the old loop. 7480 for (Instruction &I : *BB) { 7481 Value *Ptr = getLoadStorePointerOperand(&I); 7482 if (!Ptr) 7483 continue; 7484 7485 // TODO: We should generate better code and update the cost model for 7486 // predicated uniform stores. Today they are treated as any other 7487 // predicated store (see added test cases in 7488 // invariant-store-vectorization.ll). 7489 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7490 NumPredStores++; 7491 7492 if (Legal->isUniformMemOp(I)) { 7493 // TODO: Avoid replicating loads and stores instead of 7494 // relying on instcombine to remove them. 7495 // Load: Scalar load + broadcast 7496 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7497 InstructionCost Cost; 7498 if (isa<StoreInst>(&I) && VF.isScalable() && 7499 isLegalGatherOrScatter(&I)) { 7500 Cost = getGatherScatterCost(&I, VF); 7501 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7502 } else { 7503 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7504 "Cannot yet scalarize uniform stores"); 7505 Cost = getUniformMemOpCost(&I, VF); 7506 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7507 } 7508 continue; 7509 } 7510 7511 // We assume that widening is the best solution when possible. 7512 if (memoryInstructionCanBeWidened(&I, VF)) { 7513 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7514 int ConsecutiveStride = Legal->isConsecutivePtr( 7515 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7516 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7517 "Expected consecutive stride."); 7518 InstWidening Decision = 7519 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7520 setWideningDecision(&I, VF, Decision, Cost); 7521 continue; 7522 } 7523 7524 // Choose between Interleaving, Gather/Scatter or Scalarization. 7525 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7526 unsigned NumAccesses = 1; 7527 if (isAccessInterleaved(&I)) { 7528 auto Group = getInterleavedAccessGroup(&I); 7529 assert(Group && "Fail to get an interleaved access group."); 7530 7531 // Make one decision for the whole group. 7532 if (getWideningDecision(&I, VF) != CM_Unknown) 7533 continue; 7534 7535 NumAccesses = Group->getNumMembers(); 7536 if (interleavedAccessCanBeWidened(&I, VF)) 7537 InterleaveCost = getInterleaveGroupCost(&I, VF); 7538 } 7539 7540 InstructionCost GatherScatterCost = 7541 isLegalGatherOrScatter(&I) 7542 ? getGatherScatterCost(&I, VF) * NumAccesses 7543 : InstructionCost::getInvalid(); 7544 7545 InstructionCost ScalarizationCost = 7546 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7547 7548 // Choose better solution for the current VF, 7549 // write down this decision and use it during vectorization. 7550 InstructionCost Cost; 7551 InstWidening Decision; 7552 if (InterleaveCost <= GatherScatterCost && 7553 InterleaveCost < ScalarizationCost) { 7554 Decision = CM_Interleave; 7555 Cost = InterleaveCost; 7556 } else if (GatherScatterCost < ScalarizationCost) { 7557 Decision = CM_GatherScatter; 7558 Cost = GatherScatterCost; 7559 } else { 7560 Decision = CM_Scalarize; 7561 Cost = ScalarizationCost; 7562 } 7563 // If the instructions belongs to an interleave group, the whole group 7564 // receives the same decision. The whole group receives the cost, but 7565 // the cost will actually be assigned to one instruction. 7566 if (auto Group = getInterleavedAccessGroup(&I)) 7567 setWideningDecision(Group, VF, Decision, Cost); 7568 else 7569 setWideningDecision(&I, VF, Decision, Cost); 7570 } 7571 } 7572 7573 // Make sure that any load of address and any other address computation 7574 // remains scalar unless there is gather/scatter support. This avoids 7575 // inevitable extracts into address registers, and also has the benefit of 7576 // activating LSR more, since that pass can't optimize vectorized 7577 // addresses. 7578 if (TTI.prefersVectorizedAddressing()) 7579 return; 7580 7581 // Start with all scalar pointer uses. 7582 SmallPtrSet<Instruction *, 8> AddrDefs; 7583 for (BasicBlock *BB : TheLoop->blocks()) 7584 for (Instruction &I : *BB) { 7585 Instruction *PtrDef = 7586 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7587 if (PtrDef && TheLoop->contains(PtrDef) && 7588 getWideningDecision(&I, VF) != CM_GatherScatter) 7589 AddrDefs.insert(PtrDef); 7590 } 7591 7592 // Add all instructions used to generate the addresses. 7593 SmallVector<Instruction *, 4> Worklist; 7594 append_range(Worklist, AddrDefs); 7595 while (!Worklist.empty()) { 7596 Instruction *I = Worklist.pop_back_val(); 7597 for (auto &Op : I->operands()) 7598 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7599 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7600 AddrDefs.insert(InstOp).second) 7601 Worklist.push_back(InstOp); 7602 } 7603 7604 for (auto *I : AddrDefs) { 7605 if (isa<LoadInst>(I)) { 7606 // Setting the desired widening decision should ideally be handled in 7607 // by cost functions, but since this involves the task of finding out 7608 // if the loaded register is involved in an address computation, it is 7609 // instead changed here when we know this is the case. 7610 InstWidening Decision = getWideningDecision(I, VF); 7611 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7612 // Scalarize a widened load of address. 7613 setWideningDecision( 7614 I, VF, CM_Scalarize, 7615 (VF.getKnownMinValue() * 7616 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7617 else if (auto Group = getInterleavedAccessGroup(I)) { 7618 // Scalarize an interleave group of address loads. 7619 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7620 if (Instruction *Member = Group->getMember(I)) 7621 setWideningDecision( 7622 Member, VF, CM_Scalarize, 7623 (VF.getKnownMinValue() * 7624 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7625 } 7626 } 7627 } else 7628 // Make sure I gets scalarized and a cost estimate without 7629 // scalarization overhead. 7630 ForcedScalars[VF].insert(I); 7631 } 7632 } 7633 7634 InstructionCost 7635 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7636 Type *&VectorTy) { 7637 Type *RetTy = I->getType(); 7638 if (canTruncateToMinimalBitwidth(I, VF)) 7639 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7640 auto SE = PSE.getSE(); 7641 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7642 7643 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7644 ElementCount VF) -> bool { 7645 if (VF.isScalar()) 7646 return true; 7647 7648 auto Scalarized = InstsToScalarize.find(VF); 7649 assert(Scalarized != InstsToScalarize.end() && 7650 "VF not yet analyzed for scalarization profitability"); 7651 return !Scalarized->second.count(I) && 7652 llvm::all_of(I->users(), [&](User *U) { 7653 auto *UI = cast<Instruction>(U); 7654 return !Scalarized->second.count(UI); 7655 }); 7656 }; 7657 (void) hasSingleCopyAfterVectorization; 7658 7659 if (isScalarAfterVectorization(I, VF)) { 7660 // With the exception of GEPs and PHIs, after scalarization there should 7661 // only be one copy of the instruction generated in the loop. This is 7662 // because the VF is either 1, or any instructions that need scalarizing 7663 // have already been dealt with by the the time we get here. As a result, 7664 // it means we don't have to multiply the instruction cost by VF. 7665 assert(I->getOpcode() == Instruction::GetElementPtr || 7666 I->getOpcode() == Instruction::PHI || 7667 (I->getOpcode() == Instruction::BitCast && 7668 I->getType()->isPointerTy()) || 7669 hasSingleCopyAfterVectorization(I, VF)); 7670 VectorTy = RetTy; 7671 } else 7672 VectorTy = ToVectorTy(RetTy, VF); 7673 7674 // TODO: We need to estimate the cost of intrinsic calls. 7675 switch (I->getOpcode()) { 7676 case Instruction::GetElementPtr: 7677 // We mark this instruction as zero-cost because the cost of GEPs in 7678 // vectorized code depends on whether the corresponding memory instruction 7679 // is scalarized or not. Therefore, we handle GEPs with the memory 7680 // instruction cost. 7681 return 0; 7682 case Instruction::Br: { 7683 // In cases of scalarized and predicated instructions, there will be VF 7684 // predicated blocks in the vectorized loop. Each branch around these 7685 // blocks requires also an extract of its vector compare i1 element. 7686 bool ScalarPredicatedBB = false; 7687 BranchInst *BI = cast<BranchInst>(I); 7688 if (VF.isVector() && BI->isConditional() && 7689 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7690 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7691 ScalarPredicatedBB = true; 7692 7693 if (ScalarPredicatedBB) { 7694 // Not possible to scalarize scalable vector with predicated instructions. 7695 if (VF.isScalable()) 7696 return InstructionCost::getInvalid(); 7697 // Return cost for branches around scalarized and predicated blocks. 7698 auto *Vec_i1Ty = 7699 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7700 return ( 7701 TTI.getScalarizationOverhead( 7702 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7703 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7704 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7705 // The back-edge branch will remain, as will all scalar branches. 7706 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7707 else 7708 // This branch will be eliminated by if-conversion. 7709 return 0; 7710 // Note: We currently assume zero cost for an unconditional branch inside 7711 // a predicated block since it will become a fall-through, although we 7712 // may decide in the future to call TTI for all branches. 7713 } 7714 case Instruction::PHI: { 7715 auto *Phi = cast<PHINode>(I); 7716 7717 // First-order recurrences are replaced by vector shuffles inside the loop. 7718 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7719 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7720 return TTI.getShuffleCost( 7721 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7722 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7723 7724 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7725 // converted into select instructions. We require N - 1 selects per phi 7726 // node, where N is the number of incoming values. 7727 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7728 return (Phi->getNumIncomingValues() - 1) * 7729 TTI.getCmpSelInstrCost( 7730 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7731 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7732 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7733 7734 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7735 } 7736 case Instruction::UDiv: 7737 case Instruction::SDiv: 7738 case Instruction::URem: 7739 case Instruction::SRem: 7740 // If we have a predicated instruction, it may not be executed for each 7741 // vector lane. Get the scalarization cost and scale this amount by the 7742 // probability of executing the predicated block. If the instruction is not 7743 // predicated, we fall through to the next case. 7744 if (VF.isVector() && isScalarWithPredication(I)) { 7745 InstructionCost Cost = 0; 7746 7747 // These instructions have a non-void type, so account for the phi nodes 7748 // that we will create. This cost is likely to be zero. The phi node 7749 // cost, if any, should be scaled by the block probability because it 7750 // models a copy at the end of each predicated block. 7751 Cost += VF.getKnownMinValue() * 7752 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7753 7754 // The cost of the non-predicated instruction. 7755 Cost += VF.getKnownMinValue() * 7756 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7757 7758 // The cost of insertelement and extractelement instructions needed for 7759 // scalarization. 7760 Cost += getScalarizationOverhead(I, VF); 7761 7762 // Scale the cost by the probability of executing the predicated blocks. 7763 // This assumes the predicated block for each vector lane is equally 7764 // likely. 7765 return Cost / getReciprocalPredBlockProb(); 7766 } 7767 LLVM_FALLTHROUGH; 7768 case Instruction::Add: 7769 case Instruction::FAdd: 7770 case Instruction::Sub: 7771 case Instruction::FSub: 7772 case Instruction::Mul: 7773 case Instruction::FMul: 7774 case Instruction::FDiv: 7775 case Instruction::FRem: 7776 case Instruction::Shl: 7777 case Instruction::LShr: 7778 case Instruction::AShr: 7779 case Instruction::And: 7780 case Instruction::Or: 7781 case Instruction::Xor: { 7782 // Since we will replace the stride by 1 the multiplication should go away. 7783 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7784 return 0; 7785 7786 // Detect reduction patterns 7787 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7788 return *RedCost; 7789 7790 // Certain instructions can be cheaper to vectorize if they have a constant 7791 // second vector operand. One example of this are shifts on x86. 7792 Value *Op2 = I->getOperand(1); 7793 TargetTransformInfo::OperandValueProperties Op2VP; 7794 TargetTransformInfo::OperandValueKind Op2VK = 7795 TTI.getOperandInfo(Op2, Op2VP); 7796 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7797 Op2VK = TargetTransformInfo::OK_UniformValue; 7798 7799 SmallVector<const Value *, 4> Operands(I->operand_values()); 7800 return TTI.getArithmeticInstrCost( 7801 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7802 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7803 } 7804 case Instruction::FNeg: { 7805 return TTI.getArithmeticInstrCost( 7806 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7807 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7808 TargetTransformInfo::OP_None, I->getOperand(0), I); 7809 } 7810 case Instruction::Select: { 7811 SelectInst *SI = cast<SelectInst>(I); 7812 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7813 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7814 7815 const Value *Op0, *Op1; 7816 using namespace llvm::PatternMatch; 7817 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7818 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7819 // select x, y, false --> x & y 7820 // select x, true, y --> x | y 7821 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7822 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7823 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7824 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7825 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7826 Op1->getType()->getScalarSizeInBits() == 1); 7827 7828 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7829 return TTI.getArithmeticInstrCost( 7830 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7831 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7832 } 7833 7834 Type *CondTy = SI->getCondition()->getType(); 7835 if (!ScalarCond) 7836 CondTy = VectorType::get(CondTy, VF); 7837 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7838 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7839 } 7840 case Instruction::ICmp: 7841 case Instruction::FCmp: { 7842 Type *ValTy = I->getOperand(0)->getType(); 7843 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7844 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7845 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7846 VectorTy = ToVectorTy(ValTy, VF); 7847 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7848 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7849 } 7850 case Instruction::Store: 7851 case Instruction::Load: { 7852 ElementCount Width = VF; 7853 if (Width.isVector()) { 7854 InstWidening Decision = getWideningDecision(I, Width); 7855 assert(Decision != CM_Unknown && 7856 "CM decision should be taken at this point"); 7857 if (Decision == CM_Scalarize) 7858 Width = ElementCount::getFixed(1); 7859 } 7860 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7861 return getMemoryInstructionCost(I, VF); 7862 } 7863 case Instruction::BitCast: 7864 if (I->getType()->isPointerTy()) 7865 return 0; 7866 LLVM_FALLTHROUGH; 7867 case Instruction::ZExt: 7868 case Instruction::SExt: 7869 case Instruction::FPToUI: 7870 case Instruction::FPToSI: 7871 case Instruction::FPExt: 7872 case Instruction::PtrToInt: 7873 case Instruction::IntToPtr: 7874 case Instruction::SIToFP: 7875 case Instruction::UIToFP: 7876 case Instruction::Trunc: 7877 case Instruction::FPTrunc: { 7878 // Computes the CastContextHint from a Load/Store instruction. 7879 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7880 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7881 "Expected a load or a store!"); 7882 7883 if (VF.isScalar() || !TheLoop->contains(I)) 7884 return TTI::CastContextHint::Normal; 7885 7886 switch (getWideningDecision(I, VF)) { 7887 case LoopVectorizationCostModel::CM_GatherScatter: 7888 return TTI::CastContextHint::GatherScatter; 7889 case LoopVectorizationCostModel::CM_Interleave: 7890 return TTI::CastContextHint::Interleave; 7891 case LoopVectorizationCostModel::CM_Scalarize: 7892 case LoopVectorizationCostModel::CM_Widen: 7893 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7894 : TTI::CastContextHint::Normal; 7895 case LoopVectorizationCostModel::CM_Widen_Reverse: 7896 return TTI::CastContextHint::Reversed; 7897 case LoopVectorizationCostModel::CM_Unknown: 7898 llvm_unreachable("Instr did not go through cost modelling?"); 7899 } 7900 7901 llvm_unreachable("Unhandled case!"); 7902 }; 7903 7904 unsigned Opcode = I->getOpcode(); 7905 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7906 // For Trunc, the context is the only user, which must be a StoreInst. 7907 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7908 if (I->hasOneUse()) 7909 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7910 CCH = ComputeCCH(Store); 7911 } 7912 // For Z/Sext, the context is the operand, which must be a LoadInst. 7913 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7914 Opcode == Instruction::FPExt) { 7915 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7916 CCH = ComputeCCH(Load); 7917 } 7918 7919 // We optimize the truncation of induction variables having constant 7920 // integer steps. The cost of these truncations is the same as the scalar 7921 // operation. 7922 if (isOptimizableIVTruncate(I, VF)) { 7923 auto *Trunc = cast<TruncInst>(I); 7924 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7925 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7926 } 7927 7928 // Detect reduction patterns 7929 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7930 return *RedCost; 7931 7932 Type *SrcScalarTy = I->getOperand(0)->getType(); 7933 Type *SrcVecTy = 7934 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7935 if (canTruncateToMinimalBitwidth(I, VF)) { 7936 // This cast is going to be shrunk. This may remove the cast or it might 7937 // turn it into slightly different cast. For example, if MinBW == 16, 7938 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7939 // 7940 // Calculate the modified src and dest types. 7941 Type *MinVecTy = VectorTy; 7942 if (Opcode == Instruction::Trunc) { 7943 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7944 VectorTy = 7945 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7946 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7947 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7948 VectorTy = 7949 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7950 } 7951 } 7952 7953 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7954 } 7955 case Instruction::Call: { 7956 bool NeedToScalarize; 7957 CallInst *CI = cast<CallInst>(I); 7958 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7959 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7960 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7961 return std::min(CallCost, IntrinsicCost); 7962 } 7963 return CallCost; 7964 } 7965 case Instruction::ExtractValue: 7966 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7967 case Instruction::Alloca: 7968 // We cannot easily widen alloca to a scalable alloca, as 7969 // the result would need to be a vector of pointers. 7970 if (VF.isScalable()) 7971 return InstructionCost::getInvalid(); 7972 LLVM_FALLTHROUGH; 7973 default: 7974 // This opcode is unknown. Assume that it is the same as 'mul'. 7975 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7976 } // end of switch. 7977 } 7978 7979 char LoopVectorize::ID = 0; 7980 7981 static const char lv_name[] = "Loop Vectorization"; 7982 7983 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7984 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7985 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7986 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7987 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7988 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7989 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7990 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7991 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7992 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7993 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7994 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7995 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7996 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7997 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7998 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7999 8000 namespace llvm { 8001 8002 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 8003 8004 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 8005 bool VectorizeOnlyWhenForced) { 8006 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 8007 } 8008 8009 } // end namespace llvm 8010 8011 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 8012 // Check if the pointer operand of a load or store instruction is 8013 // consecutive. 8014 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 8015 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 8016 return false; 8017 } 8018 8019 void LoopVectorizationCostModel::collectValuesToIgnore() { 8020 // Ignore ephemeral values. 8021 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 8022 8023 // Ignore type-promoting instructions we identified during reduction 8024 // detection. 8025 for (auto &Reduction : Legal->getReductionVars()) { 8026 RecurrenceDescriptor &RedDes = Reduction.second; 8027 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 8028 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8029 } 8030 // Ignore type-casting instructions we identified during induction 8031 // detection. 8032 for (auto &Induction : Legal->getInductionVars()) { 8033 InductionDescriptor &IndDes = Induction.second; 8034 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8035 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8036 } 8037 } 8038 8039 void LoopVectorizationCostModel::collectInLoopReductions() { 8040 for (auto &Reduction : Legal->getReductionVars()) { 8041 PHINode *Phi = Reduction.first; 8042 RecurrenceDescriptor &RdxDesc = Reduction.second; 8043 8044 // We don't collect reductions that are type promoted (yet). 8045 if (RdxDesc.getRecurrenceType() != Phi->getType()) 8046 continue; 8047 8048 // If the target would prefer this reduction to happen "in-loop", then we 8049 // want to record it as such. 8050 unsigned Opcode = RdxDesc.getOpcode(); 8051 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 8052 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 8053 TargetTransformInfo::ReductionFlags())) 8054 continue; 8055 8056 // Check that we can correctly put the reductions into the loop, by 8057 // finding the chain of operations that leads from the phi to the loop 8058 // exit value. 8059 SmallVector<Instruction *, 4> ReductionOperations = 8060 RdxDesc.getReductionOpChain(Phi, TheLoop); 8061 bool InLoop = !ReductionOperations.empty(); 8062 if (InLoop) { 8063 InLoopReductionChains[Phi] = ReductionOperations; 8064 // Add the elements to InLoopReductionImmediateChains for cost modelling. 8065 Instruction *LastChain = Phi; 8066 for (auto *I : ReductionOperations) { 8067 InLoopReductionImmediateChains[I] = LastChain; 8068 LastChain = I; 8069 } 8070 } 8071 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 8072 << " reduction for phi: " << *Phi << "\n"); 8073 } 8074 } 8075 8076 // TODO: we could return a pair of values that specify the max VF and 8077 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 8078 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 8079 // doesn't have a cost model that can choose which plan to execute if 8080 // more than one is generated. 8081 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 8082 LoopVectorizationCostModel &CM) { 8083 unsigned WidestType; 8084 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 8085 return WidestVectorRegBits / WidestType; 8086 } 8087 8088 VectorizationFactor 8089 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 8090 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 8091 ElementCount VF = UserVF; 8092 // Outer loop handling: They may require CFG and instruction level 8093 // transformations before even evaluating whether vectorization is profitable. 8094 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8095 // the vectorization pipeline. 8096 if (!OrigLoop->isInnermost()) { 8097 // If the user doesn't provide a vectorization factor, determine a 8098 // reasonable one. 8099 if (UserVF.isZero()) { 8100 VF = ElementCount::getFixed(determineVPlanVF( 8101 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8102 .getFixedSize(), 8103 CM)); 8104 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8105 8106 // Make sure we have a VF > 1 for stress testing. 8107 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8108 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8109 << "overriding computed VF.\n"); 8110 VF = ElementCount::getFixed(4); 8111 } 8112 } 8113 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8114 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8115 "VF needs to be a power of two"); 8116 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8117 << "VF " << VF << " to build VPlans.\n"); 8118 buildVPlans(VF, VF); 8119 8120 // For VPlan build stress testing, we bail out after VPlan construction. 8121 if (VPlanBuildStressTest) 8122 return VectorizationFactor::Disabled(); 8123 8124 return {VF, 0 /*Cost*/}; 8125 } 8126 8127 LLVM_DEBUG( 8128 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8129 "VPlan-native path.\n"); 8130 return VectorizationFactor::Disabled(); 8131 } 8132 8133 Optional<VectorizationFactor> 8134 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8135 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8136 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8137 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8138 return None; 8139 8140 // Invalidate interleave groups if all blocks of loop will be predicated. 8141 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 8142 !useMaskedInterleavedAccesses(*TTI)) { 8143 LLVM_DEBUG( 8144 dbgs() 8145 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8146 "which requires masked-interleaved support.\n"); 8147 if (CM.InterleaveInfo.invalidateGroups()) 8148 // Invalidating interleave groups also requires invalidating all decisions 8149 // based on them, which includes widening decisions and uniform and scalar 8150 // values. 8151 CM.invalidateCostModelingDecisions(); 8152 } 8153 8154 ElementCount MaxUserVF = 8155 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8156 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8157 if (!UserVF.isZero() && UserVFIsLegal) { 8158 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8159 "VF needs to be a power of two"); 8160 // Collect the instructions (and their associated costs) that will be more 8161 // profitable to scalarize. 8162 if (CM.selectUserVectorizationFactor(UserVF)) { 8163 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8164 CM.collectInLoopReductions(); 8165 buildVPlansWithVPRecipes(UserVF, UserVF); 8166 LLVM_DEBUG(printPlans(dbgs())); 8167 return {{UserVF, 0}}; 8168 } else 8169 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8170 "InvalidCost", ORE, OrigLoop); 8171 } 8172 8173 // Populate the set of Vectorization Factor Candidates. 8174 ElementCountSet VFCandidates; 8175 for (auto VF = ElementCount::getFixed(1); 8176 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8177 VFCandidates.insert(VF); 8178 for (auto VF = ElementCount::getScalable(1); 8179 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8180 VFCandidates.insert(VF); 8181 8182 for (const auto &VF : VFCandidates) { 8183 // Collect Uniform and Scalar instructions after vectorization with VF. 8184 CM.collectUniformsAndScalars(VF); 8185 8186 // Collect the instructions (and their associated costs) that will be more 8187 // profitable to scalarize. 8188 if (VF.isVector()) 8189 CM.collectInstsToScalarize(VF); 8190 } 8191 8192 CM.collectInLoopReductions(); 8193 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8194 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8195 8196 LLVM_DEBUG(printPlans(dbgs())); 8197 if (!MaxFactors.hasVector()) 8198 return VectorizationFactor::Disabled(); 8199 8200 // Select the optimal vectorization factor. 8201 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8202 8203 // Check if it is profitable to vectorize with runtime checks. 8204 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8205 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8206 bool PragmaThresholdReached = 8207 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8208 bool ThresholdReached = 8209 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8210 if ((ThresholdReached && !Hints.allowReordering()) || 8211 PragmaThresholdReached) { 8212 ORE->emit([&]() { 8213 return OptimizationRemarkAnalysisAliasing( 8214 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8215 OrigLoop->getHeader()) 8216 << "loop not vectorized: cannot prove it is safe to reorder " 8217 "memory operations"; 8218 }); 8219 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8220 Hints.emitRemarkWithHints(); 8221 return VectorizationFactor::Disabled(); 8222 } 8223 } 8224 return SelectedVF; 8225 } 8226 8227 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 8228 assert(count_if(VPlans, 8229 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 8230 1 && 8231 "Best VF has not a single VPlan."); 8232 8233 for (const VPlanPtr &Plan : VPlans) { 8234 if (Plan->hasVF(VF)) 8235 return *Plan.get(); 8236 } 8237 llvm_unreachable("No plan found!"); 8238 } 8239 8240 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 8241 VPlan &BestVPlan, 8242 InnerLoopVectorizer &ILV, 8243 DominatorTree *DT) { 8244 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 8245 << '\n'); 8246 8247 // Perform the actual loop transformation. 8248 8249 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8250 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 8251 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8252 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8253 State.CanonicalIV = ILV.Induction; 8254 8255 ILV.printDebugTracesAtStart(); 8256 8257 //===------------------------------------------------===// 8258 // 8259 // Notice: any optimization or new instruction that go 8260 // into the code below should also be implemented in 8261 // the cost-model. 8262 // 8263 //===------------------------------------------------===// 8264 8265 // 2. Copy and widen instructions from the old loop into the new loop. 8266 BestVPlan.execute(&State); 8267 8268 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8269 // predication, updating analyses. 8270 ILV.fixVectorizedLoop(State); 8271 8272 ILV.printDebugTracesAtEnd(); 8273 } 8274 8275 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8276 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8277 for (const auto &Plan : VPlans) 8278 if (PrintVPlansInDotFormat) 8279 Plan->printDOT(O); 8280 else 8281 Plan->print(O); 8282 } 8283 #endif 8284 8285 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8286 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8287 8288 // We create new control-flow for the vectorized loop, so the original exit 8289 // conditions will be dead after vectorization if it's only used by the 8290 // terminator 8291 SmallVector<BasicBlock*> ExitingBlocks; 8292 OrigLoop->getExitingBlocks(ExitingBlocks); 8293 for (auto *BB : ExitingBlocks) { 8294 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8295 if (!Cmp || !Cmp->hasOneUse()) 8296 continue; 8297 8298 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8299 if (!DeadInstructions.insert(Cmp).second) 8300 continue; 8301 8302 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8303 // TODO: can recurse through operands in general 8304 for (Value *Op : Cmp->operands()) { 8305 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8306 DeadInstructions.insert(cast<Instruction>(Op)); 8307 } 8308 } 8309 8310 // We create new "steps" for induction variable updates to which the original 8311 // induction variables map. An original update instruction will be dead if 8312 // all its users except the induction variable are dead. 8313 auto *Latch = OrigLoop->getLoopLatch(); 8314 for (auto &Induction : Legal->getInductionVars()) { 8315 PHINode *Ind = Induction.first; 8316 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8317 8318 // If the tail is to be folded by masking, the primary induction variable, 8319 // if exists, isn't dead: it will be used for masking. Don't kill it. 8320 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8321 continue; 8322 8323 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8324 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8325 })) 8326 DeadInstructions.insert(IndUpdate); 8327 8328 // We record as "Dead" also the type-casting instructions we had identified 8329 // during induction analysis. We don't need any handling for them in the 8330 // vectorized loop because we have proven that, under a proper runtime 8331 // test guarding the vectorized loop, the value of the phi, and the casted 8332 // value of the phi, are the same. The last instruction in this casting chain 8333 // will get its scalar/vector/widened def from the scalar/vector/widened def 8334 // of the respective phi node. Any other casts in the induction def-use chain 8335 // have no other uses outside the phi update chain, and will be ignored. 8336 InductionDescriptor &IndDes = Induction.second; 8337 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8338 DeadInstructions.insert(Casts.begin(), Casts.end()); 8339 } 8340 } 8341 8342 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8343 8344 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8345 8346 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8347 Value *Step, 8348 Instruction::BinaryOps BinOp) { 8349 // When unrolling and the VF is 1, we only need to add a simple scalar. 8350 Type *Ty = Val->getType(); 8351 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8352 8353 if (Ty->isFloatingPointTy()) { 8354 // Floating-point operations inherit FMF via the builder's flags. 8355 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8356 return Builder.CreateBinOp(BinOp, Val, MulOp); 8357 } 8358 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8359 } 8360 8361 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8362 SmallVector<Metadata *, 4> MDs; 8363 // Reserve first location for self reference to the LoopID metadata node. 8364 MDs.push_back(nullptr); 8365 bool IsUnrollMetadata = false; 8366 MDNode *LoopID = L->getLoopID(); 8367 if (LoopID) { 8368 // First find existing loop unrolling disable metadata. 8369 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8370 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8371 if (MD) { 8372 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8373 IsUnrollMetadata = 8374 S && S->getString().startswith("llvm.loop.unroll.disable"); 8375 } 8376 MDs.push_back(LoopID->getOperand(i)); 8377 } 8378 } 8379 8380 if (!IsUnrollMetadata) { 8381 // Add runtime unroll disable metadata. 8382 LLVMContext &Context = L->getHeader()->getContext(); 8383 SmallVector<Metadata *, 1> DisableOperands; 8384 DisableOperands.push_back( 8385 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8386 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8387 MDs.push_back(DisableNode); 8388 MDNode *NewLoopID = MDNode::get(Context, MDs); 8389 // Set operand 0 to refer to the loop id itself. 8390 NewLoopID->replaceOperandWith(0, NewLoopID); 8391 L->setLoopID(NewLoopID); 8392 } 8393 } 8394 8395 //===--------------------------------------------------------------------===// 8396 // EpilogueVectorizerMainLoop 8397 //===--------------------------------------------------------------------===// 8398 8399 /// This function is partially responsible for generating the control flow 8400 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8401 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8402 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8403 Loop *Lp = createVectorLoopSkeleton(""); 8404 8405 // Generate the code to check the minimum iteration count of the vector 8406 // epilogue (see below). 8407 EPI.EpilogueIterationCountCheck = 8408 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8409 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8410 8411 // Generate the code to check any assumptions that we've made for SCEV 8412 // expressions. 8413 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8414 8415 // Generate the code that checks at runtime if arrays overlap. We put the 8416 // checks into a separate block to make the more common case of few elements 8417 // faster. 8418 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8419 8420 // Generate the iteration count check for the main loop, *after* the check 8421 // for the epilogue loop, so that the path-length is shorter for the case 8422 // that goes directly through the vector epilogue. The longer-path length for 8423 // the main loop is compensated for, by the gain from vectorizing the larger 8424 // trip count. Note: the branch will get updated later on when we vectorize 8425 // the epilogue. 8426 EPI.MainLoopIterationCountCheck = 8427 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8428 8429 // Generate the induction variable. 8430 OldInduction = Legal->getPrimaryInduction(); 8431 Type *IdxTy = Legal->getWidestInductionType(); 8432 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8433 8434 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8435 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8436 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8437 EPI.VectorTripCount = CountRoundDown; 8438 Induction = 8439 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8440 getDebugLocFromInstOrOperands(OldInduction)); 8441 8442 // Skip induction resume value creation here because they will be created in 8443 // the second pass. If we created them here, they wouldn't be used anyway, 8444 // because the vplan in the second pass still contains the inductions from the 8445 // original loop. 8446 8447 return completeLoopSkeleton(Lp, OrigLoopID); 8448 } 8449 8450 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8451 LLVM_DEBUG({ 8452 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8453 << "Main Loop VF:" << EPI.MainLoopVF 8454 << ", Main Loop UF:" << EPI.MainLoopUF 8455 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8456 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8457 }); 8458 } 8459 8460 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8461 DEBUG_WITH_TYPE(VerboseDebug, { 8462 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8463 }); 8464 } 8465 8466 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8467 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8468 assert(L && "Expected valid Loop."); 8469 assert(Bypass && "Expected valid bypass basic block."); 8470 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8471 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8472 Value *Count = getOrCreateTripCount(L); 8473 // Reuse existing vector loop preheader for TC checks. 8474 // Note that new preheader block is generated for vector loop. 8475 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8476 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8477 8478 // Generate code to check if the loop's trip count is less than VF * UF of the 8479 // main vector loop. 8480 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8481 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8482 8483 Value *CheckMinIters = Builder.CreateICmp( 8484 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8485 "min.iters.check"); 8486 8487 if (!ForEpilogue) 8488 TCCheckBlock->setName("vector.main.loop.iter.check"); 8489 8490 // Create new preheader for vector loop. 8491 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8492 DT, LI, nullptr, "vector.ph"); 8493 8494 if (ForEpilogue) { 8495 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8496 DT->getNode(Bypass)->getIDom()) && 8497 "TC check is expected to dominate Bypass"); 8498 8499 // Update dominator for Bypass & LoopExit. 8500 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8501 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8502 // For loops with multiple exits, there's no edge from the middle block 8503 // to exit blocks (as the epilogue must run) and thus no need to update 8504 // the immediate dominator of the exit blocks. 8505 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8506 8507 LoopBypassBlocks.push_back(TCCheckBlock); 8508 8509 // Save the trip count so we don't have to regenerate it in the 8510 // vec.epilog.iter.check. This is safe to do because the trip count 8511 // generated here dominates the vector epilog iter check. 8512 EPI.TripCount = Count; 8513 } 8514 8515 ReplaceInstWithInst( 8516 TCCheckBlock->getTerminator(), 8517 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8518 8519 return TCCheckBlock; 8520 } 8521 8522 //===--------------------------------------------------------------------===// 8523 // EpilogueVectorizerEpilogueLoop 8524 //===--------------------------------------------------------------------===// 8525 8526 /// This function is partially responsible for generating the control flow 8527 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8528 BasicBlock * 8529 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8530 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8531 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8532 8533 // Now, compare the remaining count and if there aren't enough iterations to 8534 // execute the vectorized epilogue skip to the scalar part. 8535 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8536 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8537 LoopVectorPreHeader = 8538 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8539 LI, nullptr, "vec.epilog.ph"); 8540 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8541 VecEpilogueIterationCountCheck); 8542 8543 // Adjust the control flow taking the state info from the main loop 8544 // vectorization into account. 8545 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8546 "expected this to be saved from the previous pass."); 8547 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8548 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8549 8550 DT->changeImmediateDominator(LoopVectorPreHeader, 8551 EPI.MainLoopIterationCountCheck); 8552 8553 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8554 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8555 8556 if (EPI.SCEVSafetyCheck) 8557 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8558 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8559 if (EPI.MemSafetyCheck) 8560 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8561 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8562 8563 DT->changeImmediateDominator( 8564 VecEpilogueIterationCountCheck, 8565 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8566 8567 DT->changeImmediateDominator(LoopScalarPreHeader, 8568 EPI.EpilogueIterationCountCheck); 8569 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8570 // If there is an epilogue which must run, there's no edge from the 8571 // middle block to exit blocks and thus no need to update the immediate 8572 // dominator of the exit blocks. 8573 DT->changeImmediateDominator(LoopExitBlock, 8574 EPI.EpilogueIterationCountCheck); 8575 8576 // Keep track of bypass blocks, as they feed start values to the induction 8577 // phis in the scalar loop preheader. 8578 if (EPI.SCEVSafetyCheck) 8579 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8580 if (EPI.MemSafetyCheck) 8581 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8582 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8583 8584 // Generate a resume induction for the vector epilogue and put it in the 8585 // vector epilogue preheader 8586 Type *IdxTy = Legal->getWidestInductionType(); 8587 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8588 LoopVectorPreHeader->getFirstNonPHI()); 8589 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8590 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8591 EPI.MainLoopIterationCountCheck); 8592 8593 // Generate the induction variable. 8594 OldInduction = Legal->getPrimaryInduction(); 8595 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8596 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8597 Value *StartIdx = EPResumeVal; 8598 Induction = 8599 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8600 getDebugLocFromInstOrOperands(OldInduction)); 8601 8602 // Generate induction resume values. These variables save the new starting 8603 // indexes for the scalar loop. They are used to test if there are any tail 8604 // iterations left once the vector loop has completed. 8605 // Note that when the vectorized epilogue is skipped due to iteration count 8606 // check, then the resume value for the induction variable comes from 8607 // the trip count of the main vector loop, hence passing the AdditionalBypass 8608 // argument. 8609 createInductionResumeValues(Lp, CountRoundDown, 8610 {VecEpilogueIterationCountCheck, 8611 EPI.VectorTripCount} /* AdditionalBypass */); 8612 8613 AddRuntimeUnrollDisableMetaData(Lp); 8614 return completeLoopSkeleton(Lp, OrigLoopID); 8615 } 8616 8617 BasicBlock * 8618 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8619 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8620 8621 assert(EPI.TripCount && 8622 "Expected trip count to have been safed in the first pass."); 8623 assert( 8624 (!isa<Instruction>(EPI.TripCount) || 8625 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8626 "saved trip count does not dominate insertion point."); 8627 Value *TC = EPI.TripCount; 8628 IRBuilder<> Builder(Insert->getTerminator()); 8629 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8630 8631 // Generate code to check if the loop's trip count is less than VF * UF of the 8632 // vector epilogue loop. 8633 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8634 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8635 8636 Value *CheckMinIters = 8637 Builder.CreateICmp(P, Count, 8638 createStepForVF(Builder, Count->getType(), 8639 EPI.EpilogueVF, EPI.EpilogueUF), 8640 "min.epilog.iters.check"); 8641 8642 ReplaceInstWithInst( 8643 Insert->getTerminator(), 8644 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8645 8646 LoopBypassBlocks.push_back(Insert); 8647 return Insert; 8648 } 8649 8650 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8651 LLVM_DEBUG({ 8652 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8653 << "Epilogue Loop VF:" << EPI.EpilogueVF 8654 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8655 }); 8656 } 8657 8658 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8659 DEBUG_WITH_TYPE(VerboseDebug, { 8660 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8661 }); 8662 } 8663 8664 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8665 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8666 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8667 bool PredicateAtRangeStart = Predicate(Range.Start); 8668 8669 for (ElementCount TmpVF = Range.Start * 2; 8670 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8671 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8672 Range.End = TmpVF; 8673 break; 8674 } 8675 8676 return PredicateAtRangeStart; 8677 } 8678 8679 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8680 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8681 /// of VF's starting at a given VF and extending it as much as possible. Each 8682 /// vectorization decision can potentially shorten this sub-range during 8683 /// buildVPlan(). 8684 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8685 ElementCount MaxVF) { 8686 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8687 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8688 VFRange SubRange = {VF, MaxVFPlusOne}; 8689 VPlans.push_back(buildVPlan(SubRange)); 8690 VF = SubRange.End; 8691 } 8692 } 8693 8694 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8695 VPlanPtr &Plan) { 8696 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8697 8698 // Look for cached value. 8699 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8700 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8701 if (ECEntryIt != EdgeMaskCache.end()) 8702 return ECEntryIt->second; 8703 8704 VPValue *SrcMask = createBlockInMask(Src, Plan); 8705 8706 // The terminator has to be a branch inst! 8707 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8708 assert(BI && "Unexpected terminator found"); 8709 8710 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8711 return EdgeMaskCache[Edge] = SrcMask; 8712 8713 // If source is an exiting block, we know the exit edge is dynamically dead 8714 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8715 // adding uses of an otherwise potentially dead instruction. 8716 if (OrigLoop->isLoopExiting(Src)) 8717 return EdgeMaskCache[Edge] = SrcMask; 8718 8719 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8720 assert(EdgeMask && "No Edge Mask found for condition"); 8721 8722 if (BI->getSuccessor(0) != Dst) 8723 EdgeMask = Builder.createNot(EdgeMask); 8724 8725 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8726 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8727 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8728 // The select version does not introduce new UB if SrcMask is false and 8729 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8730 VPValue *False = Plan->getOrAddVPValue( 8731 ConstantInt::getFalse(BI->getCondition()->getType())); 8732 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8733 } 8734 8735 return EdgeMaskCache[Edge] = EdgeMask; 8736 } 8737 8738 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8739 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8740 8741 // Look for cached value. 8742 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8743 if (BCEntryIt != BlockMaskCache.end()) 8744 return BCEntryIt->second; 8745 8746 // All-one mask is modelled as no-mask following the convention for masked 8747 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8748 VPValue *BlockMask = nullptr; 8749 8750 if (OrigLoop->getHeader() == BB) { 8751 if (!CM.blockNeedsPredication(BB)) 8752 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8753 8754 // Create the block in mask as the first non-phi instruction in the block. 8755 VPBuilder::InsertPointGuard Guard(Builder); 8756 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8757 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8758 8759 // Introduce the early-exit compare IV <= BTC to form header block mask. 8760 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8761 // Start by constructing the desired canonical IV. 8762 VPValue *IV = nullptr; 8763 if (Legal->getPrimaryInduction()) 8764 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8765 else { 8766 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8767 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8768 IV = IVRecipe; 8769 } 8770 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8771 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8772 8773 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8774 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8775 // as a second argument, we only pass the IV here and extract the 8776 // tripcount from the transform state where codegen of the VP instructions 8777 // happen. 8778 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8779 } else { 8780 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8781 } 8782 return BlockMaskCache[BB] = BlockMask; 8783 } 8784 8785 // This is the block mask. We OR all incoming edges. 8786 for (auto *Predecessor : predecessors(BB)) { 8787 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8788 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8789 return BlockMaskCache[BB] = EdgeMask; 8790 8791 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8792 BlockMask = EdgeMask; 8793 continue; 8794 } 8795 8796 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8797 } 8798 8799 return BlockMaskCache[BB] = BlockMask; 8800 } 8801 8802 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8803 ArrayRef<VPValue *> Operands, 8804 VFRange &Range, 8805 VPlanPtr &Plan) { 8806 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8807 "Must be called with either a load or store"); 8808 8809 auto willWiden = [&](ElementCount VF) -> bool { 8810 if (VF.isScalar()) 8811 return false; 8812 LoopVectorizationCostModel::InstWidening Decision = 8813 CM.getWideningDecision(I, VF); 8814 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8815 "CM decision should be taken at this point."); 8816 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8817 return true; 8818 if (CM.isScalarAfterVectorization(I, VF) || 8819 CM.isProfitableToScalarize(I, VF)) 8820 return false; 8821 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8822 }; 8823 8824 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8825 return nullptr; 8826 8827 VPValue *Mask = nullptr; 8828 if (Legal->isMaskRequired(I)) 8829 Mask = createBlockInMask(I->getParent(), Plan); 8830 8831 // Determine if the pointer operand of the access is either consecutive or 8832 // reverse consecutive. 8833 LoopVectorizationCostModel::InstWidening Decision = 8834 CM.getWideningDecision(I, Range.Start); 8835 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8836 bool Consecutive = 8837 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8838 8839 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8840 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8841 Consecutive, Reverse); 8842 8843 StoreInst *Store = cast<StoreInst>(I); 8844 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8845 Mask, Consecutive, Reverse); 8846 } 8847 8848 VPWidenIntOrFpInductionRecipe * 8849 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8850 ArrayRef<VPValue *> Operands) const { 8851 // Check if this is an integer or fp induction. If so, build the recipe that 8852 // produces its scalar and vector values. 8853 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8854 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8855 II.getKind() == InductionDescriptor::IK_FpInduction) { 8856 assert(II.getStartValue() == 8857 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8858 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8859 return new VPWidenIntOrFpInductionRecipe( 8860 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8861 } 8862 8863 return nullptr; 8864 } 8865 8866 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8867 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8868 VPlan &Plan) const { 8869 // Optimize the special case where the source is a constant integer 8870 // induction variable. Notice that we can only optimize the 'trunc' case 8871 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8872 // (c) other casts depend on pointer size. 8873 8874 // Determine whether \p K is a truncation based on an induction variable that 8875 // can be optimized. 8876 auto isOptimizableIVTruncate = 8877 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8878 return [=](ElementCount VF) -> bool { 8879 return CM.isOptimizableIVTruncate(K, VF); 8880 }; 8881 }; 8882 8883 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8884 isOptimizableIVTruncate(I), Range)) { 8885 8886 InductionDescriptor II = 8887 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8888 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8889 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8890 Start, nullptr, I); 8891 } 8892 return nullptr; 8893 } 8894 8895 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8896 ArrayRef<VPValue *> Operands, 8897 VPlanPtr &Plan) { 8898 // If all incoming values are equal, the incoming VPValue can be used directly 8899 // instead of creating a new VPBlendRecipe. 8900 VPValue *FirstIncoming = Operands[0]; 8901 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8902 return FirstIncoming == Inc; 8903 })) { 8904 return Operands[0]; 8905 } 8906 8907 // We know that all PHIs in non-header blocks are converted into selects, so 8908 // we don't have to worry about the insertion order and we can just use the 8909 // builder. At this point we generate the predication tree. There may be 8910 // duplications since this is a simple recursive scan, but future 8911 // optimizations will clean it up. 8912 SmallVector<VPValue *, 2> OperandsWithMask; 8913 unsigned NumIncoming = Phi->getNumIncomingValues(); 8914 8915 for (unsigned In = 0; In < NumIncoming; In++) { 8916 VPValue *EdgeMask = 8917 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8918 assert((EdgeMask || NumIncoming == 1) && 8919 "Multiple predecessors with one having a full mask"); 8920 OperandsWithMask.push_back(Operands[In]); 8921 if (EdgeMask) 8922 OperandsWithMask.push_back(EdgeMask); 8923 } 8924 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8925 } 8926 8927 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8928 ArrayRef<VPValue *> Operands, 8929 VFRange &Range) const { 8930 8931 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8932 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8933 Range); 8934 8935 if (IsPredicated) 8936 return nullptr; 8937 8938 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8939 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8940 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8941 ID == Intrinsic::pseudoprobe || 8942 ID == Intrinsic::experimental_noalias_scope_decl)) 8943 return nullptr; 8944 8945 auto willWiden = [&](ElementCount VF) -> bool { 8946 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8947 // The following case may be scalarized depending on the VF. 8948 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8949 // version of the instruction. 8950 // Is it beneficial to perform intrinsic call compared to lib call? 8951 bool NeedToScalarize = false; 8952 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8953 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8954 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8955 return UseVectorIntrinsic || !NeedToScalarize; 8956 }; 8957 8958 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8959 return nullptr; 8960 8961 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8962 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8963 } 8964 8965 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8966 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8967 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8968 // Instruction should be widened, unless it is scalar after vectorization, 8969 // scalarization is profitable or it is predicated. 8970 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8971 return CM.isScalarAfterVectorization(I, VF) || 8972 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8973 }; 8974 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8975 Range); 8976 } 8977 8978 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8979 ArrayRef<VPValue *> Operands) const { 8980 auto IsVectorizableOpcode = [](unsigned Opcode) { 8981 switch (Opcode) { 8982 case Instruction::Add: 8983 case Instruction::And: 8984 case Instruction::AShr: 8985 case Instruction::BitCast: 8986 case Instruction::FAdd: 8987 case Instruction::FCmp: 8988 case Instruction::FDiv: 8989 case Instruction::FMul: 8990 case Instruction::FNeg: 8991 case Instruction::FPExt: 8992 case Instruction::FPToSI: 8993 case Instruction::FPToUI: 8994 case Instruction::FPTrunc: 8995 case Instruction::FRem: 8996 case Instruction::FSub: 8997 case Instruction::ICmp: 8998 case Instruction::IntToPtr: 8999 case Instruction::LShr: 9000 case Instruction::Mul: 9001 case Instruction::Or: 9002 case Instruction::PtrToInt: 9003 case Instruction::SDiv: 9004 case Instruction::Select: 9005 case Instruction::SExt: 9006 case Instruction::Shl: 9007 case Instruction::SIToFP: 9008 case Instruction::SRem: 9009 case Instruction::Sub: 9010 case Instruction::Trunc: 9011 case Instruction::UDiv: 9012 case Instruction::UIToFP: 9013 case Instruction::URem: 9014 case Instruction::Xor: 9015 case Instruction::ZExt: 9016 return true; 9017 } 9018 return false; 9019 }; 9020 9021 if (!IsVectorizableOpcode(I->getOpcode())) 9022 return nullptr; 9023 9024 // Success: widen this instruction. 9025 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 9026 } 9027 9028 void VPRecipeBuilder::fixHeaderPhis() { 9029 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 9030 for (VPWidenPHIRecipe *R : PhisToFix) { 9031 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 9032 VPRecipeBase *IncR = 9033 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 9034 R->addOperand(IncR->getVPSingleValue()); 9035 } 9036 } 9037 9038 VPBasicBlock *VPRecipeBuilder::handleReplication( 9039 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 9040 VPlanPtr &Plan) { 9041 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 9042 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 9043 Range); 9044 9045 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 9046 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 9047 9048 // Even if the instruction is not marked as uniform, there are certain 9049 // intrinsic calls that can be effectively treated as such, so we check for 9050 // them here. Conservatively, we only do this for scalable vectors, since 9051 // for fixed-width VFs we can always fall back on full scalarization. 9052 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 9053 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 9054 case Intrinsic::assume: 9055 case Intrinsic::lifetime_start: 9056 case Intrinsic::lifetime_end: 9057 // For scalable vectors if one of the operands is variant then we still 9058 // want to mark as uniform, which will generate one instruction for just 9059 // the first lane of the vector. We can't scalarize the call in the same 9060 // way as for fixed-width vectors because we don't know how many lanes 9061 // there are. 9062 // 9063 // The reasons for doing it this way for scalable vectors are: 9064 // 1. For the assume intrinsic generating the instruction for the first 9065 // lane is still be better than not generating any at all. For 9066 // example, the input may be a splat across all lanes. 9067 // 2. For the lifetime start/end intrinsics the pointer operand only 9068 // does anything useful when the input comes from a stack object, 9069 // which suggests it should always be uniform. For non-stack objects 9070 // the effect is to poison the object, which still allows us to 9071 // remove the call. 9072 IsUniform = true; 9073 break; 9074 default: 9075 break; 9076 } 9077 } 9078 9079 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 9080 IsUniform, IsPredicated); 9081 setRecipe(I, Recipe); 9082 Plan->addVPValue(I, Recipe); 9083 9084 // Find if I uses a predicated instruction. If so, it will use its scalar 9085 // value. Avoid hoisting the insert-element which packs the scalar value into 9086 // a vector value, as that happens iff all users use the vector value. 9087 for (VPValue *Op : Recipe->operands()) { 9088 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 9089 if (!PredR) 9090 continue; 9091 auto *RepR = 9092 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 9093 assert(RepR->isPredicated() && 9094 "expected Replicate recipe to be predicated"); 9095 RepR->setAlsoPack(false); 9096 } 9097 9098 // Finalize the recipe for Instr, first if it is not predicated. 9099 if (!IsPredicated) { 9100 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9101 VPBB->appendRecipe(Recipe); 9102 return VPBB; 9103 } 9104 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9105 assert(VPBB->getSuccessors().empty() && 9106 "VPBB has successors when handling predicated replication."); 9107 // Record predicated instructions for above packing optimizations. 9108 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9109 VPBlockUtils::insertBlockAfter(Region, VPBB); 9110 auto *RegSucc = new VPBasicBlock(); 9111 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9112 return RegSucc; 9113 } 9114 9115 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9116 VPRecipeBase *PredRecipe, 9117 VPlanPtr &Plan) { 9118 // Instructions marked for predication are replicated and placed under an 9119 // if-then construct to prevent side-effects. 9120 9121 // Generate recipes to compute the block mask for this region. 9122 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9123 9124 // Build the triangular if-then region. 9125 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9126 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9127 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9128 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9129 auto *PHIRecipe = Instr->getType()->isVoidTy() 9130 ? nullptr 9131 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9132 if (PHIRecipe) { 9133 Plan->removeVPValueFor(Instr); 9134 Plan->addVPValue(Instr, PHIRecipe); 9135 } 9136 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9137 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9138 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9139 9140 // Note: first set Entry as region entry and then connect successors starting 9141 // from it in order, to propagate the "parent" of each VPBasicBlock. 9142 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9143 VPBlockUtils::connectBlocks(Pred, Exit); 9144 9145 return Region; 9146 } 9147 9148 VPRecipeOrVPValueTy 9149 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9150 ArrayRef<VPValue *> Operands, 9151 VFRange &Range, VPlanPtr &Plan) { 9152 // First, check for specific widening recipes that deal with calls, memory 9153 // operations, inductions and Phi nodes. 9154 if (auto *CI = dyn_cast<CallInst>(Instr)) 9155 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9156 9157 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9158 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9159 9160 VPRecipeBase *Recipe; 9161 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9162 if (Phi->getParent() != OrigLoop->getHeader()) 9163 return tryToBlend(Phi, Operands, Plan); 9164 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9165 return toVPRecipeResult(Recipe); 9166 9167 VPWidenPHIRecipe *PhiRecipe = nullptr; 9168 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9169 VPValue *StartV = Operands[0]; 9170 if (Legal->isReductionVariable(Phi)) { 9171 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9172 assert(RdxDesc.getRecurrenceStartValue() == 9173 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9174 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9175 CM.isInLoopReduction(Phi), 9176 CM.useOrderedReductions(RdxDesc)); 9177 } else { 9178 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9179 } 9180 9181 // Record the incoming value from the backedge, so we can add the incoming 9182 // value from the backedge after all recipes have been created. 9183 recordRecipeOf(cast<Instruction>( 9184 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9185 PhisToFix.push_back(PhiRecipe); 9186 } else { 9187 // TODO: record start and backedge value for remaining pointer induction 9188 // phis. 9189 assert(Phi->getType()->isPointerTy() && 9190 "only pointer phis should be handled here"); 9191 PhiRecipe = new VPWidenPHIRecipe(Phi); 9192 } 9193 9194 return toVPRecipeResult(PhiRecipe); 9195 } 9196 9197 if (isa<TruncInst>(Instr) && 9198 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9199 Range, *Plan))) 9200 return toVPRecipeResult(Recipe); 9201 9202 if (!shouldWiden(Instr, Range)) 9203 return nullptr; 9204 9205 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9206 return toVPRecipeResult(new VPWidenGEPRecipe( 9207 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9208 9209 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9210 bool InvariantCond = 9211 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9212 return toVPRecipeResult(new VPWidenSelectRecipe( 9213 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9214 } 9215 9216 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9217 } 9218 9219 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9220 ElementCount MaxVF) { 9221 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9222 9223 // Collect instructions from the original loop that will become trivially dead 9224 // in the vectorized loop. We don't need to vectorize these instructions. For 9225 // example, original induction update instructions can become dead because we 9226 // separately emit induction "steps" when generating code for the new loop. 9227 // Similarly, we create a new latch condition when setting up the structure 9228 // of the new loop, so the old one can become dead. 9229 SmallPtrSet<Instruction *, 4> DeadInstructions; 9230 collectTriviallyDeadInstructions(DeadInstructions); 9231 9232 // Add assume instructions we need to drop to DeadInstructions, to prevent 9233 // them from being added to the VPlan. 9234 // TODO: We only need to drop assumes in blocks that get flattend. If the 9235 // control flow is preserved, we should keep them. 9236 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9237 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9238 9239 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9240 // Dead instructions do not need sinking. Remove them from SinkAfter. 9241 for (Instruction *I : DeadInstructions) 9242 SinkAfter.erase(I); 9243 9244 // Cannot sink instructions after dead instructions (there won't be any 9245 // recipes for them). Instead, find the first non-dead previous instruction. 9246 for (auto &P : Legal->getSinkAfter()) { 9247 Instruction *SinkTarget = P.second; 9248 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9249 (void)FirstInst; 9250 while (DeadInstructions.contains(SinkTarget)) { 9251 assert( 9252 SinkTarget != FirstInst && 9253 "Must find a live instruction (at least the one feeding the " 9254 "first-order recurrence PHI) before reaching beginning of the block"); 9255 SinkTarget = SinkTarget->getPrevNode(); 9256 assert(SinkTarget != P.first && 9257 "sink source equals target, no sinking required"); 9258 } 9259 P.second = SinkTarget; 9260 } 9261 9262 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9263 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9264 VFRange SubRange = {VF, MaxVFPlusOne}; 9265 VPlans.push_back( 9266 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9267 VF = SubRange.End; 9268 } 9269 } 9270 9271 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9272 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9273 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9274 9275 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9276 9277 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9278 9279 // --------------------------------------------------------------------------- 9280 // Pre-construction: record ingredients whose recipes we'll need to further 9281 // process after constructing the initial VPlan. 9282 // --------------------------------------------------------------------------- 9283 9284 // Mark instructions we'll need to sink later and their targets as 9285 // ingredients whose recipe we'll need to record. 9286 for (auto &Entry : SinkAfter) { 9287 RecipeBuilder.recordRecipeOf(Entry.first); 9288 RecipeBuilder.recordRecipeOf(Entry.second); 9289 } 9290 for (auto &Reduction : CM.getInLoopReductionChains()) { 9291 PHINode *Phi = Reduction.first; 9292 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9293 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9294 9295 RecipeBuilder.recordRecipeOf(Phi); 9296 for (auto &R : ReductionOperations) { 9297 RecipeBuilder.recordRecipeOf(R); 9298 // For min/max reducitons, where we have a pair of icmp/select, we also 9299 // need to record the ICmp recipe, so it can be removed later. 9300 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9301 "Only min/max recurrences allowed for inloop reductions"); 9302 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9303 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9304 } 9305 } 9306 9307 // For each interleave group which is relevant for this (possibly trimmed) 9308 // Range, add it to the set of groups to be later applied to the VPlan and add 9309 // placeholders for its members' Recipes which we'll be replacing with a 9310 // single VPInterleaveRecipe. 9311 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9312 auto applyIG = [IG, this](ElementCount VF) -> bool { 9313 return (VF.isVector() && // Query is illegal for VF == 1 9314 CM.getWideningDecision(IG->getInsertPos(), VF) == 9315 LoopVectorizationCostModel::CM_Interleave); 9316 }; 9317 if (!getDecisionAndClampRange(applyIG, Range)) 9318 continue; 9319 InterleaveGroups.insert(IG); 9320 for (unsigned i = 0; i < IG->getFactor(); i++) 9321 if (Instruction *Member = IG->getMember(i)) 9322 RecipeBuilder.recordRecipeOf(Member); 9323 }; 9324 9325 // --------------------------------------------------------------------------- 9326 // Build initial VPlan: Scan the body of the loop in a topological order to 9327 // visit each basic block after having visited its predecessor basic blocks. 9328 // --------------------------------------------------------------------------- 9329 9330 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9331 auto Plan = std::make_unique<VPlan>(); 9332 9333 // Scan the body of the loop in a topological order to visit each basic block 9334 // after having visited its predecessor basic blocks. 9335 LoopBlocksDFS DFS(OrigLoop); 9336 DFS.perform(LI); 9337 9338 VPBasicBlock *VPBB = nullptr; 9339 VPBasicBlock *HeaderVPBB = nullptr; 9340 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9341 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9342 // Relevant instructions from basic block BB will be grouped into VPRecipe 9343 // ingredients and fill a new VPBasicBlock. 9344 unsigned VPBBsForBB = 0; 9345 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9346 if (VPBB) 9347 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9348 else { 9349 Plan->setEntry(FirstVPBBForBB); 9350 HeaderVPBB = FirstVPBBForBB; 9351 } 9352 VPBB = FirstVPBBForBB; 9353 Builder.setInsertPoint(VPBB); 9354 9355 // Introduce each ingredient into VPlan. 9356 // TODO: Model and preserve debug instrinsics in VPlan. 9357 for (Instruction &I : BB->instructionsWithoutDebug()) { 9358 Instruction *Instr = &I; 9359 9360 // First filter out irrelevant instructions, to ensure no recipes are 9361 // built for them. 9362 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9363 continue; 9364 9365 SmallVector<VPValue *, 4> Operands; 9366 auto *Phi = dyn_cast<PHINode>(Instr); 9367 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9368 Operands.push_back(Plan->getOrAddVPValue( 9369 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9370 } else { 9371 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9372 Operands = {OpRange.begin(), OpRange.end()}; 9373 } 9374 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9375 Instr, Operands, Range, Plan)) { 9376 // If Instr can be simplified to an existing VPValue, use it. 9377 if (RecipeOrValue.is<VPValue *>()) { 9378 auto *VPV = RecipeOrValue.get<VPValue *>(); 9379 Plan->addVPValue(Instr, VPV); 9380 // If the re-used value is a recipe, register the recipe for the 9381 // instruction, in case the recipe for Instr needs to be recorded. 9382 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9383 RecipeBuilder.setRecipe(Instr, R); 9384 continue; 9385 } 9386 // Otherwise, add the new recipe. 9387 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9388 for (auto *Def : Recipe->definedValues()) { 9389 auto *UV = Def->getUnderlyingValue(); 9390 Plan->addVPValue(UV, Def); 9391 } 9392 9393 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9394 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9395 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9396 // of the header block. That can happen for truncates of induction 9397 // variables. Those recipes are moved to the phi section of the header 9398 // block after applying SinkAfter, which relies on the original 9399 // position of the trunc. 9400 assert(isa<TruncInst>(Instr)); 9401 InductionsToMove.push_back( 9402 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9403 } 9404 RecipeBuilder.setRecipe(Instr, Recipe); 9405 VPBB->appendRecipe(Recipe); 9406 continue; 9407 } 9408 9409 // Otherwise, if all widening options failed, Instruction is to be 9410 // replicated. This may create a successor for VPBB. 9411 VPBasicBlock *NextVPBB = 9412 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9413 if (NextVPBB != VPBB) { 9414 VPBB = NextVPBB; 9415 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9416 : ""); 9417 } 9418 } 9419 } 9420 9421 assert(isa<VPBasicBlock>(Plan->getEntry()) && 9422 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9423 "entry block must be set to a non-empty VPBasicBlock"); 9424 RecipeBuilder.fixHeaderPhis(); 9425 9426 // --------------------------------------------------------------------------- 9427 // Transform initial VPlan: Apply previously taken decisions, in order, to 9428 // bring the VPlan to its final state. 9429 // --------------------------------------------------------------------------- 9430 9431 // Apply Sink-After legal constraints. 9432 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9433 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9434 if (Region && Region->isReplicator()) { 9435 assert(Region->getNumSuccessors() == 1 && 9436 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9437 assert(R->getParent()->size() == 1 && 9438 "A recipe in an original replicator region must be the only " 9439 "recipe in its block"); 9440 return Region; 9441 } 9442 return nullptr; 9443 }; 9444 for (auto &Entry : SinkAfter) { 9445 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9446 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9447 9448 auto *TargetRegion = GetReplicateRegion(Target); 9449 auto *SinkRegion = GetReplicateRegion(Sink); 9450 if (!SinkRegion) { 9451 // If the sink source is not a replicate region, sink the recipe directly. 9452 if (TargetRegion) { 9453 // The target is in a replication region, make sure to move Sink to 9454 // the block after it, not into the replication region itself. 9455 VPBasicBlock *NextBlock = 9456 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9457 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9458 } else 9459 Sink->moveAfter(Target); 9460 continue; 9461 } 9462 9463 // The sink source is in a replicate region. Unhook the region from the CFG. 9464 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9465 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9466 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9467 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9468 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9469 9470 if (TargetRegion) { 9471 // The target recipe is also in a replicate region, move the sink region 9472 // after the target region. 9473 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9474 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9475 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9476 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9477 } else { 9478 // The sink source is in a replicate region, we need to move the whole 9479 // replicate region, which should only contain a single recipe in the 9480 // main block. 9481 auto *SplitBlock = 9482 Target->getParent()->splitAt(std::next(Target->getIterator())); 9483 9484 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9485 9486 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9487 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9488 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9489 if (VPBB == SplitPred) 9490 VPBB = SplitBlock; 9491 } 9492 } 9493 9494 // Now that sink-after is done, move induction recipes for optimized truncates 9495 // to the phi section of the header block. 9496 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9497 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9498 9499 // Adjust the recipes for any inloop reductions. 9500 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9501 9502 // Introduce a recipe to combine the incoming and previous values of a 9503 // first-order recurrence. 9504 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9505 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9506 if (!RecurPhi) 9507 continue; 9508 9509 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9510 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9511 auto *Region = GetReplicateRegion(PrevRecipe); 9512 if (Region) 9513 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9514 if (Region || PrevRecipe->isPhi()) 9515 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9516 else 9517 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9518 9519 auto *RecurSplice = cast<VPInstruction>( 9520 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9521 {RecurPhi, RecurPhi->getBackedgeValue()})); 9522 9523 RecurPhi->replaceAllUsesWith(RecurSplice); 9524 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9525 // all users. 9526 RecurSplice->setOperand(0, RecurPhi); 9527 } 9528 9529 // Interleave memory: for each Interleave Group we marked earlier as relevant 9530 // for this VPlan, replace the Recipes widening its memory instructions with a 9531 // single VPInterleaveRecipe at its insertion point. 9532 for (auto IG : InterleaveGroups) { 9533 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9534 RecipeBuilder.getRecipe(IG->getInsertPos())); 9535 SmallVector<VPValue *, 4> StoredValues; 9536 for (unsigned i = 0; i < IG->getFactor(); ++i) 9537 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9538 auto *StoreR = 9539 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9540 StoredValues.push_back(StoreR->getStoredValue()); 9541 } 9542 9543 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9544 Recipe->getMask()); 9545 VPIG->insertBefore(Recipe); 9546 unsigned J = 0; 9547 for (unsigned i = 0; i < IG->getFactor(); ++i) 9548 if (Instruction *Member = IG->getMember(i)) { 9549 if (!Member->getType()->isVoidTy()) { 9550 VPValue *OriginalV = Plan->getVPValue(Member); 9551 Plan->removeVPValueFor(Member); 9552 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9553 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9554 J++; 9555 } 9556 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9557 } 9558 } 9559 9560 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9561 // in ways that accessing values using original IR values is incorrect. 9562 Plan->disableValue2VPValue(); 9563 9564 VPlanTransforms::sinkScalarOperands(*Plan); 9565 VPlanTransforms::mergeReplicateRegions(*Plan); 9566 9567 std::string PlanName; 9568 raw_string_ostream RSO(PlanName); 9569 ElementCount VF = Range.Start; 9570 Plan->addVF(VF); 9571 RSO << "Initial VPlan for VF={" << VF; 9572 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9573 Plan->addVF(VF); 9574 RSO << "," << VF; 9575 } 9576 RSO << "},UF>=1"; 9577 RSO.flush(); 9578 Plan->setName(PlanName); 9579 9580 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9581 return Plan; 9582 } 9583 9584 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9585 // Outer loop handling: They may require CFG and instruction level 9586 // transformations before even evaluating whether vectorization is profitable. 9587 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9588 // the vectorization pipeline. 9589 assert(!OrigLoop->isInnermost()); 9590 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9591 9592 // Create new empty VPlan 9593 auto Plan = std::make_unique<VPlan>(); 9594 9595 // Build hierarchical CFG 9596 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9597 HCFGBuilder.buildHierarchicalCFG(); 9598 9599 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9600 VF *= 2) 9601 Plan->addVF(VF); 9602 9603 if (EnableVPlanPredication) { 9604 VPlanPredicator VPP(*Plan); 9605 VPP.predicate(); 9606 9607 // Avoid running transformation to recipes until masked code generation in 9608 // VPlan-native path is in place. 9609 return Plan; 9610 } 9611 9612 SmallPtrSet<Instruction *, 1> DeadInstructions; 9613 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9614 Legal->getInductionVars(), 9615 DeadInstructions, *PSE.getSE()); 9616 return Plan; 9617 } 9618 9619 // Adjust the recipes for reductions. For in-loop reductions the chain of 9620 // instructions leading from the loop exit instr to the phi need to be converted 9621 // to reductions, with one operand being vector and the other being the scalar 9622 // reduction chain. For other reductions, a select is introduced between the phi 9623 // and live-out recipes when folding the tail. 9624 void LoopVectorizationPlanner::adjustRecipesForReductions( 9625 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9626 ElementCount MinVF) { 9627 for (auto &Reduction : CM.getInLoopReductionChains()) { 9628 PHINode *Phi = Reduction.first; 9629 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9630 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9631 9632 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9633 continue; 9634 9635 // ReductionOperations are orders top-down from the phi's use to the 9636 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9637 // which of the two operands will remain scalar and which will be reduced. 9638 // For minmax the chain will be the select instructions. 9639 Instruction *Chain = Phi; 9640 for (Instruction *R : ReductionOperations) { 9641 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9642 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9643 9644 VPValue *ChainOp = Plan->getVPValue(Chain); 9645 unsigned FirstOpId; 9646 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9647 "Only min/max recurrences allowed for inloop reductions"); 9648 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9649 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9650 "Expected to replace a VPWidenSelectSC"); 9651 FirstOpId = 1; 9652 } else { 9653 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && 9654 "Expected to replace a VPWidenSC"); 9655 FirstOpId = 0; 9656 } 9657 unsigned VecOpId = 9658 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9659 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9660 9661 auto *CondOp = CM.foldTailByMasking() 9662 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9663 : nullptr; 9664 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9665 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9666 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9667 Plan->removeVPValueFor(R); 9668 Plan->addVPValue(R, RedRecipe); 9669 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9670 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9671 WidenRecipe->eraseFromParent(); 9672 9673 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9674 VPRecipeBase *CompareRecipe = 9675 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9676 assert(isa<VPWidenRecipe>(CompareRecipe) && 9677 "Expected to replace a VPWidenSC"); 9678 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9679 "Expected no remaining users"); 9680 CompareRecipe->eraseFromParent(); 9681 } 9682 Chain = R; 9683 } 9684 } 9685 9686 // If tail is folded by masking, introduce selects between the phi 9687 // and the live-out instruction of each reduction, at the end of the latch. 9688 if (CM.foldTailByMasking()) { 9689 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9690 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9691 if (!PhiR || PhiR->isInLoop()) 9692 continue; 9693 Builder.setInsertPoint(LatchVPBB); 9694 VPValue *Cond = 9695 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9696 VPValue *Red = PhiR->getBackedgeValue(); 9697 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9698 } 9699 } 9700 } 9701 9702 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9703 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9704 VPSlotTracker &SlotTracker) const { 9705 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9706 IG->getInsertPos()->printAsOperand(O, false); 9707 O << ", "; 9708 getAddr()->printAsOperand(O, SlotTracker); 9709 VPValue *Mask = getMask(); 9710 if (Mask) { 9711 O << ", "; 9712 Mask->printAsOperand(O, SlotTracker); 9713 } 9714 9715 unsigned OpIdx = 0; 9716 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9717 if (!IG->getMember(i)) 9718 continue; 9719 if (getNumStoreOperands() > 0) { 9720 O << "\n" << Indent << " store "; 9721 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9722 O << " to index " << i; 9723 } else { 9724 O << "\n" << Indent << " "; 9725 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9726 O << " = load from index " << i; 9727 } 9728 ++OpIdx; 9729 } 9730 } 9731 #endif 9732 9733 void VPWidenCallRecipe::execute(VPTransformState &State) { 9734 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9735 *this, State); 9736 } 9737 9738 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9739 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9740 this, *this, InvariantCond, State); 9741 } 9742 9743 void VPWidenRecipe::execute(VPTransformState &State) { 9744 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9745 } 9746 9747 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9748 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9749 *this, State.UF, State.VF, IsPtrLoopInvariant, 9750 IsIndexLoopInvariant, State); 9751 } 9752 9753 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9754 assert(!State.Instance && "Int or FP induction being replicated."); 9755 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9756 getTruncInst(), getVPValue(0), 9757 getCastValue(), State); 9758 } 9759 9760 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9761 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9762 State); 9763 } 9764 9765 void VPBlendRecipe::execute(VPTransformState &State) { 9766 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9767 // We know that all PHIs in non-header blocks are converted into 9768 // selects, so we don't have to worry about the insertion order and we 9769 // can just use the builder. 9770 // At this point we generate the predication tree. There may be 9771 // duplications since this is a simple recursive scan, but future 9772 // optimizations will clean it up. 9773 9774 unsigned NumIncoming = getNumIncomingValues(); 9775 9776 // Generate a sequence of selects of the form: 9777 // SELECT(Mask3, In3, 9778 // SELECT(Mask2, In2, 9779 // SELECT(Mask1, In1, 9780 // In0))) 9781 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9782 // are essentially undef are taken from In0. 9783 InnerLoopVectorizer::VectorParts Entry(State.UF); 9784 for (unsigned In = 0; In < NumIncoming; ++In) { 9785 for (unsigned Part = 0; Part < State.UF; ++Part) { 9786 // We might have single edge PHIs (blocks) - use an identity 9787 // 'select' for the first PHI operand. 9788 Value *In0 = State.get(getIncomingValue(In), Part); 9789 if (In == 0) 9790 Entry[Part] = In0; // Initialize with the first incoming value. 9791 else { 9792 // Select between the current value and the previous incoming edge 9793 // based on the incoming mask. 9794 Value *Cond = State.get(getMask(In), Part); 9795 Entry[Part] = 9796 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9797 } 9798 } 9799 } 9800 for (unsigned Part = 0; Part < State.UF; ++Part) 9801 State.set(this, Entry[Part], Part); 9802 } 9803 9804 void VPInterleaveRecipe::execute(VPTransformState &State) { 9805 assert(!State.Instance && "Interleave group being replicated."); 9806 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9807 getStoredValues(), getMask()); 9808 } 9809 9810 void VPReductionRecipe::execute(VPTransformState &State) { 9811 assert(!State.Instance && "Reduction being replicated."); 9812 Value *PrevInChain = State.get(getChainOp(), 0); 9813 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9814 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9815 // Propagate the fast-math flags carried by the underlying instruction. 9816 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9817 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9818 for (unsigned Part = 0; Part < State.UF; ++Part) { 9819 Value *NewVecOp = State.get(getVecOp(), Part); 9820 if (VPValue *Cond = getCondOp()) { 9821 Value *NewCond = State.get(Cond, Part); 9822 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9823 Value *Iden = RdxDesc->getRecurrenceIdentity( 9824 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9825 Value *IdenVec = 9826 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9827 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9828 NewVecOp = Select; 9829 } 9830 Value *NewRed; 9831 Value *NextInChain; 9832 if (IsOrdered) { 9833 if (State.VF.isVector()) 9834 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9835 PrevInChain); 9836 else 9837 NewRed = State.Builder.CreateBinOp( 9838 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9839 NewVecOp); 9840 PrevInChain = NewRed; 9841 } else { 9842 PrevInChain = State.get(getChainOp(), Part); 9843 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9844 } 9845 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9846 NextInChain = 9847 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9848 NewRed, PrevInChain); 9849 } else if (IsOrdered) 9850 NextInChain = NewRed; 9851 else 9852 NextInChain = State.Builder.CreateBinOp( 9853 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9854 PrevInChain); 9855 State.set(this, NextInChain, Part); 9856 } 9857 } 9858 9859 void VPReplicateRecipe::execute(VPTransformState &State) { 9860 if (State.Instance) { // Generate a single instance. 9861 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9862 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9863 *State.Instance, IsPredicated, State); 9864 // Insert scalar instance packing it into a vector. 9865 if (AlsoPack && State.VF.isVector()) { 9866 // If we're constructing lane 0, initialize to start from poison. 9867 if (State.Instance->Lane.isFirstLane()) { 9868 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9869 Value *Poison = PoisonValue::get( 9870 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9871 State.set(this, Poison, State.Instance->Part); 9872 } 9873 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9874 } 9875 return; 9876 } 9877 9878 // Generate scalar instances for all VF lanes of all UF parts, unless the 9879 // instruction is uniform inwhich case generate only the first lane for each 9880 // of the UF parts. 9881 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9882 assert((!State.VF.isScalable() || IsUniform) && 9883 "Can't scalarize a scalable vector"); 9884 for (unsigned Part = 0; Part < State.UF; ++Part) 9885 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9886 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9887 VPIteration(Part, Lane), IsPredicated, 9888 State); 9889 } 9890 9891 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9892 assert(State.Instance && "Branch on Mask works only on single instance."); 9893 9894 unsigned Part = State.Instance->Part; 9895 unsigned Lane = State.Instance->Lane.getKnownLane(); 9896 9897 Value *ConditionBit = nullptr; 9898 VPValue *BlockInMask = getMask(); 9899 if (BlockInMask) { 9900 ConditionBit = State.get(BlockInMask, Part); 9901 if (ConditionBit->getType()->isVectorTy()) 9902 ConditionBit = State.Builder.CreateExtractElement( 9903 ConditionBit, State.Builder.getInt32(Lane)); 9904 } else // Block in mask is all-one. 9905 ConditionBit = State.Builder.getTrue(); 9906 9907 // Replace the temporary unreachable terminator with a new conditional branch, 9908 // whose two destinations will be set later when they are created. 9909 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9910 assert(isa<UnreachableInst>(CurrentTerminator) && 9911 "Expected to replace unreachable terminator with conditional branch."); 9912 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9913 CondBr->setSuccessor(0, nullptr); 9914 ReplaceInstWithInst(CurrentTerminator, CondBr); 9915 } 9916 9917 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9918 assert(State.Instance && "Predicated instruction PHI works per instance."); 9919 Instruction *ScalarPredInst = 9920 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9921 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9922 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9923 assert(PredicatingBB && "Predicated block has no single predecessor."); 9924 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9925 "operand must be VPReplicateRecipe"); 9926 9927 // By current pack/unpack logic we need to generate only a single phi node: if 9928 // a vector value for the predicated instruction exists at this point it means 9929 // the instruction has vector users only, and a phi for the vector value is 9930 // needed. In this case the recipe of the predicated instruction is marked to 9931 // also do that packing, thereby "hoisting" the insert-element sequence. 9932 // Otherwise, a phi node for the scalar value is needed. 9933 unsigned Part = State.Instance->Part; 9934 if (State.hasVectorValue(getOperand(0), Part)) { 9935 Value *VectorValue = State.get(getOperand(0), Part); 9936 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9937 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9938 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9939 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9940 if (State.hasVectorValue(this, Part)) 9941 State.reset(this, VPhi, Part); 9942 else 9943 State.set(this, VPhi, Part); 9944 // NOTE: Currently we need to update the value of the operand, so the next 9945 // predicated iteration inserts its generated value in the correct vector. 9946 State.reset(getOperand(0), VPhi, Part); 9947 } else { 9948 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9949 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9950 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9951 PredicatingBB); 9952 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9953 if (State.hasScalarValue(this, *State.Instance)) 9954 State.reset(this, Phi, *State.Instance); 9955 else 9956 State.set(this, Phi, *State.Instance); 9957 // NOTE: Currently we need to update the value of the operand, so the next 9958 // predicated iteration inserts its generated value in the correct vector. 9959 State.reset(getOperand(0), Phi, *State.Instance); 9960 } 9961 } 9962 9963 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9964 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9965 State.ILV->vectorizeMemoryInstruction( 9966 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9967 StoredValue, getMask(), Consecutive, Reverse); 9968 } 9969 9970 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9971 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9972 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9973 // for predication. 9974 static ScalarEpilogueLowering getScalarEpilogueLowering( 9975 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9976 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9977 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9978 LoopVectorizationLegality &LVL) { 9979 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9980 // don't look at hints or options, and don't request a scalar epilogue. 9981 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9982 // LoopAccessInfo (due to code dependency and not being able to reliably get 9983 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9984 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9985 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9986 // back to the old way and vectorize with versioning when forced. See D81345.) 9987 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9988 PGSOQueryType::IRPass) && 9989 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9990 return CM_ScalarEpilogueNotAllowedOptSize; 9991 9992 // 2) If set, obey the directives 9993 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9994 switch (PreferPredicateOverEpilogue) { 9995 case PreferPredicateTy::ScalarEpilogue: 9996 return CM_ScalarEpilogueAllowed; 9997 case PreferPredicateTy::PredicateElseScalarEpilogue: 9998 return CM_ScalarEpilogueNotNeededUsePredicate; 9999 case PreferPredicateTy::PredicateOrDontVectorize: 10000 return CM_ScalarEpilogueNotAllowedUsePredicate; 10001 }; 10002 } 10003 10004 // 3) If set, obey the hints 10005 switch (Hints.getPredicate()) { 10006 case LoopVectorizeHints::FK_Enabled: 10007 return CM_ScalarEpilogueNotNeededUsePredicate; 10008 case LoopVectorizeHints::FK_Disabled: 10009 return CM_ScalarEpilogueAllowed; 10010 }; 10011 10012 // 4) if the TTI hook indicates this is profitable, request predication. 10013 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10014 LVL.getLAI())) 10015 return CM_ScalarEpilogueNotNeededUsePredicate; 10016 10017 return CM_ScalarEpilogueAllowed; 10018 } 10019 10020 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10021 // If Values have been set for this Def return the one relevant for \p Part. 10022 if (hasVectorValue(Def, Part)) 10023 return Data.PerPartOutput[Def][Part]; 10024 10025 if (!hasScalarValue(Def, {Part, 0})) { 10026 Value *IRV = Def->getLiveInIRValue(); 10027 Value *B = ILV->getBroadcastInstrs(IRV); 10028 set(Def, B, Part); 10029 return B; 10030 } 10031 10032 Value *ScalarValue = get(Def, {Part, 0}); 10033 // If we aren't vectorizing, we can just copy the scalar map values over 10034 // to the vector map. 10035 if (VF.isScalar()) { 10036 set(Def, ScalarValue, Part); 10037 return ScalarValue; 10038 } 10039 10040 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10041 bool IsUniform = RepR && RepR->isUniform(); 10042 10043 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10044 // Check if there is a scalar value for the selected lane. 10045 if (!hasScalarValue(Def, {Part, LastLane})) { 10046 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10047 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10048 "unexpected recipe found to be invariant"); 10049 IsUniform = true; 10050 LastLane = 0; 10051 } 10052 10053 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10054 // Set the insert point after the last scalarized instruction or after the 10055 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10056 // will directly follow the scalar definitions. 10057 auto OldIP = Builder.saveIP(); 10058 auto NewIP = 10059 isa<PHINode>(LastInst) 10060 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10061 : std::next(BasicBlock::iterator(LastInst)); 10062 Builder.SetInsertPoint(&*NewIP); 10063 10064 // However, if we are vectorizing, we need to construct the vector values. 10065 // If the value is known to be uniform after vectorization, we can just 10066 // broadcast the scalar value corresponding to lane zero for each unroll 10067 // iteration. Otherwise, we construct the vector values using 10068 // insertelement instructions. Since the resulting vectors are stored in 10069 // State, we will only generate the insertelements once. 10070 Value *VectorValue = nullptr; 10071 if (IsUniform) { 10072 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10073 set(Def, VectorValue, Part); 10074 } else { 10075 // Initialize packing with insertelements to start from undef. 10076 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10077 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10078 set(Def, Undef, Part); 10079 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10080 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10081 VectorValue = get(Def, Part); 10082 } 10083 Builder.restoreIP(OldIP); 10084 return VectorValue; 10085 } 10086 10087 // Process the loop in the VPlan-native vectorization path. This path builds 10088 // VPlan upfront in the vectorization pipeline, which allows to apply 10089 // VPlan-to-VPlan transformations from the very beginning without modifying the 10090 // input LLVM IR. 10091 static bool processLoopInVPlanNativePath( 10092 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10093 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10094 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10095 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10096 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10097 LoopVectorizationRequirements &Requirements) { 10098 10099 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10100 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10101 return false; 10102 } 10103 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10104 Function *F = L->getHeader()->getParent(); 10105 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10106 10107 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10108 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10109 10110 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10111 &Hints, IAI); 10112 // Use the planner for outer loop vectorization. 10113 // TODO: CM is not used at this point inside the planner. Turn CM into an 10114 // optional argument if we don't need it in the future. 10115 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10116 Requirements, ORE); 10117 10118 // Get user vectorization factor. 10119 ElementCount UserVF = Hints.getWidth(); 10120 10121 CM.collectElementTypesForWidening(); 10122 10123 // Plan how to best vectorize, return the best VF and its cost. 10124 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10125 10126 // If we are stress testing VPlan builds, do not attempt to generate vector 10127 // code. Masked vector code generation support will follow soon. 10128 // Also, do not attempt to vectorize if no vector code will be produced. 10129 if (VPlanBuildStressTest || EnableVPlanPredication || 10130 VectorizationFactor::Disabled() == VF) 10131 return false; 10132 10133 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10134 10135 { 10136 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10137 F->getParent()->getDataLayout()); 10138 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10139 &CM, BFI, PSI, Checks); 10140 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10141 << L->getHeader()->getParent()->getName() << "\"\n"); 10142 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10143 } 10144 10145 // Mark the loop as already vectorized to avoid vectorizing again. 10146 Hints.setAlreadyVectorized(); 10147 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10148 return true; 10149 } 10150 10151 // Emit a remark if there are stores to floats that required a floating point 10152 // extension. If the vectorized loop was generated with floating point there 10153 // will be a performance penalty from the conversion overhead and the change in 10154 // the vector width. 10155 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10156 SmallVector<Instruction *, 4> Worklist; 10157 for (BasicBlock *BB : L->getBlocks()) { 10158 for (Instruction &Inst : *BB) { 10159 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10160 if (S->getValueOperand()->getType()->isFloatTy()) 10161 Worklist.push_back(S); 10162 } 10163 } 10164 } 10165 10166 // Traverse the floating point stores upwards searching, for floating point 10167 // conversions. 10168 SmallPtrSet<const Instruction *, 4> Visited; 10169 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10170 while (!Worklist.empty()) { 10171 auto *I = Worklist.pop_back_val(); 10172 if (!L->contains(I)) 10173 continue; 10174 if (!Visited.insert(I).second) 10175 continue; 10176 10177 // Emit a remark if the floating point store required a floating 10178 // point conversion. 10179 // TODO: More work could be done to identify the root cause such as a 10180 // constant or a function return type and point the user to it. 10181 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10182 ORE->emit([&]() { 10183 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10184 I->getDebugLoc(), L->getHeader()) 10185 << "floating point conversion changes vector width. " 10186 << "Mixed floating point precision requires an up/down " 10187 << "cast that will negatively impact performance."; 10188 }); 10189 10190 for (Use &Op : I->operands()) 10191 if (auto *OpI = dyn_cast<Instruction>(Op)) 10192 Worklist.push_back(OpI); 10193 } 10194 } 10195 10196 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10197 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10198 !EnableLoopInterleaving), 10199 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10200 !EnableLoopVectorization) {} 10201 10202 bool LoopVectorizePass::processLoop(Loop *L) { 10203 assert((EnableVPlanNativePath || L->isInnermost()) && 10204 "VPlan-native path is not enabled. Only process inner loops."); 10205 10206 #ifndef NDEBUG 10207 const std::string DebugLocStr = getDebugLocString(L); 10208 #endif /* NDEBUG */ 10209 10210 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10211 << L->getHeader()->getParent()->getName() << "\" from " 10212 << DebugLocStr << "\n"); 10213 10214 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10215 10216 LLVM_DEBUG( 10217 dbgs() << "LV: Loop hints:" 10218 << " force=" 10219 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10220 ? "disabled" 10221 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10222 ? "enabled" 10223 : "?")) 10224 << " width=" << Hints.getWidth() 10225 << " interleave=" << Hints.getInterleave() << "\n"); 10226 10227 // Function containing loop 10228 Function *F = L->getHeader()->getParent(); 10229 10230 // Looking at the diagnostic output is the only way to determine if a loop 10231 // was vectorized (other than looking at the IR or machine code), so it 10232 // is important to generate an optimization remark for each loop. Most of 10233 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10234 // generated as OptimizationRemark and OptimizationRemarkMissed are 10235 // less verbose reporting vectorized loops and unvectorized loops that may 10236 // benefit from vectorization, respectively. 10237 10238 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10239 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10240 return false; 10241 } 10242 10243 PredicatedScalarEvolution PSE(*SE, *L); 10244 10245 // Check if it is legal to vectorize the loop. 10246 LoopVectorizationRequirements Requirements; 10247 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10248 &Requirements, &Hints, DB, AC, BFI, PSI); 10249 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10250 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10251 Hints.emitRemarkWithHints(); 10252 return false; 10253 } 10254 10255 // Check the function attributes and profiles to find out if this function 10256 // should be optimized for size. 10257 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10258 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10259 10260 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10261 // here. They may require CFG and instruction level transformations before 10262 // even evaluating whether vectorization is profitable. Since we cannot modify 10263 // the incoming IR, we need to build VPlan upfront in the vectorization 10264 // pipeline. 10265 if (!L->isInnermost()) 10266 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10267 ORE, BFI, PSI, Hints, Requirements); 10268 10269 assert(L->isInnermost() && "Inner loop expected."); 10270 10271 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10272 // count by optimizing for size, to minimize overheads. 10273 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10274 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10275 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10276 << "This loop is worth vectorizing only if no scalar " 10277 << "iteration overheads are incurred."); 10278 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10279 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10280 else { 10281 LLVM_DEBUG(dbgs() << "\n"); 10282 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10283 } 10284 } 10285 10286 // Check the function attributes to see if implicit floats are allowed. 10287 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10288 // an integer loop and the vector instructions selected are purely integer 10289 // vector instructions? 10290 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10291 reportVectorizationFailure( 10292 "Can't vectorize when the NoImplicitFloat attribute is used", 10293 "loop not vectorized due to NoImplicitFloat attribute", 10294 "NoImplicitFloat", ORE, L); 10295 Hints.emitRemarkWithHints(); 10296 return false; 10297 } 10298 10299 // Check if the target supports potentially unsafe FP vectorization. 10300 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10301 // for the target we're vectorizing for, to make sure none of the 10302 // additional fp-math flags can help. 10303 if (Hints.isPotentiallyUnsafe() && 10304 TTI->isFPVectorizationPotentiallyUnsafe()) { 10305 reportVectorizationFailure( 10306 "Potentially unsafe FP op prevents vectorization", 10307 "loop not vectorized due to unsafe FP support.", 10308 "UnsafeFP", ORE, L); 10309 Hints.emitRemarkWithHints(); 10310 return false; 10311 } 10312 10313 bool AllowOrderedReductions; 10314 // If the flag is set, use that instead and override the TTI behaviour. 10315 if (ForceOrderedReductions.getNumOccurrences() > 0) 10316 AllowOrderedReductions = ForceOrderedReductions; 10317 else 10318 AllowOrderedReductions = TTI->enableOrderedReductions(); 10319 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10320 ORE->emit([&]() { 10321 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10322 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10323 ExactFPMathInst->getDebugLoc(), 10324 ExactFPMathInst->getParent()) 10325 << "loop not vectorized: cannot prove it is safe to reorder " 10326 "floating-point operations"; 10327 }); 10328 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10329 "reorder floating-point operations\n"); 10330 Hints.emitRemarkWithHints(); 10331 return false; 10332 } 10333 10334 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10335 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10336 10337 // If an override option has been passed in for interleaved accesses, use it. 10338 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10339 UseInterleaved = EnableInterleavedMemAccesses; 10340 10341 // Analyze interleaved memory accesses. 10342 if (UseInterleaved) { 10343 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10344 } 10345 10346 // Use the cost model. 10347 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10348 F, &Hints, IAI); 10349 CM.collectValuesToIgnore(); 10350 CM.collectElementTypesForWidening(); 10351 10352 // Use the planner for vectorization. 10353 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10354 Requirements, ORE); 10355 10356 // Get user vectorization factor and interleave count. 10357 ElementCount UserVF = Hints.getWidth(); 10358 unsigned UserIC = Hints.getInterleave(); 10359 10360 // Plan how to best vectorize, return the best VF and its cost. 10361 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10362 10363 VectorizationFactor VF = VectorizationFactor::Disabled(); 10364 unsigned IC = 1; 10365 10366 if (MaybeVF) { 10367 VF = *MaybeVF; 10368 // Select the interleave count. 10369 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10370 } 10371 10372 // Identify the diagnostic messages that should be produced. 10373 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10374 bool VectorizeLoop = true, InterleaveLoop = true; 10375 if (VF.Width.isScalar()) { 10376 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10377 VecDiagMsg = std::make_pair( 10378 "VectorizationNotBeneficial", 10379 "the cost-model indicates that vectorization is not beneficial"); 10380 VectorizeLoop = false; 10381 } 10382 10383 if (!MaybeVF && UserIC > 1) { 10384 // Tell the user interleaving was avoided up-front, despite being explicitly 10385 // requested. 10386 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10387 "interleaving should be avoided up front\n"); 10388 IntDiagMsg = std::make_pair( 10389 "InterleavingAvoided", 10390 "Ignoring UserIC, because interleaving was avoided up front"); 10391 InterleaveLoop = false; 10392 } else if (IC == 1 && UserIC <= 1) { 10393 // Tell the user interleaving is not beneficial. 10394 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10395 IntDiagMsg = std::make_pair( 10396 "InterleavingNotBeneficial", 10397 "the cost-model indicates that interleaving is not beneficial"); 10398 InterleaveLoop = false; 10399 if (UserIC == 1) { 10400 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10401 IntDiagMsg.second += 10402 " and is explicitly disabled or interleave count is set to 1"; 10403 } 10404 } else if (IC > 1 && UserIC == 1) { 10405 // Tell the user interleaving is beneficial, but it explicitly disabled. 10406 LLVM_DEBUG( 10407 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10408 IntDiagMsg = std::make_pair( 10409 "InterleavingBeneficialButDisabled", 10410 "the cost-model indicates that interleaving is beneficial " 10411 "but is explicitly disabled or interleave count is set to 1"); 10412 InterleaveLoop = false; 10413 } 10414 10415 // Override IC if user provided an interleave count. 10416 IC = UserIC > 0 ? UserIC : IC; 10417 10418 // Emit diagnostic messages, if any. 10419 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10420 if (!VectorizeLoop && !InterleaveLoop) { 10421 // Do not vectorize or interleaving the loop. 10422 ORE->emit([&]() { 10423 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10424 L->getStartLoc(), L->getHeader()) 10425 << VecDiagMsg.second; 10426 }); 10427 ORE->emit([&]() { 10428 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10429 L->getStartLoc(), L->getHeader()) 10430 << IntDiagMsg.second; 10431 }); 10432 return false; 10433 } else if (!VectorizeLoop && InterleaveLoop) { 10434 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10435 ORE->emit([&]() { 10436 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10437 L->getStartLoc(), L->getHeader()) 10438 << VecDiagMsg.second; 10439 }); 10440 } else if (VectorizeLoop && !InterleaveLoop) { 10441 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10442 << ") in " << DebugLocStr << '\n'); 10443 ORE->emit([&]() { 10444 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10445 L->getStartLoc(), L->getHeader()) 10446 << IntDiagMsg.second; 10447 }); 10448 } else if (VectorizeLoop && InterleaveLoop) { 10449 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10450 << ") in " << DebugLocStr << '\n'); 10451 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10452 } 10453 10454 bool DisableRuntimeUnroll = false; 10455 MDNode *OrigLoopID = L->getLoopID(); 10456 { 10457 // Optimistically generate runtime checks. Drop them if they turn out to not 10458 // be profitable. Limit the scope of Checks, so the cleanup happens 10459 // immediately after vector codegeneration is done. 10460 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10461 F->getParent()->getDataLayout()); 10462 if (!VF.Width.isScalar() || IC > 1) 10463 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10464 10465 using namespace ore; 10466 if (!VectorizeLoop) { 10467 assert(IC > 1 && "interleave count should not be 1 or 0"); 10468 // If we decided that it is not legal to vectorize the loop, then 10469 // interleave it. 10470 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10471 &CM, BFI, PSI, Checks); 10472 10473 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10474 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10475 10476 ORE->emit([&]() { 10477 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10478 L->getHeader()) 10479 << "interleaved loop (interleaved count: " 10480 << NV("InterleaveCount", IC) << ")"; 10481 }); 10482 } else { 10483 // If we decided that it is *legal* to vectorize the loop, then do it. 10484 10485 // Consider vectorizing the epilogue too if it's profitable. 10486 VectorizationFactor EpilogueVF = 10487 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10488 if (EpilogueVF.Width.isVector()) { 10489 10490 // The first pass vectorizes the main loop and creates a scalar epilogue 10491 // to be vectorized by executing the plan (potentially with a different 10492 // factor) again shortly afterwards. 10493 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10494 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10495 EPI, &LVL, &CM, BFI, PSI, Checks); 10496 10497 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10498 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10499 DT); 10500 ++LoopsVectorized; 10501 10502 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10503 formLCSSARecursively(*L, *DT, LI, SE); 10504 10505 // Second pass vectorizes the epilogue and adjusts the control flow 10506 // edges from the first pass. 10507 EPI.MainLoopVF = EPI.EpilogueVF; 10508 EPI.MainLoopUF = EPI.EpilogueUF; 10509 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10510 ORE, EPI, &LVL, &CM, BFI, PSI, 10511 Checks); 10512 10513 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10514 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10515 DT); 10516 ++LoopsEpilogueVectorized; 10517 10518 if (!MainILV.areSafetyChecksAdded()) 10519 DisableRuntimeUnroll = true; 10520 } else { 10521 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10522 &LVL, &CM, BFI, PSI, Checks); 10523 10524 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10525 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10526 ++LoopsVectorized; 10527 10528 // Add metadata to disable runtime unrolling a scalar loop when there 10529 // are no runtime checks about strides and memory. A scalar loop that is 10530 // rarely used is not worth unrolling. 10531 if (!LB.areSafetyChecksAdded()) 10532 DisableRuntimeUnroll = true; 10533 } 10534 // Report the vectorization decision. 10535 ORE->emit([&]() { 10536 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10537 L->getHeader()) 10538 << "vectorized loop (vectorization width: " 10539 << NV("VectorizationFactor", VF.Width) 10540 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10541 }); 10542 } 10543 10544 if (ORE->allowExtraAnalysis(LV_NAME)) 10545 checkMixedPrecision(L, ORE); 10546 } 10547 10548 Optional<MDNode *> RemainderLoopID = 10549 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10550 LLVMLoopVectorizeFollowupEpilogue}); 10551 if (RemainderLoopID.hasValue()) { 10552 L->setLoopID(RemainderLoopID.getValue()); 10553 } else { 10554 if (DisableRuntimeUnroll) 10555 AddRuntimeUnrollDisableMetaData(L); 10556 10557 // Mark the loop as already vectorized to avoid vectorizing again. 10558 Hints.setAlreadyVectorized(); 10559 } 10560 10561 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10562 return true; 10563 } 10564 10565 LoopVectorizeResult LoopVectorizePass::runImpl( 10566 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10567 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10568 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10569 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10570 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10571 SE = &SE_; 10572 LI = &LI_; 10573 TTI = &TTI_; 10574 DT = &DT_; 10575 BFI = &BFI_; 10576 TLI = TLI_; 10577 AA = &AA_; 10578 AC = &AC_; 10579 GetLAA = &GetLAA_; 10580 DB = &DB_; 10581 ORE = &ORE_; 10582 PSI = PSI_; 10583 10584 // Don't attempt if 10585 // 1. the target claims to have no vector registers, and 10586 // 2. interleaving won't help ILP. 10587 // 10588 // The second condition is necessary because, even if the target has no 10589 // vector registers, loop vectorization may still enable scalar 10590 // interleaving. 10591 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10592 TTI->getMaxInterleaveFactor(1) < 2) 10593 return LoopVectorizeResult(false, false); 10594 10595 bool Changed = false, CFGChanged = false; 10596 10597 // The vectorizer requires loops to be in simplified form. 10598 // Since simplification may add new inner loops, it has to run before the 10599 // legality and profitability checks. This means running the loop vectorizer 10600 // will simplify all loops, regardless of whether anything end up being 10601 // vectorized. 10602 for (auto &L : *LI) 10603 Changed |= CFGChanged |= 10604 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10605 10606 // Build up a worklist of inner-loops to vectorize. This is necessary as 10607 // the act of vectorizing or partially unrolling a loop creates new loops 10608 // and can invalidate iterators across the loops. 10609 SmallVector<Loop *, 8> Worklist; 10610 10611 for (Loop *L : *LI) 10612 collectSupportedLoops(*L, LI, ORE, Worklist); 10613 10614 LoopsAnalyzed += Worklist.size(); 10615 10616 // Now walk the identified inner loops. 10617 while (!Worklist.empty()) { 10618 Loop *L = Worklist.pop_back_val(); 10619 10620 // For the inner loops we actually process, form LCSSA to simplify the 10621 // transform. 10622 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10623 10624 Changed |= CFGChanged |= processLoop(L); 10625 } 10626 10627 // Process each loop nest in the function. 10628 return LoopVectorizeResult(Changed, CFGChanged); 10629 } 10630 10631 PreservedAnalyses LoopVectorizePass::run(Function &F, 10632 FunctionAnalysisManager &AM) { 10633 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10634 auto &LI = AM.getResult<LoopAnalysis>(F); 10635 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10636 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10637 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10638 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10639 auto &AA = AM.getResult<AAManager>(F); 10640 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10641 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10642 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10643 10644 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10645 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10646 [&](Loop &L) -> const LoopAccessInfo & { 10647 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10648 TLI, TTI, nullptr, nullptr, nullptr}; 10649 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10650 }; 10651 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10652 ProfileSummaryInfo *PSI = 10653 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10654 LoopVectorizeResult Result = 10655 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10656 if (!Result.MadeAnyChange) 10657 return PreservedAnalyses::all(); 10658 PreservedAnalyses PA; 10659 10660 // We currently do not preserve loopinfo/dominator analyses with outer loop 10661 // vectorization. Until this is addressed, mark these analyses as preserved 10662 // only for non-VPlan-native path. 10663 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10664 if (!EnableVPlanNativePath) { 10665 PA.preserve<LoopAnalysis>(); 10666 PA.preserve<DominatorTreeAnalysis>(); 10667 } 10668 if (!Result.MadeCFGChange) 10669 PA.preserveSet<CFGAnalyses>(); 10670 return PA; 10671 } 10672 10673 void LoopVectorizePass::printPipeline( 10674 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10675 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10676 OS, MapClassName2PassName); 10677 10678 OS << "<"; 10679 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10680 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10681 OS << ">"; 10682 } 10683