1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop. 472 /// In the case of epilogue vectorization, this function is overriden to 473 /// handle the more complex control flow around the loops. 474 virtual BasicBlock *createVectorizedLoopSkeleton(); 475 476 /// Widen a single instruction within the innermost loop. 477 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 478 VPTransformState &State); 479 480 /// Widen a single call instruction within the innermost loop. 481 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 482 VPTransformState &State); 483 484 /// Widen a single select instruction within the innermost loop. 485 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 486 bool InvariantCond, VPTransformState &State); 487 488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 489 void fixVectorizedLoop(VPTransformState &State); 490 491 // Return true if any runtime check is added. 492 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 493 494 /// A type for vectorized values in the new loop. Each value from the 495 /// original loop, when vectorized, is represented by UF vector values in the 496 /// new unrolled loop, where UF is the unroll factor. 497 using VectorParts = SmallVector<Value *, 2>; 498 499 /// Vectorize a single GetElementPtrInst based on information gathered and 500 /// decisions taken during planning. 501 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 502 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 503 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 504 505 /// Vectorize a single first-order recurrence or pointer induction PHINode in 506 /// a block. This method handles the induction variable canonicalization. It 507 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 508 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 509 VPTransformState &State); 510 511 /// A helper function to scalarize a single Instruction in the innermost loop. 512 /// Generates a sequence of scalar instances for each lane between \p MinLane 513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 514 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 515 /// Instr's operands. 516 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 517 const VPIteration &Instance, bool IfPredicateInstr, 518 VPTransformState &State); 519 520 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 521 /// is provided, the integer induction variable will first be truncated to 522 /// the corresponding type. 523 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 524 VPValue *Def, VPValue *CastDef, 525 VPTransformState &State); 526 527 /// Construct the vector value of a scalarized value \p V one lane at a time. 528 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 529 VPTransformState &State); 530 531 /// Try to vectorize interleaved access group \p Group with the base address 532 /// given in \p Addr, optionally masking the vector operations if \p 533 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 534 /// values in the vectorized loop. 535 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 536 ArrayRef<VPValue *> VPDefs, 537 VPTransformState &State, VPValue *Addr, 538 ArrayRef<VPValue *> StoredValues, 539 VPValue *BlockInMask = nullptr); 540 541 /// Vectorize Load and Store instructions with the base address given in \p 542 /// Addr, optionally masking the vector operations if \p BlockInMask is 543 /// non-null. Use \p State to translate given VPValues to IR values in the 544 /// vectorized loop. 545 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 546 VPValue *Def, VPValue *Addr, 547 VPValue *StoredValue, VPValue *BlockInMask, 548 bool ConsecutiveStride, bool Reverse); 549 550 /// Set the debug location in the builder \p Ptr using the debug location in 551 /// \p V. If \p Ptr is None then it uses the class member's Builder. 552 void setDebugLocFromInst(const Value *V, 553 Optional<IRBuilder<> *> CustomBuilder = None); 554 555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 556 void fixNonInductionPHIs(VPTransformState &State); 557 558 /// Returns true if the reordering of FP operations is not allowed, but we are 559 /// able to vectorize with strict in-order reductions for the given RdxDesc. 560 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 561 562 /// Create a broadcast instruction. This method generates a broadcast 563 /// instruction (shuffle) for loop invariant values and for the induction 564 /// value. If this is the induction variable then we extend it to N, N+1, ... 565 /// this is needed because each iteration in the loop corresponds to a SIMD 566 /// element. 567 virtual Value *getBroadcastInstrs(Value *V); 568 569 protected: 570 friend class LoopVectorizationPlanner; 571 572 /// A small list of PHINodes. 573 using PhiVector = SmallVector<PHINode *, 4>; 574 575 /// A type for scalarized values in the new loop. Each value from the 576 /// original loop, when scalarized, is represented by UF x VF scalar values 577 /// in the new unrolled loop, where UF is the unroll factor and VF is the 578 /// vectorization factor. 579 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 580 581 /// Set up the values of the IVs correctly when exiting the vector loop. 582 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 583 Value *CountRoundDown, Value *EndValue, 584 BasicBlock *MiddleBlock); 585 586 /// Create a new induction variable inside L. 587 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 588 Value *Step, Instruction *DL); 589 590 /// Handle all cross-iteration phis in the header. 591 void fixCrossIterationPHIs(VPTransformState &State); 592 593 /// Create the exit value of first order recurrences in the middle block and 594 /// update their users. 595 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 596 597 /// Create code for the loop exit value of the reduction. 598 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 599 600 /// Clear NSW/NUW flags from reduction instructions if necessary. 601 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 602 VPTransformState &State); 603 604 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 605 /// means we need to add the appropriate incoming value from the middle 606 /// block as exiting edges from the scalar epilogue loop (if present) are 607 /// already in place, and we exit the vector loop exclusively to the middle 608 /// block. 609 void fixLCSSAPHIs(VPTransformState &State); 610 611 /// Iteratively sink the scalarized operands of a predicated instruction into 612 /// the block that was created for it. 613 void sinkScalarOperands(Instruction *PredInst); 614 615 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 616 /// represented as. 617 void truncateToMinimalBitwidths(VPTransformState &State); 618 619 /// This function adds 620 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 621 /// to each vector element of Val. The sequence starts at StartIndex. 622 /// \p Opcode is relevant for FP induction variable. 623 virtual Value * 624 getStepVector(Value *Val, Value *StartIdx, Value *Step, 625 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 626 627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 628 /// variable on which to base the steps, \p Step is the size of the step, and 629 /// \p EntryVal is the value from the original loop that maps to the steps. 630 /// Note that \p EntryVal doesn't have to be an induction variable - it 631 /// can also be a truncate instruction. 632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 633 const InductionDescriptor &ID, VPValue *Def, 634 VPValue *CastDef, VPTransformState &State); 635 636 /// Create a vector induction phi node based on an existing scalar one. \p 637 /// EntryVal is the value from the original loop that maps to the vector phi 638 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 639 /// truncate instruction, instead of widening the original IV, we widen a 640 /// version of the IV truncated to \p EntryVal's type. 641 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 642 Value *Step, Value *Start, 643 Instruction *EntryVal, VPValue *Def, 644 VPValue *CastDef, 645 VPTransformState &State); 646 647 /// Returns true if an instruction \p I should be scalarized instead of 648 /// vectorized for the chosen vectorization factor. 649 bool shouldScalarizeInstruction(Instruction *I) const; 650 651 /// Returns true if we should generate a scalar version of \p IV. 652 bool needsScalarInduction(Instruction *IV) const; 653 654 /// If there is a cast involved in the induction variable \p ID, which should 655 /// be ignored in the vectorized loop body, this function records the 656 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 657 /// cast. We had already proved that the casted Phi is equal to the uncasted 658 /// Phi in the vectorized loop (under a runtime guard), and therefore 659 /// there is no need to vectorize the cast - the same value can be used in the 660 /// vector loop for both the Phi and the cast. 661 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 662 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 663 /// 664 /// \p EntryVal is the value from the original loop that maps to the vector 665 /// phi node and is used to distinguish what is the IV currently being 666 /// processed - original one (if \p EntryVal is a phi corresponding to the 667 /// original IV) or the "newly-created" one based on the proof mentioned above 668 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 669 /// latter case \p EntryVal is a TruncInst and we must not record anything for 670 /// that IV, but it's error-prone to expect callers of this routine to care 671 /// about that, hence this explicit parameter. 672 void recordVectorLoopValueForInductionCast( 673 const InductionDescriptor &ID, const Instruction *EntryVal, 674 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 675 unsigned Part, unsigned Lane = UINT_MAX); 676 677 /// Generate a shuffle sequence that will reverse the vector Vec. 678 virtual Value *reverseVector(Value *Vec); 679 680 /// Returns (and creates if needed) the original loop trip count. 681 Value *getOrCreateTripCount(Loop *NewLoop); 682 683 /// Returns (and creates if needed) the trip count of the widened loop. 684 Value *getOrCreateVectorTripCount(Loop *NewLoop); 685 686 /// Returns a bitcasted value to the requested vector type. 687 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 688 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 689 const DataLayout &DL); 690 691 /// Emit a bypass check to see if the vector trip count is zero, including if 692 /// it overflows. 693 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 694 695 /// Emit a bypass check to see if all of the SCEV assumptions we've 696 /// had to make are correct. Returns the block containing the checks or 697 /// nullptr if no checks have been added. 698 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 699 700 /// Emit bypass checks to check any memory assumptions we may have made. 701 /// Returns the block containing the checks or nullptr if no checks have been 702 /// added. 703 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 704 705 /// Compute the transformed value of Index at offset StartValue using step 706 /// StepValue. 707 /// For integer induction, returns StartValue + Index * StepValue. 708 /// For pointer induction, returns StartValue[Index * StepValue]. 709 /// FIXME: The newly created binary instructions should contain nsw/nuw 710 /// flags, which can be found from the original scalar operations. 711 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 712 const DataLayout &DL, 713 const InductionDescriptor &ID) const; 714 715 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 716 /// vector loop preheader, middle block and scalar preheader. Also 717 /// allocate a loop object for the new vector loop and return it. 718 Loop *createVectorLoopSkeleton(StringRef Prefix); 719 720 /// Create new phi nodes for the induction variables to resume iteration count 721 /// in the scalar epilogue, from where the vectorized loop left off (given by 722 /// \p VectorTripCount). 723 /// In cases where the loop skeleton is more complicated (eg. epilogue 724 /// vectorization) and the resume values can come from an additional bypass 725 /// block, the \p AdditionalBypass pair provides information about the bypass 726 /// block and the end value on the edge from bypass to this loop. 727 void createInductionResumeValues( 728 Loop *L, Value *VectorTripCount, 729 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 730 731 /// Complete the loop skeleton by adding debug MDs, creating appropriate 732 /// conditional branches in the middle block, preparing the builder and 733 /// running the verifier. Take in the vector loop \p L as argument, and return 734 /// the preheader of the completed vector loop. 735 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 736 737 /// Add additional metadata to \p To that was not present on \p Orig. 738 /// 739 /// Currently this is used to add the noalias annotations based on the 740 /// inserted memchecks. Use this for instructions that are *cloned* into the 741 /// vector loop. 742 void addNewMetadata(Instruction *To, const Instruction *Orig); 743 744 /// Add metadata from one instruction to another. 745 /// 746 /// This includes both the original MDs from \p From and additional ones (\see 747 /// addNewMetadata). Use this for *newly created* instructions in the vector 748 /// loop. 749 void addMetadata(Instruction *To, Instruction *From); 750 751 /// Similar to the previous function but it adds the metadata to a 752 /// vector of instructions. 753 void addMetadata(ArrayRef<Value *> To, Instruction *From); 754 755 /// Allow subclasses to override and print debug traces before/after vplan 756 /// execution, when trace information is requested. 757 virtual void printDebugTracesAtStart(){}; 758 virtual void printDebugTracesAtEnd(){}; 759 760 /// The original loop. 761 Loop *OrigLoop; 762 763 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 764 /// dynamic knowledge to simplify SCEV expressions and converts them to a 765 /// more usable form. 766 PredicatedScalarEvolution &PSE; 767 768 /// Loop Info. 769 LoopInfo *LI; 770 771 /// Dominator Tree. 772 DominatorTree *DT; 773 774 /// Alias Analysis. 775 AAResults *AA; 776 777 /// Target Library Info. 778 const TargetLibraryInfo *TLI; 779 780 /// Target Transform Info. 781 const TargetTransformInfo *TTI; 782 783 /// Assumption Cache. 784 AssumptionCache *AC; 785 786 /// Interface to emit optimization remarks. 787 OptimizationRemarkEmitter *ORE; 788 789 /// LoopVersioning. It's only set up (non-null) if memchecks were 790 /// used. 791 /// 792 /// This is currently only used to add no-alias metadata based on the 793 /// memchecks. The actually versioning is performed manually. 794 std::unique_ptr<LoopVersioning> LVer; 795 796 /// The vectorization SIMD factor to use. Each vector will have this many 797 /// vector elements. 798 ElementCount VF; 799 800 /// The vectorization unroll factor to use. Each scalar is vectorized to this 801 /// many different vector instructions. 802 unsigned UF; 803 804 /// The builder that we use 805 IRBuilder<> Builder; 806 807 // --- Vectorization state --- 808 809 /// The vector-loop preheader. 810 BasicBlock *LoopVectorPreHeader; 811 812 /// The scalar-loop preheader. 813 BasicBlock *LoopScalarPreHeader; 814 815 /// Middle Block between the vector and the scalar. 816 BasicBlock *LoopMiddleBlock; 817 818 /// The unique ExitBlock of the scalar loop if one exists. Note that 819 /// there can be multiple exiting edges reaching this block. 820 BasicBlock *LoopExitBlock; 821 822 /// The vector loop body. 823 BasicBlock *LoopVectorBody; 824 825 /// The scalar loop body. 826 BasicBlock *LoopScalarBody; 827 828 /// A list of all bypass blocks. The first block is the entry of the loop. 829 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 830 831 /// The new Induction variable which was added to the new block. 832 PHINode *Induction = nullptr; 833 834 /// The induction variable of the old basic block. 835 PHINode *OldInduction = nullptr; 836 837 /// Store instructions that were predicated. 838 SmallVector<Instruction *, 4> PredicatedInstructions; 839 840 /// Trip count of the original loop. 841 Value *TripCount = nullptr; 842 843 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 844 Value *VectorTripCount = nullptr; 845 846 /// The legality analysis. 847 LoopVectorizationLegality *Legal; 848 849 /// The profitablity analysis. 850 LoopVectorizationCostModel *Cost; 851 852 // Record whether runtime checks are added. 853 bool AddedSafetyChecks = false; 854 855 // Holds the end values for each induction variable. We save the end values 856 // so we can later fix-up the external users of the induction variables. 857 DenseMap<PHINode *, Value *> IVEndValues; 858 859 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 860 // fixed up at the end of vector code generation. 861 SmallVector<PHINode *, 8> OrigPHIsToFix; 862 863 /// BFI and PSI are used to check for profile guided size optimizations. 864 BlockFrequencyInfo *BFI; 865 ProfileSummaryInfo *PSI; 866 867 // Whether this loop should be optimized for size based on profile guided size 868 // optimizatios. 869 bool OptForSizeBasedOnProfile; 870 871 /// Structure to hold information about generated runtime checks, responsible 872 /// for cleaning the checks, if vectorization turns out unprofitable. 873 GeneratedRTChecks &RTChecks; 874 }; 875 876 class InnerLoopUnroller : public InnerLoopVectorizer { 877 public: 878 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 879 LoopInfo *LI, DominatorTree *DT, 880 const TargetLibraryInfo *TLI, 881 const TargetTransformInfo *TTI, AssumptionCache *AC, 882 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 883 LoopVectorizationLegality *LVL, 884 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 885 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 887 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 888 BFI, PSI, Check) {} 889 890 private: 891 Value *getBroadcastInstrs(Value *V) override; 892 Value *getStepVector( 893 Value *Val, Value *StartIdx, Value *Step, 894 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 895 Value *reverseVector(Value *Vec) override; 896 }; 897 898 /// Encapsulate information regarding vectorization of a loop and its epilogue. 899 /// This information is meant to be updated and used across two stages of 900 /// epilogue vectorization. 901 struct EpilogueLoopVectorizationInfo { 902 ElementCount MainLoopVF = ElementCount::getFixed(0); 903 unsigned MainLoopUF = 0; 904 ElementCount EpilogueVF = ElementCount::getFixed(0); 905 unsigned EpilogueUF = 0; 906 BasicBlock *MainLoopIterationCountCheck = nullptr; 907 BasicBlock *EpilogueIterationCountCheck = nullptr; 908 BasicBlock *SCEVSafetyCheck = nullptr; 909 BasicBlock *MemSafetyCheck = nullptr; 910 Value *TripCount = nullptr; 911 Value *VectorTripCount = nullptr; 912 913 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 914 ElementCount EVF, unsigned EUF) 915 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 916 assert(EUF == 1 && 917 "A high UF for the epilogue loop is likely not beneficial."); 918 } 919 }; 920 921 /// An extension of the inner loop vectorizer that creates a skeleton for a 922 /// vectorized loop that has its epilogue (residual) also vectorized. 923 /// The idea is to run the vplan on a given loop twice, firstly to setup the 924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 925 /// from the first step and vectorize the epilogue. This is achieved by 926 /// deriving two concrete strategy classes from this base class and invoking 927 /// them in succession from the loop vectorizer planner. 928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 929 public: 930 InnerLoopAndEpilogueVectorizer( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 940 Checks), 941 EPI(EPI) {} 942 943 // Override this function to handle the more complex control flow around the 944 // three loops. 945 BasicBlock *createVectorizedLoopSkeleton() final override { 946 return createEpilogueVectorizedLoopSkeleton(); 947 } 948 949 /// The interface for creating a vectorized skeleton using one of two 950 /// different strategies, each corresponding to one execution of the vplan 951 /// as described above. 952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 953 954 /// Holds and updates state information required to vectorize the main loop 955 /// and its epilogue in two separate passes. This setup helps us avoid 956 /// regenerating and recomputing runtime safety checks. It also helps us to 957 /// shorten the iteration-count-check path length for the cases where the 958 /// iteration count of the loop is so small that the main vector loop is 959 /// completely skipped. 960 EpilogueLoopVectorizationInfo &EPI; 961 }; 962 963 /// A specialized derived class of inner loop vectorizer that performs 964 /// vectorization of *main* loops in the process of vectorizing loops and their 965 /// epilogues. 966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 967 public: 968 EpilogueVectorizerMainLoop( 969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 970 DominatorTree *DT, const TargetLibraryInfo *TLI, 971 const TargetTransformInfo *TTI, AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 975 GeneratedRTChecks &Check) 976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 977 EPI, LVL, CM, BFI, PSI, Check) {} 978 /// Implements the interface for creating a vectorized skeleton using the 979 /// *main loop* strategy (ie the first pass of vplan execution). 980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 981 982 protected: 983 /// Emits an iteration count bypass check once for the main loop (when \p 984 /// ForEpilogue is false) and once for the epilogue loop (when \p 985 /// ForEpilogue is true). 986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 987 bool ForEpilogue); 988 void printDebugTracesAtStart() override; 989 void printDebugTracesAtEnd() override; 990 }; 991 992 // A specialized derived class of inner loop vectorizer that performs 993 // vectorization of *epilogue* loops in the process of vectorizing loops and 994 // their epilogues. 995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 996 public: 997 EpilogueVectorizerEpilogueLoop( 998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 999 DominatorTree *DT, const TargetLibraryInfo *TLI, 1000 const TargetTransformInfo *TTI, AssumptionCache *AC, 1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1004 GeneratedRTChecks &Checks) 1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1006 EPI, LVL, CM, BFI, PSI, Checks) {} 1007 /// Implements the interface for creating a vectorized skeleton using the 1008 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1010 1011 protected: 1012 /// Emits an iteration count bypass check after the main vector loop has 1013 /// finished to see if there are any iterations left to execute by either 1014 /// the vector epilogue or the scalar epilogue. 1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1016 BasicBlock *Bypass, 1017 BasicBlock *Insert); 1018 void printDebugTracesAtStart() override; 1019 void printDebugTracesAtEnd() override; 1020 }; 1021 } // end namespace llvm 1022 1023 /// Look for a meaningful debug location on the instruction or it's 1024 /// operands. 1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1026 if (!I) 1027 return I; 1028 1029 DebugLoc Empty; 1030 if (I->getDebugLoc() != Empty) 1031 return I; 1032 1033 for (Use &Op : I->operands()) { 1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1035 if (OpInst->getDebugLoc() != Empty) 1036 return OpInst; 1037 } 1038 1039 return I; 1040 } 1041 1042 void InnerLoopVectorizer::setDebugLocFromInst( 1043 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1044 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1045 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1046 const DILocation *DIL = Inst->getDebugLoc(); 1047 1048 // When a FSDiscriminator is enabled, we don't need to add the multiply 1049 // factors to the discriminators. 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1052 // FIXME: For scalable vectors, assume vscale=1. 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B->SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } else 1062 B->SetCurrentDebugLocation(DIL); 1063 } else 1064 B->SetCurrentDebugLocation(DebugLoc()); 1065 } 1066 1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1068 /// is passed, the message relates to that particular instruction. 1069 #ifndef NDEBUG 1070 static void debugVectorizationMessage(const StringRef Prefix, 1071 const StringRef DebugMsg, 1072 Instruction *I) { 1073 dbgs() << "LV: " << Prefix << DebugMsg; 1074 if (I != nullptr) 1075 dbgs() << " " << *I; 1076 else 1077 dbgs() << '.'; 1078 dbgs() << '\n'; 1079 } 1080 #endif 1081 1082 /// Create an analysis remark that explains why vectorization failed 1083 /// 1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1085 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1086 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1087 /// the location of the remark. \return the remark object that can be 1088 /// streamed to. 1089 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1090 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1091 Value *CodeRegion = TheLoop->getHeader(); 1092 DebugLoc DL = TheLoop->getStartLoc(); 1093 1094 if (I) { 1095 CodeRegion = I->getParent(); 1096 // If there is no debug location attached to the instruction, revert back to 1097 // using the loop's. 1098 if (I->getDebugLoc()) 1099 DL = I->getDebugLoc(); 1100 } 1101 1102 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1103 } 1104 1105 /// Return a value for Step multiplied by VF. 1106 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1107 int64_t Step) { 1108 assert(Ty->isIntegerTy() && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1110 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1111 } 1112 1113 namespace llvm { 1114 1115 /// Return the runtime value for VF. 1116 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1117 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1118 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1119 } 1120 1121 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1122 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1123 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1124 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1125 return B.CreateUIToFP(RuntimeVF, FTy); 1126 } 1127 1128 void reportVectorizationFailure(const StringRef DebugMsg, 1129 const StringRef OREMsg, const StringRef ORETag, 1130 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1131 Instruction *I) { 1132 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1133 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1134 ORE->emit( 1135 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1136 << "loop not vectorized: " << OREMsg); 1137 } 1138 1139 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1140 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1141 Instruction *I) { 1142 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1143 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1144 ORE->emit( 1145 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1146 << Msg); 1147 } 1148 1149 } // end namespace llvm 1150 1151 #ifndef NDEBUG 1152 /// \return string containing a file name and a line # for the given loop. 1153 static std::string getDebugLocString(const Loop *L) { 1154 std::string Result; 1155 if (L) { 1156 raw_string_ostream OS(Result); 1157 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1158 LoopDbgLoc.print(OS); 1159 else 1160 // Just print the module name. 1161 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1162 OS.flush(); 1163 } 1164 return Result; 1165 } 1166 #endif 1167 1168 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1169 const Instruction *Orig) { 1170 // If the loop was versioned with memchecks, add the corresponding no-alias 1171 // metadata. 1172 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1173 LVer->annotateInstWithNoAlias(To, Orig); 1174 } 1175 1176 void InnerLoopVectorizer::addMetadata(Instruction *To, 1177 Instruction *From) { 1178 propagateMetadata(To, From); 1179 addNewMetadata(To, From); 1180 } 1181 1182 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1183 Instruction *From) { 1184 for (Value *V : To) { 1185 if (Instruction *I = dyn_cast<Instruction>(V)) 1186 addMetadata(I, From); 1187 } 1188 } 1189 1190 namespace llvm { 1191 1192 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1193 // lowered. 1194 enum ScalarEpilogueLowering { 1195 1196 // The default: allowing scalar epilogues. 1197 CM_ScalarEpilogueAllowed, 1198 1199 // Vectorization with OptForSize: don't allow epilogues. 1200 CM_ScalarEpilogueNotAllowedOptSize, 1201 1202 // A special case of vectorisation with OptForSize: loops with a very small 1203 // trip count are considered for vectorization under OptForSize, thereby 1204 // making sure the cost of their loop body is dominant, free of runtime 1205 // guards and scalar iteration overheads. 1206 CM_ScalarEpilogueNotAllowedLowTripLoop, 1207 1208 // Loop hint predicate indicating an epilogue is undesired. 1209 CM_ScalarEpilogueNotNeededUsePredicate, 1210 1211 // Directive indicating we must either tail fold or not vectorize 1212 CM_ScalarEpilogueNotAllowedUsePredicate 1213 }; 1214 1215 /// ElementCountComparator creates a total ordering for ElementCount 1216 /// for the purposes of using it in a set structure. 1217 struct ElementCountComparator { 1218 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1219 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1220 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1221 } 1222 }; 1223 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1224 1225 /// LoopVectorizationCostModel - estimates the expected speedups due to 1226 /// vectorization. 1227 /// In many cases vectorization is not profitable. This can happen because of 1228 /// a number of reasons. In this class we mainly attempt to predict the 1229 /// expected speedup/slowdowns due to the supported instruction set. We use the 1230 /// TargetTransformInfo to query the different backends for the cost of 1231 /// different operations. 1232 class LoopVectorizationCostModel { 1233 public: 1234 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1235 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1236 LoopVectorizationLegality *Legal, 1237 const TargetTransformInfo &TTI, 1238 const TargetLibraryInfo *TLI, DemandedBits *DB, 1239 AssumptionCache *AC, 1240 OptimizationRemarkEmitter *ORE, const Function *F, 1241 const LoopVectorizeHints *Hints, 1242 InterleavedAccessInfo &IAI) 1243 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1244 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1245 Hints(Hints), InterleaveInfo(IAI) {} 1246 1247 /// \return An upper bound for the vectorization factors (both fixed and 1248 /// scalable). If the factors are 0, vectorization and interleaving should be 1249 /// avoided up front. 1250 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1251 1252 /// \return True if runtime checks are required for vectorization, and false 1253 /// otherwise. 1254 bool runtimeChecksRequired(); 1255 1256 /// \return The most profitable vectorization factor and the cost of that VF. 1257 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1258 /// then this vectorization factor will be selected if vectorization is 1259 /// possible. 1260 VectorizationFactor 1261 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1262 1263 VectorizationFactor 1264 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1265 const LoopVectorizationPlanner &LVP); 1266 1267 /// Setup cost-based decisions for user vectorization factor. 1268 /// \return true if the UserVF is a feasible VF to be chosen. 1269 bool selectUserVectorizationFactor(ElementCount UserVF) { 1270 collectUniformsAndScalars(UserVF); 1271 collectInstsToScalarize(UserVF); 1272 return expectedCost(UserVF).first.isValid(); 1273 } 1274 1275 /// \return The size (in bits) of the smallest and widest types in the code 1276 /// that needs to be vectorized. We ignore values that remain scalar such as 1277 /// 64 bit loop indices. 1278 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1279 1280 /// \return The desired interleave count. 1281 /// If interleave count has been specified by metadata it will be returned. 1282 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1283 /// are the selected vectorization factor and the cost of the selected VF. 1284 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1285 1286 /// Memory access instruction may be vectorized in more than one way. 1287 /// Form of instruction after vectorization depends on cost. 1288 /// This function takes cost-based decisions for Load/Store instructions 1289 /// and collects them in a map. This decisions map is used for building 1290 /// the lists of loop-uniform and loop-scalar instructions. 1291 /// The calculated cost is saved with widening decision in order to 1292 /// avoid redundant calculations. 1293 void setCostBasedWideningDecision(ElementCount VF); 1294 1295 /// A struct that represents some properties of the register usage 1296 /// of a loop. 1297 struct RegisterUsage { 1298 /// Holds the number of loop invariant values that are used in the loop. 1299 /// The key is ClassID of target-provided register class. 1300 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1301 /// Holds the maximum number of concurrent live intervals in the loop. 1302 /// The key is ClassID of target-provided register class. 1303 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1304 }; 1305 1306 /// \return Returns information about the register usages of the loop for the 1307 /// given vectorization factors. 1308 SmallVector<RegisterUsage, 8> 1309 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1310 1311 /// Collect values we want to ignore in the cost model. 1312 void collectValuesToIgnore(); 1313 1314 /// Collect all element types in the loop for which widening is needed. 1315 void collectElementTypesForWidening(); 1316 1317 /// Split reductions into those that happen in the loop, and those that happen 1318 /// outside. In loop reductions are collected into InLoopReductionChains. 1319 void collectInLoopReductions(); 1320 1321 /// Returns true if we should use strict in-order reductions for the given 1322 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1323 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1324 /// of FP operations. 1325 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1326 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1327 } 1328 1329 /// \returns The smallest bitwidth each instruction can be represented with. 1330 /// The vector equivalents of these instructions should be truncated to this 1331 /// type. 1332 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1333 return MinBWs; 1334 } 1335 1336 /// \returns True if it is more profitable to scalarize instruction \p I for 1337 /// vectorization factor \p VF. 1338 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1339 assert(VF.isVector() && 1340 "Profitable to scalarize relevant only for VF > 1."); 1341 1342 // Cost model is not run in the VPlan-native path - return conservative 1343 // result until this changes. 1344 if (EnableVPlanNativePath) 1345 return false; 1346 1347 auto Scalars = InstsToScalarize.find(VF); 1348 assert(Scalars != InstsToScalarize.end() && 1349 "VF not yet analyzed for scalarization profitability"); 1350 return Scalars->second.find(I) != Scalars->second.end(); 1351 } 1352 1353 /// Returns true if \p I is known to be uniform after vectorization. 1354 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1355 if (VF.isScalar()) 1356 return true; 1357 1358 // Cost model is not run in the VPlan-native path - return conservative 1359 // result until this changes. 1360 if (EnableVPlanNativePath) 1361 return false; 1362 1363 auto UniformsPerVF = Uniforms.find(VF); 1364 assert(UniformsPerVF != Uniforms.end() && 1365 "VF not yet analyzed for uniformity"); 1366 return UniformsPerVF->second.count(I); 1367 } 1368 1369 /// Returns true if \p I is known to be scalar after vectorization. 1370 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1371 if (VF.isScalar()) 1372 return true; 1373 1374 // Cost model is not run in the VPlan-native path - return conservative 1375 // result until this changes. 1376 if (EnableVPlanNativePath) 1377 return false; 1378 1379 auto ScalarsPerVF = Scalars.find(VF); 1380 assert(ScalarsPerVF != Scalars.end() && 1381 "Scalar values are not calculated for VF"); 1382 return ScalarsPerVF->second.count(I); 1383 } 1384 1385 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1386 /// for vectorization factor \p VF. 1387 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1388 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1389 !isProfitableToScalarize(I, VF) && 1390 !isScalarAfterVectorization(I, VF); 1391 } 1392 1393 /// Decision that was taken during cost calculation for memory instruction. 1394 enum InstWidening { 1395 CM_Unknown, 1396 CM_Widen, // For consecutive accesses with stride +1. 1397 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1398 CM_Interleave, 1399 CM_GatherScatter, 1400 CM_Scalarize 1401 }; 1402 1403 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1404 /// instruction \p I and vector width \p VF. 1405 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1406 InstructionCost Cost) { 1407 assert(VF.isVector() && "Expected VF >=2"); 1408 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1409 } 1410 1411 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1412 /// interleaving group \p Grp and vector width \p VF. 1413 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1414 ElementCount VF, InstWidening W, 1415 InstructionCost Cost) { 1416 assert(VF.isVector() && "Expected VF >=2"); 1417 /// Broadcast this decicion to all instructions inside the group. 1418 /// But the cost will be assigned to one instruction only. 1419 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1420 if (auto *I = Grp->getMember(i)) { 1421 if (Grp->getInsertPos() == I) 1422 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1423 else 1424 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1425 } 1426 } 1427 } 1428 1429 /// Return the cost model decision for the given instruction \p I and vector 1430 /// width \p VF. Return CM_Unknown if this instruction did not pass 1431 /// through the cost modeling. 1432 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1433 assert(VF.isVector() && "Expected VF to be a vector VF"); 1434 // Cost model is not run in the VPlan-native path - return conservative 1435 // result until this changes. 1436 if (EnableVPlanNativePath) 1437 return CM_GatherScatter; 1438 1439 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1440 auto Itr = WideningDecisions.find(InstOnVF); 1441 if (Itr == WideningDecisions.end()) 1442 return CM_Unknown; 1443 return Itr->second.first; 1444 } 1445 1446 /// Return the vectorization cost for the given instruction \p I and vector 1447 /// width \p VF. 1448 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1449 assert(VF.isVector() && "Expected VF >=2"); 1450 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1451 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1452 "The cost is not calculated"); 1453 return WideningDecisions[InstOnVF].second; 1454 } 1455 1456 /// Return True if instruction \p I is an optimizable truncate whose operand 1457 /// is an induction variable. Such a truncate will be removed by adding a new 1458 /// induction variable with the destination type. 1459 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1460 // If the instruction is not a truncate, return false. 1461 auto *Trunc = dyn_cast<TruncInst>(I); 1462 if (!Trunc) 1463 return false; 1464 1465 // Get the source and destination types of the truncate. 1466 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1467 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1468 1469 // If the truncate is free for the given types, return false. Replacing a 1470 // free truncate with an induction variable would add an induction variable 1471 // update instruction to each iteration of the loop. We exclude from this 1472 // check the primary induction variable since it will need an update 1473 // instruction regardless. 1474 Value *Op = Trunc->getOperand(0); 1475 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1476 return false; 1477 1478 // If the truncated value is not an induction variable, return false. 1479 return Legal->isInductionPhi(Op); 1480 } 1481 1482 /// Collects the instructions to scalarize for each predicated instruction in 1483 /// the loop. 1484 void collectInstsToScalarize(ElementCount VF); 1485 1486 /// Collect Uniform and Scalar values for the given \p VF. 1487 /// The sets depend on CM decision for Load/Store instructions 1488 /// that may be vectorized as interleave, gather-scatter or scalarized. 1489 void collectUniformsAndScalars(ElementCount VF) { 1490 // Do the analysis once. 1491 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1492 return; 1493 setCostBasedWideningDecision(VF); 1494 collectLoopUniforms(VF); 1495 collectLoopScalars(VF); 1496 } 1497 1498 /// Returns true if the target machine supports masked store operation 1499 /// for the given \p DataType and kind of access to \p Ptr. 1500 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1501 return Legal->isConsecutivePtr(DataType, Ptr) && 1502 TTI.isLegalMaskedStore(DataType, Alignment); 1503 } 1504 1505 /// Returns true if the target machine supports masked load operation 1506 /// for the given \p DataType and kind of access to \p Ptr. 1507 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1508 return Legal->isConsecutivePtr(DataType, Ptr) && 1509 TTI.isLegalMaskedLoad(DataType, Alignment); 1510 } 1511 1512 /// Returns true if the target machine can represent \p V as a masked gather 1513 /// or scatter operation. 1514 bool isLegalGatherOrScatter(Value *V) { 1515 bool LI = isa<LoadInst>(V); 1516 bool SI = isa<StoreInst>(V); 1517 if (!LI && !SI) 1518 return false; 1519 auto *Ty = getLoadStoreType(V); 1520 Align Align = getLoadStoreAlignment(V); 1521 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1522 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1523 } 1524 1525 /// Returns true if the target machine supports all of the reduction 1526 /// variables found for the given VF. 1527 bool canVectorizeReductions(ElementCount VF) const { 1528 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1529 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1530 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1531 })); 1532 } 1533 1534 /// Returns true if \p I is an instruction that will be scalarized with 1535 /// predication. Such instructions include conditional stores and 1536 /// instructions that may divide by zero. 1537 /// If a non-zero VF has been calculated, we check if I will be scalarized 1538 /// predication for that VF. 1539 bool isScalarWithPredication(Instruction *I) const; 1540 1541 // Returns true if \p I is an instruction that will be predicated either 1542 // through scalar predication or masked load/store or masked gather/scatter. 1543 // Superset of instructions that return true for isScalarWithPredication. 1544 bool isPredicatedInst(Instruction *I) { 1545 if (!blockNeedsPredication(I->getParent())) 1546 return false; 1547 // Loads and stores that need some form of masked operation are predicated 1548 // instructions. 1549 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1550 return Legal->isMaskRequired(I); 1551 return isScalarWithPredication(I); 1552 } 1553 1554 /// Returns true if \p I is a memory instruction with consecutive memory 1555 /// access that can be widened. 1556 bool 1557 memoryInstructionCanBeWidened(Instruction *I, 1558 ElementCount VF = ElementCount::getFixed(1)); 1559 1560 /// Returns true if \p I is a memory instruction in an interleaved-group 1561 /// of memory accesses that can be vectorized with wide vector loads/stores 1562 /// and shuffles. 1563 bool 1564 interleavedAccessCanBeWidened(Instruction *I, 1565 ElementCount VF = ElementCount::getFixed(1)); 1566 1567 /// Check if \p Instr belongs to any interleaved access group. 1568 bool isAccessInterleaved(Instruction *Instr) { 1569 return InterleaveInfo.isInterleaved(Instr); 1570 } 1571 1572 /// Get the interleaved access group that \p Instr belongs to. 1573 const InterleaveGroup<Instruction> * 1574 getInterleavedAccessGroup(Instruction *Instr) { 1575 return InterleaveInfo.getInterleaveGroup(Instr); 1576 } 1577 1578 /// Returns true if we're required to use a scalar epilogue for at least 1579 /// the final iteration of the original loop. 1580 bool requiresScalarEpilogue(ElementCount VF) const { 1581 if (!isScalarEpilogueAllowed()) 1582 return false; 1583 // If we might exit from anywhere but the latch, must run the exiting 1584 // iteration in scalar form. 1585 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1586 return true; 1587 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1588 } 1589 1590 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1591 /// loop hint annotation. 1592 bool isScalarEpilogueAllowed() const { 1593 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1594 } 1595 1596 /// Returns true if all loop blocks should be masked to fold tail loop. 1597 bool foldTailByMasking() const { return FoldTailByMasking; } 1598 1599 bool blockNeedsPredication(BasicBlock *BB) const { 1600 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1601 } 1602 1603 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1604 /// nodes to the chain of instructions representing the reductions. Uses a 1605 /// MapVector to ensure deterministic iteration order. 1606 using ReductionChainMap = 1607 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1608 1609 /// Return the chain of instructions representing an inloop reduction. 1610 const ReductionChainMap &getInLoopReductionChains() const { 1611 return InLoopReductionChains; 1612 } 1613 1614 /// Returns true if the Phi is part of an inloop reduction. 1615 bool isInLoopReduction(PHINode *Phi) const { 1616 return InLoopReductionChains.count(Phi); 1617 } 1618 1619 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1620 /// with factor VF. Return the cost of the instruction, including 1621 /// scalarization overhead if it's needed. 1622 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1623 1624 /// Estimate cost of a call instruction CI if it were vectorized with factor 1625 /// VF. Return the cost of the instruction, including scalarization overhead 1626 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1627 /// scalarized - 1628 /// i.e. either vector version isn't available, or is too expensive. 1629 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1630 bool &NeedToScalarize) const; 1631 1632 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1633 /// that of B. 1634 bool isMoreProfitable(const VectorizationFactor &A, 1635 const VectorizationFactor &B) const; 1636 1637 /// Invalidates decisions already taken by the cost model. 1638 void invalidateCostModelingDecisions() { 1639 WideningDecisions.clear(); 1640 Uniforms.clear(); 1641 Scalars.clear(); 1642 } 1643 1644 private: 1645 unsigned NumPredStores = 0; 1646 1647 /// \return An upper bound for the vectorization factors for both 1648 /// fixed and scalable vectorization, where the minimum-known number of 1649 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1650 /// disabled or unsupported, then the scalable part will be equal to 1651 /// ElementCount::getScalable(0). 1652 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1653 ElementCount UserVF); 1654 1655 /// \return the maximized element count based on the targets vector 1656 /// registers and the loop trip-count, but limited to a maximum safe VF. 1657 /// This is a helper function of computeFeasibleMaxVF. 1658 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1659 /// issue that occurred on one of the buildbots which cannot be reproduced 1660 /// without having access to the properietary compiler (see comments on 1661 /// D98509). The issue is currently under investigation and this workaround 1662 /// will be removed as soon as possible. 1663 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1664 unsigned SmallestType, 1665 unsigned WidestType, 1666 const ElementCount &MaxSafeVF); 1667 1668 /// \return the maximum legal scalable VF, based on the safe max number 1669 /// of elements. 1670 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1671 1672 /// The vectorization cost is a combination of the cost itself and a boolean 1673 /// indicating whether any of the contributing operations will actually 1674 /// operate on vector values after type legalization in the backend. If this 1675 /// latter value is false, then all operations will be scalarized (i.e. no 1676 /// vectorization has actually taken place). 1677 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1678 1679 /// Returns the expected execution cost. The unit of the cost does 1680 /// not matter because we use the 'cost' units to compare different 1681 /// vector widths. The cost that is returned is *not* normalized by 1682 /// the factor width. If \p Invalid is not nullptr, this function 1683 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1684 /// each instruction that has an Invalid cost for the given VF. 1685 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1686 VectorizationCostTy 1687 expectedCost(ElementCount VF, 1688 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1689 1690 /// Returns the execution time cost of an instruction for a given vector 1691 /// width. Vector width of one means scalar. 1692 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1693 1694 /// The cost-computation logic from getInstructionCost which provides 1695 /// the vector type as an output parameter. 1696 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1697 Type *&VectorTy); 1698 1699 /// Return the cost of instructions in an inloop reduction pattern, if I is 1700 /// part of that pattern. 1701 Optional<InstructionCost> 1702 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1703 TTI::TargetCostKind CostKind); 1704 1705 /// Calculate vectorization cost of memory instruction \p I. 1706 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1707 1708 /// The cost computation for scalarized memory instruction. 1709 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1710 1711 /// The cost computation for interleaving group of memory instructions. 1712 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1713 1714 /// The cost computation for Gather/Scatter instruction. 1715 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1716 1717 /// The cost computation for widening instruction \p I with consecutive 1718 /// memory access. 1719 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1720 1721 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1722 /// Load: scalar load + broadcast. 1723 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1724 /// element) 1725 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1726 1727 /// Estimate the overhead of scalarizing an instruction. This is a 1728 /// convenience wrapper for the type-based getScalarizationOverhead API. 1729 InstructionCost getScalarizationOverhead(Instruction *I, 1730 ElementCount VF) const; 1731 1732 /// Returns whether the instruction is a load or store and will be a emitted 1733 /// as a vector operation. 1734 bool isConsecutiveLoadOrStore(Instruction *I); 1735 1736 /// Returns true if an artificially high cost for emulated masked memrefs 1737 /// should be used. 1738 bool useEmulatedMaskMemRefHack(Instruction *I); 1739 1740 /// Map of scalar integer values to the smallest bitwidth they can be legally 1741 /// represented as. The vector equivalents of these values should be truncated 1742 /// to this type. 1743 MapVector<Instruction *, uint64_t> MinBWs; 1744 1745 /// A type representing the costs for instructions if they were to be 1746 /// scalarized rather than vectorized. The entries are Instruction-Cost 1747 /// pairs. 1748 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1749 1750 /// A set containing all BasicBlocks that are known to present after 1751 /// vectorization as a predicated block. 1752 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1753 1754 /// Records whether it is allowed to have the original scalar loop execute at 1755 /// least once. This may be needed as a fallback loop in case runtime 1756 /// aliasing/dependence checks fail, or to handle the tail/remainder 1757 /// iterations when the trip count is unknown or doesn't divide by the VF, 1758 /// or as a peel-loop to handle gaps in interleave-groups. 1759 /// Under optsize and when the trip count is very small we don't allow any 1760 /// iterations to execute in the scalar loop. 1761 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1762 1763 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1764 bool FoldTailByMasking = false; 1765 1766 /// A map holding scalar costs for different vectorization factors. The 1767 /// presence of a cost for an instruction in the mapping indicates that the 1768 /// instruction will be scalarized when vectorizing with the associated 1769 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1770 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1771 1772 /// Holds the instructions known to be uniform after vectorization. 1773 /// Entries in Uniforms may demand either the first or last lane. 1774 /// The data is collected per VF. 1775 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1776 1777 /// Holds the instructions known to be scalar after vectorization. 1778 /// The data is collected per VF. 1779 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1780 1781 /// Holds the instructions (address computations) that are forced to be 1782 /// scalarized. 1783 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1784 1785 /// PHINodes of the reductions that should be expanded in-loop along with 1786 /// their associated chains of reduction operations, in program order from top 1787 /// (PHI) to bottom 1788 ReductionChainMap InLoopReductionChains; 1789 1790 /// A Map of inloop reduction operations and their immediate chain operand. 1791 /// FIXME: This can be removed once reductions can be costed correctly in 1792 /// vplan. This was added to allow quick lookup to the inloop operations, 1793 /// without having to loop through InLoopReductionChains. 1794 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1795 1796 /// Returns the expected difference in cost from scalarizing the expression 1797 /// feeding a predicated instruction \p PredInst. The instructions to 1798 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1799 /// non-negative return value implies the expression will be scalarized. 1800 /// Currently, only single-use chains are considered for scalarization. 1801 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1802 ElementCount VF); 1803 1804 /// Collect the instructions that are uniform after vectorization. An 1805 /// instruction is uniform if we represent it with a single scalar value in 1806 /// the vectorized loop corresponding to each vector iteration. Examples of 1807 /// uniform instructions include pointer operands of consecutive or 1808 /// interleaved memory accesses. Note that although uniformity implies an 1809 /// instruction will be scalar, the reverse is not true. In general, a 1810 /// scalarized instruction will be represented by VF scalar values in the 1811 /// vectorized loop, each corresponding to an iteration of the original 1812 /// scalar loop. 1813 void collectLoopUniforms(ElementCount VF); 1814 1815 /// Collect the instructions that are scalar after vectorization. An 1816 /// instruction is scalar if it is known to be uniform or will be scalarized 1817 /// during vectorization. Non-uniform scalarized instructions will be 1818 /// represented by VF values in the vectorized loop, each corresponding to an 1819 /// iteration of the original scalar loop. 1820 void collectLoopScalars(ElementCount VF); 1821 1822 /// Keeps cost model vectorization decision and cost for instructions. 1823 /// Right now it is used for memory instructions only. 1824 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1825 std::pair<InstWidening, InstructionCost>>; 1826 1827 DecisionList WideningDecisions; 1828 1829 /// Returns true if \p V is expected to be vectorized and it needs to be 1830 /// extracted. 1831 bool needsExtract(Value *V, ElementCount VF) const { 1832 Instruction *I = dyn_cast<Instruction>(V); 1833 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1834 TheLoop->isLoopInvariant(I)) 1835 return false; 1836 1837 // Assume we can vectorize V (and hence we need extraction) if the 1838 // scalars are not computed yet. This can happen, because it is called 1839 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1840 // the scalars are collected. That should be a safe assumption in most 1841 // cases, because we check if the operands have vectorizable types 1842 // beforehand in LoopVectorizationLegality. 1843 return Scalars.find(VF) == Scalars.end() || 1844 !isScalarAfterVectorization(I, VF); 1845 }; 1846 1847 /// Returns a range containing only operands needing to be extracted. 1848 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1849 ElementCount VF) const { 1850 return SmallVector<Value *, 4>(make_filter_range( 1851 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1852 } 1853 1854 /// Determines if we have the infrastructure to vectorize loop \p L and its 1855 /// epilogue, assuming the main loop is vectorized by \p VF. 1856 bool isCandidateForEpilogueVectorization(const Loop &L, 1857 const ElementCount VF) const; 1858 1859 /// Returns true if epilogue vectorization is considered profitable, and 1860 /// false otherwise. 1861 /// \p VF is the vectorization factor chosen for the original loop. 1862 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1863 1864 public: 1865 /// The loop that we evaluate. 1866 Loop *TheLoop; 1867 1868 /// Predicated scalar evolution analysis. 1869 PredicatedScalarEvolution &PSE; 1870 1871 /// Loop Info analysis. 1872 LoopInfo *LI; 1873 1874 /// Vectorization legality. 1875 LoopVectorizationLegality *Legal; 1876 1877 /// Vector target information. 1878 const TargetTransformInfo &TTI; 1879 1880 /// Target Library Info. 1881 const TargetLibraryInfo *TLI; 1882 1883 /// Demanded bits analysis. 1884 DemandedBits *DB; 1885 1886 /// Assumption cache. 1887 AssumptionCache *AC; 1888 1889 /// Interface to emit optimization remarks. 1890 OptimizationRemarkEmitter *ORE; 1891 1892 const Function *TheFunction; 1893 1894 /// Loop Vectorize Hint. 1895 const LoopVectorizeHints *Hints; 1896 1897 /// The interleave access information contains groups of interleaved accesses 1898 /// with the same stride and close to each other. 1899 InterleavedAccessInfo &InterleaveInfo; 1900 1901 /// Values to ignore in the cost model. 1902 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1903 1904 /// Values to ignore in the cost model when VF > 1. 1905 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1906 1907 /// All element types found in the loop. 1908 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1909 1910 /// Profitable vector factors. 1911 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1912 }; 1913 } // end namespace llvm 1914 1915 /// Helper struct to manage generating runtime checks for vectorization. 1916 /// 1917 /// The runtime checks are created up-front in temporary blocks to allow better 1918 /// estimating the cost and un-linked from the existing IR. After deciding to 1919 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1920 /// temporary blocks are completely removed. 1921 class GeneratedRTChecks { 1922 /// Basic block which contains the generated SCEV checks, if any. 1923 BasicBlock *SCEVCheckBlock = nullptr; 1924 1925 /// The value representing the result of the generated SCEV checks. If it is 1926 /// nullptr, either no SCEV checks have been generated or they have been used. 1927 Value *SCEVCheckCond = nullptr; 1928 1929 /// Basic block which contains the generated memory runtime checks, if any. 1930 BasicBlock *MemCheckBlock = nullptr; 1931 1932 /// The value representing the result of the generated memory runtime checks. 1933 /// If it is nullptr, either no memory runtime checks have been generated or 1934 /// they have been used. 1935 Value *MemRuntimeCheckCond = nullptr; 1936 1937 DominatorTree *DT; 1938 LoopInfo *LI; 1939 1940 SCEVExpander SCEVExp; 1941 SCEVExpander MemCheckExp; 1942 1943 public: 1944 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1945 const DataLayout &DL) 1946 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1947 MemCheckExp(SE, DL, "scev.check") {} 1948 1949 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1950 /// accurately estimate the cost of the runtime checks. The blocks are 1951 /// un-linked from the IR and is added back during vector code generation. If 1952 /// there is no vector code generation, the check blocks are removed 1953 /// completely. 1954 void Create(Loop *L, const LoopAccessInfo &LAI, 1955 const SCEVUnionPredicate &UnionPred) { 1956 1957 BasicBlock *LoopHeader = L->getHeader(); 1958 BasicBlock *Preheader = L->getLoopPreheader(); 1959 1960 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1961 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1962 // may be used by SCEVExpander. The blocks will be un-linked from their 1963 // predecessors and removed from LI & DT at the end of the function. 1964 if (!UnionPred.isAlwaysTrue()) { 1965 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1966 nullptr, "vector.scevcheck"); 1967 1968 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1969 &UnionPred, SCEVCheckBlock->getTerminator()); 1970 } 1971 1972 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1973 if (RtPtrChecking.Need) { 1974 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1975 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1976 "vector.memcheck"); 1977 1978 MemRuntimeCheckCond = 1979 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1980 RtPtrChecking.getChecks(), MemCheckExp); 1981 assert(MemRuntimeCheckCond && 1982 "no RT checks generated although RtPtrChecking " 1983 "claimed checks are required"); 1984 } 1985 1986 if (!MemCheckBlock && !SCEVCheckBlock) 1987 return; 1988 1989 // Unhook the temporary block with the checks, update various places 1990 // accordingly. 1991 if (SCEVCheckBlock) 1992 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1993 if (MemCheckBlock) 1994 MemCheckBlock->replaceAllUsesWith(Preheader); 1995 1996 if (SCEVCheckBlock) { 1997 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1998 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1999 Preheader->getTerminator()->eraseFromParent(); 2000 } 2001 if (MemCheckBlock) { 2002 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2003 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2004 Preheader->getTerminator()->eraseFromParent(); 2005 } 2006 2007 DT->changeImmediateDominator(LoopHeader, Preheader); 2008 if (MemCheckBlock) { 2009 DT->eraseNode(MemCheckBlock); 2010 LI->removeBlock(MemCheckBlock); 2011 } 2012 if (SCEVCheckBlock) { 2013 DT->eraseNode(SCEVCheckBlock); 2014 LI->removeBlock(SCEVCheckBlock); 2015 } 2016 } 2017 2018 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2019 /// unused. 2020 ~GeneratedRTChecks() { 2021 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2022 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2023 if (!SCEVCheckCond) 2024 SCEVCleaner.markResultUsed(); 2025 2026 if (!MemRuntimeCheckCond) 2027 MemCheckCleaner.markResultUsed(); 2028 2029 if (MemRuntimeCheckCond) { 2030 auto &SE = *MemCheckExp.getSE(); 2031 // Memory runtime check generation creates compares that use expanded 2032 // values. Remove them before running the SCEVExpanderCleaners. 2033 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2034 if (MemCheckExp.isInsertedInstruction(&I)) 2035 continue; 2036 SE.forgetValue(&I); 2037 I.eraseFromParent(); 2038 } 2039 } 2040 MemCheckCleaner.cleanup(); 2041 SCEVCleaner.cleanup(); 2042 2043 if (SCEVCheckCond) 2044 SCEVCheckBlock->eraseFromParent(); 2045 if (MemRuntimeCheckCond) 2046 MemCheckBlock->eraseFromParent(); 2047 } 2048 2049 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2050 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2051 /// depending on the generated condition. 2052 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2053 BasicBlock *LoopVectorPreHeader, 2054 BasicBlock *LoopExitBlock) { 2055 if (!SCEVCheckCond) 2056 return nullptr; 2057 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2058 if (C->isZero()) 2059 return nullptr; 2060 2061 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2062 2063 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2064 // Create new preheader for vector loop. 2065 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2066 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2067 2068 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2069 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2070 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2071 SCEVCheckBlock); 2072 2073 DT->addNewBlock(SCEVCheckBlock, Pred); 2074 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2075 2076 ReplaceInstWithInst( 2077 SCEVCheckBlock->getTerminator(), 2078 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2079 // Mark the check as used, to prevent it from being removed during cleanup. 2080 SCEVCheckCond = nullptr; 2081 return SCEVCheckBlock; 2082 } 2083 2084 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2085 /// the branches to branch to the vector preheader or \p Bypass, depending on 2086 /// the generated condition. 2087 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2088 BasicBlock *LoopVectorPreHeader) { 2089 // Check if we generated code that checks in runtime if arrays overlap. 2090 if (!MemRuntimeCheckCond) 2091 return nullptr; 2092 2093 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2094 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2095 MemCheckBlock); 2096 2097 DT->addNewBlock(MemCheckBlock, Pred); 2098 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2099 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2100 2101 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2102 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2103 2104 ReplaceInstWithInst( 2105 MemCheckBlock->getTerminator(), 2106 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2107 MemCheckBlock->getTerminator()->setDebugLoc( 2108 Pred->getTerminator()->getDebugLoc()); 2109 2110 // Mark the check as used, to prevent it from being removed during cleanup. 2111 MemRuntimeCheckCond = nullptr; 2112 return MemCheckBlock; 2113 } 2114 }; 2115 2116 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2117 // vectorization. The loop needs to be annotated with #pragma omp simd 2118 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2119 // vector length information is not provided, vectorization is not considered 2120 // explicit. Interleave hints are not allowed either. These limitations will be 2121 // relaxed in the future. 2122 // Please, note that we are currently forced to abuse the pragma 'clang 2123 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2124 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2125 // provides *explicit vectorization hints* (LV can bypass legal checks and 2126 // assume that vectorization is legal). However, both hints are implemented 2127 // using the same metadata (llvm.loop.vectorize, processed by 2128 // LoopVectorizeHints). This will be fixed in the future when the native IR 2129 // representation for pragma 'omp simd' is introduced. 2130 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2131 OptimizationRemarkEmitter *ORE) { 2132 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2133 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2134 2135 // Only outer loops with an explicit vectorization hint are supported. 2136 // Unannotated outer loops are ignored. 2137 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2138 return false; 2139 2140 Function *Fn = OuterLp->getHeader()->getParent(); 2141 if (!Hints.allowVectorization(Fn, OuterLp, 2142 true /*VectorizeOnlyWhenForced*/)) { 2143 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2144 return false; 2145 } 2146 2147 if (Hints.getInterleave() > 1) { 2148 // TODO: Interleave support is future work. 2149 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2150 "outer loops.\n"); 2151 Hints.emitRemarkWithHints(); 2152 return false; 2153 } 2154 2155 return true; 2156 } 2157 2158 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2159 OptimizationRemarkEmitter *ORE, 2160 SmallVectorImpl<Loop *> &V) { 2161 // Collect inner loops and outer loops without irreducible control flow. For 2162 // now, only collect outer loops that have explicit vectorization hints. If we 2163 // are stress testing the VPlan H-CFG construction, we collect the outermost 2164 // loop of every loop nest. 2165 if (L.isInnermost() || VPlanBuildStressTest || 2166 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2167 LoopBlocksRPO RPOT(&L); 2168 RPOT.perform(LI); 2169 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2170 V.push_back(&L); 2171 // TODO: Collect inner loops inside marked outer loops in case 2172 // vectorization fails for the outer loop. Do not invoke 2173 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2174 // already known to be reducible. We can use an inherited attribute for 2175 // that. 2176 return; 2177 } 2178 } 2179 for (Loop *InnerL : L) 2180 collectSupportedLoops(*InnerL, LI, ORE, V); 2181 } 2182 2183 namespace { 2184 2185 /// The LoopVectorize Pass. 2186 struct LoopVectorize : public FunctionPass { 2187 /// Pass identification, replacement for typeid 2188 static char ID; 2189 2190 LoopVectorizePass Impl; 2191 2192 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2193 bool VectorizeOnlyWhenForced = false) 2194 : FunctionPass(ID), 2195 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2196 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2197 } 2198 2199 bool runOnFunction(Function &F) override { 2200 if (skipFunction(F)) 2201 return false; 2202 2203 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2204 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2205 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2206 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2207 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2208 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2209 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2210 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2211 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2212 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2213 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2214 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2215 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2216 2217 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2218 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2219 2220 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2221 GetLAA, *ORE, PSI).MadeAnyChange; 2222 } 2223 2224 void getAnalysisUsage(AnalysisUsage &AU) const override { 2225 AU.addRequired<AssumptionCacheTracker>(); 2226 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2227 AU.addRequired<DominatorTreeWrapperPass>(); 2228 AU.addRequired<LoopInfoWrapperPass>(); 2229 AU.addRequired<ScalarEvolutionWrapperPass>(); 2230 AU.addRequired<TargetTransformInfoWrapperPass>(); 2231 AU.addRequired<AAResultsWrapperPass>(); 2232 AU.addRequired<LoopAccessLegacyAnalysis>(); 2233 AU.addRequired<DemandedBitsWrapperPass>(); 2234 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2235 AU.addRequired<InjectTLIMappingsLegacy>(); 2236 2237 // We currently do not preserve loopinfo/dominator analyses with outer loop 2238 // vectorization. Until this is addressed, mark these analyses as preserved 2239 // only for non-VPlan-native path. 2240 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2241 if (!EnableVPlanNativePath) { 2242 AU.addPreserved<LoopInfoWrapperPass>(); 2243 AU.addPreserved<DominatorTreeWrapperPass>(); 2244 } 2245 2246 AU.addPreserved<BasicAAWrapperPass>(); 2247 AU.addPreserved<GlobalsAAWrapperPass>(); 2248 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2249 } 2250 }; 2251 2252 } // end anonymous namespace 2253 2254 //===----------------------------------------------------------------------===// 2255 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2256 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2257 //===----------------------------------------------------------------------===// 2258 2259 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2260 // We need to place the broadcast of invariant variables outside the loop, 2261 // but only if it's proven safe to do so. Else, broadcast will be inside 2262 // vector loop body. 2263 Instruction *Instr = dyn_cast<Instruction>(V); 2264 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2265 (!Instr || 2266 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2267 // Place the code for broadcasting invariant variables in the new preheader. 2268 IRBuilder<>::InsertPointGuard Guard(Builder); 2269 if (SafeToHoist) 2270 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2271 2272 // Broadcast the scalar into all locations in the vector. 2273 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2274 2275 return Shuf; 2276 } 2277 2278 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2279 const InductionDescriptor &II, Value *Step, Value *Start, 2280 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2281 VPTransformState &State) { 2282 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2283 "Expected either an induction phi-node or a truncate of it!"); 2284 2285 // Construct the initial value of the vector IV in the vector loop preheader 2286 auto CurrIP = Builder.saveIP(); 2287 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2288 if (isa<TruncInst>(EntryVal)) { 2289 assert(Start->getType()->isIntegerTy() && 2290 "Truncation requires an integer type"); 2291 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2292 Step = Builder.CreateTrunc(Step, TruncType); 2293 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2294 } 2295 2296 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2297 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2298 Value *SteppedStart = 2299 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2300 2301 // We create vector phi nodes for both integer and floating-point induction 2302 // variables. Here, we determine the kind of arithmetic we will perform. 2303 Instruction::BinaryOps AddOp; 2304 Instruction::BinaryOps MulOp; 2305 if (Step->getType()->isIntegerTy()) { 2306 AddOp = Instruction::Add; 2307 MulOp = Instruction::Mul; 2308 } else { 2309 AddOp = II.getInductionOpcode(); 2310 MulOp = Instruction::FMul; 2311 } 2312 2313 // Multiply the vectorization factor by the step using integer or 2314 // floating-point arithmetic as appropriate. 2315 Type *StepType = Step->getType(); 2316 Value *RuntimeVF; 2317 if (Step->getType()->isFloatingPointTy()) 2318 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); 2319 else 2320 RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2321 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2322 2323 // Create a vector splat to use in the induction update. 2324 // 2325 // FIXME: If the step is non-constant, we create the vector splat with 2326 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2327 // handle a constant vector splat. 2328 Value *SplatVF = isa<Constant>(Mul) 2329 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2330 : Builder.CreateVectorSplat(VF, Mul); 2331 Builder.restoreIP(CurrIP); 2332 2333 // We may need to add the step a number of times, depending on the unroll 2334 // factor. The last of those goes into the PHI. 2335 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2336 &*LoopVectorBody->getFirstInsertionPt()); 2337 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2338 Instruction *LastInduction = VecInd; 2339 for (unsigned Part = 0; Part < UF; ++Part) { 2340 State.set(Def, LastInduction, Part); 2341 2342 if (isa<TruncInst>(EntryVal)) 2343 addMetadata(LastInduction, EntryVal); 2344 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2345 State, Part); 2346 2347 LastInduction = cast<Instruction>( 2348 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2349 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2350 } 2351 2352 // Move the last step to the end of the latch block. This ensures consistent 2353 // placement of all induction updates. 2354 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2355 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2356 auto *ICmp = cast<Instruction>(Br->getCondition()); 2357 LastInduction->moveBefore(ICmp); 2358 LastInduction->setName("vec.ind.next"); 2359 2360 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2361 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2362 } 2363 2364 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2365 return Cost->isScalarAfterVectorization(I, VF) || 2366 Cost->isProfitableToScalarize(I, VF); 2367 } 2368 2369 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2370 if (shouldScalarizeInstruction(IV)) 2371 return true; 2372 auto isScalarInst = [&](User *U) -> bool { 2373 auto *I = cast<Instruction>(U); 2374 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2375 }; 2376 return llvm::any_of(IV->users(), isScalarInst); 2377 } 2378 2379 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2380 const InductionDescriptor &ID, const Instruction *EntryVal, 2381 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2382 unsigned Part, unsigned Lane) { 2383 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2384 "Expected either an induction phi-node or a truncate of it!"); 2385 2386 // This induction variable is not the phi from the original loop but the 2387 // newly-created IV based on the proof that casted Phi is equal to the 2388 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2389 // re-uses the same InductionDescriptor that original IV uses but we don't 2390 // have to do any recording in this case - that is done when original IV is 2391 // processed. 2392 if (isa<TruncInst>(EntryVal)) 2393 return; 2394 2395 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2396 if (Casts.empty()) 2397 return; 2398 // Only the first Cast instruction in the Casts vector is of interest. 2399 // The rest of the Casts (if exist) have no uses outside the 2400 // induction update chain itself. 2401 if (Lane < UINT_MAX) 2402 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2403 else 2404 State.set(CastDef, VectorLoopVal, Part); 2405 } 2406 2407 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2408 TruncInst *Trunc, VPValue *Def, 2409 VPValue *CastDef, 2410 VPTransformState &State) { 2411 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2412 "Primary induction variable must have an integer type"); 2413 2414 auto II = Legal->getInductionVars().find(IV); 2415 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2416 2417 auto ID = II->second; 2418 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2419 2420 // The value from the original loop to which we are mapping the new induction 2421 // variable. 2422 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2423 2424 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2425 2426 // Generate code for the induction step. Note that induction steps are 2427 // required to be loop-invariant 2428 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2429 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2430 "Induction step should be loop invariant"); 2431 if (PSE.getSE()->isSCEVable(IV->getType())) { 2432 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2433 return Exp.expandCodeFor(Step, Step->getType(), 2434 LoopVectorPreHeader->getTerminator()); 2435 } 2436 return cast<SCEVUnknown>(Step)->getValue(); 2437 }; 2438 2439 // The scalar value to broadcast. This is derived from the canonical 2440 // induction variable. If a truncation type is given, truncate the canonical 2441 // induction variable and step. Otherwise, derive these values from the 2442 // induction descriptor. 2443 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2444 Value *ScalarIV = Induction; 2445 if (IV != OldInduction) { 2446 ScalarIV = IV->getType()->isIntegerTy() 2447 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2448 : Builder.CreateCast(Instruction::SIToFP, Induction, 2449 IV->getType()); 2450 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2451 ScalarIV->setName("offset.idx"); 2452 } 2453 if (Trunc) { 2454 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2455 assert(Step->getType()->isIntegerTy() && 2456 "Truncation requires an integer step"); 2457 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2458 Step = Builder.CreateTrunc(Step, TruncType); 2459 } 2460 return ScalarIV; 2461 }; 2462 2463 // Create the vector values from the scalar IV, in the absence of creating a 2464 // vector IV. 2465 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2466 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2467 for (unsigned Part = 0; Part < UF; ++Part) { 2468 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2469 Value *StartIdx; 2470 if (Step->getType()->isFloatingPointTy()) 2471 StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); 2472 else 2473 StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); 2474 2475 Value *EntryPart = 2476 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2477 State.set(Def, EntryPart, Part); 2478 if (Trunc) 2479 addMetadata(EntryPart, Trunc); 2480 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2481 State, Part); 2482 } 2483 }; 2484 2485 // Fast-math-flags propagate from the original induction instruction. 2486 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2487 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2488 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2489 2490 // Now do the actual transformations, and start with creating the step value. 2491 Value *Step = CreateStepValue(ID.getStep()); 2492 if (VF.isZero() || VF.isScalar()) { 2493 Value *ScalarIV = CreateScalarIV(Step); 2494 CreateSplatIV(ScalarIV, Step); 2495 return; 2496 } 2497 2498 // Determine if we want a scalar version of the induction variable. This is 2499 // true if the induction variable itself is not widened, or if it has at 2500 // least one user in the loop that is not widened. 2501 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2502 if (!NeedsScalarIV) { 2503 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2504 State); 2505 return; 2506 } 2507 2508 // Try to create a new independent vector induction variable. If we can't 2509 // create the phi node, we will splat the scalar induction variable in each 2510 // loop iteration. 2511 if (!shouldScalarizeInstruction(EntryVal)) { 2512 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2513 State); 2514 Value *ScalarIV = CreateScalarIV(Step); 2515 // Create scalar steps that can be used by instructions we will later 2516 // scalarize. Note that the addition of the scalar steps will not increase 2517 // the number of instructions in the loop in the common case prior to 2518 // InstCombine. We will be trading one vector extract for each scalar step. 2519 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2520 return; 2521 } 2522 2523 // All IV users are scalar instructions, so only emit a scalar IV, not a 2524 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2525 // predicate used by the masked loads/stores. 2526 Value *ScalarIV = CreateScalarIV(Step); 2527 if (!Cost->isScalarEpilogueAllowed()) 2528 CreateSplatIV(ScalarIV, Step); 2529 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2530 } 2531 2532 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2533 Value *Step, 2534 Instruction::BinaryOps BinOp) { 2535 // Create and check the types. 2536 auto *ValVTy = cast<VectorType>(Val->getType()); 2537 ElementCount VLen = ValVTy->getElementCount(); 2538 2539 Type *STy = Val->getType()->getScalarType(); 2540 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2541 "Induction Step must be an integer or FP"); 2542 assert(Step->getType() == STy && "Step has wrong type"); 2543 2544 SmallVector<Constant *, 8> Indices; 2545 2546 // Create a vector of consecutive numbers from zero to VF. 2547 VectorType *InitVecValVTy = ValVTy; 2548 Type *InitVecValSTy = STy; 2549 if (STy->isFloatingPointTy()) { 2550 InitVecValSTy = 2551 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2552 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2553 } 2554 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2555 2556 // Splat the StartIdx 2557 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2558 2559 if (STy->isIntegerTy()) { 2560 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2561 Step = Builder.CreateVectorSplat(VLen, Step); 2562 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2563 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2564 // which can be found from the original scalar operations. 2565 Step = Builder.CreateMul(InitVec, Step); 2566 return Builder.CreateAdd(Val, Step, "induction"); 2567 } 2568 2569 // Floating point induction. 2570 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2571 "Binary Opcode should be specified for FP induction"); 2572 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2573 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2574 2575 Step = Builder.CreateVectorSplat(VLen, Step); 2576 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2577 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2578 } 2579 2580 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2581 Instruction *EntryVal, 2582 const InductionDescriptor &ID, 2583 VPValue *Def, VPValue *CastDef, 2584 VPTransformState &State) { 2585 // We shouldn't have to build scalar steps if we aren't vectorizing. 2586 assert(VF.isVector() && "VF should be greater than one"); 2587 // Get the value type and ensure it and the step have the same integer type. 2588 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2589 assert(ScalarIVTy == Step->getType() && 2590 "Val and Step should have the same type"); 2591 2592 // We build scalar steps for both integer and floating-point induction 2593 // variables. Here, we determine the kind of arithmetic we will perform. 2594 Instruction::BinaryOps AddOp; 2595 Instruction::BinaryOps MulOp; 2596 if (ScalarIVTy->isIntegerTy()) { 2597 AddOp = Instruction::Add; 2598 MulOp = Instruction::Mul; 2599 } else { 2600 AddOp = ID.getInductionOpcode(); 2601 MulOp = Instruction::FMul; 2602 } 2603 2604 // Determine the number of scalars we need to generate for each unroll 2605 // iteration. If EntryVal is uniform, we only need to generate the first 2606 // lane. Otherwise, we generate all VF values. 2607 bool IsUniform = 2608 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2609 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2610 // Compute the scalar steps and save the results in State. 2611 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2612 ScalarIVTy->getScalarSizeInBits()); 2613 Type *VecIVTy = nullptr; 2614 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2615 if (!IsUniform && VF.isScalable()) { 2616 VecIVTy = VectorType::get(ScalarIVTy, VF); 2617 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2618 SplatStep = Builder.CreateVectorSplat(VF, Step); 2619 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2620 } 2621 2622 for (unsigned Part = 0; Part < UF; ++Part) { 2623 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); 2624 2625 if (!IsUniform && VF.isScalable()) { 2626 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2627 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2628 if (ScalarIVTy->isFloatingPointTy()) 2629 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2630 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2631 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2632 State.set(Def, Add, Part); 2633 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2634 Part); 2635 // It's useful to record the lane values too for the known minimum number 2636 // of elements so we do those below. This improves the code quality when 2637 // trying to extract the first element, for example. 2638 } 2639 2640 if (ScalarIVTy->isFloatingPointTy()) 2641 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2642 2643 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2644 Value *StartIdx = Builder.CreateBinOp( 2645 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2646 // The step returned by `createStepForVF` is a runtime-evaluated value 2647 // when VF is scalable. Otherwise, it should be folded into a Constant. 2648 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2649 "Expected StartIdx to be folded to a constant when VF is not " 2650 "scalable"); 2651 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2652 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2653 State.set(Def, Add, VPIteration(Part, Lane)); 2654 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2655 Part, Lane); 2656 } 2657 } 2658 } 2659 2660 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2661 const VPIteration &Instance, 2662 VPTransformState &State) { 2663 Value *ScalarInst = State.get(Def, Instance); 2664 Value *VectorValue = State.get(Def, Instance.Part); 2665 VectorValue = Builder.CreateInsertElement( 2666 VectorValue, ScalarInst, 2667 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2668 State.set(Def, VectorValue, Instance.Part); 2669 } 2670 2671 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2672 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2673 return Builder.CreateVectorReverse(Vec, "reverse"); 2674 } 2675 2676 // Return whether we allow using masked interleave-groups (for dealing with 2677 // strided loads/stores that reside in predicated blocks, or for dealing 2678 // with gaps). 2679 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2680 // If an override option has been passed in for interleaved accesses, use it. 2681 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2682 return EnableMaskedInterleavedMemAccesses; 2683 2684 return TTI.enableMaskedInterleavedAccessVectorization(); 2685 } 2686 2687 // Try to vectorize the interleave group that \p Instr belongs to. 2688 // 2689 // E.g. Translate following interleaved load group (factor = 3): 2690 // for (i = 0; i < N; i+=3) { 2691 // R = Pic[i]; // Member of index 0 2692 // G = Pic[i+1]; // Member of index 1 2693 // B = Pic[i+2]; // Member of index 2 2694 // ... // do something to R, G, B 2695 // } 2696 // To: 2697 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2698 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2699 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2700 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2701 // 2702 // Or translate following interleaved store group (factor = 3): 2703 // for (i = 0; i < N; i+=3) { 2704 // ... do something to R, G, B 2705 // Pic[i] = R; // Member of index 0 2706 // Pic[i+1] = G; // Member of index 1 2707 // Pic[i+2] = B; // Member of index 2 2708 // } 2709 // To: 2710 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2711 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2712 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2713 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2714 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2715 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2716 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2717 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2718 VPValue *BlockInMask) { 2719 Instruction *Instr = Group->getInsertPos(); 2720 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2721 2722 // Prepare for the vector type of the interleaved load/store. 2723 Type *ScalarTy = getLoadStoreType(Instr); 2724 unsigned InterleaveFactor = Group->getFactor(); 2725 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2726 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2727 2728 // Prepare for the new pointers. 2729 SmallVector<Value *, 2> AddrParts; 2730 unsigned Index = Group->getIndex(Instr); 2731 2732 // TODO: extend the masked interleaved-group support to reversed access. 2733 assert((!BlockInMask || !Group->isReverse()) && 2734 "Reversed masked interleave-group not supported."); 2735 2736 // If the group is reverse, adjust the index to refer to the last vector lane 2737 // instead of the first. We adjust the index from the first vector lane, 2738 // rather than directly getting the pointer for lane VF - 1, because the 2739 // pointer operand of the interleaved access is supposed to be uniform. For 2740 // uniform instructions, we're only required to generate a value for the 2741 // first vector lane in each unroll iteration. 2742 if (Group->isReverse()) 2743 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2744 2745 for (unsigned Part = 0; Part < UF; Part++) { 2746 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2747 setDebugLocFromInst(AddrPart); 2748 2749 // Notice current instruction could be any index. Need to adjust the address 2750 // to the member of index 0. 2751 // 2752 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2753 // b = A[i]; // Member of index 0 2754 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2755 // 2756 // E.g. A[i+1] = a; // Member of index 1 2757 // A[i] = b; // Member of index 0 2758 // A[i+2] = c; // Member of index 2 (Current instruction) 2759 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2760 2761 bool InBounds = false; 2762 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2763 InBounds = gep->isInBounds(); 2764 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2765 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2766 2767 // Cast to the vector pointer type. 2768 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2769 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2770 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2771 } 2772 2773 setDebugLocFromInst(Instr); 2774 Value *PoisonVec = PoisonValue::get(VecTy); 2775 2776 Value *MaskForGaps = nullptr; 2777 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2778 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2779 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2780 } 2781 2782 // Vectorize the interleaved load group. 2783 if (isa<LoadInst>(Instr)) { 2784 // For each unroll part, create a wide load for the group. 2785 SmallVector<Value *, 2> NewLoads; 2786 for (unsigned Part = 0; Part < UF; Part++) { 2787 Instruction *NewLoad; 2788 if (BlockInMask || MaskForGaps) { 2789 assert(useMaskedInterleavedAccesses(*TTI) && 2790 "masked interleaved groups are not allowed."); 2791 Value *GroupMask = MaskForGaps; 2792 if (BlockInMask) { 2793 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2794 Value *ShuffledMask = Builder.CreateShuffleVector( 2795 BlockInMaskPart, 2796 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2797 "interleaved.mask"); 2798 GroupMask = MaskForGaps 2799 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2800 MaskForGaps) 2801 : ShuffledMask; 2802 } 2803 NewLoad = 2804 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2805 GroupMask, PoisonVec, "wide.masked.vec"); 2806 } 2807 else 2808 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2809 Group->getAlign(), "wide.vec"); 2810 Group->addMetadata(NewLoad); 2811 NewLoads.push_back(NewLoad); 2812 } 2813 2814 // For each member in the group, shuffle out the appropriate data from the 2815 // wide loads. 2816 unsigned J = 0; 2817 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2818 Instruction *Member = Group->getMember(I); 2819 2820 // Skip the gaps in the group. 2821 if (!Member) 2822 continue; 2823 2824 auto StrideMask = 2825 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2826 for (unsigned Part = 0; Part < UF; Part++) { 2827 Value *StridedVec = Builder.CreateShuffleVector( 2828 NewLoads[Part], StrideMask, "strided.vec"); 2829 2830 // If this member has different type, cast the result type. 2831 if (Member->getType() != ScalarTy) { 2832 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2833 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2834 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2835 } 2836 2837 if (Group->isReverse()) 2838 StridedVec = reverseVector(StridedVec); 2839 2840 State.set(VPDefs[J], StridedVec, Part); 2841 } 2842 ++J; 2843 } 2844 return; 2845 } 2846 2847 // The sub vector type for current instruction. 2848 auto *SubVT = VectorType::get(ScalarTy, VF); 2849 2850 // Vectorize the interleaved store group. 2851 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2852 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2853 "masked interleaved groups are not allowed."); 2854 assert((!MaskForGaps || !VF.isScalable()) && 2855 "masking gaps for scalable vectors is not yet supported."); 2856 for (unsigned Part = 0; Part < UF; Part++) { 2857 // Collect the stored vector from each member. 2858 SmallVector<Value *, 4> StoredVecs; 2859 for (unsigned i = 0; i < InterleaveFactor; i++) { 2860 assert((Group->getMember(i) || MaskForGaps) && 2861 "Fail to get a member from an interleaved store group"); 2862 Instruction *Member = Group->getMember(i); 2863 2864 // Skip the gaps in the group. 2865 if (!Member) { 2866 Value *Undef = PoisonValue::get(SubVT); 2867 StoredVecs.push_back(Undef); 2868 continue; 2869 } 2870 2871 Value *StoredVec = State.get(StoredValues[i], Part); 2872 2873 if (Group->isReverse()) 2874 StoredVec = reverseVector(StoredVec); 2875 2876 // If this member has different type, cast it to a unified type. 2877 2878 if (StoredVec->getType() != SubVT) 2879 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2880 2881 StoredVecs.push_back(StoredVec); 2882 } 2883 2884 // Concatenate all vectors into a wide vector. 2885 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2886 2887 // Interleave the elements in the wide vector. 2888 Value *IVec = Builder.CreateShuffleVector( 2889 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2890 "interleaved.vec"); 2891 2892 Instruction *NewStoreInstr; 2893 if (BlockInMask || MaskForGaps) { 2894 Value *GroupMask = MaskForGaps; 2895 if (BlockInMask) { 2896 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2897 Value *ShuffledMask = Builder.CreateShuffleVector( 2898 BlockInMaskPart, 2899 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2900 "interleaved.mask"); 2901 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2902 ShuffledMask, MaskForGaps) 2903 : ShuffledMask; 2904 } 2905 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2906 Group->getAlign(), GroupMask); 2907 } else 2908 NewStoreInstr = 2909 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2910 2911 Group->addMetadata(NewStoreInstr); 2912 } 2913 } 2914 2915 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2916 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2917 VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, 2918 bool Reverse) { 2919 // Attempt to issue a wide load. 2920 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2921 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2922 2923 assert((LI || SI) && "Invalid Load/Store instruction"); 2924 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2925 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2926 2927 Type *ScalarDataTy = getLoadStoreType(Instr); 2928 2929 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2930 const Align Alignment = getLoadStoreAlignment(Instr); 2931 bool CreateGatherScatter = !ConsecutiveStride; 2932 2933 VectorParts BlockInMaskParts(UF); 2934 bool isMaskRequired = BlockInMask; 2935 if (isMaskRequired) 2936 for (unsigned Part = 0; Part < UF; ++Part) 2937 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2938 2939 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2940 // Calculate the pointer for the specific unroll-part. 2941 GetElementPtrInst *PartPtr = nullptr; 2942 2943 bool InBounds = false; 2944 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2945 InBounds = gep->isInBounds(); 2946 if (Reverse) { 2947 // If the address is consecutive but reversed, then the 2948 // wide store needs to start at the last vector element. 2949 // RunTimeVF = VScale * VF.getKnownMinValue() 2950 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2951 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2952 // NumElt = -Part * RunTimeVF 2953 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2954 // LastLane = 1 - RunTimeVF 2955 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2956 PartPtr = 2957 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2958 PartPtr->setIsInBounds(InBounds); 2959 PartPtr = cast<GetElementPtrInst>( 2960 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2961 PartPtr->setIsInBounds(InBounds); 2962 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2963 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2964 } else { 2965 Value *Increment = 2966 createStepForVF(Builder, Builder.getInt32Ty(), VF, Part); 2967 PartPtr = cast<GetElementPtrInst>( 2968 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2969 PartPtr->setIsInBounds(InBounds); 2970 } 2971 2972 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2973 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2974 }; 2975 2976 // Handle Stores: 2977 if (SI) { 2978 setDebugLocFromInst(SI); 2979 2980 for (unsigned Part = 0; Part < UF; ++Part) { 2981 Instruction *NewSI = nullptr; 2982 Value *StoredVal = State.get(StoredValue, Part); 2983 if (CreateGatherScatter) { 2984 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2985 Value *VectorGep = State.get(Addr, Part); 2986 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2987 MaskPart); 2988 } else { 2989 if (Reverse) { 2990 // If we store to reverse consecutive memory locations, then we need 2991 // to reverse the order of elements in the stored value. 2992 StoredVal = reverseVector(StoredVal); 2993 // We don't want to update the value in the map as it might be used in 2994 // another expression. So don't call resetVectorValue(StoredVal). 2995 } 2996 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2997 if (isMaskRequired) 2998 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2999 BlockInMaskParts[Part]); 3000 else 3001 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 3002 } 3003 addMetadata(NewSI, SI); 3004 } 3005 return; 3006 } 3007 3008 // Handle loads. 3009 assert(LI && "Must have a load instruction"); 3010 setDebugLocFromInst(LI); 3011 for (unsigned Part = 0; Part < UF; ++Part) { 3012 Value *NewLI; 3013 if (CreateGatherScatter) { 3014 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3015 Value *VectorGep = State.get(Addr, Part); 3016 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3017 nullptr, "wide.masked.gather"); 3018 addMetadata(NewLI, LI); 3019 } else { 3020 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3021 if (isMaskRequired) 3022 NewLI = Builder.CreateMaskedLoad( 3023 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3024 PoisonValue::get(DataTy), "wide.masked.load"); 3025 else 3026 NewLI = 3027 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3028 3029 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3030 addMetadata(NewLI, LI); 3031 if (Reverse) 3032 NewLI = reverseVector(NewLI); 3033 } 3034 3035 State.set(Def, NewLI, Part); 3036 } 3037 } 3038 3039 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3040 VPUser &User, 3041 const VPIteration &Instance, 3042 bool IfPredicateInstr, 3043 VPTransformState &State) { 3044 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3045 3046 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3047 // the first lane and part. 3048 if (isa<NoAliasScopeDeclInst>(Instr)) 3049 if (!Instance.isFirstIteration()) 3050 return; 3051 3052 setDebugLocFromInst(Instr); 3053 3054 // Does this instruction return a value ? 3055 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3056 3057 Instruction *Cloned = Instr->clone(); 3058 if (!IsVoidRetTy) 3059 Cloned->setName(Instr->getName() + ".cloned"); 3060 3061 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3062 Builder.GetInsertPoint()); 3063 // Replace the operands of the cloned instructions with their scalar 3064 // equivalents in the new loop. 3065 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3066 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3067 auto InputInstance = Instance; 3068 if (!Operand || !OrigLoop->contains(Operand) || 3069 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3070 InputInstance.Lane = VPLane::getFirstLane(); 3071 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3072 Cloned->setOperand(op, NewOp); 3073 } 3074 addNewMetadata(Cloned, Instr); 3075 3076 // Place the cloned scalar in the new loop. 3077 Builder.Insert(Cloned); 3078 3079 State.set(Def, Cloned, Instance); 3080 3081 // If we just cloned a new assumption, add it the assumption cache. 3082 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3083 AC->registerAssumption(II); 3084 3085 // End if-block. 3086 if (IfPredicateInstr) 3087 PredicatedInstructions.push_back(Cloned); 3088 } 3089 3090 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3091 Value *End, Value *Step, 3092 Instruction *DL) { 3093 BasicBlock *Header = L->getHeader(); 3094 BasicBlock *Latch = L->getLoopLatch(); 3095 // As we're just creating this loop, it's possible no latch exists 3096 // yet. If so, use the header as this will be a single block loop. 3097 if (!Latch) 3098 Latch = Header; 3099 3100 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3101 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3102 setDebugLocFromInst(OldInst, &B); 3103 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3104 3105 B.SetInsertPoint(Latch->getTerminator()); 3106 setDebugLocFromInst(OldInst, &B); 3107 3108 // Create i+1 and fill the PHINode. 3109 // 3110 // If the tail is not folded, we know that End - Start >= Step (either 3111 // statically or through the minimum iteration checks). We also know that both 3112 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3113 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3114 // overflows and we can mark the induction increment as NUW. 3115 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3116 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3117 Induction->addIncoming(Start, L->getLoopPreheader()); 3118 Induction->addIncoming(Next, Latch); 3119 // Create the compare. 3120 Value *ICmp = B.CreateICmpEQ(Next, End); 3121 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3122 3123 // Now we have two terminators. Remove the old one from the block. 3124 Latch->getTerminator()->eraseFromParent(); 3125 3126 return Induction; 3127 } 3128 3129 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3130 if (TripCount) 3131 return TripCount; 3132 3133 assert(L && "Create Trip Count for null loop."); 3134 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3135 // Find the loop boundaries. 3136 ScalarEvolution *SE = PSE.getSE(); 3137 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3138 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3139 "Invalid loop count"); 3140 3141 Type *IdxTy = Legal->getWidestInductionType(); 3142 assert(IdxTy && "No type for induction"); 3143 3144 // The exit count might have the type of i64 while the phi is i32. This can 3145 // happen if we have an induction variable that is sign extended before the 3146 // compare. The only way that we get a backedge taken count is that the 3147 // induction variable was signed and as such will not overflow. In such a case 3148 // truncation is legal. 3149 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3150 IdxTy->getPrimitiveSizeInBits()) 3151 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3152 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3153 3154 // Get the total trip count from the count by adding 1. 3155 const SCEV *ExitCount = SE->getAddExpr( 3156 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3157 3158 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3159 3160 // Expand the trip count and place the new instructions in the preheader. 3161 // Notice that the pre-header does not change, only the loop body. 3162 SCEVExpander Exp(*SE, DL, "induction"); 3163 3164 // Count holds the overall loop count (N). 3165 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3166 L->getLoopPreheader()->getTerminator()); 3167 3168 if (TripCount->getType()->isPointerTy()) 3169 TripCount = 3170 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3171 L->getLoopPreheader()->getTerminator()); 3172 3173 return TripCount; 3174 } 3175 3176 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3177 if (VectorTripCount) 3178 return VectorTripCount; 3179 3180 Value *TC = getOrCreateTripCount(L); 3181 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3182 3183 Type *Ty = TC->getType(); 3184 // This is where we can make the step a runtime constant. 3185 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3186 3187 // If the tail is to be folded by masking, round the number of iterations N 3188 // up to a multiple of Step instead of rounding down. This is done by first 3189 // adding Step-1 and then rounding down. Note that it's ok if this addition 3190 // overflows: the vector induction variable will eventually wrap to zero given 3191 // that it starts at zero and its Step is a power of two; the loop will then 3192 // exit, with the last early-exit vector comparison also producing all-true. 3193 if (Cost->foldTailByMasking()) { 3194 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3195 "VF*UF must be a power of 2 when folding tail by masking"); 3196 assert(!VF.isScalable() && 3197 "Tail folding not yet supported for scalable vectors"); 3198 TC = Builder.CreateAdd( 3199 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3200 } 3201 3202 // Now we need to generate the expression for the part of the loop that the 3203 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3204 // iterations are not required for correctness, or N - Step, otherwise. Step 3205 // is equal to the vectorization factor (number of SIMD elements) times the 3206 // unroll factor (number of SIMD instructions). 3207 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3208 3209 // There are cases where we *must* run at least one iteration in the remainder 3210 // loop. See the cost model for when this can happen. If the step evenly 3211 // divides the trip count, we set the remainder to be equal to the step. If 3212 // the step does not evenly divide the trip count, no adjustment is necessary 3213 // since there will already be scalar iterations. Note that the minimum 3214 // iterations check ensures that N >= Step. 3215 if (Cost->requiresScalarEpilogue(VF)) { 3216 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3217 R = Builder.CreateSelect(IsZero, Step, R); 3218 } 3219 3220 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3221 3222 return VectorTripCount; 3223 } 3224 3225 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3226 const DataLayout &DL) { 3227 // Verify that V is a vector type with same number of elements as DstVTy. 3228 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3229 unsigned VF = DstFVTy->getNumElements(); 3230 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3231 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3232 Type *SrcElemTy = SrcVecTy->getElementType(); 3233 Type *DstElemTy = DstFVTy->getElementType(); 3234 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3235 "Vector elements must have same size"); 3236 3237 // Do a direct cast if element types are castable. 3238 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3239 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3240 } 3241 // V cannot be directly casted to desired vector type. 3242 // May happen when V is a floating point vector but DstVTy is a vector of 3243 // pointers or vice-versa. Handle this using a two-step bitcast using an 3244 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3245 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3246 "Only one type should be a pointer type"); 3247 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3248 "Only one type should be a floating point type"); 3249 Type *IntTy = 3250 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3251 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3252 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3253 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3254 } 3255 3256 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3257 BasicBlock *Bypass) { 3258 Value *Count = getOrCreateTripCount(L); 3259 // Reuse existing vector loop preheader for TC checks. 3260 // Note that new preheader block is generated for vector loop. 3261 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3262 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3263 3264 // Generate code to check if the loop's trip count is less than VF * UF, or 3265 // equal to it in case a scalar epilogue is required; this implies that the 3266 // vector trip count is zero. This check also covers the case where adding one 3267 // to the backedge-taken count overflowed leading to an incorrect trip count 3268 // of zero. In this case we will also jump to the scalar loop. 3269 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3270 : ICmpInst::ICMP_ULT; 3271 3272 // If tail is to be folded, vector loop takes care of all iterations. 3273 Value *CheckMinIters = Builder.getFalse(); 3274 if (!Cost->foldTailByMasking()) { 3275 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3276 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3277 } 3278 // Create new preheader for vector loop. 3279 LoopVectorPreHeader = 3280 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3281 "vector.ph"); 3282 3283 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3284 DT->getNode(Bypass)->getIDom()) && 3285 "TC check is expected to dominate Bypass"); 3286 3287 // Update dominator for Bypass & LoopExit (if needed). 3288 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3289 if (!Cost->requiresScalarEpilogue(VF)) 3290 // If there is an epilogue which must run, there's no edge from the 3291 // middle block to exit blocks and thus no need to update the immediate 3292 // dominator of the exit blocks. 3293 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3294 3295 ReplaceInstWithInst( 3296 TCCheckBlock->getTerminator(), 3297 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3298 LoopBypassBlocks.push_back(TCCheckBlock); 3299 } 3300 3301 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3302 3303 BasicBlock *const SCEVCheckBlock = 3304 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3305 if (!SCEVCheckBlock) 3306 return nullptr; 3307 3308 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3309 (OptForSizeBasedOnProfile && 3310 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3311 "Cannot SCEV check stride or overflow when optimizing for size"); 3312 3313 3314 // Update dominator only if this is first RT check. 3315 if (LoopBypassBlocks.empty()) { 3316 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3317 if (!Cost->requiresScalarEpilogue(VF)) 3318 // If there is an epilogue which must run, there's no edge from the 3319 // middle block to exit blocks and thus no need to update the immediate 3320 // dominator of the exit blocks. 3321 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3322 } 3323 3324 LoopBypassBlocks.push_back(SCEVCheckBlock); 3325 AddedSafetyChecks = true; 3326 return SCEVCheckBlock; 3327 } 3328 3329 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3330 BasicBlock *Bypass) { 3331 // VPlan-native path does not do any analysis for runtime checks currently. 3332 if (EnableVPlanNativePath) 3333 return nullptr; 3334 3335 BasicBlock *const MemCheckBlock = 3336 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3337 3338 // Check if we generated code that checks in runtime if arrays overlap. We put 3339 // the checks into a separate block to make the more common case of few 3340 // elements faster. 3341 if (!MemCheckBlock) 3342 return nullptr; 3343 3344 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3345 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3346 "Cannot emit memory checks when optimizing for size, unless forced " 3347 "to vectorize."); 3348 ORE->emit([&]() { 3349 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3350 L->getStartLoc(), L->getHeader()) 3351 << "Code-size may be reduced by not forcing " 3352 "vectorization, or by source-code modifications " 3353 "eliminating the need for runtime checks " 3354 "(e.g., adding 'restrict')."; 3355 }); 3356 } 3357 3358 LoopBypassBlocks.push_back(MemCheckBlock); 3359 3360 AddedSafetyChecks = true; 3361 3362 // We currently don't use LoopVersioning for the actual loop cloning but we 3363 // still use it to add the noalias metadata. 3364 LVer = std::make_unique<LoopVersioning>( 3365 *Legal->getLAI(), 3366 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3367 DT, PSE.getSE()); 3368 LVer->prepareNoAliasMetadata(); 3369 return MemCheckBlock; 3370 } 3371 3372 Value *InnerLoopVectorizer::emitTransformedIndex( 3373 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3374 const InductionDescriptor &ID) const { 3375 3376 SCEVExpander Exp(*SE, DL, "induction"); 3377 auto Step = ID.getStep(); 3378 auto StartValue = ID.getStartValue(); 3379 assert(Index->getType()->getScalarType() == Step->getType() && 3380 "Index scalar type does not match StepValue type"); 3381 3382 // Note: the IR at this point is broken. We cannot use SE to create any new 3383 // SCEV and then expand it, hoping that SCEV's simplification will give us 3384 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3385 // lead to various SCEV crashes. So all we can do is to use builder and rely 3386 // on InstCombine for future simplifications. Here we handle some trivial 3387 // cases only. 3388 auto CreateAdd = [&B](Value *X, Value *Y) { 3389 assert(X->getType() == Y->getType() && "Types don't match!"); 3390 if (auto *CX = dyn_cast<ConstantInt>(X)) 3391 if (CX->isZero()) 3392 return Y; 3393 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3394 if (CY->isZero()) 3395 return X; 3396 return B.CreateAdd(X, Y); 3397 }; 3398 3399 // We allow X to be a vector type, in which case Y will potentially be 3400 // splatted into a vector with the same element count. 3401 auto CreateMul = [&B](Value *X, Value *Y) { 3402 assert(X->getType()->getScalarType() == Y->getType() && 3403 "Types don't match!"); 3404 if (auto *CX = dyn_cast<ConstantInt>(X)) 3405 if (CX->isOne()) 3406 return Y; 3407 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3408 if (CY->isOne()) 3409 return X; 3410 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3411 if (XVTy && !isa<VectorType>(Y->getType())) 3412 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3413 return B.CreateMul(X, Y); 3414 }; 3415 3416 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3417 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3418 // the DomTree is not kept up-to-date for additional blocks generated in the 3419 // vector loop. By using the header as insertion point, we guarantee that the 3420 // expanded instructions dominate all their uses. 3421 auto GetInsertPoint = [this, &B]() { 3422 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3423 if (InsertBB != LoopVectorBody && 3424 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3425 return LoopVectorBody->getTerminator(); 3426 return &*B.GetInsertPoint(); 3427 }; 3428 3429 switch (ID.getKind()) { 3430 case InductionDescriptor::IK_IntInduction: { 3431 assert(!isa<VectorType>(Index->getType()) && 3432 "Vector indices not supported for integer inductions yet"); 3433 assert(Index->getType() == StartValue->getType() && 3434 "Index type does not match StartValue type"); 3435 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3436 return B.CreateSub(StartValue, Index); 3437 auto *Offset = CreateMul( 3438 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3439 return CreateAdd(StartValue, Offset); 3440 } 3441 case InductionDescriptor::IK_PtrInduction: { 3442 assert(isa<SCEVConstant>(Step) && 3443 "Expected constant step for pointer induction"); 3444 return B.CreateGEP( 3445 ID.getElementType(), StartValue, 3446 CreateMul(Index, 3447 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3448 GetInsertPoint()))); 3449 } 3450 case InductionDescriptor::IK_FpInduction: { 3451 assert(!isa<VectorType>(Index->getType()) && 3452 "Vector indices not supported for FP inductions yet"); 3453 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3454 auto InductionBinOp = ID.getInductionBinOp(); 3455 assert(InductionBinOp && 3456 (InductionBinOp->getOpcode() == Instruction::FAdd || 3457 InductionBinOp->getOpcode() == Instruction::FSub) && 3458 "Original bin op should be defined for FP induction"); 3459 3460 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3461 Value *MulExp = B.CreateFMul(StepValue, Index); 3462 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3463 "induction"); 3464 } 3465 case InductionDescriptor::IK_NoInduction: 3466 return nullptr; 3467 } 3468 llvm_unreachable("invalid enum"); 3469 } 3470 3471 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3472 LoopScalarBody = OrigLoop->getHeader(); 3473 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3474 assert(LoopVectorPreHeader && "Invalid loop structure"); 3475 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3476 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3477 "multiple exit loop without required epilogue?"); 3478 3479 LoopMiddleBlock = 3480 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3481 LI, nullptr, Twine(Prefix) + "middle.block"); 3482 LoopScalarPreHeader = 3483 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3484 nullptr, Twine(Prefix) + "scalar.ph"); 3485 3486 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3487 3488 // Set up the middle block terminator. Two cases: 3489 // 1) If we know that we must execute the scalar epilogue, emit an 3490 // unconditional branch. 3491 // 2) Otherwise, we must have a single unique exit block (due to how we 3492 // implement the multiple exit case). In this case, set up a conditonal 3493 // branch from the middle block to the loop scalar preheader, and the 3494 // exit block. completeLoopSkeleton will update the condition to use an 3495 // iteration check, if required to decide whether to execute the remainder. 3496 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3497 BranchInst::Create(LoopScalarPreHeader) : 3498 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3499 Builder.getTrue()); 3500 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3501 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3502 3503 // We intentionally don't let SplitBlock to update LoopInfo since 3504 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3505 // LoopVectorBody is explicitly added to the correct place few lines later. 3506 LoopVectorBody = 3507 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3508 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3509 3510 // Update dominator for loop exit. 3511 if (!Cost->requiresScalarEpilogue(VF)) 3512 // If there is an epilogue which must run, there's no edge from the 3513 // middle block to exit blocks and thus no need to update the immediate 3514 // dominator of the exit blocks. 3515 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3516 3517 // Create and register the new vector loop. 3518 Loop *Lp = LI->AllocateLoop(); 3519 Loop *ParentLoop = OrigLoop->getParentLoop(); 3520 3521 // Insert the new loop into the loop nest and register the new basic blocks 3522 // before calling any utilities such as SCEV that require valid LoopInfo. 3523 if (ParentLoop) { 3524 ParentLoop->addChildLoop(Lp); 3525 } else { 3526 LI->addTopLevelLoop(Lp); 3527 } 3528 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3529 return Lp; 3530 } 3531 3532 void InnerLoopVectorizer::createInductionResumeValues( 3533 Loop *L, Value *VectorTripCount, 3534 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3535 assert(VectorTripCount && L && "Expected valid arguments"); 3536 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3537 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3538 "Inconsistent information about additional bypass."); 3539 // We are going to resume the execution of the scalar loop. 3540 // Go over all of the induction variables that we found and fix the 3541 // PHIs that are left in the scalar version of the loop. 3542 // The starting values of PHI nodes depend on the counter of the last 3543 // iteration in the vectorized loop. 3544 // If we come from a bypass edge then we need to start from the original 3545 // start value. 3546 for (auto &InductionEntry : Legal->getInductionVars()) { 3547 PHINode *OrigPhi = InductionEntry.first; 3548 InductionDescriptor II = InductionEntry.second; 3549 3550 // Create phi nodes to merge from the backedge-taken check block. 3551 PHINode *BCResumeVal = 3552 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3553 LoopScalarPreHeader->getTerminator()); 3554 // Copy original phi DL over to the new one. 3555 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3556 Value *&EndValue = IVEndValues[OrigPhi]; 3557 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3558 if (OrigPhi == OldInduction) { 3559 // We know what the end value is. 3560 EndValue = VectorTripCount; 3561 } else { 3562 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3563 3564 // Fast-math-flags propagate from the original induction instruction. 3565 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3566 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3567 3568 Type *StepType = II.getStep()->getType(); 3569 Instruction::CastOps CastOp = 3570 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3571 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3572 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3573 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3574 EndValue->setName("ind.end"); 3575 3576 // Compute the end value for the additional bypass (if applicable). 3577 if (AdditionalBypass.first) { 3578 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3579 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3580 StepType, true); 3581 CRD = 3582 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3583 EndValueFromAdditionalBypass = 3584 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3585 EndValueFromAdditionalBypass->setName("ind.end"); 3586 } 3587 } 3588 // The new PHI merges the original incoming value, in case of a bypass, 3589 // or the value at the end of the vectorized loop. 3590 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3591 3592 // Fix the scalar body counter (PHI node). 3593 // The old induction's phi node in the scalar body needs the truncated 3594 // value. 3595 for (BasicBlock *BB : LoopBypassBlocks) 3596 BCResumeVal->addIncoming(II.getStartValue(), BB); 3597 3598 if (AdditionalBypass.first) 3599 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3600 EndValueFromAdditionalBypass); 3601 3602 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3603 } 3604 } 3605 3606 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3607 MDNode *OrigLoopID) { 3608 assert(L && "Expected valid loop."); 3609 3610 // The trip counts should be cached by now. 3611 Value *Count = getOrCreateTripCount(L); 3612 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3613 3614 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3615 3616 // Add a check in the middle block to see if we have completed 3617 // all of the iterations in the first vector loop. Three cases: 3618 // 1) If we require a scalar epilogue, there is no conditional branch as 3619 // we unconditionally branch to the scalar preheader. Do nothing. 3620 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3621 // Thus if tail is to be folded, we know we don't need to run the 3622 // remainder and we can use the previous value for the condition (true). 3623 // 3) Otherwise, construct a runtime check. 3624 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3625 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3626 Count, VectorTripCount, "cmp.n", 3627 LoopMiddleBlock->getTerminator()); 3628 3629 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3630 // of the corresponding compare because they may have ended up with 3631 // different line numbers and we want to avoid awkward line stepping while 3632 // debugging. Eg. if the compare has got a line number inside the loop. 3633 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3634 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3635 } 3636 3637 // Get ready to start creating new instructions into the vectorized body. 3638 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3639 "Inconsistent vector loop preheader"); 3640 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3641 3642 Optional<MDNode *> VectorizedLoopID = 3643 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3644 LLVMLoopVectorizeFollowupVectorized}); 3645 if (VectorizedLoopID.hasValue()) { 3646 L->setLoopID(VectorizedLoopID.getValue()); 3647 3648 // Do not setAlreadyVectorized if loop attributes have been defined 3649 // explicitly. 3650 return LoopVectorPreHeader; 3651 } 3652 3653 // Keep all loop hints from the original loop on the vector loop (we'll 3654 // replace the vectorizer-specific hints below). 3655 if (MDNode *LID = OrigLoop->getLoopID()) 3656 L->setLoopID(LID); 3657 3658 LoopVectorizeHints Hints(L, true, *ORE); 3659 Hints.setAlreadyVectorized(); 3660 3661 #ifdef EXPENSIVE_CHECKS 3662 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3663 LI->verify(*DT); 3664 #endif 3665 3666 return LoopVectorPreHeader; 3667 } 3668 3669 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3670 /* 3671 In this function we generate a new loop. The new loop will contain 3672 the vectorized instructions while the old loop will continue to run the 3673 scalar remainder. 3674 3675 [ ] <-- loop iteration number check. 3676 / | 3677 / v 3678 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3679 | / | 3680 | / v 3681 || [ ] <-- vector pre header. 3682 |/ | 3683 | v 3684 | [ ] \ 3685 | [ ]_| <-- vector loop. 3686 | | 3687 | v 3688 \ -[ ] <--- middle-block. 3689 \/ | 3690 /\ v 3691 | ->[ ] <--- new preheader. 3692 | | 3693 (opt) v <-- edge from middle to exit iff epilogue is not required. 3694 | [ ] \ 3695 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3696 \ | 3697 \ v 3698 >[ ] <-- exit block(s). 3699 ... 3700 */ 3701 3702 // Get the metadata of the original loop before it gets modified. 3703 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3704 3705 // Workaround! Compute the trip count of the original loop and cache it 3706 // before we start modifying the CFG. This code has a systemic problem 3707 // wherein it tries to run analysis over partially constructed IR; this is 3708 // wrong, and not simply for SCEV. The trip count of the original loop 3709 // simply happens to be prone to hitting this in practice. In theory, we 3710 // can hit the same issue for any SCEV, or ValueTracking query done during 3711 // mutation. See PR49900. 3712 getOrCreateTripCount(OrigLoop); 3713 3714 // Create an empty vector loop, and prepare basic blocks for the runtime 3715 // checks. 3716 Loop *Lp = createVectorLoopSkeleton(""); 3717 3718 // Now, compare the new count to zero. If it is zero skip the vector loop and 3719 // jump to the scalar loop. This check also covers the case where the 3720 // backedge-taken count is uint##_max: adding one to it will overflow leading 3721 // to an incorrect trip count of zero. In this (rare) case we will also jump 3722 // to the scalar loop. 3723 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3724 3725 // Generate the code to check any assumptions that we've made for SCEV 3726 // expressions. 3727 emitSCEVChecks(Lp, LoopScalarPreHeader); 3728 3729 // Generate the code that checks in runtime if arrays overlap. We put the 3730 // checks into a separate block to make the more common case of few elements 3731 // faster. 3732 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3733 3734 // Some loops have a single integer induction variable, while other loops 3735 // don't. One example is c++ iterators that often have multiple pointer 3736 // induction variables. In the code below we also support a case where we 3737 // don't have a single induction variable. 3738 // 3739 // We try to obtain an induction variable from the original loop as hard 3740 // as possible. However if we don't find one that: 3741 // - is an integer 3742 // - counts from zero, stepping by one 3743 // - is the size of the widest induction variable type 3744 // then we create a new one. 3745 OldInduction = Legal->getPrimaryInduction(); 3746 Type *IdxTy = Legal->getWidestInductionType(); 3747 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3748 // The loop step is equal to the vectorization factor (num of SIMD elements) 3749 // times the unroll factor (num of SIMD instructions). 3750 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3751 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3752 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3753 Induction = 3754 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3755 getDebugLocFromInstOrOperands(OldInduction)); 3756 3757 // Emit phis for the new starting index of the scalar loop. 3758 createInductionResumeValues(Lp, CountRoundDown); 3759 3760 return completeLoopSkeleton(Lp, OrigLoopID); 3761 } 3762 3763 // Fix up external users of the induction variable. At this point, we are 3764 // in LCSSA form, with all external PHIs that use the IV having one input value, 3765 // coming from the remainder loop. We need those PHIs to also have a correct 3766 // value for the IV when arriving directly from the middle block. 3767 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3768 const InductionDescriptor &II, 3769 Value *CountRoundDown, Value *EndValue, 3770 BasicBlock *MiddleBlock) { 3771 // There are two kinds of external IV usages - those that use the value 3772 // computed in the last iteration (the PHI) and those that use the penultimate 3773 // value (the value that feeds into the phi from the loop latch). 3774 // We allow both, but they, obviously, have different values. 3775 3776 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3777 3778 DenseMap<Value *, Value *> MissingVals; 3779 3780 // An external user of the last iteration's value should see the value that 3781 // the remainder loop uses to initialize its own IV. 3782 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3783 for (User *U : PostInc->users()) { 3784 Instruction *UI = cast<Instruction>(U); 3785 if (!OrigLoop->contains(UI)) { 3786 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3787 MissingVals[UI] = EndValue; 3788 } 3789 } 3790 3791 // An external user of the penultimate value need to see EndValue - Step. 3792 // The simplest way to get this is to recompute it from the constituent SCEVs, 3793 // that is Start + (Step * (CRD - 1)). 3794 for (User *U : OrigPhi->users()) { 3795 auto *UI = cast<Instruction>(U); 3796 if (!OrigLoop->contains(UI)) { 3797 const DataLayout &DL = 3798 OrigLoop->getHeader()->getModule()->getDataLayout(); 3799 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3800 3801 IRBuilder<> B(MiddleBlock->getTerminator()); 3802 3803 // Fast-math-flags propagate from the original induction instruction. 3804 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3805 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3806 3807 Value *CountMinusOne = B.CreateSub( 3808 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3809 Value *CMO = 3810 !II.getStep()->getType()->isIntegerTy() 3811 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3812 II.getStep()->getType()) 3813 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3814 CMO->setName("cast.cmo"); 3815 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3816 Escape->setName("ind.escape"); 3817 MissingVals[UI] = Escape; 3818 } 3819 } 3820 3821 for (auto &I : MissingVals) { 3822 PHINode *PHI = cast<PHINode>(I.first); 3823 // One corner case we have to handle is two IVs "chasing" each-other, 3824 // that is %IV2 = phi [...], [ %IV1, %latch ] 3825 // In this case, if IV1 has an external use, we need to avoid adding both 3826 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3827 // don't already have an incoming value for the middle block. 3828 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3829 PHI->addIncoming(I.second, MiddleBlock); 3830 } 3831 } 3832 3833 namespace { 3834 3835 struct CSEDenseMapInfo { 3836 static bool canHandle(const Instruction *I) { 3837 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3838 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3839 } 3840 3841 static inline Instruction *getEmptyKey() { 3842 return DenseMapInfo<Instruction *>::getEmptyKey(); 3843 } 3844 3845 static inline Instruction *getTombstoneKey() { 3846 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3847 } 3848 3849 static unsigned getHashValue(const Instruction *I) { 3850 assert(canHandle(I) && "Unknown instruction!"); 3851 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3852 I->value_op_end())); 3853 } 3854 3855 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3856 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3857 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3858 return LHS == RHS; 3859 return LHS->isIdenticalTo(RHS); 3860 } 3861 }; 3862 3863 } // end anonymous namespace 3864 3865 ///Perform cse of induction variable instructions. 3866 static void cse(BasicBlock *BB) { 3867 // Perform simple cse. 3868 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3869 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3870 if (!CSEDenseMapInfo::canHandle(&In)) 3871 continue; 3872 3873 // Check if we can replace this instruction with any of the 3874 // visited instructions. 3875 if (Instruction *V = CSEMap.lookup(&In)) { 3876 In.replaceAllUsesWith(V); 3877 In.eraseFromParent(); 3878 continue; 3879 } 3880 3881 CSEMap[&In] = &In; 3882 } 3883 } 3884 3885 InstructionCost 3886 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3887 bool &NeedToScalarize) const { 3888 Function *F = CI->getCalledFunction(); 3889 Type *ScalarRetTy = CI->getType(); 3890 SmallVector<Type *, 4> Tys, ScalarTys; 3891 for (auto &ArgOp : CI->args()) 3892 ScalarTys.push_back(ArgOp->getType()); 3893 3894 // Estimate cost of scalarized vector call. The source operands are assumed 3895 // to be vectors, so we need to extract individual elements from there, 3896 // execute VF scalar calls, and then gather the result into the vector return 3897 // value. 3898 InstructionCost ScalarCallCost = 3899 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3900 if (VF.isScalar()) 3901 return ScalarCallCost; 3902 3903 // Compute corresponding vector type for return value and arguments. 3904 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3905 for (Type *ScalarTy : ScalarTys) 3906 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3907 3908 // Compute costs of unpacking argument values for the scalar calls and 3909 // packing the return values to a vector. 3910 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3911 3912 InstructionCost Cost = 3913 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3914 3915 // If we can't emit a vector call for this function, then the currently found 3916 // cost is the cost we need to return. 3917 NeedToScalarize = true; 3918 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3919 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3920 3921 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3922 return Cost; 3923 3924 // If the corresponding vector cost is cheaper, return its cost. 3925 InstructionCost VectorCallCost = 3926 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3927 if (VectorCallCost < Cost) { 3928 NeedToScalarize = false; 3929 Cost = VectorCallCost; 3930 } 3931 return Cost; 3932 } 3933 3934 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3935 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3936 return Elt; 3937 return VectorType::get(Elt, VF); 3938 } 3939 3940 InstructionCost 3941 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3942 ElementCount VF) const { 3943 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3944 assert(ID && "Expected intrinsic call!"); 3945 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3946 FastMathFlags FMF; 3947 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3948 FMF = FPMO->getFastMathFlags(); 3949 3950 SmallVector<const Value *> Arguments(CI->args()); 3951 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3952 SmallVector<Type *> ParamTys; 3953 std::transform(FTy->param_begin(), FTy->param_end(), 3954 std::back_inserter(ParamTys), 3955 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3956 3957 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3958 dyn_cast<IntrinsicInst>(CI)); 3959 return TTI.getIntrinsicInstrCost(CostAttrs, 3960 TargetTransformInfo::TCK_RecipThroughput); 3961 } 3962 3963 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3964 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3965 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3966 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3967 } 3968 3969 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3970 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3971 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3972 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3973 } 3974 3975 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3976 // For every instruction `I` in MinBWs, truncate the operands, create a 3977 // truncated version of `I` and reextend its result. InstCombine runs 3978 // later and will remove any ext/trunc pairs. 3979 SmallPtrSet<Value *, 4> Erased; 3980 for (const auto &KV : Cost->getMinimalBitwidths()) { 3981 // If the value wasn't vectorized, we must maintain the original scalar 3982 // type. The absence of the value from State indicates that it 3983 // wasn't vectorized. 3984 // FIXME: Should not rely on getVPValue at this point. 3985 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3986 if (!State.hasAnyVectorValue(Def)) 3987 continue; 3988 for (unsigned Part = 0; Part < UF; ++Part) { 3989 Value *I = State.get(Def, Part); 3990 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3991 continue; 3992 Type *OriginalTy = I->getType(); 3993 Type *ScalarTruncatedTy = 3994 IntegerType::get(OriginalTy->getContext(), KV.second); 3995 auto *TruncatedTy = VectorType::get( 3996 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3997 if (TruncatedTy == OriginalTy) 3998 continue; 3999 4000 IRBuilder<> B(cast<Instruction>(I)); 4001 auto ShrinkOperand = [&](Value *V) -> Value * { 4002 if (auto *ZI = dyn_cast<ZExtInst>(V)) 4003 if (ZI->getSrcTy() == TruncatedTy) 4004 return ZI->getOperand(0); 4005 return B.CreateZExtOrTrunc(V, TruncatedTy); 4006 }; 4007 4008 // The actual instruction modification depends on the instruction type, 4009 // unfortunately. 4010 Value *NewI = nullptr; 4011 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4012 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4013 ShrinkOperand(BO->getOperand(1))); 4014 4015 // Any wrapping introduced by shrinking this operation shouldn't be 4016 // considered undefined behavior. So, we can't unconditionally copy 4017 // arithmetic wrapping flags to NewI. 4018 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4019 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4020 NewI = 4021 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4022 ShrinkOperand(CI->getOperand(1))); 4023 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4024 NewI = B.CreateSelect(SI->getCondition(), 4025 ShrinkOperand(SI->getTrueValue()), 4026 ShrinkOperand(SI->getFalseValue())); 4027 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4028 switch (CI->getOpcode()) { 4029 default: 4030 llvm_unreachable("Unhandled cast!"); 4031 case Instruction::Trunc: 4032 NewI = ShrinkOperand(CI->getOperand(0)); 4033 break; 4034 case Instruction::SExt: 4035 NewI = B.CreateSExtOrTrunc( 4036 CI->getOperand(0), 4037 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4038 break; 4039 case Instruction::ZExt: 4040 NewI = B.CreateZExtOrTrunc( 4041 CI->getOperand(0), 4042 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4043 break; 4044 } 4045 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4046 auto Elements0 = 4047 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4048 auto *O0 = B.CreateZExtOrTrunc( 4049 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4050 auto Elements1 = 4051 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4052 auto *O1 = B.CreateZExtOrTrunc( 4053 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4054 4055 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4056 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4057 // Don't do anything with the operands, just extend the result. 4058 continue; 4059 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4060 auto Elements = 4061 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4062 auto *O0 = B.CreateZExtOrTrunc( 4063 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4064 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4065 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4066 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4067 auto Elements = 4068 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4069 auto *O0 = B.CreateZExtOrTrunc( 4070 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4071 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4072 } else { 4073 // If we don't know what to do, be conservative and don't do anything. 4074 continue; 4075 } 4076 4077 // Lastly, extend the result. 4078 NewI->takeName(cast<Instruction>(I)); 4079 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4080 I->replaceAllUsesWith(Res); 4081 cast<Instruction>(I)->eraseFromParent(); 4082 Erased.insert(I); 4083 State.reset(Def, Res, Part); 4084 } 4085 } 4086 4087 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4088 for (const auto &KV : Cost->getMinimalBitwidths()) { 4089 // If the value wasn't vectorized, we must maintain the original scalar 4090 // type. The absence of the value from State indicates that it 4091 // wasn't vectorized. 4092 // FIXME: Should not rely on getVPValue at this point. 4093 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4094 if (!State.hasAnyVectorValue(Def)) 4095 continue; 4096 for (unsigned Part = 0; Part < UF; ++Part) { 4097 Value *I = State.get(Def, Part); 4098 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4099 if (Inst && Inst->use_empty()) { 4100 Value *NewI = Inst->getOperand(0); 4101 Inst->eraseFromParent(); 4102 State.reset(Def, NewI, Part); 4103 } 4104 } 4105 } 4106 } 4107 4108 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4109 // Insert truncates and extends for any truncated instructions as hints to 4110 // InstCombine. 4111 if (VF.isVector()) 4112 truncateToMinimalBitwidths(State); 4113 4114 // Fix widened non-induction PHIs by setting up the PHI operands. 4115 if (OrigPHIsToFix.size()) { 4116 assert(EnableVPlanNativePath && 4117 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4118 fixNonInductionPHIs(State); 4119 } 4120 4121 // At this point every instruction in the original loop is widened to a 4122 // vector form. Now we need to fix the recurrences in the loop. These PHI 4123 // nodes are currently empty because we did not want to introduce cycles. 4124 // This is the second stage of vectorizing recurrences. 4125 fixCrossIterationPHIs(State); 4126 4127 // Forget the original basic block. 4128 PSE.getSE()->forgetLoop(OrigLoop); 4129 4130 // If we inserted an edge from the middle block to the unique exit block, 4131 // update uses outside the loop (phis) to account for the newly inserted 4132 // edge. 4133 if (!Cost->requiresScalarEpilogue(VF)) { 4134 // Fix-up external users of the induction variables. 4135 for (auto &Entry : Legal->getInductionVars()) 4136 fixupIVUsers(Entry.first, Entry.second, 4137 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4138 IVEndValues[Entry.first], LoopMiddleBlock); 4139 4140 fixLCSSAPHIs(State); 4141 } 4142 4143 for (Instruction *PI : PredicatedInstructions) 4144 sinkScalarOperands(&*PI); 4145 4146 // Remove redundant induction instructions. 4147 cse(LoopVectorBody); 4148 4149 // Set/update profile weights for the vector and remainder loops as original 4150 // loop iterations are now distributed among them. Note that original loop 4151 // represented by LoopScalarBody becomes remainder loop after vectorization. 4152 // 4153 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4154 // end up getting slightly roughened result but that should be OK since 4155 // profile is not inherently precise anyway. Note also possible bypass of 4156 // vector code caused by legality checks is ignored, assigning all the weight 4157 // to the vector loop, optimistically. 4158 // 4159 // For scalable vectorization we can't know at compile time how many iterations 4160 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4161 // vscale of '1'. 4162 setProfileInfoAfterUnrolling( 4163 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4164 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4165 } 4166 4167 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4168 // In order to support recurrences we need to be able to vectorize Phi nodes. 4169 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4170 // stage #2: We now need to fix the recurrences by adding incoming edges to 4171 // the currently empty PHI nodes. At this point every instruction in the 4172 // original loop is widened to a vector form so we can use them to construct 4173 // the incoming edges. 4174 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4175 for (VPRecipeBase &R : Header->phis()) { 4176 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4177 fixReduction(ReductionPhi, State); 4178 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4179 fixFirstOrderRecurrence(FOR, State); 4180 } 4181 } 4182 4183 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4184 VPTransformState &State) { 4185 // This is the second phase of vectorizing first-order recurrences. An 4186 // overview of the transformation is described below. Suppose we have the 4187 // following loop. 4188 // 4189 // for (int i = 0; i < n; ++i) 4190 // b[i] = a[i] - a[i - 1]; 4191 // 4192 // There is a first-order recurrence on "a". For this loop, the shorthand 4193 // scalar IR looks like: 4194 // 4195 // scalar.ph: 4196 // s_init = a[-1] 4197 // br scalar.body 4198 // 4199 // scalar.body: 4200 // i = phi [0, scalar.ph], [i+1, scalar.body] 4201 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4202 // s2 = a[i] 4203 // b[i] = s2 - s1 4204 // br cond, scalar.body, ... 4205 // 4206 // In this example, s1 is a recurrence because it's value depends on the 4207 // previous iteration. In the first phase of vectorization, we created a 4208 // vector phi v1 for s1. We now complete the vectorization and produce the 4209 // shorthand vector IR shown below (for VF = 4, UF = 1). 4210 // 4211 // vector.ph: 4212 // v_init = vector(..., ..., ..., a[-1]) 4213 // br vector.body 4214 // 4215 // vector.body 4216 // i = phi [0, vector.ph], [i+4, vector.body] 4217 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4218 // v2 = a[i, i+1, i+2, i+3]; 4219 // v3 = vector(v1(3), v2(0, 1, 2)) 4220 // b[i, i+1, i+2, i+3] = v2 - v3 4221 // br cond, vector.body, middle.block 4222 // 4223 // middle.block: 4224 // x = v2(3) 4225 // br scalar.ph 4226 // 4227 // scalar.ph: 4228 // s_init = phi [x, middle.block], [a[-1], otherwise] 4229 // br scalar.body 4230 // 4231 // After execution completes the vector loop, we extract the next value of 4232 // the recurrence (x) to use as the initial value in the scalar loop. 4233 4234 // Extract the last vector element in the middle block. This will be the 4235 // initial value for the recurrence when jumping to the scalar loop. 4236 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4237 Value *Incoming = State.get(PreviousDef, UF - 1); 4238 auto *ExtractForScalar = Incoming; 4239 auto *IdxTy = Builder.getInt32Ty(); 4240 if (VF.isVector()) { 4241 auto *One = ConstantInt::get(IdxTy, 1); 4242 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4243 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4244 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4245 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4246 "vector.recur.extract"); 4247 } 4248 // Extract the second last element in the middle block if the 4249 // Phi is used outside the loop. We need to extract the phi itself 4250 // and not the last element (the phi update in the current iteration). This 4251 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4252 // when the scalar loop is not run at all. 4253 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4254 if (VF.isVector()) { 4255 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4256 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4257 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4258 Incoming, Idx, "vector.recur.extract.for.phi"); 4259 } else if (UF > 1) 4260 // When loop is unrolled without vectorizing, initialize 4261 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4262 // of `Incoming`. This is analogous to the vectorized case above: extracting 4263 // the second last element when VF > 1. 4264 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4265 4266 // Fix the initial value of the original recurrence in the scalar loop. 4267 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4268 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4269 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4270 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4271 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4272 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4273 Start->addIncoming(Incoming, BB); 4274 } 4275 4276 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4277 Phi->setName("scalar.recur"); 4278 4279 // Finally, fix users of the recurrence outside the loop. The users will need 4280 // either the last value of the scalar recurrence or the last value of the 4281 // vector recurrence we extracted in the middle block. Since the loop is in 4282 // LCSSA form, we just need to find all the phi nodes for the original scalar 4283 // recurrence in the exit block, and then add an edge for the middle block. 4284 // Note that LCSSA does not imply single entry when the original scalar loop 4285 // had multiple exiting edges (as we always run the last iteration in the 4286 // scalar epilogue); in that case, there is no edge from middle to exit and 4287 // and thus no phis which needed updated. 4288 if (!Cost->requiresScalarEpilogue(VF)) 4289 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4290 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4291 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4292 } 4293 4294 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4295 VPTransformState &State) { 4296 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4297 // Get it's reduction variable descriptor. 4298 assert(Legal->isReductionVariable(OrigPhi) && 4299 "Unable to find the reduction variable"); 4300 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4301 4302 RecurKind RK = RdxDesc.getRecurrenceKind(); 4303 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4304 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4305 setDebugLocFromInst(ReductionStartValue); 4306 4307 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4308 // This is the vector-clone of the value that leaves the loop. 4309 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4310 4311 // Wrap flags are in general invalid after vectorization, clear them. 4312 clearReductionWrapFlags(RdxDesc, State); 4313 4314 // Before each round, move the insertion point right between 4315 // the PHIs and the values we are going to write. 4316 // This allows us to write both PHINodes and the extractelement 4317 // instructions. 4318 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4319 4320 setDebugLocFromInst(LoopExitInst); 4321 4322 Type *PhiTy = OrigPhi->getType(); 4323 // If tail is folded by masking, the vector value to leave the loop should be 4324 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4325 // instead of the former. For an inloop reduction the reduction will already 4326 // be predicated, and does not need to be handled here. 4327 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4328 for (unsigned Part = 0; Part < UF; ++Part) { 4329 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4330 Value *Sel = nullptr; 4331 for (User *U : VecLoopExitInst->users()) { 4332 if (isa<SelectInst>(U)) { 4333 assert(!Sel && "Reduction exit feeding two selects"); 4334 Sel = U; 4335 } else 4336 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4337 } 4338 assert(Sel && "Reduction exit feeds no select"); 4339 State.reset(LoopExitInstDef, Sel, Part); 4340 4341 // If the target can create a predicated operator for the reduction at no 4342 // extra cost in the loop (for example a predicated vadd), it can be 4343 // cheaper for the select to remain in the loop than be sunk out of it, 4344 // and so use the select value for the phi instead of the old 4345 // LoopExitValue. 4346 if (PreferPredicatedReductionSelect || 4347 TTI->preferPredicatedReductionSelect( 4348 RdxDesc.getOpcode(), PhiTy, 4349 TargetTransformInfo::ReductionFlags())) { 4350 auto *VecRdxPhi = 4351 cast<PHINode>(State.get(PhiR, Part)); 4352 VecRdxPhi->setIncomingValueForBlock( 4353 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4354 } 4355 } 4356 } 4357 4358 // If the vector reduction can be performed in a smaller type, we truncate 4359 // then extend the loop exit value to enable InstCombine to evaluate the 4360 // entire expression in the smaller type. 4361 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4362 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4363 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4364 Builder.SetInsertPoint( 4365 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4366 VectorParts RdxParts(UF); 4367 for (unsigned Part = 0; Part < UF; ++Part) { 4368 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4369 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4370 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4371 : Builder.CreateZExt(Trunc, VecTy); 4372 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4373 if (U != Trunc) { 4374 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4375 RdxParts[Part] = Extnd; 4376 } 4377 } 4378 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4379 for (unsigned Part = 0; Part < UF; ++Part) { 4380 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4381 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4382 } 4383 } 4384 4385 // Reduce all of the unrolled parts into a single vector. 4386 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4387 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4388 4389 // The middle block terminator has already been assigned a DebugLoc here (the 4390 // OrigLoop's single latch terminator). We want the whole middle block to 4391 // appear to execute on this line because: (a) it is all compiler generated, 4392 // (b) these instructions are always executed after evaluating the latch 4393 // conditional branch, and (c) other passes may add new predecessors which 4394 // terminate on this line. This is the easiest way to ensure we don't 4395 // accidentally cause an extra step back into the loop while debugging. 4396 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4397 if (PhiR->isOrdered()) 4398 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4399 else { 4400 // Floating-point operations should have some FMF to enable the reduction. 4401 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4402 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4403 for (unsigned Part = 1; Part < UF; ++Part) { 4404 Value *RdxPart = State.get(LoopExitInstDef, Part); 4405 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4406 ReducedPartRdx = Builder.CreateBinOp( 4407 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4408 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4409 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4410 ReducedPartRdx, RdxPart); 4411 else 4412 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4413 } 4414 } 4415 4416 // Create the reduction after the loop. Note that inloop reductions create the 4417 // target reduction in the loop using a Reduction recipe. 4418 if (VF.isVector() && !PhiR->isInLoop()) { 4419 ReducedPartRdx = 4420 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4421 // If the reduction can be performed in a smaller type, we need to extend 4422 // the reduction to the wider type before we branch to the original loop. 4423 if (PhiTy != RdxDesc.getRecurrenceType()) 4424 ReducedPartRdx = RdxDesc.isSigned() 4425 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4426 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4427 } 4428 4429 // Create a phi node that merges control-flow from the backedge-taken check 4430 // block and the middle block. 4431 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4432 LoopScalarPreHeader->getTerminator()); 4433 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4434 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4435 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4436 4437 // Now, we need to fix the users of the reduction variable 4438 // inside and outside of the scalar remainder loop. 4439 4440 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4441 // in the exit blocks. See comment on analogous loop in 4442 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4443 if (!Cost->requiresScalarEpilogue(VF)) 4444 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4445 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4446 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4447 4448 // Fix the scalar loop reduction variable with the incoming reduction sum 4449 // from the vector body and from the backedge value. 4450 int IncomingEdgeBlockIdx = 4451 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4452 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4453 // Pick the other block. 4454 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4455 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4456 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4457 } 4458 4459 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4460 VPTransformState &State) { 4461 RecurKind RK = RdxDesc.getRecurrenceKind(); 4462 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4463 return; 4464 4465 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4466 assert(LoopExitInstr && "null loop exit instruction"); 4467 SmallVector<Instruction *, 8> Worklist; 4468 SmallPtrSet<Instruction *, 8> Visited; 4469 Worklist.push_back(LoopExitInstr); 4470 Visited.insert(LoopExitInstr); 4471 4472 while (!Worklist.empty()) { 4473 Instruction *Cur = Worklist.pop_back_val(); 4474 if (isa<OverflowingBinaryOperator>(Cur)) 4475 for (unsigned Part = 0; Part < UF; ++Part) { 4476 // FIXME: Should not rely on getVPValue at this point. 4477 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4478 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4479 } 4480 4481 for (User *U : Cur->users()) { 4482 Instruction *UI = cast<Instruction>(U); 4483 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4484 Visited.insert(UI).second) 4485 Worklist.push_back(UI); 4486 } 4487 } 4488 } 4489 4490 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4491 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4492 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4493 // Some phis were already hand updated by the reduction and recurrence 4494 // code above, leave them alone. 4495 continue; 4496 4497 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4498 // Non-instruction incoming values will have only one value. 4499 4500 VPLane Lane = VPLane::getFirstLane(); 4501 if (isa<Instruction>(IncomingValue) && 4502 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4503 VF)) 4504 Lane = VPLane::getLastLaneForVF(VF); 4505 4506 // Can be a loop invariant incoming value or the last scalar value to be 4507 // extracted from the vectorized loop. 4508 // FIXME: Should not rely on getVPValue at this point. 4509 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4510 Value *lastIncomingValue = 4511 OrigLoop->isLoopInvariant(IncomingValue) 4512 ? IncomingValue 4513 : State.get(State.Plan->getVPValue(IncomingValue, true), 4514 VPIteration(UF - 1, Lane)); 4515 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4516 } 4517 } 4518 4519 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4520 // The basic block and loop containing the predicated instruction. 4521 auto *PredBB = PredInst->getParent(); 4522 auto *VectorLoop = LI->getLoopFor(PredBB); 4523 4524 // Initialize a worklist with the operands of the predicated instruction. 4525 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4526 4527 // Holds instructions that we need to analyze again. An instruction may be 4528 // reanalyzed if we don't yet know if we can sink it or not. 4529 SmallVector<Instruction *, 8> InstsToReanalyze; 4530 4531 // Returns true if a given use occurs in the predicated block. Phi nodes use 4532 // their operands in their corresponding predecessor blocks. 4533 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4534 auto *I = cast<Instruction>(U.getUser()); 4535 BasicBlock *BB = I->getParent(); 4536 if (auto *Phi = dyn_cast<PHINode>(I)) 4537 BB = Phi->getIncomingBlock( 4538 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4539 return BB == PredBB; 4540 }; 4541 4542 // Iteratively sink the scalarized operands of the predicated instruction 4543 // into the block we created for it. When an instruction is sunk, it's 4544 // operands are then added to the worklist. The algorithm ends after one pass 4545 // through the worklist doesn't sink a single instruction. 4546 bool Changed; 4547 do { 4548 // Add the instructions that need to be reanalyzed to the worklist, and 4549 // reset the changed indicator. 4550 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4551 InstsToReanalyze.clear(); 4552 Changed = false; 4553 4554 while (!Worklist.empty()) { 4555 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4556 4557 // We can't sink an instruction if it is a phi node, is not in the loop, 4558 // or may have side effects. 4559 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4560 I->mayHaveSideEffects()) 4561 continue; 4562 4563 // If the instruction is already in PredBB, check if we can sink its 4564 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4565 // sinking the scalar instruction I, hence it appears in PredBB; but it 4566 // may have failed to sink I's operands (recursively), which we try 4567 // (again) here. 4568 if (I->getParent() == PredBB) { 4569 Worklist.insert(I->op_begin(), I->op_end()); 4570 continue; 4571 } 4572 4573 // It's legal to sink the instruction if all its uses occur in the 4574 // predicated block. Otherwise, there's nothing to do yet, and we may 4575 // need to reanalyze the instruction. 4576 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4577 InstsToReanalyze.push_back(I); 4578 continue; 4579 } 4580 4581 // Move the instruction to the beginning of the predicated block, and add 4582 // it's operands to the worklist. 4583 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4584 Worklist.insert(I->op_begin(), I->op_end()); 4585 4586 // The sinking may have enabled other instructions to be sunk, so we will 4587 // need to iterate. 4588 Changed = true; 4589 } 4590 } while (Changed); 4591 } 4592 4593 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4594 for (PHINode *OrigPhi : OrigPHIsToFix) { 4595 VPWidenPHIRecipe *VPPhi = 4596 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4597 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4598 // Make sure the builder has a valid insert point. 4599 Builder.SetInsertPoint(NewPhi); 4600 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4601 VPValue *Inc = VPPhi->getIncomingValue(i); 4602 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4603 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4604 } 4605 } 4606 } 4607 4608 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4609 return Cost->useOrderedReductions(RdxDesc); 4610 } 4611 4612 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4613 VPUser &Operands, unsigned UF, 4614 ElementCount VF, bool IsPtrLoopInvariant, 4615 SmallBitVector &IsIndexLoopInvariant, 4616 VPTransformState &State) { 4617 // Construct a vector GEP by widening the operands of the scalar GEP as 4618 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4619 // results in a vector of pointers when at least one operand of the GEP 4620 // is vector-typed. Thus, to keep the representation compact, we only use 4621 // vector-typed operands for loop-varying values. 4622 4623 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4624 // If we are vectorizing, but the GEP has only loop-invariant operands, 4625 // the GEP we build (by only using vector-typed operands for 4626 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4627 // produce a vector of pointers, we need to either arbitrarily pick an 4628 // operand to broadcast, or broadcast a clone of the original GEP. 4629 // Here, we broadcast a clone of the original. 4630 // 4631 // TODO: If at some point we decide to scalarize instructions having 4632 // loop-invariant operands, this special case will no longer be 4633 // required. We would add the scalarization decision to 4634 // collectLoopScalars() and teach getVectorValue() to broadcast 4635 // the lane-zero scalar value. 4636 auto *Clone = Builder.Insert(GEP->clone()); 4637 for (unsigned Part = 0; Part < UF; ++Part) { 4638 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4639 State.set(VPDef, EntryPart, Part); 4640 addMetadata(EntryPart, GEP); 4641 } 4642 } else { 4643 // If the GEP has at least one loop-varying operand, we are sure to 4644 // produce a vector of pointers. But if we are only unrolling, we want 4645 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4646 // produce with the code below will be scalar (if VF == 1) or vector 4647 // (otherwise). Note that for the unroll-only case, we still maintain 4648 // values in the vector mapping with initVector, as we do for other 4649 // instructions. 4650 for (unsigned Part = 0; Part < UF; ++Part) { 4651 // The pointer operand of the new GEP. If it's loop-invariant, we 4652 // won't broadcast it. 4653 auto *Ptr = IsPtrLoopInvariant 4654 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4655 : State.get(Operands.getOperand(0), Part); 4656 4657 // Collect all the indices for the new GEP. If any index is 4658 // loop-invariant, we won't broadcast it. 4659 SmallVector<Value *, 4> Indices; 4660 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4661 VPValue *Operand = Operands.getOperand(I); 4662 if (IsIndexLoopInvariant[I - 1]) 4663 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4664 else 4665 Indices.push_back(State.get(Operand, Part)); 4666 } 4667 4668 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4669 // but it should be a vector, otherwise. 4670 auto *NewGEP = 4671 GEP->isInBounds() 4672 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4673 Indices) 4674 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4675 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4676 "NewGEP is not a pointer vector"); 4677 State.set(VPDef, NewGEP, Part); 4678 addMetadata(NewGEP, GEP); 4679 } 4680 } 4681 } 4682 4683 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4684 VPWidenPHIRecipe *PhiR, 4685 VPTransformState &State) { 4686 PHINode *P = cast<PHINode>(PN); 4687 if (EnableVPlanNativePath) { 4688 // Currently we enter here in the VPlan-native path for non-induction 4689 // PHIs where all control flow is uniform. We simply widen these PHIs. 4690 // Create a vector phi with no operands - the vector phi operands will be 4691 // set at the end of vector code generation. 4692 Type *VecTy = (State.VF.isScalar()) 4693 ? PN->getType() 4694 : VectorType::get(PN->getType(), State.VF); 4695 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4696 State.set(PhiR, VecPhi, 0); 4697 OrigPHIsToFix.push_back(P); 4698 4699 return; 4700 } 4701 4702 assert(PN->getParent() == OrigLoop->getHeader() && 4703 "Non-header phis should have been handled elsewhere"); 4704 4705 // In order to support recurrences we need to be able to vectorize Phi nodes. 4706 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4707 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4708 // this value when we vectorize all of the instructions that use the PHI. 4709 4710 assert(!Legal->isReductionVariable(P) && 4711 "reductions should be handled elsewhere"); 4712 4713 setDebugLocFromInst(P); 4714 4715 // This PHINode must be an induction variable. 4716 // Make sure that we know about it. 4717 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4718 4719 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4720 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4721 4722 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4723 // which can be found from the original scalar operations. 4724 switch (II.getKind()) { 4725 case InductionDescriptor::IK_NoInduction: 4726 llvm_unreachable("Unknown induction"); 4727 case InductionDescriptor::IK_IntInduction: 4728 case InductionDescriptor::IK_FpInduction: 4729 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4730 case InductionDescriptor::IK_PtrInduction: { 4731 // Handle the pointer induction variable case. 4732 assert(P->getType()->isPointerTy() && "Unexpected type."); 4733 4734 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4735 // This is the normalized GEP that starts counting at zero. 4736 Value *PtrInd = 4737 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4738 // Determine the number of scalars we need to generate for each unroll 4739 // iteration. If the instruction is uniform, we only need to generate the 4740 // first lane. Otherwise, we generate all VF values. 4741 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4742 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4743 4744 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4745 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4746 if (NeedsVectorIndex) { 4747 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4748 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4749 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4750 } 4751 4752 for (unsigned Part = 0; Part < UF; ++Part) { 4753 Value *PartStart = 4754 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4755 4756 if (NeedsVectorIndex) { 4757 // Here we cache the whole vector, which means we can support the 4758 // extraction of any lane. However, in some cases the extractelement 4759 // instruction that is generated for scalar uses of this vector (e.g. 4760 // a load instruction) is not folded away. Therefore we still 4761 // calculate values for the first n lanes to avoid redundant moves 4762 // (when extracting the 0th element) and to produce scalar code (i.e. 4763 // additional add/gep instructions instead of expensive extractelement 4764 // instructions) when extracting higher-order elements. 4765 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4766 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4767 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4768 Value *SclrGep = 4769 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4770 SclrGep->setName("next.gep"); 4771 State.set(PhiR, SclrGep, Part); 4772 } 4773 4774 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4775 Value *Idx = Builder.CreateAdd( 4776 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4777 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4778 Value *SclrGep = 4779 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4780 SclrGep->setName("next.gep"); 4781 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4782 } 4783 } 4784 return; 4785 } 4786 assert(isa<SCEVConstant>(II.getStep()) && 4787 "Induction step not a SCEV constant!"); 4788 Type *PhiType = II.getStep()->getType(); 4789 4790 // Build a pointer phi 4791 Value *ScalarStartValue = II.getStartValue(); 4792 Type *ScStValueType = ScalarStartValue->getType(); 4793 PHINode *NewPointerPhi = 4794 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4795 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4796 4797 // A pointer induction, performed by using a gep 4798 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4799 Instruction *InductionLoc = LoopLatch->getTerminator(); 4800 const SCEV *ScalarStep = II.getStep(); 4801 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4802 Value *ScalarStepValue = 4803 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4804 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4805 Value *NumUnrolledElems = 4806 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4807 Value *InductionGEP = GetElementPtrInst::Create( 4808 II.getElementType(), NewPointerPhi, 4809 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4810 InductionLoc); 4811 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4812 4813 // Create UF many actual address geps that use the pointer 4814 // phi as base and a vectorized version of the step value 4815 // (<step*0, ..., step*N>) as offset. 4816 for (unsigned Part = 0; Part < State.UF; ++Part) { 4817 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4818 Value *StartOffsetScalar = 4819 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4820 Value *StartOffset = 4821 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4822 // Create a vector of consecutive numbers from zero to VF. 4823 StartOffset = 4824 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4825 4826 Value *GEP = Builder.CreateGEP( 4827 II.getElementType(), NewPointerPhi, 4828 Builder.CreateMul( 4829 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4830 "vector.gep")); 4831 State.set(PhiR, GEP, Part); 4832 } 4833 } 4834 } 4835 } 4836 4837 /// A helper function for checking whether an integer division-related 4838 /// instruction may divide by zero (in which case it must be predicated if 4839 /// executed conditionally in the scalar code). 4840 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4841 /// Non-zero divisors that are non compile-time constants will not be 4842 /// converted into multiplication, so we will still end up scalarizing 4843 /// the division, but can do so w/o predication. 4844 static bool mayDivideByZero(Instruction &I) { 4845 assert((I.getOpcode() == Instruction::UDiv || 4846 I.getOpcode() == Instruction::SDiv || 4847 I.getOpcode() == Instruction::URem || 4848 I.getOpcode() == Instruction::SRem) && 4849 "Unexpected instruction"); 4850 Value *Divisor = I.getOperand(1); 4851 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4852 return !CInt || CInt->isZero(); 4853 } 4854 4855 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4856 VPUser &User, 4857 VPTransformState &State) { 4858 switch (I.getOpcode()) { 4859 case Instruction::Call: 4860 case Instruction::Br: 4861 case Instruction::PHI: 4862 case Instruction::GetElementPtr: 4863 case Instruction::Select: 4864 llvm_unreachable("This instruction is handled by a different recipe."); 4865 case Instruction::UDiv: 4866 case Instruction::SDiv: 4867 case Instruction::SRem: 4868 case Instruction::URem: 4869 case Instruction::Add: 4870 case Instruction::FAdd: 4871 case Instruction::Sub: 4872 case Instruction::FSub: 4873 case Instruction::FNeg: 4874 case Instruction::Mul: 4875 case Instruction::FMul: 4876 case Instruction::FDiv: 4877 case Instruction::FRem: 4878 case Instruction::Shl: 4879 case Instruction::LShr: 4880 case Instruction::AShr: 4881 case Instruction::And: 4882 case Instruction::Or: 4883 case Instruction::Xor: { 4884 // Just widen unops and binops. 4885 setDebugLocFromInst(&I); 4886 4887 for (unsigned Part = 0; Part < UF; ++Part) { 4888 SmallVector<Value *, 2> Ops; 4889 for (VPValue *VPOp : User.operands()) 4890 Ops.push_back(State.get(VPOp, Part)); 4891 4892 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4893 4894 if (auto *VecOp = dyn_cast<Instruction>(V)) 4895 VecOp->copyIRFlags(&I); 4896 4897 // Use this vector value for all users of the original instruction. 4898 State.set(Def, V, Part); 4899 addMetadata(V, &I); 4900 } 4901 4902 break; 4903 } 4904 case Instruction::ICmp: 4905 case Instruction::FCmp: { 4906 // Widen compares. Generate vector compares. 4907 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4908 auto *Cmp = cast<CmpInst>(&I); 4909 setDebugLocFromInst(Cmp); 4910 for (unsigned Part = 0; Part < UF; ++Part) { 4911 Value *A = State.get(User.getOperand(0), Part); 4912 Value *B = State.get(User.getOperand(1), Part); 4913 Value *C = nullptr; 4914 if (FCmp) { 4915 // Propagate fast math flags. 4916 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4917 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4918 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4919 } else { 4920 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4921 } 4922 State.set(Def, C, Part); 4923 addMetadata(C, &I); 4924 } 4925 4926 break; 4927 } 4928 4929 case Instruction::ZExt: 4930 case Instruction::SExt: 4931 case Instruction::FPToUI: 4932 case Instruction::FPToSI: 4933 case Instruction::FPExt: 4934 case Instruction::PtrToInt: 4935 case Instruction::IntToPtr: 4936 case Instruction::SIToFP: 4937 case Instruction::UIToFP: 4938 case Instruction::Trunc: 4939 case Instruction::FPTrunc: 4940 case Instruction::BitCast: { 4941 auto *CI = cast<CastInst>(&I); 4942 setDebugLocFromInst(CI); 4943 4944 /// Vectorize casts. 4945 Type *DestTy = 4946 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4947 4948 for (unsigned Part = 0; Part < UF; ++Part) { 4949 Value *A = State.get(User.getOperand(0), Part); 4950 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4951 State.set(Def, Cast, Part); 4952 addMetadata(Cast, &I); 4953 } 4954 break; 4955 } 4956 default: 4957 // This instruction is not vectorized by simple widening. 4958 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4959 llvm_unreachable("Unhandled instruction!"); 4960 } // end of switch. 4961 } 4962 4963 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4964 VPUser &ArgOperands, 4965 VPTransformState &State) { 4966 assert(!isa<DbgInfoIntrinsic>(I) && 4967 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4968 setDebugLocFromInst(&I); 4969 4970 Module *M = I.getParent()->getParent()->getParent(); 4971 auto *CI = cast<CallInst>(&I); 4972 4973 SmallVector<Type *, 4> Tys; 4974 for (Value *ArgOperand : CI->args()) 4975 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4976 4977 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4978 4979 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4980 // version of the instruction. 4981 // Is it beneficial to perform intrinsic call compared to lib call? 4982 bool NeedToScalarize = false; 4983 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4984 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4985 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4986 assert((UseVectorIntrinsic || !NeedToScalarize) && 4987 "Instruction should be scalarized elsewhere."); 4988 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4989 "Either the intrinsic cost or vector call cost must be valid"); 4990 4991 for (unsigned Part = 0; Part < UF; ++Part) { 4992 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4993 SmallVector<Value *, 4> Args; 4994 for (auto &I : enumerate(ArgOperands.operands())) { 4995 // Some intrinsics have a scalar argument - don't replace it with a 4996 // vector. 4997 Value *Arg; 4998 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4999 Arg = State.get(I.value(), Part); 5000 else { 5001 Arg = State.get(I.value(), VPIteration(0, 0)); 5002 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 5003 TysForDecl.push_back(Arg->getType()); 5004 } 5005 Args.push_back(Arg); 5006 } 5007 5008 Function *VectorF; 5009 if (UseVectorIntrinsic) { 5010 // Use vector version of the intrinsic. 5011 if (VF.isVector()) 5012 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5013 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5014 assert(VectorF && "Can't retrieve vector intrinsic."); 5015 } else { 5016 // Use vector version of the function call. 5017 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5018 #ifndef NDEBUG 5019 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5020 "Can't create vector function."); 5021 #endif 5022 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5023 } 5024 SmallVector<OperandBundleDef, 1> OpBundles; 5025 CI->getOperandBundlesAsDefs(OpBundles); 5026 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5027 5028 if (isa<FPMathOperator>(V)) 5029 V->copyFastMathFlags(CI); 5030 5031 State.set(Def, V, Part); 5032 addMetadata(V, &I); 5033 } 5034 } 5035 5036 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5037 VPUser &Operands, 5038 bool InvariantCond, 5039 VPTransformState &State) { 5040 setDebugLocFromInst(&I); 5041 5042 // The condition can be loop invariant but still defined inside the 5043 // loop. This means that we can't just use the original 'cond' value. 5044 // We have to take the 'vectorized' value and pick the first lane. 5045 // Instcombine will make this a no-op. 5046 auto *InvarCond = InvariantCond 5047 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5048 : nullptr; 5049 5050 for (unsigned Part = 0; Part < UF; ++Part) { 5051 Value *Cond = 5052 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5053 Value *Op0 = State.get(Operands.getOperand(1), Part); 5054 Value *Op1 = State.get(Operands.getOperand(2), Part); 5055 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5056 State.set(VPDef, Sel, Part); 5057 addMetadata(Sel, &I); 5058 } 5059 } 5060 5061 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5062 // We should not collect Scalars more than once per VF. Right now, this 5063 // function is called from collectUniformsAndScalars(), which already does 5064 // this check. Collecting Scalars for VF=1 does not make any sense. 5065 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5066 "This function should not be visited twice for the same VF"); 5067 5068 SmallSetVector<Instruction *, 8> Worklist; 5069 5070 // These sets are used to seed the analysis with pointers used by memory 5071 // accesses that will remain scalar. 5072 SmallSetVector<Instruction *, 8> ScalarPtrs; 5073 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5074 auto *Latch = TheLoop->getLoopLatch(); 5075 5076 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5077 // The pointer operands of loads and stores will be scalar as long as the 5078 // memory access is not a gather or scatter operation. The value operand of a 5079 // store will remain scalar if the store is scalarized. 5080 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5081 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5082 assert(WideningDecision != CM_Unknown && 5083 "Widening decision should be ready at this moment"); 5084 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5085 if (Ptr == Store->getValueOperand()) 5086 return WideningDecision == CM_Scalarize; 5087 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5088 "Ptr is neither a value or pointer operand"); 5089 return WideningDecision != CM_GatherScatter; 5090 }; 5091 5092 // A helper that returns true if the given value is a bitcast or 5093 // getelementptr instruction contained in the loop. 5094 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5095 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5096 isa<GetElementPtrInst>(V)) && 5097 !TheLoop->isLoopInvariant(V); 5098 }; 5099 5100 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5101 if (!isa<PHINode>(Ptr) || 5102 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5103 return false; 5104 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5105 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5106 return false; 5107 return isScalarUse(MemAccess, Ptr); 5108 }; 5109 5110 // A helper that evaluates a memory access's use of a pointer. If the 5111 // pointer is actually the pointer induction of a loop, it is being 5112 // inserted into Worklist. If the use will be a scalar use, and the 5113 // pointer is only used by memory accesses, we place the pointer in 5114 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5115 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5116 if (isScalarPtrInduction(MemAccess, Ptr)) { 5117 Worklist.insert(cast<Instruction>(Ptr)); 5118 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5119 << "\n"); 5120 5121 Instruction *Update = cast<Instruction>( 5122 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5123 5124 // If there is more than one user of Update (Ptr), we shouldn't assume it 5125 // will be scalar after vectorisation as other users of the instruction 5126 // may require widening. Otherwise, add it to ScalarPtrs. 5127 if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) { 5128 ScalarPtrs.insert(Update); 5129 return; 5130 } 5131 } 5132 // We only care about bitcast and getelementptr instructions contained in 5133 // the loop. 5134 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5135 return; 5136 5137 // If the pointer has already been identified as scalar (e.g., if it was 5138 // also identified as uniform), there's nothing to do. 5139 auto *I = cast<Instruction>(Ptr); 5140 if (Worklist.count(I)) 5141 return; 5142 5143 // If the use of the pointer will be a scalar use, and all users of the 5144 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5145 // place the pointer in PossibleNonScalarPtrs. 5146 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5147 return isa<LoadInst>(U) || isa<StoreInst>(U); 5148 })) 5149 ScalarPtrs.insert(I); 5150 else 5151 PossibleNonScalarPtrs.insert(I); 5152 }; 5153 5154 // We seed the scalars analysis with three classes of instructions: (1) 5155 // instructions marked uniform-after-vectorization and (2) bitcast, 5156 // getelementptr and (pointer) phi instructions used by memory accesses 5157 // requiring a scalar use. 5158 // 5159 // (1) Add to the worklist all instructions that have been identified as 5160 // uniform-after-vectorization. 5161 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5162 5163 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5164 // memory accesses requiring a scalar use. The pointer operands of loads and 5165 // stores will be scalar as long as the memory accesses is not a gather or 5166 // scatter operation. The value operand of a store will remain scalar if the 5167 // store is scalarized. 5168 for (auto *BB : TheLoop->blocks()) 5169 for (auto &I : *BB) { 5170 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5171 evaluatePtrUse(Load, Load->getPointerOperand()); 5172 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5173 evaluatePtrUse(Store, Store->getPointerOperand()); 5174 evaluatePtrUse(Store, Store->getValueOperand()); 5175 } 5176 } 5177 for (auto *I : ScalarPtrs) 5178 if (!PossibleNonScalarPtrs.count(I)) { 5179 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5180 Worklist.insert(I); 5181 } 5182 5183 // Insert the forced scalars. 5184 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5185 // induction variable when the PHI user is scalarized. 5186 auto ForcedScalar = ForcedScalars.find(VF); 5187 if (ForcedScalar != ForcedScalars.end()) 5188 for (auto *I : ForcedScalar->second) 5189 Worklist.insert(I); 5190 5191 // Expand the worklist by looking through any bitcasts and getelementptr 5192 // instructions we've already identified as scalar. This is similar to the 5193 // expansion step in collectLoopUniforms(); however, here we're only 5194 // expanding to include additional bitcasts and getelementptr instructions. 5195 unsigned Idx = 0; 5196 while (Idx != Worklist.size()) { 5197 Instruction *Dst = Worklist[Idx++]; 5198 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5199 continue; 5200 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5201 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5202 auto *J = cast<Instruction>(U); 5203 return !TheLoop->contains(J) || Worklist.count(J) || 5204 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5205 isScalarUse(J, Src)); 5206 })) { 5207 Worklist.insert(Src); 5208 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5209 } 5210 } 5211 5212 // An induction variable will remain scalar if all users of the induction 5213 // variable and induction variable update remain scalar. 5214 for (auto &Induction : Legal->getInductionVars()) { 5215 auto *Ind = Induction.first; 5216 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5217 5218 // If tail-folding is applied, the primary induction variable will be used 5219 // to feed a vector compare. 5220 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5221 continue; 5222 5223 // Determine if all users of the induction variable are scalar after 5224 // vectorization. 5225 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5226 auto *I = cast<Instruction>(U); 5227 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5228 }); 5229 if (!ScalarInd) 5230 continue; 5231 5232 // Determine if all users of the induction variable update instruction are 5233 // scalar after vectorization. 5234 auto ScalarIndUpdate = 5235 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5236 auto *I = cast<Instruction>(U); 5237 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5238 }); 5239 if (!ScalarIndUpdate) 5240 continue; 5241 5242 // The induction variable and its update instruction will remain scalar. 5243 Worklist.insert(Ind); 5244 Worklist.insert(IndUpdate); 5245 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5246 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5247 << "\n"); 5248 } 5249 5250 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5251 } 5252 5253 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5254 if (!blockNeedsPredication(I->getParent())) 5255 return false; 5256 switch(I->getOpcode()) { 5257 default: 5258 break; 5259 case Instruction::Load: 5260 case Instruction::Store: { 5261 if (!Legal->isMaskRequired(I)) 5262 return false; 5263 auto *Ptr = getLoadStorePointerOperand(I); 5264 auto *Ty = getLoadStoreType(I); 5265 const Align Alignment = getLoadStoreAlignment(I); 5266 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5267 TTI.isLegalMaskedGather(Ty, Alignment)) 5268 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5269 TTI.isLegalMaskedScatter(Ty, Alignment)); 5270 } 5271 case Instruction::UDiv: 5272 case Instruction::SDiv: 5273 case Instruction::SRem: 5274 case Instruction::URem: 5275 return mayDivideByZero(*I); 5276 } 5277 return false; 5278 } 5279 5280 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5281 Instruction *I, ElementCount VF) { 5282 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5283 assert(getWideningDecision(I, VF) == CM_Unknown && 5284 "Decision should not be set yet."); 5285 auto *Group = getInterleavedAccessGroup(I); 5286 assert(Group && "Must have a group."); 5287 5288 // If the instruction's allocated size doesn't equal it's type size, it 5289 // requires padding and will be scalarized. 5290 auto &DL = I->getModule()->getDataLayout(); 5291 auto *ScalarTy = getLoadStoreType(I); 5292 if (hasIrregularType(ScalarTy, DL)) 5293 return false; 5294 5295 // Check if masking is required. 5296 // A Group may need masking for one of two reasons: it resides in a block that 5297 // needs predication, or it was decided to use masking to deal with gaps 5298 // (either a gap at the end of a load-access that may result in a speculative 5299 // load, or any gaps in a store-access). 5300 bool PredicatedAccessRequiresMasking = 5301 blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5302 bool LoadAccessWithGapsRequiresEpilogMasking = 5303 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 5304 !isScalarEpilogueAllowed(); 5305 bool StoreAccessWithGapsRequiresMasking = 5306 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 5307 if (!PredicatedAccessRequiresMasking && 5308 !LoadAccessWithGapsRequiresEpilogMasking && 5309 !StoreAccessWithGapsRequiresMasking) 5310 return true; 5311 5312 // If masked interleaving is required, we expect that the user/target had 5313 // enabled it, because otherwise it either wouldn't have been created or 5314 // it should have been invalidated by the CostModel. 5315 assert(useMaskedInterleavedAccesses(TTI) && 5316 "Masked interleave-groups for predicated accesses are not enabled."); 5317 5318 if (Group->isReverse()) 5319 return false; 5320 5321 auto *Ty = getLoadStoreType(I); 5322 const Align Alignment = getLoadStoreAlignment(I); 5323 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5324 : TTI.isLegalMaskedStore(Ty, Alignment); 5325 } 5326 5327 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5328 Instruction *I, ElementCount VF) { 5329 // Get and ensure we have a valid memory instruction. 5330 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 5331 5332 auto *Ptr = getLoadStorePointerOperand(I); 5333 auto *ScalarTy = getLoadStoreType(I); 5334 5335 // In order to be widened, the pointer should be consecutive, first of all. 5336 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5337 return false; 5338 5339 // If the instruction is a store located in a predicated block, it will be 5340 // scalarized. 5341 if (isScalarWithPredication(I)) 5342 return false; 5343 5344 // If the instruction's allocated size doesn't equal it's type size, it 5345 // requires padding and will be scalarized. 5346 auto &DL = I->getModule()->getDataLayout(); 5347 if (hasIrregularType(ScalarTy, DL)) 5348 return false; 5349 5350 return true; 5351 } 5352 5353 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5354 // We should not collect Uniforms more than once per VF. Right now, 5355 // this function is called from collectUniformsAndScalars(), which 5356 // already does this check. Collecting Uniforms for VF=1 does not make any 5357 // sense. 5358 5359 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5360 "This function should not be visited twice for the same VF"); 5361 5362 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5363 // not analyze again. Uniforms.count(VF) will return 1. 5364 Uniforms[VF].clear(); 5365 5366 // We now know that the loop is vectorizable! 5367 // Collect instructions inside the loop that will remain uniform after 5368 // vectorization. 5369 5370 // Global values, params and instructions outside of current loop are out of 5371 // scope. 5372 auto isOutOfScope = [&](Value *V) -> bool { 5373 Instruction *I = dyn_cast<Instruction>(V); 5374 return (!I || !TheLoop->contains(I)); 5375 }; 5376 5377 // Worklist containing uniform instructions demanding lane 0. 5378 SetVector<Instruction *> Worklist; 5379 BasicBlock *Latch = TheLoop->getLoopLatch(); 5380 5381 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5382 // that are scalar with predication must not be considered uniform after 5383 // vectorization, because that would create an erroneous replicating region 5384 // where only a single instance out of VF should be formed. 5385 // TODO: optimize such seldom cases if found important, see PR40816. 5386 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5387 if (isOutOfScope(I)) { 5388 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5389 << *I << "\n"); 5390 return; 5391 } 5392 if (isScalarWithPredication(I)) { 5393 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5394 << *I << "\n"); 5395 return; 5396 } 5397 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5398 Worklist.insert(I); 5399 }; 5400 5401 // Start with the conditional branch. If the branch condition is an 5402 // instruction contained in the loop that is only used by the branch, it is 5403 // uniform. 5404 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5405 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5406 addToWorklistIfAllowed(Cmp); 5407 5408 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5409 InstWidening WideningDecision = getWideningDecision(I, VF); 5410 assert(WideningDecision != CM_Unknown && 5411 "Widening decision should be ready at this moment"); 5412 5413 // A uniform memory op is itself uniform. 5414 if (Legal->isUniformMemOp(*I)) { 5415 assert(WideningDecision == CM_Scalarize); 5416 return true; 5417 } 5418 5419 return (WideningDecision == CM_Widen || 5420 WideningDecision == CM_Widen_Reverse || 5421 WideningDecision == CM_Interleave); 5422 }; 5423 5424 5425 // Returns true if Ptr is the pointer operand of a memory access instruction 5426 // I, and I is known to not require scalarization. 5427 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5428 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5429 }; 5430 5431 // Holds a list of values which are known to have at least one uniform use. 5432 // Note that there may be other uses which aren't uniform. A "uniform use" 5433 // here is something which only demands lane 0 of the unrolled iterations; 5434 // it does not imply that all lanes produce the same value (e.g. this is not 5435 // the usual meaning of uniform) 5436 SetVector<Value *> HasUniformUse; 5437 5438 // Scan the loop for instructions which are either a) known to have only 5439 // lane 0 or the last lane demanded or b) are uses which demand only 5440 // lane 0 of their operand. 5441 for (auto *BB : TheLoop->blocks()) 5442 for (auto &I : *BB) { 5443 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5444 switch (II->getIntrinsicID()) { 5445 case Intrinsic::sideeffect: 5446 case Intrinsic::experimental_noalias_scope_decl: 5447 case Intrinsic::assume: 5448 case Intrinsic::lifetime_start: 5449 case Intrinsic::lifetime_end: 5450 if (TheLoop->hasLoopInvariantOperands(&I)) 5451 addToWorklistIfAllowed(&I); 5452 break; 5453 default: 5454 break; 5455 } 5456 } 5457 5458 // ExtractValue instructions must be uniform, because the operands are 5459 // known to be loop-invariant. 5460 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5461 assert(isOutOfScope(EVI->getAggregateOperand()) && 5462 "Expected aggregate value to be loop invariant"); 5463 addToWorklistIfAllowed(EVI); 5464 continue; 5465 } 5466 5467 // If there's no pointer operand, there's nothing to do. 5468 auto *Ptr = getLoadStorePointerOperand(&I); 5469 if (!Ptr) 5470 continue; 5471 5472 // A uniform memory op is itself uniform. Load instructions are added 5473 // to the worklist as they demand the first lane. Since store instructions 5474 // demand the last lane, we instead add these to Uniforms only. 5475 if (Legal->isUniformMemOp(I)) { 5476 if (isa<LoadInst>(I)) 5477 addToWorklistIfAllowed(&I); 5478 else if (!isOutOfScope(&I) && !isScalarWithPredication(&I)) 5479 Uniforms[VF].insert(&I); 5480 } 5481 5482 if (isUniformDecision(&I, VF)) { 5483 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5484 HasUniformUse.insert(Ptr); 5485 } 5486 } 5487 5488 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5489 // demanding) users. Since loops are assumed to be in LCSSA form, this 5490 // disallows uses outside the loop as well. 5491 for (auto *V : HasUniformUse) { 5492 if (isOutOfScope(V)) 5493 continue; 5494 auto *I = cast<Instruction>(V); 5495 auto UsersAreMemAccesses = 5496 llvm::all_of(I->users(), [&](User *U) -> bool { 5497 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5498 }); 5499 if (UsersAreMemAccesses) 5500 addToWorklistIfAllowed(I); 5501 } 5502 5503 // Expand Worklist in topological order: whenever a new instruction 5504 // is added , its users should be already inside Worklist. It ensures 5505 // a uniform instruction will only be used by uniform instructions. 5506 unsigned idx = 0; 5507 while (idx != Worklist.size()) { 5508 Instruction *I = Worklist[idx++]; 5509 5510 for (auto OV : I->operand_values()) { 5511 // isOutOfScope operands cannot be uniform instructions. 5512 if (isOutOfScope(OV)) 5513 continue; 5514 // First order recurrence Phi's should typically be considered 5515 // non-uniform. 5516 auto *OP = dyn_cast<PHINode>(OV); 5517 if (OP && Legal->isFirstOrderRecurrence(OP)) 5518 continue; 5519 // If all the users of the operand are uniform, then add the 5520 // operand into the uniform worklist. 5521 auto *OI = cast<Instruction>(OV); 5522 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5523 auto *J = cast<Instruction>(U); 5524 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5525 })) 5526 addToWorklistIfAllowed(OI); 5527 } 5528 } 5529 5530 // For an instruction to be added into Worklist above, all its users inside 5531 // the loop should also be in Worklist. However, this condition cannot be 5532 // true for phi nodes that form a cyclic dependence. We must process phi 5533 // nodes separately. An induction variable will remain uniform if all users 5534 // of the induction variable and induction variable update remain uniform. 5535 // The code below handles both pointer and non-pointer induction variables. 5536 for (auto &Induction : Legal->getInductionVars()) { 5537 auto *Ind = Induction.first; 5538 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5539 5540 // Determine if all users of the induction variable are uniform after 5541 // vectorization. 5542 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5543 auto *I = cast<Instruction>(U); 5544 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5545 isVectorizedMemAccessUse(I, Ind); 5546 }); 5547 if (!UniformInd) 5548 continue; 5549 5550 // Determine if all users of the induction variable update instruction are 5551 // uniform after vectorization. 5552 auto UniformIndUpdate = 5553 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5554 auto *I = cast<Instruction>(U); 5555 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5556 isVectorizedMemAccessUse(I, IndUpdate); 5557 }); 5558 if (!UniformIndUpdate) 5559 continue; 5560 5561 // The induction variable and its update instruction will remain uniform. 5562 addToWorklistIfAllowed(Ind); 5563 addToWorklistIfAllowed(IndUpdate); 5564 } 5565 5566 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5567 } 5568 5569 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5570 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5571 5572 if (Legal->getRuntimePointerChecking()->Need) { 5573 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5574 "runtime pointer checks needed. Enable vectorization of this " 5575 "loop with '#pragma clang loop vectorize(enable)' when " 5576 "compiling with -Os/-Oz", 5577 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5578 return true; 5579 } 5580 5581 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5582 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5583 "runtime SCEV checks needed. Enable vectorization of this " 5584 "loop with '#pragma clang loop vectorize(enable)' when " 5585 "compiling with -Os/-Oz", 5586 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5587 return true; 5588 } 5589 5590 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5591 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5592 reportVectorizationFailure("Runtime stride check for small trip count", 5593 "runtime stride == 1 checks needed. Enable vectorization of " 5594 "this loop without such check by compiling with -Os/-Oz", 5595 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5596 return true; 5597 } 5598 5599 return false; 5600 } 5601 5602 ElementCount 5603 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5604 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5605 return ElementCount::getScalable(0); 5606 5607 if (Hints->isScalableVectorizationDisabled()) { 5608 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5609 "ScalableVectorizationDisabled", ORE, TheLoop); 5610 return ElementCount::getScalable(0); 5611 } 5612 5613 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5614 5615 auto MaxScalableVF = ElementCount::getScalable( 5616 std::numeric_limits<ElementCount::ScalarTy>::max()); 5617 5618 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5619 // FIXME: While for scalable vectors this is currently sufficient, this should 5620 // be replaced by a more detailed mechanism that filters out specific VFs, 5621 // instead of invalidating vectorization for a whole set of VFs based on the 5622 // MaxVF. 5623 5624 // Disable scalable vectorization if the loop contains unsupported reductions. 5625 if (!canVectorizeReductions(MaxScalableVF)) { 5626 reportVectorizationInfo( 5627 "Scalable vectorization not supported for the reduction " 5628 "operations found in this loop.", 5629 "ScalableVFUnfeasible", ORE, TheLoop); 5630 return ElementCount::getScalable(0); 5631 } 5632 5633 // Disable scalable vectorization if the loop contains any instructions 5634 // with element types not supported for scalable vectors. 5635 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5636 return !Ty->isVoidTy() && 5637 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5638 })) { 5639 reportVectorizationInfo("Scalable vectorization is not supported " 5640 "for all element types found in this loop.", 5641 "ScalableVFUnfeasible", ORE, TheLoop); 5642 return ElementCount::getScalable(0); 5643 } 5644 5645 if (Legal->isSafeForAnyVectorWidth()) 5646 return MaxScalableVF; 5647 5648 // Limit MaxScalableVF by the maximum safe dependence distance. 5649 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5650 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5651 unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) 5652 .getVScaleRangeArgs() 5653 .second; 5654 if (VScaleMax > 0) 5655 MaxVScale = VScaleMax; 5656 } 5657 MaxScalableVF = ElementCount::getScalable( 5658 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5659 if (!MaxScalableVF) 5660 reportVectorizationInfo( 5661 "Max legal vector width too small, scalable vectorization " 5662 "unfeasible.", 5663 "ScalableVFUnfeasible", ORE, TheLoop); 5664 5665 return MaxScalableVF; 5666 } 5667 5668 FixedScalableVFPair 5669 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5670 ElementCount UserVF) { 5671 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5672 unsigned SmallestType, WidestType; 5673 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5674 5675 // Get the maximum safe dependence distance in bits computed by LAA. 5676 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5677 // the memory accesses that is most restrictive (involved in the smallest 5678 // dependence distance). 5679 unsigned MaxSafeElements = 5680 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5681 5682 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5683 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5684 5685 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5686 << ".\n"); 5687 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5688 << ".\n"); 5689 5690 // First analyze the UserVF, fall back if the UserVF should be ignored. 5691 if (UserVF) { 5692 auto MaxSafeUserVF = 5693 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5694 5695 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5696 // If `VF=vscale x N` is safe, then so is `VF=N` 5697 if (UserVF.isScalable()) 5698 return FixedScalableVFPair( 5699 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5700 else 5701 return UserVF; 5702 } 5703 5704 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5705 5706 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5707 // is better to ignore the hint and let the compiler choose a suitable VF. 5708 if (!UserVF.isScalable()) { 5709 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5710 << " is unsafe, clamping to max safe VF=" 5711 << MaxSafeFixedVF << ".\n"); 5712 ORE->emit([&]() { 5713 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5714 TheLoop->getStartLoc(), 5715 TheLoop->getHeader()) 5716 << "User-specified vectorization factor " 5717 << ore::NV("UserVectorizationFactor", UserVF) 5718 << " is unsafe, clamping to maximum safe vectorization factor " 5719 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5720 }); 5721 return MaxSafeFixedVF; 5722 } 5723 5724 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5725 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5726 << " is ignored because scalable vectors are not " 5727 "available.\n"); 5728 ORE->emit([&]() { 5729 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5730 TheLoop->getStartLoc(), 5731 TheLoop->getHeader()) 5732 << "User-specified vectorization factor " 5733 << ore::NV("UserVectorizationFactor", UserVF) 5734 << " is ignored because the target does not support scalable " 5735 "vectors. The compiler will pick a more suitable value."; 5736 }); 5737 } else { 5738 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5739 << " is unsafe. Ignoring scalable UserVF.\n"); 5740 ORE->emit([&]() { 5741 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5742 TheLoop->getStartLoc(), 5743 TheLoop->getHeader()) 5744 << "User-specified vectorization factor " 5745 << ore::NV("UserVectorizationFactor", UserVF) 5746 << " is unsafe. Ignoring the hint to let the compiler pick a " 5747 "more suitable value."; 5748 }); 5749 } 5750 } 5751 5752 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5753 << " / " << WidestType << " bits.\n"); 5754 5755 FixedScalableVFPair Result(ElementCount::getFixed(1), 5756 ElementCount::getScalable(0)); 5757 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5758 WidestType, MaxSafeFixedVF)) 5759 Result.FixedVF = MaxVF; 5760 5761 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5762 WidestType, MaxSafeScalableVF)) 5763 if (MaxVF.isScalable()) { 5764 Result.ScalableVF = MaxVF; 5765 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5766 << "\n"); 5767 } 5768 5769 return Result; 5770 } 5771 5772 FixedScalableVFPair 5773 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5774 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5775 // TODO: It may by useful to do since it's still likely to be dynamically 5776 // uniform if the target can skip. 5777 reportVectorizationFailure( 5778 "Not inserting runtime ptr check for divergent target", 5779 "runtime pointer checks needed. Not enabled for divergent target", 5780 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5781 return FixedScalableVFPair::getNone(); 5782 } 5783 5784 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5785 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5786 if (TC == 1) { 5787 reportVectorizationFailure("Single iteration (non) loop", 5788 "loop trip count is one, irrelevant for vectorization", 5789 "SingleIterationLoop", ORE, TheLoop); 5790 return FixedScalableVFPair::getNone(); 5791 } 5792 5793 switch (ScalarEpilogueStatus) { 5794 case CM_ScalarEpilogueAllowed: 5795 return computeFeasibleMaxVF(TC, UserVF); 5796 case CM_ScalarEpilogueNotAllowedUsePredicate: 5797 LLVM_FALLTHROUGH; 5798 case CM_ScalarEpilogueNotNeededUsePredicate: 5799 LLVM_DEBUG( 5800 dbgs() << "LV: vector predicate hint/switch found.\n" 5801 << "LV: Not allowing scalar epilogue, creating predicated " 5802 << "vector loop.\n"); 5803 break; 5804 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5805 // fallthrough as a special case of OptForSize 5806 case CM_ScalarEpilogueNotAllowedOptSize: 5807 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5808 LLVM_DEBUG( 5809 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5810 else 5811 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5812 << "count.\n"); 5813 5814 // Bail if runtime checks are required, which are not good when optimising 5815 // for size. 5816 if (runtimeChecksRequired()) 5817 return FixedScalableVFPair::getNone(); 5818 5819 break; 5820 } 5821 5822 // The only loops we can vectorize without a scalar epilogue, are loops with 5823 // a bottom-test and a single exiting block. We'd have to handle the fact 5824 // that not every instruction executes on the last iteration. This will 5825 // require a lane mask which varies through the vector loop body. (TODO) 5826 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5827 // If there was a tail-folding hint/switch, but we can't fold the tail by 5828 // masking, fallback to a vectorization with a scalar epilogue. 5829 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5830 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5831 "scalar epilogue instead.\n"); 5832 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5833 return computeFeasibleMaxVF(TC, UserVF); 5834 } 5835 return FixedScalableVFPair::getNone(); 5836 } 5837 5838 // Now try the tail folding 5839 5840 // Invalidate interleave groups that require an epilogue if we can't mask 5841 // the interleave-group. 5842 if (!useMaskedInterleavedAccesses(TTI)) { 5843 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5844 "No decisions should have been taken at this point"); 5845 // Note: There is no need to invalidate any cost modeling decisions here, as 5846 // non where taken so far. 5847 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5848 } 5849 5850 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5851 // Avoid tail folding if the trip count is known to be a multiple of any VF 5852 // we chose. 5853 // FIXME: The condition below pessimises the case for fixed-width vectors, 5854 // when scalable VFs are also candidates for vectorization. 5855 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5856 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5857 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5858 "MaxFixedVF must be a power of 2"); 5859 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5860 : MaxFixedVF.getFixedValue(); 5861 ScalarEvolution *SE = PSE.getSE(); 5862 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5863 const SCEV *ExitCount = SE->getAddExpr( 5864 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5865 const SCEV *Rem = SE->getURemExpr( 5866 SE->applyLoopGuards(ExitCount, TheLoop), 5867 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5868 if (Rem->isZero()) { 5869 // Accept MaxFixedVF if we do not have a tail. 5870 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5871 return MaxFactors; 5872 } 5873 } 5874 5875 // For scalable vectors, don't use tail folding as this is currently not yet 5876 // supported. The code is likely to have ended up here if the tripcount is 5877 // low, in which case it makes sense not to use scalable vectors. 5878 if (MaxFactors.ScalableVF.isVector()) 5879 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5880 5881 // If we don't know the precise trip count, or if the trip count that we 5882 // found modulo the vectorization factor is not zero, try to fold the tail 5883 // by masking. 5884 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5885 if (Legal->prepareToFoldTailByMasking()) { 5886 FoldTailByMasking = true; 5887 return MaxFactors; 5888 } 5889 5890 // If there was a tail-folding hint/switch, but we can't fold the tail by 5891 // masking, fallback to a vectorization with a scalar epilogue. 5892 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5893 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5894 "scalar epilogue instead.\n"); 5895 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5896 return MaxFactors; 5897 } 5898 5899 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5900 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5901 return FixedScalableVFPair::getNone(); 5902 } 5903 5904 if (TC == 0) { 5905 reportVectorizationFailure( 5906 "Unable to calculate the loop count due to complex control flow", 5907 "unable to calculate the loop count due to complex control flow", 5908 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5909 return FixedScalableVFPair::getNone(); 5910 } 5911 5912 reportVectorizationFailure( 5913 "Cannot optimize for size and vectorize at the same time.", 5914 "cannot optimize for size and vectorize at the same time. " 5915 "Enable vectorization of this loop with '#pragma clang loop " 5916 "vectorize(enable)' when compiling with -Os/-Oz", 5917 "NoTailLoopWithOptForSize", ORE, TheLoop); 5918 return FixedScalableVFPair::getNone(); 5919 } 5920 5921 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5922 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5923 const ElementCount &MaxSafeVF) { 5924 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5925 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5926 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5927 : TargetTransformInfo::RGK_FixedWidthVector); 5928 5929 // Convenience function to return the minimum of two ElementCounts. 5930 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5931 assert((LHS.isScalable() == RHS.isScalable()) && 5932 "Scalable flags must match"); 5933 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5934 }; 5935 5936 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5937 // Note that both WidestRegister and WidestType may not be a powers of 2. 5938 auto MaxVectorElementCount = ElementCount::get( 5939 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5940 ComputeScalableMaxVF); 5941 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5942 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5943 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5944 5945 if (!MaxVectorElementCount) { 5946 LLVM_DEBUG(dbgs() << "LV: The target has no " 5947 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5948 << " vector registers.\n"); 5949 return ElementCount::getFixed(1); 5950 } 5951 5952 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5953 if (ConstTripCount && 5954 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5955 isPowerOf2_32(ConstTripCount)) { 5956 // We need to clamp the VF to be the ConstTripCount. There is no point in 5957 // choosing a higher viable VF as done in the loop below. If 5958 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5959 // the TC is less than or equal to the known number of lanes. 5960 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5961 << ConstTripCount << "\n"); 5962 return TripCountEC; 5963 } 5964 5965 ElementCount MaxVF = MaxVectorElementCount; 5966 if (TTI.shouldMaximizeVectorBandwidth() || 5967 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5968 auto MaxVectorElementCountMaxBW = ElementCount::get( 5969 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5970 ComputeScalableMaxVF); 5971 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5972 5973 // Collect all viable vectorization factors larger than the default MaxVF 5974 // (i.e. MaxVectorElementCount). 5975 SmallVector<ElementCount, 8> VFs; 5976 for (ElementCount VS = MaxVectorElementCount * 2; 5977 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5978 VFs.push_back(VS); 5979 5980 // For each VF calculate its register usage. 5981 auto RUs = calculateRegisterUsage(VFs); 5982 5983 // Select the largest VF which doesn't require more registers than existing 5984 // ones. 5985 for (int i = RUs.size() - 1; i >= 0; --i) { 5986 bool Selected = true; 5987 for (auto &pair : RUs[i].MaxLocalUsers) { 5988 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5989 if (pair.second > TargetNumRegisters) 5990 Selected = false; 5991 } 5992 if (Selected) { 5993 MaxVF = VFs[i]; 5994 break; 5995 } 5996 } 5997 if (ElementCount MinVF = 5998 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5999 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 6000 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 6001 << ") with target's minimum: " << MinVF << '\n'); 6002 MaxVF = MinVF; 6003 } 6004 } 6005 } 6006 return MaxVF; 6007 } 6008 6009 bool LoopVectorizationCostModel::isMoreProfitable( 6010 const VectorizationFactor &A, const VectorizationFactor &B) const { 6011 InstructionCost CostA = A.Cost; 6012 InstructionCost CostB = B.Cost; 6013 6014 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6015 6016 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6017 MaxTripCount) { 6018 // If we are folding the tail and the trip count is a known (possibly small) 6019 // constant, the trip count will be rounded up to an integer number of 6020 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6021 // which we compare directly. When not folding the tail, the total cost will 6022 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6023 // approximated with the per-lane cost below instead of using the tripcount 6024 // as here. 6025 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6026 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6027 return RTCostA < RTCostB; 6028 } 6029 6030 // Improve estimate for the vector width if it is scalable. 6031 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 6032 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 6033 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 6034 if (A.Width.isScalable()) 6035 EstimatedWidthA *= VScale.getValue(); 6036 if (B.Width.isScalable()) 6037 EstimatedWidthB *= VScale.getValue(); 6038 } 6039 6040 // When set to preferred, for now assume vscale may be larger than 1 (or the 6041 // one being tuned for), so that scalable vectorization is slightly favorable 6042 // over fixed-width vectorization. 6043 if (Hints->isScalableVectorizationPreferred()) 6044 if (A.Width.isScalable() && !B.Width.isScalable()) 6045 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 6046 6047 // To avoid the need for FP division: 6048 // (CostA / A.Width) < (CostB / B.Width) 6049 // <=> (CostA * B.Width) < (CostB * A.Width) 6050 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 6051 } 6052 6053 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 6054 const ElementCountSet &VFCandidates) { 6055 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6056 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6057 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6058 assert(VFCandidates.count(ElementCount::getFixed(1)) && 6059 "Expected Scalar VF to be a candidate"); 6060 6061 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6062 VectorizationFactor ChosenFactor = ScalarCost; 6063 6064 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6065 if (ForceVectorization && VFCandidates.size() > 1) { 6066 // Ignore scalar width, because the user explicitly wants vectorization. 6067 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6068 // evaluation. 6069 ChosenFactor.Cost = InstructionCost::getMax(); 6070 } 6071 6072 SmallVector<InstructionVFPair> InvalidCosts; 6073 for (const auto &i : VFCandidates) { 6074 // The cost for scalar VF=1 is already calculated, so ignore it. 6075 if (i.isScalar()) 6076 continue; 6077 6078 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 6079 VectorizationFactor Candidate(i, C.first); 6080 6081 #ifndef NDEBUG 6082 unsigned AssumedMinimumVscale = 1; 6083 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 6084 AssumedMinimumVscale = VScale.getValue(); 6085 unsigned Width = 6086 Candidate.Width.isScalable() 6087 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 6088 : Candidate.Width.getFixedValue(); 6089 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 6090 << " costs: " << (Candidate.Cost / Width)); 6091 if (i.isScalable()) 6092 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 6093 << AssumedMinimumVscale << ")"); 6094 LLVM_DEBUG(dbgs() << ".\n"); 6095 #endif 6096 6097 if (!C.second && !ForceVectorization) { 6098 LLVM_DEBUG( 6099 dbgs() << "LV: Not considering vector loop of width " << i 6100 << " because it will not generate any vector instructions.\n"); 6101 continue; 6102 } 6103 6104 // If profitable add it to ProfitableVF list. 6105 if (isMoreProfitable(Candidate, ScalarCost)) 6106 ProfitableVFs.push_back(Candidate); 6107 6108 if (isMoreProfitable(Candidate, ChosenFactor)) 6109 ChosenFactor = Candidate; 6110 } 6111 6112 // Emit a report of VFs with invalid costs in the loop. 6113 if (!InvalidCosts.empty()) { 6114 // Group the remarks per instruction, keeping the instruction order from 6115 // InvalidCosts. 6116 std::map<Instruction *, unsigned> Numbering; 6117 unsigned I = 0; 6118 for (auto &Pair : InvalidCosts) 6119 if (!Numbering.count(Pair.first)) 6120 Numbering[Pair.first] = I++; 6121 6122 // Sort the list, first on instruction(number) then on VF. 6123 llvm::sort(InvalidCosts, 6124 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6125 if (Numbering[A.first] != Numbering[B.first]) 6126 return Numbering[A.first] < Numbering[B.first]; 6127 ElementCountComparator ECC; 6128 return ECC(A.second, B.second); 6129 }); 6130 6131 // For a list of ordered instruction-vf pairs: 6132 // [(load, vf1), (load, vf2), (store, vf1)] 6133 // Group the instructions together to emit separate remarks for: 6134 // load (vf1, vf2) 6135 // store (vf1) 6136 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6137 auto Subset = ArrayRef<InstructionVFPair>(); 6138 do { 6139 if (Subset.empty()) 6140 Subset = Tail.take_front(1); 6141 6142 Instruction *I = Subset.front().first; 6143 6144 // If the next instruction is different, or if there are no other pairs, 6145 // emit a remark for the collated subset. e.g. 6146 // [(load, vf1), (load, vf2))] 6147 // to emit: 6148 // remark: invalid costs for 'load' at VF=(vf, vf2) 6149 if (Subset == Tail || Tail[Subset.size()].first != I) { 6150 std::string OutString; 6151 raw_string_ostream OS(OutString); 6152 assert(!Subset.empty() && "Unexpected empty range"); 6153 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6154 for (auto &Pair : Subset) 6155 OS << (Pair.second == Subset.front().second ? "" : ", ") 6156 << Pair.second; 6157 OS << "):"; 6158 if (auto *CI = dyn_cast<CallInst>(I)) 6159 OS << " call to " << CI->getCalledFunction()->getName(); 6160 else 6161 OS << " " << I->getOpcodeName(); 6162 OS.flush(); 6163 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6164 Tail = Tail.drop_front(Subset.size()); 6165 Subset = {}; 6166 } else 6167 // Grow the subset by one element 6168 Subset = Tail.take_front(Subset.size() + 1); 6169 } while (!Tail.empty()); 6170 } 6171 6172 if (!EnableCondStoresVectorization && NumPredStores) { 6173 reportVectorizationFailure("There are conditional stores.", 6174 "store that is conditionally executed prevents vectorization", 6175 "ConditionalStore", ORE, TheLoop); 6176 ChosenFactor = ScalarCost; 6177 } 6178 6179 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6180 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6181 << "LV: Vectorization seems to be not beneficial, " 6182 << "but was forced by a user.\n"); 6183 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6184 return ChosenFactor; 6185 } 6186 6187 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6188 const Loop &L, ElementCount VF) const { 6189 // Cross iteration phis such as reductions need special handling and are 6190 // currently unsupported. 6191 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6192 return Legal->isFirstOrderRecurrence(&Phi) || 6193 Legal->isReductionVariable(&Phi); 6194 })) 6195 return false; 6196 6197 // Phis with uses outside of the loop require special handling and are 6198 // currently unsupported. 6199 for (auto &Entry : Legal->getInductionVars()) { 6200 // Look for uses of the value of the induction at the last iteration. 6201 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6202 for (User *U : PostInc->users()) 6203 if (!L.contains(cast<Instruction>(U))) 6204 return false; 6205 // Look for uses of penultimate value of the induction. 6206 for (User *U : Entry.first->users()) 6207 if (!L.contains(cast<Instruction>(U))) 6208 return false; 6209 } 6210 6211 // Induction variables that are widened require special handling that is 6212 // currently not supported. 6213 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6214 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6215 this->isProfitableToScalarize(Entry.first, VF)); 6216 })) 6217 return false; 6218 6219 // Epilogue vectorization code has not been auditted to ensure it handles 6220 // non-latch exits properly. It may be fine, but it needs auditted and 6221 // tested. 6222 if (L.getExitingBlock() != L.getLoopLatch()) 6223 return false; 6224 6225 return true; 6226 } 6227 6228 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6229 const ElementCount VF) const { 6230 // FIXME: We need a much better cost-model to take different parameters such 6231 // as register pressure, code size increase and cost of extra branches into 6232 // account. For now we apply a very crude heuristic and only consider loops 6233 // with vectorization factors larger than a certain value. 6234 // We also consider epilogue vectorization unprofitable for targets that don't 6235 // consider interleaving beneficial (eg. MVE). 6236 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6237 return false; 6238 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6239 return true; 6240 return false; 6241 } 6242 6243 VectorizationFactor 6244 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6245 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6246 VectorizationFactor Result = VectorizationFactor::Disabled(); 6247 if (!EnableEpilogueVectorization) { 6248 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6249 return Result; 6250 } 6251 6252 if (!isScalarEpilogueAllowed()) { 6253 LLVM_DEBUG( 6254 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6255 "allowed.\n";); 6256 return Result; 6257 } 6258 6259 // Not really a cost consideration, but check for unsupported cases here to 6260 // simplify the logic. 6261 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6262 LLVM_DEBUG( 6263 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6264 "not a supported candidate.\n";); 6265 return Result; 6266 } 6267 6268 if (EpilogueVectorizationForceVF > 1) { 6269 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6270 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 6271 if (LVP.hasPlanWithVF(ForcedEC)) 6272 return {ForcedEC, 0}; 6273 else { 6274 LLVM_DEBUG( 6275 dbgs() 6276 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6277 return Result; 6278 } 6279 } 6280 6281 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6282 TheLoop->getHeader()->getParent()->hasMinSize()) { 6283 LLVM_DEBUG( 6284 dbgs() 6285 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6286 return Result; 6287 } 6288 6289 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 6290 if (MainLoopVF.isScalable()) 6291 LLVM_DEBUG( 6292 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 6293 "yet supported. Converting to fixed-width (VF=" 6294 << FixedMainLoopVF << ") instead\n"); 6295 6296 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 6297 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 6298 "this loop\n"); 6299 return Result; 6300 } 6301 6302 for (auto &NextVF : ProfitableVFs) 6303 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 6304 (Result.Width.getFixedValue() == 1 || 6305 isMoreProfitable(NextVF, Result)) && 6306 LVP.hasPlanWithVF(NextVF.Width)) 6307 Result = NextVF; 6308 6309 if (Result != VectorizationFactor::Disabled()) 6310 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6311 << Result.Width.getFixedValue() << "\n";); 6312 return Result; 6313 } 6314 6315 std::pair<unsigned, unsigned> 6316 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6317 unsigned MinWidth = -1U; 6318 unsigned MaxWidth = 8; 6319 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6320 for (Type *T : ElementTypesInLoop) { 6321 MinWidth = std::min<unsigned>( 6322 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6323 MaxWidth = std::max<unsigned>( 6324 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6325 } 6326 return {MinWidth, MaxWidth}; 6327 } 6328 6329 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6330 ElementTypesInLoop.clear(); 6331 // For each block. 6332 for (BasicBlock *BB : TheLoop->blocks()) { 6333 // For each instruction in the loop. 6334 for (Instruction &I : BB->instructionsWithoutDebug()) { 6335 Type *T = I.getType(); 6336 6337 // Skip ignored values. 6338 if (ValuesToIgnore.count(&I)) 6339 continue; 6340 6341 // Only examine Loads, Stores and PHINodes. 6342 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6343 continue; 6344 6345 // Examine PHI nodes that are reduction variables. Update the type to 6346 // account for the recurrence type. 6347 if (auto *PN = dyn_cast<PHINode>(&I)) { 6348 if (!Legal->isReductionVariable(PN)) 6349 continue; 6350 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6351 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6352 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6353 RdxDesc.getRecurrenceType(), 6354 TargetTransformInfo::ReductionFlags())) 6355 continue; 6356 T = RdxDesc.getRecurrenceType(); 6357 } 6358 6359 // Examine the stored values. 6360 if (auto *ST = dyn_cast<StoreInst>(&I)) 6361 T = ST->getValueOperand()->getType(); 6362 6363 // Ignore loaded pointer types and stored pointer types that are not 6364 // vectorizable. 6365 // 6366 // FIXME: The check here attempts to predict whether a load or store will 6367 // be vectorized. We only know this for certain after a VF has 6368 // been selected. Here, we assume that if an access can be 6369 // vectorized, it will be. We should also look at extending this 6370 // optimization to non-pointer types. 6371 // 6372 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6373 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6374 continue; 6375 6376 ElementTypesInLoop.insert(T); 6377 } 6378 } 6379 } 6380 6381 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6382 unsigned LoopCost) { 6383 // -- The interleave heuristics -- 6384 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6385 // There are many micro-architectural considerations that we can't predict 6386 // at this level. For example, frontend pressure (on decode or fetch) due to 6387 // code size, or the number and capabilities of the execution ports. 6388 // 6389 // We use the following heuristics to select the interleave count: 6390 // 1. If the code has reductions, then we interleave to break the cross 6391 // iteration dependency. 6392 // 2. If the loop is really small, then we interleave to reduce the loop 6393 // overhead. 6394 // 3. We don't interleave if we think that we will spill registers to memory 6395 // due to the increased register pressure. 6396 6397 if (!isScalarEpilogueAllowed()) 6398 return 1; 6399 6400 // We used the distance for the interleave count. 6401 if (Legal->getMaxSafeDepDistBytes() != -1U) 6402 return 1; 6403 6404 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6405 const bool HasReductions = !Legal->getReductionVars().empty(); 6406 // Do not interleave loops with a relatively small known or estimated trip 6407 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6408 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6409 // because with the above conditions interleaving can expose ILP and break 6410 // cross iteration dependences for reductions. 6411 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6412 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6413 return 1; 6414 6415 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6416 // We divide by these constants so assume that we have at least one 6417 // instruction that uses at least one register. 6418 for (auto& pair : R.MaxLocalUsers) { 6419 pair.second = std::max(pair.second, 1U); 6420 } 6421 6422 // We calculate the interleave count using the following formula. 6423 // Subtract the number of loop invariants from the number of available 6424 // registers. These registers are used by all of the interleaved instances. 6425 // Next, divide the remaining registers by the number of registers that is 6426 // required by the loop, in order to estimate how many parallel instances 6427 // fit without causing spills. All of this is rounded down if necessary to be 6428 // a power of two. We want power of two interleave count to simplify any 6429 // addressing operations or alignment considerations. 6430 // We also want power of two interleave counts to ensure that the induction 6431 // variable of the vector loop wraps to zero, when tail is folded by masking; 6432 // this currently happens when OptForSize, in which case IC is set to 1 above. 6433 unsigned IC = UINT_MAX; 6434 6435 for (auto& pair : R.MaxLocalUsers) { 6436 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6437 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6438 << " registers of " 6439 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6440 if (VF.isScalar()) { 6441 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6442 TargetNumRegisters = ForceTargetNumScalarRegs; 6443 } else { 6444 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6445 TargetNumRegisters = ForceTargetNumVectorRegs; 6446 } 6447 unsigned MaxLocalUsers = pair.second; 6448 unsigned LoopInvariantRegs = 0; 6449 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6450 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6451 6452 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6453 // Don't count the induction variable as interleaved. 6454 if (EnableIndVarRegisterHeur) { 6455 TmpIC = 6456 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6457 std::max(1U, (MaxLocalUsers - 1))); 6458 } 6459 6460 IC = std::min(IC, TmpIC); 6461 } 6462 6463 // Clamp the interleave ranges to reasonable counts. 6464 unsigned MaxInterleaveCount = 6465 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6466 6467 // Check if the user has overridden the max. 6468 if (VF.isScalar()) { 6469 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6470 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6471 } else { 6472 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6473 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6474 } 6475 6476 // If trip count is known or estimated compile time constant, limit the 6477 // interleave count to be less than the trip count divided by VF, provided it 6478 // is at least 1. 6479 // 6480 // For scalable vectors we can't know if interleaving is beneficial. It may 6481 // not be beneficial for small loops if none of the lanes in the second vector 6482 // iterations is enabled. However, for larger loops, there is likely to be a 6483 // similar benefit as for fixed-width vectors. For now, we choose to leave 6484 // the InterleaveCount as if vscale is '1', although if some information about 6485 // the vector is known (e.g. min vector size), we can make a better decision. 6486 if (BestKnownTC) { 6487 MaxInterleaveCount = 6488 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6489 // Make sure MaxInterleaveCount is greater than 0. 6490 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6491 } 6492 6493 assert(MaxInterleaveCount > 0 && 6494 "Maximum interleave count must be greater than 0"); 6495 6496 // Clamp the calculated IC to be between the 1 and the max interleave count 6497 // that the target and trip count allows. 6498 if (IC > MaxInterleaveCount) 6499 IC = MaxInterleaveCount; 6500 else 6501 // Make sure IC is greater than 0. 6502 IC = std::max(1u, IC); 6503 6504 assert(IC > 0 && "Interleave count must be greater than 0."); 6505 6506 // If we did not calculate the cost for VF (because the user selected the VF) 6507 // then we calculate the cost of VF here. 6508 if (LoopCost == 0) { 6509 InstructionCost C = expectedCost(VF).first; 6510 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6511 LoopCost = *C.getValue(); 6512 } 6513 6514 assert(LoopCost && "Non-zero loop cost expected"); 6515 6516 // Interleave if we vectorized this loop and there is a reduction that could 6517 // benefit from interleaving. 6518 if (VF.isVector() && HasReductions) { 6519 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6520 return IC; 6521 } 6522 6523 // Note that if we've already vectorized the loop we will have done the 6524 // runtime check and so interleaving won't require further checks. 6525 bool InterleavingRequiresRuntimePointerCheck = 6526 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6527 6528 // We want to interleave small loops in order to reduce the loop overhead and 6529 // potentially expose ILP opportunities. 6530 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6531 << "LV: IC is " << IC << '\n' 6532 << "LV: VF is " << VF << '\n'); 6533 const bool AggressivelyInterleaveReductions = 6534 TTI.enableAggressiveInterleaving(HasReductions); 6535 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6536 // We assume that the cost overhead is 1 and we use the cost model 6537 // to estimate the cost of the loop and interleave until the cost of the 6538 // loop overhead is about 5% of the cost of the loop. 6539 unsigned SmallIC = 6540 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6541 6542 // Interleave until store/load ports (estimated by max interleave count) are 6543 // saturated. 6544 unsigned NumStores = Legal->getNumStores(); 6545 unsigned NumLoads = Legal->getNumLoads(); 6546 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6547 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6548 6549 // There is little point in interleaving for reductions containing selects 6550 // and compares when VF=1 since it may just create more overhead than it's 6551 // worth for loops with small trip counts. This is because we still have to 6552 // do the final reduction after the loop. 6553 bool HasSelectCmpReductions = 6554 HasReductions && 6555 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6556 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6557 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6558 RdxDesc.getRecurrenceKind()); 6559 }); 6560 if (HasSelectCmpReductions) { 6561 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6562 return 1; 6563 } 6564 6565 // If we have a scalar reduction (vector reductions are already dealt with 6566 // by this point), we can increase the critical path length if the loop 6567 // we're interleaving is inside another loop. For tree-wise reductions 6568 // set the limit to 2, and for ordered reductions it's best to disable 6569 // interleaving entirely. 6570 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6571 bool HasOrderedReductions = 6572 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6573 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6574 return RdxDesc.isOrdered(); 6575 }); 6576 if (HasOrderedReductions) { 6577 LLVM_DEBUG( 6578 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6579 return 1; 6580 } 6581 6582 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6583 SmallIC = std::min(SmallIC, F); 6584 StoresIC = std::min(StoresIC, F); 6585 LoadsIC = std::min(LoadsIC, F); 6586 } 6587 6588 if (EnableLoadStoreRuntimeInterleave && 6589 std::max(StoresIC, LoadsIC) > SmallIC) { 6590 LLVM_DEBUG( 6591 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6592 return std::max(StoresIC, LoadsIC); 6593 } 6594 6595 // If there are scalar reductions and TTI has enabled aggressive 6596 // interleaving for reductions, we will interleave to expose ILP. 6597 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6598 AggressivelyInterleaveReductions) { 6599 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6600 // Interleave no less than SmallIC but not as aggressive as the normal IC 6601 // to satisfy the rare situation when resources are too limited. 6602 return std::max(IC / 2, SmallIC); 6603 } else { 6604 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6605 return SmallIC; 6606 } 6607 } 6608 6609 // Interleave if this is a large loop (small loops are already dealt with by 6610 // this point) that could benefit from interleaving. 6611 if (AggressivelyInterleaveReductions) { 6612 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6613 return IC; 6614 } 6615 6616 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6617 return 1; 6618 } 6619 6620 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6621 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6622 // This function calculates the register usage by measuring the highest number 6623 // of values that are alive at a single location. Obviously, this is a very 6624 // rough estimation. We scan the loop in a topological order in order and 6625 // assign a number to each instruction. We use RPO to ensure that defs are 6626 // met before their users. We assume that each instruction that has in-loop 6627 // users starts an interval. We record every time that an in-loop value is 6628 // used, so we have a list of the first and last occurrences of each 6629 // instruction. Next, we transpose this data structure into a multi map that 6630 // holds the list of intervals that *end* at a specific location. This multi 6631 // map allows us to perform a linear search. We scan the instructions linearly 6632 // and record each time that a new interval starts, by placing it in a set. 6633 // If we find this value in the multi-map then we remove it from the set. 6634 // The max register usage is the maximum size of the set. 6635 // We also search for instructions that are defined outside the loop, but are 6636 // used inside the loop. We need this number separately from the max-interval 6637 // usage number because when we unroll, loop-invariant values do not take 6638 // more register. 6639 LoopBlocksDFS DFS(TheLoop); 6640 DFS.perform(LI); 6641 6642 RegisterUsage RU; 6643 6644 // Each 'key' in the map opens a new interval. The values 6645 // of the map are the index of the 'last seen' usage of the 6646 // instruction that is the key. 6647 using IntervalMap = DenseMap<Instruction *, unsigned>; 6648 6649 // Maps instruction to its index. 6650 SmallVector<Instruction *, 64> IdxToInstr; 6651 // Marks the end of each interval. 6652 IntervalMap EndPoint; 6653 // Saves the list of instruction indices that are used in the loop. 6654 SmallPtrSet<Instruction *, 8> Ends; 6655 // Saves the list of values that are used in the loop but are 6656 // defined outside the loop, such as arguments and constants. 6657 SmallPtrSet<Value *, 8> LoopInvariants; 6658 6659 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6660 for (Instruction &I : BB->instructionsWithoutDebug()) { 6661 IdxToInstr.push_back(&I); 6662 6663 // Save the end location of each USE. 6664 for (Value *U : I.operands()) { 6665 auto *Instr = dyn_cast<Instruction>(U); 6666 6667 // Ignore non-instruction values such as arguments, constants, etc. 6668 if (!Instr) 6669 continue; 6670 6671 // If this instruction is outside the loop then record it and continue. 6672 if (!TheLoop->contains(Instr)) { 6673 LoopInvariants.insert(Instr); 6674 continue; 6675 } 6676 6677 // Overwrite previous end points. 6678 EndPoint[Instr] = IdxToInstr.size(); 6679 Ends.insert(Instr); 6680 } 6681 } 6682 } 6683 6684 // Saves the list of intervals that end with the index in 'key'. 6685 using InstrList = SmallVector<Instruction *, 2>; 6686 DenseMap<unsigned, InstrList> TransposeEnds; 6687 6688 // Transpose the EndPoints to a list of values that end at each index. 6689 for (auto &Interval : EndPoint) 6690 TransposeEnds[Interval.second].push_back(Interval.first); 6691 6692 SmallPtrSet<Instruction *, 8> OpenIntervals; 6693 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6694 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6695 6696 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6697 6698 // A lambda that gets the register usage for the given type and VF. 6699 const auto &TTICapture = TTI; 6700 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6701 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6702 return 0; 6703 InstructionCost::CostType RegUsage = 6704 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6705 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6706 "Nonsensical values for register usage."); 6707 return RegUsage; 6708 }; 6709 6710 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6711 Instruction *I = IdxToInstr[i]; 6712 6713 // Remove all of the instructions that end at this location. 6714 InstrList &List = TransposeEnds[i]; 6715 for (Instruction *ToRemove : List) 6716 OpenIntervals.erase(ToRemove); 6717 6718 // Ignore instructions that are never used within the loop. 6719 if (!Ends.count(I)) 6720 continue; 6721 6722 // Skip ignored values. 6723 if (ValuesToIgnore.count(I)) 6724 continue; 6725 6726 // For each VF find the maximum usage of registers. 6727 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6728 // Count the number of live intervals. 6729 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6730 6731 if (VFs[j].isScalar()) { 6732 for (auto Inst : OpenIntervals) { 6733 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6734 if (RegUsage.find(ClassID) == RegUsage.end()) 6735 RegUsage[ClassID] = 1; 6736 else 6737 RegUsage[ClassID] += 1; 6738 } 6739 } else { 6740 collectUniformsAndScalars(VFs[j]); 6741 for (auto Inst : OpenIntervals) { 6742 // Skip ignored values for VF > 1. 6743 if (VecValuesToIgnore.count(Inst)) 6744 continue; 6745 if (isScalarAfterVectorization(Inst, VFs[j])) { 6746 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6747 if (RegUsage.find(ClassID) == RegUsage.end()) 6748 RegUsage[ClassID] = 1; 6749 else 6750 RegUsage[ClassID] += 1; 6751 } else { 6752 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6753 if (RegUsage.find(ClassID) == RegUsage.end()) 6754 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6755 else 6756 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6757 } 6758 } 6759 } 6760 6761 for (auto& pair : RegUsage) { 6762 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6763 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6764 else 6765 MaxUsages[j][pair.first] = pair.second; 6766 } 6767 } 6768 6769 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6770 << OpenIntervals.size() << '\n'); 6771 6772 // Add the current instruction to the list of open intervals. 6773 OpenIntervals.insert(I); 6774 } 6775 6776 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6777 SmallMapVector<unsigned, unsigned, 4> Invariant; 6778 6779 for (auto Inst : LoopInvariants) { 6780 unsigned Usage = 6781 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6782 unsigned ClassID = 6783 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6784 if (Invariant.find(ClassID) == Invariant.end()) 6785 Invariant[ClassID] = Usage; 6786 else 6787 Invariant[ClassID] += Usage; 6788 } 6789 6790 LLVM_DEBUG({ 6791 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6792 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6793 << " item\n"; 6794 for (const auto &pair : MaxUsages[i]) { 6795 dbgs() << "LV(REG): RegisterClass: " 6796 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6797 << " registers\n"; 6798 } 6799 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6800 << " item\n"; 6801 for (const auto &pair : Invariant) { 6802 dbgs() << "LV(REG): RegisterClass: " 6803 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6804 << " registers\n"; 6805 } 6806 }); 6807 6808 RU.LoopInvariantRegs = Invariant; 6809 RU.MaxLocalUsers = MaxUsages[i]; 6810 RUs[i] = RU; 6811 } 6812 6813 return RUs; 6814 } 6815 6816 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6817 // TODO: Cost model for emulated masked load/store is completely 6818 // broken. This hack guides the cost model to use an artificially 6819 // high enough value to practically disable vectorization with such 6820 // operations, except where previously deployed legality hack allowed 6821 // using very low cost values. This is to avoid regressions coming simply 6822 // from moving "masked load/store" check from legality to cost model. 6823 // Masked Load/Gather emulation was previously never allowed. 6824 // Limited number of Masked Store/Scatter emulation was allowed. 6825 assert(isPredicatedInst(I) && 6826 "Expecting a scalar emulated instruction"); 6827 return isa<LoadInst>(I) || 6828 (isa<StoreInst>(I) && 6829 NumPredStores > NumberOfStoresToPredicate); 6830 } 6831 6832 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6833 // If we aren't vectorizing the loop, or if we've already collected the 6834 // instructions to scalarize, there's nothing to do. Collection may already 6835 // have occurred if we have a user-selected VF and are now computing the 6836 // expected cost for interleaving. 6837 if (VF.isScalar() || VF.isZero() || 6838 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6839 return; 6840 6841 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6842 // not profitable to scalarize any instructions, the presence of VF in the 6843 // map will indicate that we've analyzed it already. 6844 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6845 6846 // Find all the instructions that are scalar with predication in the loop and 6847 // determine if it would be better to not if-convert the blocks they are in. 6848 // If so, we also record the instructions to scalarize. 6849 for (BasicBlock *BB : TheLoop->blocks()) { 6850 if (!blockNeedsPredication(BB)) 6851 continue; 6852 for (Instruction &I : *BB) 6853 if (isScalarWithPredication(&I)) { 6854 ScalarCostsTy ScalarCosts; 6855 // Do not apply discount if scalable, because that would lead to 6856 // invalid scalarization costs. 6857 // Do not apply discount logic if hacked cost is needed 6858 // for emulated masked memrefs. 6859 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6860 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6861 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6862 // Remember that BB will remain after vectorization. 6863 PredicatedBBsAfterVectorization.insert(BB); 6864 } 6865 } 6866 } 6867 6868 int LoopVectorizationCostModel::computePredInstDiscount( 6869 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6870 assert(!isUniformAfterVectorization(PredInst, VF) && 6871 "Instruction marked uniform-after-vectorization will be predicated"); 6872 6873 // Initialize the discount to zero, meaning that the scalar version and the 6874 // vector version cost the same. 6875 InstructionCost Discount = 0; 6876 6877 // Holds instructions to analyze. The instructions we visit are mapped in 6878 // ScalarCosts. Those instructions are the ones that would be scalarized if 6879 // we find that the scalar version costs less. 6880 SmallVector<Instruction *, 8> Worklist; 6881 6882 // Returns true if the given instruction can be scalarized. 6883 auto canBeScalarized = [&](Instruction *I) -> bool { 6884 // We only attempt to scalarize instructions forming a single-use chain 6885 // from the original predicated block that would otherwise be vectorized. 6886 // Although not strictly necessary, we give up on instructions we know will 6887 // already be scalar to avoid traversing chains that are unlikely to be 6888 // beneficial. 6889 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6890 isScalarAfterVectorization(I, VF)) 6891 return false; 6892 6893 // If the instruction is scalar with predication, it will be analyzed 6894 // separately. We ignore it within the context of PredInst. 6895 if (isScalarWithPredication(I)) 6896 return false; 6897 6898 // If any of the instruction's operands are uniform after vectorization, 6899 // the instruction cannot be scalarized. This prevents, for example, a 6900 // masked load from being scalarized. 6901 // 6902 // We assume we will only emit a value for lane zero of an instruction 6903 // marked uniform after vectorization, rather than VF identical values. 6904 // Thus, if we scalarize an instruction that uses a uniform, we would 6905 // create uses of values corresponding to the lanes we aren't emitting code 6906 // for. This behavior can be changed by allowing getScalarValue to clone 6907 // the lane zero values for uniforms rather than asserting. 6908 for (Use &U : I->operands()) 6909 if (auto *J = dyn_cast<Instruction>(U.get())) 6910 if (isUniformAfterVectorization(J, VF)) 6911 return false; 6912 6913 // Otherwise, we can scalarize the instruction. 6914 return true; 6915 }; 6916 6917 // Compute the expected cost discount from scalarizing the entire expression 6918 // feeding the predicated instruction. We currently only consider expressions 6919 // that are single-use instruction chains. 6920 Worklist.push_back(PredInst); 6921 while (!Worklist.empty()) { 6922 Instruction *I = Worklist.pop_back_val(); 6923 6924 // If we've already analyzed the instruction, there's nothing to do. 6925 if (ScalarCosts.find(I) != ScalarCosts.end()) 6926 continue; 6927 6928 // Compute the cost of the vector instruction. Note that this cost already 6929 // includes the scalarization overhead of the predicated instruction. 6930 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6931 6932 // Compute the cost of the scalarized instruction. This cost is the cost of 6933 // the instruction as if it wasn't if-converted and instead remained in the 6934 // predicated block. We will scale this cost by block probability after 6935 // computing the scalarization overhead. 6936 InstructionCost ScalarCost = 6937 VF.getFixedValue() * 6938 getInstructionCost(I, ElementCount::getFixed(1)).first; 6939 6940 // Compute the scalarization overhead of needed insertelement instructions 6941 // and phi nodes. 6942 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6943 ScalarCost += TTI.getScalarizationOverhead( 6944 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6945 APInt::getAllOnes(VF.getFixedValue()), true, false); 6946 ScalarCost += 6947 VF.getFixedValue() * 6948 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6949 } 6950 6951 // Compute the scalarization overhead of needed extractelement 6952 // instructions. For each of the instruction's operands, if the operand can 6953 // be scalarized, add it to the worklist; otherwise, account for the 6954 // overhead. 6955 for (Use &U : I->operands()) 6956 if (auto *J = dyn_cast<Instruction>(U.get())) { 6957 assert(VectorType::isValidElementType(J->getType()) && 6958 "Instruction has non-scalar type"); 6959 if (canBeScalarized(J)) 6960 Worklist.push_back(J); 6961 else if (needsExtract(J, VF)) { 6962 ScalarCost += TTI.getScalarizationOverhead( 6963 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6964 APInt::getAllOnes(VF.getFixedValue()), false, true); 6965 } 6966 } 6967 6968 // Scale the total scalar cost by block probability. 6969 ScalarCost /= getReciprocalPredBlockProb(); 6970 6971 // Compute the discount. A non-negative discount means the vector version 6972 // of the instruction costs more, and scalarizing would be beneficial. 6973 Discount += VectorCost - ScalarCost; 6974 ScalarCosts[I] = ScalarCost; 6975 } 6976 6977 return *Discount.getValue(); 6978 } 6979 6980 LoopVectorizationCostModel::VectorizationCostTy 6981 LoopVectorizationCostModel::expectedCost( 6982 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6983 VectorizationCostTy Cost; 6984 6985 // For each block. 6986 for (BasicBlock *BB : TheLoop->blocks()) { 6987 VectorizationCostTy BlockCost; 6988 6989 // For each instruction in the old loop. 6990 for (Instruction &I : BB->instructionsWithoutDebug()) { 6991 // Skip ignored values. 6992 if (ValuesToIgnore.count(&I) || 6993 (VF.isVector() && VecValuesToIgnore.count(&I))) 6994 continue; 6995 6996 VectorizationCostTy C = getInstructionCost(&I, VF); 6997 6998 // Check if we should override the cost. 6999 if (C.first.isValid() && 7000 ForceTargetInstructionCost.getNumOccurrences() > 0) 7001 C.first = InstructionCost(ForceTargetInstructionCost); 7002 7003 // Keep a list of instructions with invalid costs. 7004 if (Invalid && !C.first.isValid()) 7005 Invalid->emplace_back(&I, VF); 7006 7007 BlockCost.first += C.first; 7008 BlockCost.second |= C.second; 7009 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 7010 << " for VF " << VF << " For instruction: " << I 7011 << '\n'); 7012 } 7013 7014 // If we are vectorizing a predicated block, it will have been 7015 // if-converted. This means that the block's instructions (aside from 7016 // stores and instructions that may divide by zero) will now be 7017 // unconditionally executed. For the scalar case, we may not always execute 7018 // the predicated block, if it is an if-else block. Thus, scale the block's 7019 // cost by the probability of executing it. blockNeedsPredication from 7020 // Legal is used so as to not include all blocks in tail folded loops. 7021 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 7022 BlockCost.first /= getReciprocalPredBlockProb(); 7023 7024 Cost.first += BlockCost.first; 7025 Cost.second |= BlockCost.second; 7026 } 7027 7028 return Cost; 7029 } 7030 7031 /// Gets Address Access SCEV after verifying that the access pattern 7032 /// is loop invariant except the induction variable dependence. 7033 /// 7034 /// This SCEV can be sent to the Target in order to estimate the address 7035 /// calculation cost. 7036 static const SCEV *getAddressAccessSCEV( 7037 Value *Ptr, 7038 LoopVectorizationLegality *Legal, 7039 PredicatedScalarEvolution &PSE, 7040 const Loop *TheLoop) { 7041 7042 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 7043 if (!Gep) 7044 return nullptr; 7045 7046 // We are looking for a gep with all loop invariant indices except for one 7047 // which should be an induction variable. 7048 auto SE = PSE.getSE(); 7049 unsigned NumOperands = Gep->getNumOperands(); 7050 for (unsigned i = 1; i < NumOperands; ++i) { 7051 Value *Opd = Gep->getOperand(i); 7052 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 7053 !Legal->isInductionVariable(Opd)) 7054 return nullptr; 7055 } 7056 7057 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 7058 return PSE.getSCEV(Ptr); 7059 } 7060 7061 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 7062 return Legal->hasStride(I->getOperand(0)) || 7063 Legal->hasStride(I->getOperand(1)); 7064 } 7065 7066 InstructionCost 7067 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 7068 ElementCount VF) { 7069 assert(VF.isVector() && 7070 "Scalarization cost of instruction implies vectorization."); 7071 if (VF.isScalable()) 7072 return InstructionCost::getInvalid(); 7073 7074 Type *ValTy = getLoadStoreType(I); 7075 auto SE = PSE.getSE(); 7076 7077 unsigned AS = getLoadStoreAddressSpace(I); 7078 Value *Ptr = getLoadStorePointerOperand(I); 7079 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 7080 7081 // Figure out whether the access is strided and get the stride value 7082 // if it's known in compile time 7083 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 7084 7085 // Get the cost of the scalar memory instruction and address computation. 7086 InstructionCost Cost = 7087 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 7088 7089 // Don't pass *I here, since it is scalar but will actually be part of a 7090 // vectorized loop where the user of it is a vectorized instruction. 7091 const Align Alignment = getLoadStoreAlignment(I); 7092 Cost += VF.getKnownMinValue() * 7093 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 7094 AS, TTI::TCK_RecipThroughput); 7095 7096 // Get the overhead of the extractelement and insertelement instructions 7097 // we might create due to scalarization. 7098 Cost += getScalarizationOverhead(I, VF); 7099 7100 // If we have a predicated load/store, it will need extra i1 extracts and 7101 // conditional branches, but may not be executed for each vector lane. Scale 7102 // the cost by the probability of executing the predicated block. 7103 if (isPredicatedInst(I)) { 7104 Cost /= getReciprocalPredBlockProb(); 7105 7106 // Add the cost of an i1 extract and a branch 7107 auto *Vec_i1Ty = 7108 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 7109 Cost += TTI.getScalarizationOverhead( 7110 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 7111 /*Insert=*/false, /*Extract=*/true); 7112 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 7113 7114 if (useEmulatedMaskMemRefHack(I)) 7115 // Artificially setting to a high enough value to practically disable 7116 // vectorization with such operations. 7117 Cost = 3000000; 7118 } 7119 7120 return Cost; 7121 } 7122 7123 InstructionCost 7124 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7125 ElementCount VF) { 7126 Type *ValTy = getLoadStoreType(I); 7127 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7128 Value *Ptr = getLoadStorePointerOperand(I); 7129 unsigned AS = getLoadStoreAddressSpace(I); 7130 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 7131 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7132 7133 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7134 "Stride should be 1 or -1 for consecutive memory access"); 7135 const Align Alignment = getLoadStoreAlignment(I); 7136 InstructionCost Cost = 0; 7137 if (Legal->isMaskRequired(I)) 7138 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7139 CostKind); 7140 else 7141 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7142 CostKind, I); 7143 7144 bool Reverse = ConsecutiveStride < 0; 7145 if (Reverse) 7146 Cost += 7147 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7148 return Cost; 7149 } 7150 7151 InstructionCost 7152 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7153 ElementCount VF) { 7154 assert(Legal->isUniformMemOp(*I)); 7155 7156 Type *ValTy = getLoadStoreType(I); 7157 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7158 const Align Alignment = getLoadStoreAlignment(I); 7159 unsigned AS = getLoadStoreAddressSpace(I); 7160 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7161 if (isa<LoadInst>(I)) { 7162 return TTI.getAddressComputationCost(ValTy) + 7163 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7164 CostKind) + 7165 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7166 } 7167 StoreInst *SI = cast<StoreInst>(I); 7168 7169 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7170 return TTI.getAddressComputationCost(ValTy) + 7171 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7172 CostKind) + 7173 (isLoopInvariantStoreValue 7174 ? 0 7175 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7176 VF.getKnownMinValue() - 1)); 7177 } 7178 7179 InstructionCost 7180 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7181 ElementCount VF) { 7182 Type *ValTy = getLoadStoreType(I); 7183 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7184 const Align Alignment = getLoadStoreAlignment(I); 7185 const Value *Ptr = getLoadStorePointerOperand(I); 7186 7187 return TTI.getAddressComputationCost(VectorTy) + 7188 TTI.getGatherScatterOpCost( 7189 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7190 TargetTransformInfo::TCK_RecipThroughput, I); 7191 } 7192 7193 InstructionCost 7194 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7195 ElementCount VF) { 7196 // TODO: Once we have support for interleaving with scalable vectors 7197 // we can calculate the cost properly here. 7198 if (VF.isScalable()) 7199 return InstructionCost::getInvalid(); 7200 7201 Type *ValTy = getLoadStoreType(I); 7202 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7203 unsigned AS = getLoadStoreAddressSpace(I); 7204 7205 auto Group = getInterleavedAccessGroup(I); 7206 assert(Group && "Fail to get an interleaved access group."); 7207 7208 unsigned InterleaveFactor = Group->getFactor(); 7209 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7210 7211 // Holds the indices of existing members in the interleaved group. 7212 SmallVector<unsigned, 4> Indices; 7213 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 7214 if (Group->getMember(IF)) 7215 Indices.push_back(IF); 7216 7217 // Calculate the cost of the whole interleaved group. 7218 bool UseMaskForGaps = 7219 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 7220 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 7221 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7222 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7223 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7224 7225 if (Group->isReverse()) { 7226 // TODO: Add support for reversed masked interleaved access. 7227 assert(!Legal->isMaskRequired(I) && 7228 "Reverse masked interleaved access not supported."); 7229 Cost += 7230 Group->getNumMembers() * 7231 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7232 } 7233 return Cost; 7234 } 7235 7236 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7237 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7238 using namespace llvm::PatternMatch; 7239 // Early exit for no inloop reductions 7240 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7241 return None; 7242 auto *VectorTy = cast<VectorType>(Ty); 7243 7244 // We are looking for a pattern of, and finding the minimal acceptable cost: 7245 // reduce(mul(ext(A), ext(B))) or 7246 // reduce(mul(A, B)) or 7247 // reduce(ext(A)) or 7248 // reduce(A). 7249 // The basic idea is that we walk down the tree to do that, finding the root 7250 // reduction instruction in InLoopReductionImmediateChains. From there we find 7251 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7252 // of the components. If the reduction cost is lower then we return it for the 7253 // reduction instruction and 0 for the other instructions in the pattern. If 7254 // it is not we return an invalid cost specifying the orignal cost method 7255 // should be used. 7256 Instruction *RetI = I; 7257 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7258 if (!RetI->hasOneUser()) 7259 return None; 7260 RetI = RetI->user_back(); 7261 } 7262 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7263 RetI->user_back()->getOpcode() == Instruction::Add) { 7264 if (!RetI->hasOneUser()) 7265 return None; 7266 RetI = RetI->user_back(); 7267 } 7268 7269 // Test if the found instruction is a reduction, and if not return an invalid 7270 // cost specifying the parent to use the original cost modelling. 7271 if (!InLoopReductionImmediateChains.count(RetI)) 7272 return None; 7273 7274 // Find the reduction this chain is a part of and calculate the basic cost of 7275 // the reduction on its own. 7276 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7277 Instruction *ReductionPhi = LastChain; 7278 while (!isa<PHINode>(ReductionPhi)) 7279 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7280 7281 const RecurrenceDescriptor &RdxDesc = 7282 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7283 7284 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7285 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7286 7287 // If we're using ordered reductions then we can just return the base cost 7288 // here, since getArithmeticReductionCost calculates the full ordered 7289 // reduction cost when FP reassociation is not allowed. 7290 if (useOrderedReductions(RdxDesc)) 7291 return BaseCost; 7292 7293 // Get the operand that was not the reduction chain and match it to one of the 7294 // patterns, returning the better cost if it is found. 7295 Instruction *RedOp = RetI->getOperand(1) == LastChain 7296 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7297 : dyn_cast<Instruction>(RetI->getOperand(1)); 7298 7299 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7300 7301 Instruction *Op0, *Op1; 7302 if (RedOp && 7303 match(RedOp, 7304 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7305 match(Op0, m_ZExtOrSExt(m_Value())) && 7306 Op0->getOpcode() == Op1->getOpcode() && 7307 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7308 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7309 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7310 7311 // Matched reduce(ext(mul(ext(A), ext(B))) 7312 // Note that the extend opcodes need to all match, or if A==B they will have 7313 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7314 // which is equally fine. 7315 bool IsUnsigned = isa<ZExtInst>(Op0); 7316 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7317 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7318 7319 InstructionCost ExtCost = 7320 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7321 TTI::CastContextHint::None, CostKind, Op0); 7322 InstructionCost MulCost = 7323 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7324 InstructionCost Ext2Cost = 7325 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7326 TTI::CastContextHint::None, CostKind, RedOp); 7327 7328 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7329 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7330 CostKind); 7331 7332 if (RedCost.isValid() && 7333 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7334 return I == RetI ? RedCost : 0; 7335 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7336 !TheLoop->isLoopInvariant(RedOp)) { 7337 // Matched reduce(ext(A)) 7338 bool IsUnsigned = isa<ZExtInst>(RedOp); 7339 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7340 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7341 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7342 CostKind); 7343 7344 InstructionCost ExtCost = 7345 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7346 TTI::CastContextHint::None, CostKind, RedOp); 7347 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7348 return I == RetI ? RedCost : 0; 7349 } else if (RedOp && 7350 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7351 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7352 Op0->getOpcode() == Op1->getOpcode() && 7353 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7354 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7355 bool IsUnsigned = isa<ZExtInst>(Op0); 7356 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7357 // Matched reduce(mul(ext, ext)) 7358 InstructionCost ExtCost = 7359 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7360 TTI::CastContextHint::None, CostKind, Op0); 7361 InstructionCost MulCost = 7362 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7363 7364 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7365 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7366 CostKind); 7367 7368 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7369 return I == RetI ? RedCost : 0; 7370 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7371 // Matched reduce(mul()) 7372 InstructionCost MulCost = 7373 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7374 7375 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7376 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7377 CostKind); 7378 7379 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7380 return I == RetI ? RedCost : 0; 7381 } 7382 } 7383 7384 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7385 } 7386 7387 InstructionCost 7388 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7389 ElementCount VF) { 7390 // Calculate scalar cost only. Vectorization cost should be ready at this 7391 // moment. 7392 if (VF.isScalar()) { 7393 Type *ValTy = getLoadStoreType(I); 7394 const Align Alignment = getLoadStoreAlignment(I); 7395 unsigned AS = getLoadStoreAddressSpace(I); 7396 7397 return TTI.getAddressComputationCost(ValTy) + 7398 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7399 TTI::TCK_RecipThroughput, I); 7400 } 7401 return getWideningCost(I, VF); 7402 } 7403 7404 LoopVectorizationCostModel::VectorizationCostTy 7405 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7406 ElementCount VF) { 7407 // If we know that this instruction will remain uniform, check the cost of 7408 // the scalar version. 7409 if (isUniformAfterVectorization(I, VF)) 7410 VF = ElementCount::getFixed(1); 7411 7412 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7413 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7414 7415 // Forced scalars do not have any scalarization overhead. 7416 auto ForcedScalar = ForcedScalars.find(VF); 7417 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7418 auto InstSet = ForcedScalar->second; 7419 if (InstSet.count(I)) 7420 return VectorizationCostTy( 7421 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7422 VF.getKnownMinValue()), 7423 false); 7424 } 7425 7426 Type *VectorTy; 7427 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7428 7429 bool TypeNotScalarized = 7430 VF.isVector() && VectorTy->isVectorTy() && 7431 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7432 return VectorizationCostTy(C, TypeNotScalarized); 7433 } 7434 7435 InstructionCost 7436 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7437 ElementCount VF) const { 7438 7439 // There is no mechanism yet to create a scalable scalarization loop, 7440 // so this is currently Invalid. 7441 if (VF.isScalable()) 7442 return InstructionCost::getInvalid(); 7443 7444 if (VF.isScalar()) 7445 return 0; 7446 7447 InstructionCost Cost = 0; 7448 Type *RetTy = ToVectorTy(I->getType(), VF); 7449 if (!RetTy->isVoidTy() && 7450 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7451 Cost += TTI.getScalarizationOverhead( 7452 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7453 false); 7454 7455 // Some targets keep addresses scalar. 7456 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7457 return Cost; 7458 7459 // Some targets support efficient element stores. 7460 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7461 return Cost; 7462 7463 // Collect operands to consider. 7464 CallInst *CI = dyn_cast<CallInst>(I); 7465 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7466 7467 // Skip operands that do not require extraction/scalarization and do not incur 7468 // any overhead. 7469 SmallVector<Type *> Tys; 7470 for (auto *V : filterExtractingOperands(Ops, VF)) 7471 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7472 return Cost + TTI.getOperandsScalarizationOverhead( 7473 filterExtractingOperands(Ops, VF), Tys); 7474 } 7475 7476 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7477 if (VF.isScalar()) 7478 return; 7479 NumPredStores = 0; 7480 for (BasicBlock *BB : TheLoop->blocks()) { 7481 // For each instruction in the old loop. 7482 for (Instruction &I : *BB) { 7483 Value *Ptr = getLoadStorePointerOperand(&I); 7484 if (!Ptr) 7485 continue; 7486 7487 // TODO: We should generate better code and update the cost model for 7488 // predicated uniform stores. Today they are treated as any other 7489 // predicated store (see added test cases in 7490 // invariant-store-vectorization.ll). 7491 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7492 NumPredStores++; 7493 7494 if (Legal->isUniformMemOp(I)) { 7495 // TODO: Avoid replicating loads and stores instead of 7496 // relying on instcombine to remove them. 7497 // Load: Scalar load + broadcast 7498 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7499 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7500 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7501 continue; 7502 } 7503 7504 // We assume that widening is the best solution when possible. 7505 if (memoryInstructionCanBeWidened(&I, VF)) { 7506 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7507 int ConsecutiveStride = Legal->isConsecutivePtr( 7508 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7509 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7510 "Expected consecutive stride."); 7511 InstWidening Decision = 7512 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7513 setWideningDecision(&I, VF, Decision, Cost); 7514 continue; 7515 } 7516 7517 // Choose between Interleaving, Gather/Scatter or Scalarization. 7518 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7519 unsigned NumAccesses = 1; 7520 if (isAccessInterleaved(&I)) { 7521 auto Group = getInterleavedAccessGroup(&I); 7522 assert(Group && "Fail to get an interleaved access group."); 7523 7524 // Make one decision for the whole group. 7525 if (getWideningDecision(&I, VF) != CM_Unknown) 7526 continue; 7527 7528 NumAccesses = Group->getNumMembers(); 7529 if (interleavedAccessCanBeWidened(&I, VF)) 7530 InterleaveCost = getInterleaveGroupCost(&I, VF); 7531 } 7532 7533 InstructionCost GatherScatterCost = 7534 isLegalGatherOrScatter(&I) 7535 ? getGatherScatterCost(&I, VF) * NumAccesses 7536 : InstructionCost::getInvalid(); 7537 7538 InstructionCost ScalarizationCost = 7539 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7540 7541 // Choose better solution for the current VF, 7542 // write down this decision and use it during vectorization. 7543 InstructionCost Cost; 7544 InstWidening Decision; 7545 if (InterleaveCost <= GatherScatterCost && 7546 InterleaveCost < ScalarizationCost) { 7547 Decision = CM_Interleave; 7548 Cost = InterleaveCost; 7549 } else if (GatherScatterCost < ScalarizationCost) { 7550 Decision = CM_GatherScatter; 7551 Cost = GatherScatterCost; 7552 } else { 7553 Decision = CM_Scalarize; 7554 Cost = ScalarizationCost; 7555 } 7556 // If the instructions belongs to an interleave group, the whole group 7557 // receives the same decision. The whole group receives the cost, but 7558 // the cost will actually be assigned to one instruction. 7559 if (auto Group = getInterleavedAccessGroup(&I)) 7560 setWideningDecision(Group, VF, Decision, Cost); 7561 else 7562 setWideningDecision(&I, VF, Decision, Cost); 7563 } 7564 } 7565 7566 // Make sure that any load of address and any other address computation 7567 // remains scalar unless there is gather/scatter support. This avoids 7568 // inevitable extracts into address registers, and also has the benefit of 7569 // activating LSR more, since that pass can't optimize vectorized 7570 // addresses. 7571 if (TTI.prefersVectorizedAddressing()) 7572 return; 7573 7574 // Start with all scalar pointer uses. 7575 SmallPtrSet<Instruction *, 8> AddrDefs; 7576 for (BasicBlock *BB : TheLoop->blocks()) 7577 for (Instruction &I : *BB) { 7578 Instruction *PtrDef = 7579 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7580 if (PtrDef && TheLoop->contains(PtrDef) && 7581 getWideningDecision(&I, VF) != CM_GatherScatter) 7582 AddrDefs.insert(PtrDef); 7583 } 7584 7585 // Add all instructions used to generate the addresses. 7586 SmallVector<Instruction *, 4> Worklist; 7587 append_range(Worklist, AddrDefs); 7588 while (!Worklist.empty()) { 7589 Instruction *I = Worklist.pop_back_val(); 7590 for (auto &Op : I->operands()) 7591 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7592 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7593 AddrDefs.insert(InstOp).second) 7594 Worklist.push_back(InstOp); 7595 } 7596 7597 for (auto *I : AddrDefs) { 7598 if (isa<LoadInst>(I)) { 7599 // Setting the desired widening decision should ideally be handled in 7600 // by cost functions, but since this involves the task of finding out 7601 // if the loaded register is involved in an address computation, it is 7602 // instead changed here when we know this is the case. 7603 InstWidening Decision = getWideningDecision(I, VF); 7604 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7605 // Scalarize a widened load of address. 7606 setWideningDecision( 7607 I, VF, CM_Scalarize, 7608 (VF.getKnownMinValue() * 7609 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7610 else if (auto Group = getInterleavedAccessGroup(I)) { 7611 // Scalarize an interleave group of address loads. 7612 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7613 if (Instruction *Member = Group->getMember(I)) 7614 setWideningDecision( 7615 Member, VF, CM_Scalarize, 7616 (VF.getKnownMinValue() * 7617 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7618 } 7619 } 7620 } else 7621 // Make sure I gets scalarized and a cost estimate without 7622 // scalarization overhead. 7623 ForcedScalars[VF].insert(I); 7624 } 7625 } 7626 7627 InstructionCost 7628 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7629 Type *&VectorTy) { 7630 Type *RetTy = I->getType(); 7631 if (canTruncateToMinimalBitwidth(I, VF)) 7632 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7633 auto SE = PSE.getSE(); 7634 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7635 7636 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7637 ElementCount VF) -> bool { 7638 if (VF.isScalar()) 7639 return true; 7640 7641 auto Scalarized = InstsToScalarize.find(VF); 7642 assert(Scalarized != InstsToScalarize.end() && 7643 "VF not yet analyzed for scalarization profitability"); 7644 return !Scalarized->second.count(I) && 7645 llvm::all_of(I->users(), [&](User *U) { 7646 auto *UI = cast<Instruction>(U); 7647 return !Scalarized->second.count(UI); 7648 }); 7649 }; 7650 (void) hasSingleCopyAfterVectorization; 7651 7652 if (isScalarAfterVectorization(I, VF)) { 7653 // With the exception of GEPs and PHIs, after scalarization there should 7654 // only be one copy of the instruction generated in the loop. This is 7655 // because the VF is either 1, or any instructions that need scalarizing 7656 // have already been dealt with by the the time we get here. As a result, 7657 // it means we don't have to multiply the instruction cost by VF. 7658 assert(I->getOpcode() == Instruction::GetElementPtr || 7659 I->getOpcode() == Instruction::PHI || 7660 (I->getOpcode() == Instruction::BitCast && 7661 I->getType()->isPointerTy()) || 7662 hasSingleCopyAfterVectorization(I, VF)); 7663 VectorTy = RetTy; 7664 } else 7665 VectorTy = ToVectorTy(RetTy, VF); 7666 7667 // TODO: We need to estimate the cost of intrinsic calls. 7668 switch (I->getOpcode()) { 7669 case Instruction::GetElementPtr: 7670 // We mark this instruction as zero-cost because the cost of GEPs in 7671 // vectorized code depends on whether the corresponding memory instruction 7672 // is scalarized or not. Therefore, we handle GEPs with the memory 7673 // instruction cost. 7674 return 0; 7675 case Instruction::Br: { 7676 // In cases of scalarized and predicated instructions, there will be VF 7677 // predicated blocks in the vectorized loop. Each branch around these 7678 // blocks requires also an extract of its vector compare i1 element. 7679 bool ScalarPredicatedBB = false; 7680 BranchInst *BI = cast<BranchInst>(I); 7681 if (VF.isVector() && BI->isConditional() && 7682 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7683 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7684 ScalarPredicatedBB = true; 7685 7686 if (ScalarPredicatedBB) { 7687 // Not possible to scalarize scalable vector with predicated instructions. 7688 if (VF.isScalable()) 7689 return InstructionCost::getInvalid(); 7690 // Return cost for branches around scalarized and predicated blocks. 7691 auto *Vec_i1Ty = 7692 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7693 return ( 7694 TTI.getScalarizationOverhead( 7695 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7696 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7697 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7698 // The back-edge branch will remain, as will all scalar branches. 7699 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7700 else 7701 // This branch will be eliminated by if-conversion. 7702 return 0; 7703 // Note: We currently assume zero cost for an unconditional branch inside 7704 // a predicated block since it will become a fall-through, although we 7705 // may decide in the future to call TTI for all branches. 7706 } 7707 case Instruction::PHI: { 7708 auto *Phi = cast<PHINode>(I); 7709 7710 // First-order recurrences are replaced by vector shuffles inside the loop. 7711 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7712 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7713 return TTI.getShuffleCost( 7714 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7715 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7716 7717 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7718 // converted into select instructions. We require N - 1 selects per phi 7719 // node, where N is the number of incoming values. 7720 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7721 return (Phi->getNumIncomingValues() - 1) * 7722 TTI.getCmpSelInstrCost( 7723 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7724 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7725 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7726 7727 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7728 } 7729 case Instruction::UDiv: 7730 case Instruction::SDiv: 7731 case Instruction::URem: 7732 case Instruction::SRem: 7733 // If we have a predicated instruction, it may not be executed for each 7734 // vector lane. Get the scalarization cost and scale this amount by the 7735 // probability of executing the predicated block. If the instruction is not 7736 // predicated, we fall through to the next case. 7737 if (VF.isVector() && isScalarWithPredication(I)) { 7738 InstructionCost Cost = 0; 7739 7740 // These instructions have a non-void type, so account for the phi nodes 7741 // that we will create. This cost is likely to be zero. The phi node 7742 // cost, if any, should be scaled by the block probability because it 7743 // models a copy at the end of each predicated block. 7744 Cost += VF.getKnownMinValue() * 7745 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7746 7747 // The cost of the non-predicated instruction. 7748 Cost += VF.getKnownMinValue() * 7749 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7750 7751 // The cost of insertelement and extractelement instructions needed for 7752 // scalarization. 7753 Cost += getScalarizationOverhead(I, VF); 7754 7755 // Scale the cost by the probability of executing the predicated blocks. 7756 // This assumes the predicated block for each vector lane is equally 7757 // likely. 7758 return Cost / getReciprocalPredBlockProb(); 7759 } 7760 LLVM_FALLTHROUGH; 7761 case Instruction::Add: 7762 case Instruction::FAdd: 7763 case Instruction::Sub: 7764 case Instruction::FSub: 7765 case Instruction::Mul: 7766 case Instruction::FMul: 7767 case Instruction::FDiv: 7768 case Instruction::FRem: 7769 case Instruction::Shl: 7770 case Instruction::LShr: 7771 case Instruction::AShr: 7772 case Instruction::And: 7773 case Instruction::Or: 7774 case Instruction::Xor: { 7775 // Since we will replace the stride by 1 the multiplication should go away. 7776 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7777 return 0; 7778 7779 // Detect reduction patterns 7780 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7781 return *RedCost; 7782 7783 // Certain instructions can be cheaper to vectorize if they have a constant 7784 // second vector operand. One example of this are shifts on x86. 7785 Value *Op2 = I->getOperand(1); 7786 TargetTransformInfo::OperandValueProperties Op2VP; 7787 TargetTransformInfo::OperandValueKind Op2VK = 7788 TTI.getOperandInfo(Op2, Op2VP); 7789 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7790 Op2VK = TargetTransformInfo::OK_UniformValue; 7791 7792 SmallVector<const Value *, 4> Operands(I->operand_values()); 7793 return TTI.getArithmeticInstrCost( 7794 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7795 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7796 } 7797 case Instruction::FNeg: { 7798 return TTI.getArithmeticInstrCost( 7799 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7800 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7801 TargetTransformInfo::OP_None, I->getOperand(0), I); 7802 } 7803 case Instruction::Select: { 7804 SelectInst *SI = cast<SelectInst>(I); 7805 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7806 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7807 7808 const Value *Op0, *Op1; 7809 using namespace llvm::PatternMatch; 7810 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7811 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7812 // select x, y, false --> x & y 7813 // select x, true, y --> x | y 7814 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7815 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7816 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7817 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7818 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7819 Op1->getType()->getScalarSizeInBits() == 1); 7820 7821 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7822 return TTI.getArithmeticInstrCost( 7823 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7824 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7825 } 7826 7827 Type *CondTy = SI->getCondition()->getType(); 7828 if (!ScalarCond) 7829 CondTy = VectorType::get(CondTy, VF); 7830 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7831 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7832 } 7833 case Instruction::ICmp: 7834 case Instruction::FCmp: { 7835 Type *ValTy = I->getOperand(0)->getType(); 7836 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7837 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7838 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7839 VectorTy = ToVectorTy(ValTy, VF); 7840 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7841 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7842 } 7843 case Instruction::Store: 7844 case Instruction::Load: { 7845 ElementCount Width = VF; 7846 if (Width.isVector()) { 7847 InstWidening Decision = getWideningDecision(I, Width); 7848 assert(Decision != CM_Unknown && 7849 "CM decision should be taken at this point"); 7850 if (Decision == CM_Scalarize) 7851 Width = ElementCount::getFixed(1); 7852 } 7853 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7854 return getMemoryInstructionCost(I, VF); 7855 } 7856 case Instruction::BitCast: 7857 if (I->getType()->isPointerTy()) 7858 return 0; 7859 LLVM_FALLTHROUGH; 7860 case Instruction::ZExt: 7861 case Instruction::SExt: 7862 case Instruction::FPToUI: 7863 case Instruction::FPToSI: 7864 case Instruction::FPExt: 7865 case Instruction::PtrToInt: 7866 case Instruction::IntToPtr: 7867 case Instruction::SIToFP: 7868 case Instruction::UIToFP: 7869 case Instruction::Trunc: 7870 case Instruction::FPTrunc: { 7871 // Computes the CastContextHint from a Load/Store instruction. 7872 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7873 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7874 "Expected a load or a store!"); 7875 7876 if (VF.isScalar() || !TheLoop->contains(I)) 7877 return TTI::CastContextHint::Normal; 7878 7879 switch (getWideningDecision(I, VF)) { 7880 case LoopVectorizationCostModel::CM_GatherScatter: 7881 return TTI::CastContextHint::GatherScatter; 7882 case LoopVectorizationCostModel::CM_Interleave: 7883 return TTI::CastContextHint::Interleave; 7884 case LoopVectorizationCostModel::CM_Scalarize: 7885 case LoopVectorizationCostModel::CM_Widen: 7886 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7887 : TTI::CastContextHint::Normal; 7888 case LoopVectorizationCostModel::CM_Widen_Reverse: 7889 return TTI::CastContextHint::Reversed; 7890 case LoopVectorizationCostModel::CM_Unknown: 7891 llvm_unreachable("Instr did not go through cost modelling?"); 7892 } 7893 7894 llvm_unreachable("Unhandled case!"); 7895 }; 7896 7897 unsigned Opcode = I->getOpcode(); 7898 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7899 // For Trunc, the context is the only user, which must be a StoreInst. 7900 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7901 if (I->hasOneUse()) 7902 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7903 CCH = ComputeCCH(Store); 7904 } 7905 // For Z/Sext, the context is the operand, which must be a LoadInst. 7906 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7907 Opcode == Instruction::FPExt) { 7908 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7909 CCH = ComputeCCH(Load); 7910 } 7911 7912 // We optimize the truncation of induction variables having constant 7913 // integer steps. The cost of these truncations is the same as the scalar 7914 // operation. 7915 if (isOptimizableIVTruncate(I, VF)) { 7916 auto *Trunc = cast<TruncInst>(I); 7917 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7918 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7919 } 7920 7921 // Detect reduction patterns 7922 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7923 return *RedCost; 7924 7925 Type *SrcScalarTy = I->getOperand(0)->getType(); 7926 Type *SrcVecTy = 7927 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7928 if (canTruncateToMinimalBitwidth(I, VF)) { 7929 // This cast is going to be shrunk. This may remove the cast or it might 7930 // turn it into slightly different cast. For example, if MinBW == 16, 7931 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7932 // 7933 // Calculate the modified src and dest types. 7934 Type *MinVecTy = VectorTy; 7935 if (Opcode == Instruction::Trunc) { 7936 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7937 VectorTy = 7938 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7939 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7940 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7941 VectorTy = 7942 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7943 } 7944 } 7945 7946 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7947 } 7948 case Instruction::Call: { 7949 bool NeedToScalarize; 7950 CallInst *CI = cast<CallInst>(I); 7951 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7952 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7953 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7954 return std::min(CallCost, IntrinsicCost); 7955 } 7956 return CallCost; 7957 } 7958 case Instruction::ExtractValue: 7959 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7960 case Instruction::Alloca: 7961 // We cannot easily widen alloca to a scalable alloca, as 7962 // the result would need to be a vector of pointers. 7963 if (VF.isScalable()) 7964 return InstructionCost::getInvalid(); 7965 LLVM_FALLTHROUGH; 7966 default: 7967 // This opcode is unknown. Assume that it is the same as 'mul'. 7968 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7969 } // end of switch. 7970 } 7971 7972 char LoopVectorize::ID = 0; 7973 7974 static const char lv_name[] = "Loop Vectorization"; 7975 7976 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7977 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7978 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7979 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7980 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7981 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7982 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7983 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7984 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7985 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7986 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7987 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7988 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7989 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7990 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7991 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7992 7993 namespace llvm { 7994 7995 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7996 7997 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7998 bool VectorizeOnlyWhenForced) { 7999 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 8000 } 8001 8002 } // end namespace llvm 8003 8004 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 8005 // Check if the pointer operand of a load or store instruction is 8006 // consecutive. 8007 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 8008 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 8009 return false; 8010 } 8011 8012 void LoopVectorizationCostModel::collectValuesToIgnore() { 8013 // Ignore ephemeral values. 8014 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 8015 8016 // Ignore type-promoting instructions we identified during reduction 8017 // detection. 8018 for (auto &Reduction : Legal->getReductionVars()) { 8019 RecurrenceDescriptor &RedDes = Reduction.second; 8020 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 8021 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8022 } 8023 // Ignore type-casting instructions we identified during induction 8024 // detection. 8025 for (auto &Induction : Legal->getInductionVars()) { 8026 InductionDescriptor &IndDes = Induction.second; 8027 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8028 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 8029 } 8030 } 8031 8032 void LoopVectorizationCostModel::collectInLoopReductions() { 8033 for (auto &Reduction : Legal->getReductionVars()) { 8034 PHINode *Phi = Reduction.first; 8035 RecurrenceDescriptor &RdxDesc = Reduction.second; 8036 8037 // We don't collect reductions that are type promoted (yet). 8038 if (RdxDesc.getRecurrenceType() != Phi->getType()) 8039 continue; 8040 8041 // If the target would prefer this reduction to happen "in-loop", then we 8042 // want to record it as such. 8043 unsigned Opcode = RdxDesc.getOpcode(); 8044 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 8045 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 8046 TargetTransformInfo::ReductionFlags())) 8047 continue; 8048 8049 // Check that we can correctly put the reductions into the loop, by 8050 // finding the chain of operations that leads from the phi to the loop 8051 // exit value. 8052 SmallVector<Instruction *, 4> ReductionOperations = 8053 RdxDesc.getReductionOpChain(Phi, TheLoop); 8054 bool InLoop = !ReductionOperations.empty(); 8055 if (InLoop) { 8056 InLoopReductionChains[Phi] = ReductionOperations; 8057 // Add the elements to InLoopReductionImmediateChains for cost modelling. 8058 Instruction *LastChain = Phi; 8059 for (auto *I : ReductionOperations) { 8060 InLoopReductionImmediateChains[I] = LastChain; 8061 LastChain = I; 8062 } 8063 } 8064 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 8065 << " reduction for phi: " << *Phi << "\n"); 8066 } 8067 } 8068 8069 // TODO: we could return a pair of values that specify the max VF and 8070 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 8071 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 8072 // doesn't have a cost model that can choose which plan to execute if 8073 // more than one is generated. 8074 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 8075 LoopVectorizationCostModel &CM) { 8076 unsigned WidestType; 8077 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 8078 return WidestVectorRegBits / WidestType; 8079 } 8080 8081 VectorizationFactor 8082 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 8083 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 8084 ElementCount VF = UserVF; 8085 // Outer loop handling: They may require CFG and instruction level 8086 // transformations before even evaluating whether vectorization is profitable. 8087 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8088 // the vectorization pipeline. 8089 if (!OrigLoop->isInnermost()) { 8090 // If the user doesn't provide a vectorization factor, determine a 8091 // reasonable one. 8092 if (UserVF.isZero()) { 8093 VF = ElementCount::getFixed(determineVPlanVF( 8094 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8095 .getFixedSize(), 8096 CM)); 8097 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8098 8099 // Make sure we have a VF > 1 for stress testing. 8100 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8101 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8102 << "overriding computed VF.\n"); 8103 VF = ElementCount::getFixed(4); 8104 } 8105 } 8106 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8107 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8108 "VF needs to be a power of two"); 8109 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8110 << "VF " << VF << " to build VPlans.\n"); 8111 buildVPlans(VF, VF); 8112 8113 // For VPlan build stress testing, we bail out after VPlan construction. 8114 if (VPlanBuildStressTest) 8115 return VectorizationFactor::Disabled(); 8116 8117 return {VF, 0 /*Cost*/}; 8118 } 8119 8120 LLVM_DEBUG( 8121 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8122 "VPlan-native path.\n"); 8123 return VectorizationFactor::Disabled(); 8124 } 8125 8126 Optional<VectorizationFactor> 8127 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8128 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8129 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8130 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8131 return None; 8132 8133 // Invalidate interleave groups if all blocks of loop will be predicated. 8134 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 8135 !useMaskedInterleavedAccesses(*TTI)) { 8136 LLVM_DEBUG( 8137 dbgs() 8138 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8139 "which requires masked-interleaved support.\n"); 8140 if (CM.InterleaveInfo.invalidateGroups()) 8141 // Invalidating interleave groups also requires invalidating all decisions 8142 // based on them, which includes widening decisions and uniform and scalar 8143 // values. 8144 CM.invalidateCostModelingDecisions(); 8145 } 8146 8147 ElementCount MaxUserVF = 8148 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8149 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8150 if (!UserVF.isZero() && UserVFIsLegal) { 8151 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8152 "VF needs to be a power of two"); 8153 // Collect the instructions (and their associated costs) that will be more 8154 // profitable to scalarize. 8155 if (CM.selectUserVectorizationFactor(UserVF)) { 8156 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8157 CM.collectInLoopReductions(); 8158 buildVPlansWithVPRecipes(UserVF, UserVF); 8159 LLVM_DEBUG(printPlans(dbgs())); 8160 return {{UserVF, 0}}; 8161 } else 8162 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8163 "InvalidCost", ORE, OrigLoop); 8164 } 8165 8166 // Populate the set of Vectorization Factor Candidates. 8167 ElementCountSet VFCandidates; 8168 for (auto VF = ElementCount::getFixed(1); 8169 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8170 VFCandidates.insert(VF); 8171 for (auto VF = ElementCount::getScalable(1); 8172 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8173 VFCandidates.insert(VF); 8174 8175 for (const auto &VF : VFCandidates) { 8176 // Collect Uniform and Scalar instructions after vectorization with VF. 8177 CM.collectUniformsAndScalars(VF); 8178 8179 // Collect the instructions (and their associated costs) that will be more 8180 // profitable to scalarize. 8181 if (VF.isVector()) 8182 CM.collectInstsToScalarize(VF); 8183 } 8184 8185 CM.collectInLoopReductions(); 8186 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8187 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8188 8189 LLVM_DEBUG(printPlans(dbgs())); 8190 if (!MaxFactors.hasVector()) 8191 return VectorizationFactor::Disabled(); 8192 8193 // Select the optimal vectorization factor. 8194 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8195 8196 // Check if it is profitable to vectorize with runtime checks. 8197 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8198 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8199 bool PragmaThresholdReached = 8200 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8201 bool ThresholdReached = 8202 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8203 if ((ThresholdReached && !Hints.allowReordering()) || 8204 PragmaThresholdReached) { 8205 ORE->emit([&]() { 8206 return OptimizationRemarkAnalysisAliasing( 8207 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8208 OrigLoop->getHeader()) 8209 << "loop not vectorized: cannot prove it is safe to reorder " 8210 "memory operations"; 8211 }); 8212 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8213 Hints.emitRemarkWithHints(); 8214 return VectorizationFactor::Disabled(); 8215 } 8216 } 8217 return SelectedVF; 8218 } 8219 8220 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 8221 assert(count_if(VPlans, 8222 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 8223 1 && 8224 "Best VF has not a single VPlan."); 8225 8226 for (const VPlanPtr &Plan : VPlans) { 8227 if (Plan->hasVF(VF)) 8228 return *Plan.get(); 8229 } 8230 llvm_unreachable("No plan found!"); 8231 } 8232 8233 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 8234 VPlan &BestVPlan, 8235 InnerLoopVectorizer &ILV, 8236 DominatorTree *DT) { 8237 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 8238 << '\n'); 8239 8240 // Perform the actual loop transformation. 8241 8242 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8243 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 8244 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8245 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8246 State.CanonicalIV = ILV.Induction; 8247 8248 ILV.printDebugTracesAtStart(); 8249 8250 //===------------------------------------------------===// 8251 // 8252 // Notice: any optimization or new instruction that go 8253 // into the code below should also be implemented in 8254 // the cost-model. 8255 // 8256 //===------------------------------------------------===// 8257 8258 // 2. Copy and widen instructions from the old loop into the new loop. 8259 BestVPlan.execute(&State); 8260 8261 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8262 // predication, updating analyses. 8263 ILV.fixVectorizedLoop(State); 8264 8265 ILV.printDebugTracesAtEnd(); 8266 } 8267 8268 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8269 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8270 for (const auto &Plan : VPlans) 8271 if (PrintVPlansInDotFormat) 8272 Plan->printDOT(O); 8273 else 8274 Plan->print(O); 8275 } 8276 #endif 8277 8278 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8279 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8280 8281 // We create new control-flow for the vectorized loop, so the original exit 8282 // conditions will be dead after vectorization if it's only used by the 8283 // terminator 8284 SmallVector<BasicBlock*> ExitingBlocks; 8285 OrigLoop->getExitingBlocks(ExitingBlocks); 8286 for (auto *BB : ExitingBlocks) { 8287 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8288 if (!Cmp || !Cmp->hasOneUse()) 8289 continue; 8290 8291 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8292 if (!DeadInstructions.insert(Cmp).second) 8293 continue; 8294 8295 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8296 // TODO: can recurse through operands in general 8297 for (Value *Op : Cmp->operands()) { 8298 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8299 DeadInstructions.insert(cast<Instruction>(Op)); 8300 } 8301 } 8302 8303 // We create new "steps" for induction variable updates to which the original 8304 // induction variables map. An original update instruction will be dead if 8305 // all its users except the induction variable are dead. 8306 auto *Latch = OrigLoop->getLoopLatch(); 8307 for (auto &Induction : Legal->getInductionVars()) { 8308 PHINode *Ind = Induction.first; 8309 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8310 8311 // If the tail is to be folded by masking, the primary induction variable, 8312 // if exists, isn't dead: it will be used for masking. Don't kill it. 8313 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8314 continue; 8315 8316 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8317 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8318 })) 8319 DeadInstructions.insert(IndUpdate); 8320 8321 // We record as "Dead" also the type-casting instructions we had identified 8322 // during induction analysis. We don't need any handling for them in the 8323 // vectorized loop because we have proven that, under a proper runtime 8324 // test guarding the vectorized loop, the value of the phi, and the casted 8325 // value of the phi, are the same. The last instruction in this casting chain 8326 // will get its scalar/vector/widened def from the scalar/vector/widened def 8327 // of the respective phi node. Any other casts in the induction def-use chain 8328 // have no other uses outside the phi update chain, and will be ignored. 8329 InductionDescriptor &IndDes = Induction.second; 8330 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8331 DeadInstructions.insert(Casts.begin(), Casts.end()); 8332 } 8333 } 8334 8335 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8336 8337 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8338 8339 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8340 Value *Step, 8341 Instruction::BinaryOps BinOp) { 8342 // When unrolling and the VF is 1, we only need to add a simple scalar. 8343 Type *Ty = Val->getType(); 8344 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8345 8346 if (Ty->isFloatingPointTy()) { 8347 // Floating-point operations inherit FMF via the builder's flags. 8348 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8349 return Builder.CreateBinOp(BinOp, Val, MulOp); 8350 } 8351 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8352 } 8353 8354 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8355 SmallVector<Metadata *, 4> MDs; 8356 // Reserve first location for self reference to the LoopID metadata node. 8357 MDs.push_back(nullptr); 8358 bool IsUnrollMetadata = false; 8359 MDNode *LoopID = L->getLoopID(); 8360 if (LoopID) { 8361 // First find existing loop unrolling disable metadata. 8362 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8363 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8364 if (MD) { 8365 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8366 IsUnrollMetadata = 8367 S && S->getString().startswith("llvm.loop.unroll.disable"); 8368 } 8369 MDs.push_back(LoopID->getOperand(i)); 8370 } 8371 } 8372 8373 if (!IsUnrollMetadata) { 8374 // Add runtime unroll disable metadata. 8375 LLVMContext &Context = L->getHeader()->getContext(); 8376 SmallVector<Metadata *, 1> DisableOperands; 8377 DisableOperands.push_back( 8378 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8379 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8380 MDs.push_back(DisableNode); 8381 MDNode *NewLoopID = MDNode::get(Context, MDs); 8382 // Set operand 0 to refer to the loop id itself. 8383 NewLoopID->replaceOperandWith(0, NewLoopID); 8384 L->setLoopID(NewLoopID); 8385 } 8386 } 8387 8388 //===--------------------------------------------------------------------===// 8389 // EpilogueVectorizerMainLoop 8390 //===--------------------------------------------------------------------===// 8391 8392 /// This function is partially responsible for generating the control flow 8393 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8394 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8395 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8396 Loop *Lp = createVectorLoopSkeleton(""); 8397 8398 // Generate the code to check the minimum iteration count of the vector 8399 // epilogue (see below). 8400 EPI.EpilogueIterationCountCheck = 8401 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8402 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8403 8404 // Generate the code to check any assumptions that we've made for SCEV 8405 // expressions. 8406 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8407 8408 // Generate the code that checks at runtime if arrays overlap. We put the 8409 // checks into a separate block to make the more common case of few elements 8410 // faster. 8411 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8412 8413 // Generate the iteration count check for the main loop, *after* the check 8414 // for the epilogue loop, so that the path-length is shorter for the case 8415 // that goes directly through the vector epilogue. The longer-path length for 8416 // the main loop is compensated for, by the gain from vectorizing the larger 8417 // trip count. Note: the branch will get updated later on when we vectorize 8418 // the epilogue. 8419 EPI.MainLoopIterationCountCheck = 8420 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8421 8422 // Generate the induction variable. 8423 OldInduction = Legal->getPrimaryInduction(); 8424 Type *IdxTy = Legal->getWidestInductionType(); 8425 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8426 8427 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8428 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8429 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8430 EPI.VectorTripCount = CountRoundDown; 8431 Induction = 8432 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8433 getDebugLocFromInstOrOperands(OldInduction)); 8434 8435 // Skip induction resume value creation here because they will be created in 8436 // the second pass. If we created them here, they wouldn't be used anyway, 8437 // because the vplan in the second pass still contains the inductions from the 8438 // original loop. 8439 8440 return completeLoopSkeleton(Lp, OrigLoopID); 8441 } 8442 8443 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8444 LLVM_DEBUG({ 8445 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8446 << "Main Loop VF:" << EPI.MainLoopVF 8447 << ", Main Loop UF:" << EPI.MainLoopUF 8448 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8449 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8450 }); 8451 } 8452 8453 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8454 DEBUG_WITH_TYPE(VerboseDebug, { 8455 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8456 }); 8457 } 8458 8459 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8460 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8461 assert(L && "Expected valid Loop."); 8462 assert(Bypass && "Expected valid bypass basic block."); 8463 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8464 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8465 Value *Count = getOrCreateTripCount(L); 8466 // Reuse existing vector loop preheader for TC checks. 8467 // Note that new preheader block is generated for vector loop. 8468 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8469 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8470 8471 // Generate code to check if the loop's trip count is less than VF * UF of the 8472 // main vector loop. 8473 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8474 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8475 8476 Value *CheckMinIters = Builder.CreateICmp( 8477 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8478 "min.iters.check"); 8479 8480 if (!ForEpilogue) 8481 TCCheckBlock->setName("vector.main.loop.iter.check"); 8482 8483 // Create new preheader for vector loop. 8484 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8485 DT, LI, nullptr, "vector.ph"); 8486 8487 if (ForEpilogue) { 8488 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8489 DT->getNode(Bypass)->getIDom()) && 8490 "TC check is expected to dominate Bypass"); 8491 8492 // Update dominator for Bypass & LoopExit. 8493 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8494 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8495 // For loops with multiple exits, there's no edge from the middle block 8496 // to exit blocks (as the epilogue must run) and thus no need to update 8497 // the immediate dominator of the exit blocks. 8498 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8499 8500 LoopBypassBlocks.push_back(TCCheckBlock); 8501 8502 // Save the trip count so we don't have to regenerate it in the 8503 // vec.epilog.iter.check. This is safe to do because the trip count 8504 // generated here dominates the vector epilog iter check. 8505 EPI.TripCount = Count; 8506 } 8507 8508 ReplaceInstWithInst( 8509 TCCheckBlock->getTerminator(), 8510 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8511 8512 return TCCheckBlock; 8513 } 8514 8515 //===--------------------------------------------------------------------===// 8516 // EpilogueVectorizerEpilogueLoop 8517 //===--------------------------------------------------------------------===// 8518 8519 /// This function is partially responsible for generating the control flow 8520 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8521 BasicBlock * 8522 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8523 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8524 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8525 8526 // Now, compare the remaining count and if there aren't enough iterations to 8527 // execute the vectorized epilogue skip to the scalar part. 8528 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8529 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8530 LoopVectorPreHeader = 8531 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8532 LI, nullptr, "vec.epilog.ph"); 8533 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8534 VecEpilogueIterationCountCheck); 8535 8536 // Adjust the control flow taking the state info from the main loop 8537 // vectorization into account. 8538 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8539 "expected this to be saved from the previous pass."); 8540 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8541 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8542 8543 DT->changeImmediateDominator(LoopVectorPreHeader, 8544 EPI.MainLoopIterationCountCheck); 8545 8546 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8547 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8548 8549 if (EPI.SCEVSafetyCheck) 8550 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8551 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8552 if (EPI.MemSafetyCheck) 8553 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8554 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8555 8556 DT->changeImmediateDominator( 8557 VecEpilogueIterationCountCheck, 8558 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8559 8560 DT->changeImmediateDominator(LoopScalarPreHeader, 8561 EPI.EpilogueIterationCountCheck); 8562 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8563 // If there is an epilogue which must run, there's no edge from the 8564 // middle block to exit blocks and thus no need to update the immediate 8565 // dominator of the exit blocks. 8566 DT->changeImmediateDominator(LoopExitBlock, 8567 EPI.EpilogueIterationCountCheck); 8568 8569 // Keep track of bypass blocks, as they feed start values to the induction 8570 // phis in the scalar loop preheader. 8571 if (EPI.SCEVSafetyCheck) 8572 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8573 if (EPI.MemSafetyCheck) 8574 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8575 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8576 8577 // Generate a resume induction for the vector epilogue and put it in the 8578 // vector epilogue preheader 8579 Type *IdxTy = Legal->getWidestInductionType(); 8580 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8581 LoopVectorPreHeader->getFirstNonPHI()); 8582 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8583 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8584 EPI.MainLoopIterationCountCheck); 8585 8586 // Generate the induction variable. 8587 OldInduction = Legal->getPrimaryInduction(); 8588 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8589 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8590 Value *StartIdx = EPResumeVal; 8591 Induction = 8592 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8593 getDebugLocFromInstOrOperands(OldInduction)); 8594 8595 // Generate induction resume values. These variables save the new starting 8596 // indexes for the scalar loop. They are used to test if there are any tail 8597 // iterations left once the vector loop has completed. 8598 // Note that when the vectorized epilogue is skipped due to iteration count 8599 // check, then the resume value for the induction variable comes from 8600 // the trip count of the main vector loop, hence passing the AdditionalBypass 8601 // argument. 8602 createInductionResumeValues(Lp, CountRoundDown, 8603 {VecEpilogueIterationCountCheck, 8604 EPI.VectorTripCount} /* AdditionalBypass */); 8605 8606 AddRuntimeUnrollDisableMetaData(Lp); 8607 return completeLoopSkeleton(Lp, OrigLoopID); 8608 } 8609 8610 BasicBlock * 8611 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8612 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8613 8614 assert(EPI.TripCount && 8615 "Expected trip count to have been safed in the first pass."); 8616 assert( 8617 (!isa<Instruction>(EPI.TripCount) || 8618 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8619 "saved trip count does not dominate insertion point."); 8620 Value *TC = EPI.TripCount; 8621 IRBuilder<> Builder(Insert->getTerminator()); 8622 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8623 8624 // Generate code to check if the loop's trip count is less than VF * UF of the 8625 // vector epilogue loop. 8626 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8627 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8628 8629 Value *CheckMinIters = 8630 Builder.CreateICmp(P, Count, 8631 createStepForVF(Builder, Count->getType(), 8632 EPI.EpilogueVF, EPI.EpilogueUF), 8633 "min.epilog.iters.check"); 8634 8635 ReplaceInstWithInst( 8636 Insert->getTerminator(), 8637 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8638 8639 LoopBypassBlocks.push_back(Insert); 8640 return Insert; 8641 } 8642 8643 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8644 LLVM_DEBUG({ 8645 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8646 << "Epilogue Loop VF:" << EPI.EpilogueVF 8647 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8648 }); 8649 } 8650 8651 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8652 DEBUG_WITH_TYPE(VerboseDebug, { 8653 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8654 }); 8655 } 8656 8657 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8658 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8659 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8660 bool PredicateAtRangeStart = Predicate(Range.Start); 8661 8662 for (ElementCount TmpVF = Range.Start * 2; 8663 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8664 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8665 Range.End = TmpVF; 8666 break; 8667 } 8668 8669 return PredicateAtRangeStart; 8670 } 8671 8672 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8673 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8674 /// of VF's starting at a given VF and extending it as much as possible. Each 8675 /// vectorization decision can potentially shorten this sub-range during 8676 /// buildVPlan(). 8677 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8678 ElementCount MaxVF) { 8679 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8680 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8681 VFRange SubRange = {VF, MaxVFPlusOne}; 8682 VPlans.push_back(buildVPlan(SubRange)); 8683 VF = SubRange.End; 8684 } 8685 } 8686 8687 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8688 VPlanPtr &Plan) { 8689 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8690 8691 // Look for cached value. 8692 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8693 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8694 if (ECEntryIt != EdgeMaskCache.end()) 8695 return ECEntryIt->second; 8696 8697 VPValue *SrcMask = createBlockInMask(Src, Plan); 8698 8699 // The terminator has to be a branch inst! 8700 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8701 assert(BI && "Unexpected terminator found"); 8702 8703 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8704 return EdgeMaskCache[Edge] = SrcMask; 8705 8706 // If source is an exiting block, we know the exit edge is dynamically dead 8707 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8708 // adding uses of an otherwise potentially dead instruction. 8709 if (OrigLoop->isLoopExiting(Src)) 8710 return EdgeMaskCache[Edge] = SrcMask; 8711 8712 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8713 assert(EdgeMask && "No Edge Mask found for condition"); 8714 8715 if (BI->getSuccessor(0) != Dst) 8716 EdgeMask = Builder.createNot(EdgeMask); 8717 8718 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8719 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8720 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8721 // The select version does not introduce new UB if SrcMask is false and 8722 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8723 VPValue *False = Plan->getOrAddVPValue( 8724 ConstantInt::getFalse(BI->getCondition()->getType())); 8725 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8726 } 8727 8728 return EdgeMaskCache[Edge] = EdgeMask; 8729 } 8730 8731 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8732 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8733 8734 // Look for cached value. 8735 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8736 if (BCEntryIt != BlockMaskCache.end()) 8737 return BCEntryIt->second; 8738 8739 // All-one mask is modelled as no-mask following the convention for masked 8740 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8741 VPValue *BlockMask = nullptr; 8742 8743 if (OrigLoop->getHeader() == BB) { 8744 if (!CM.blockNeedsPredication(BB)) 8745 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8746 8747 // Create the block in mask as the first non-phi instruction in the block. 8748 VPBuilder::InsertPointGuard Guard(Builder); 8749 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8750 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8751 8752 // Introduce the early-exit compare IV <= BTC to form header block mask. 8753 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8754 // Start by constructing the desired canonical IV. 8755 VPValue *IV = nullptr; 8756 if (Legal->getPrimaryInduction()) 8757 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8758 else { 8759 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8760 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8761 IV = IVRecipe; 8762 } 8763 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8764 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8765 8766 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8767 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8768 // as a second argument, we only pass the IV here and extract the 8769 // tripcount from the transform state where codegen of the VP instructions 8770 // happen. 8771 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8772 } else { 8773 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8774 } 8775 return BlockMaskCache[BB] = BlockMask; 8776 } 8777 8778 // This is the block mask. We OR all incoming edges. 8779 for (auto *Predecessor : predecessors(BB)) { 8780 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8781 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8782 return BlockMaskCache[BB] = EdgeMask; 8783 8784 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8785 BlockMask = EdgeMask; 8786 continue; 8787 } 8788 8789 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8790 } 8791 8792 return BlockMaskCache[BB] = BlockMask; 8793 } 8794 8795 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8796 ArrayRef<VPValue *> Operands, 8797 VFRange &Range, 8798 VPlanPtr &Plan) { 8799 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8800 "Must be called with either a load or store"); 8801 8802 auto willWiden = [&](ElementCount VF) -> bool { 8803 if (VF.isScalar()) 8804 return false; 8805 LoopVectorizationCostModel::InstWidening Decision = 8806 CM.getWideningDecision(I, VF); 8807 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8808 "CM decision should be taken at this point."); 8809 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8810 return true; 8811 if (CM.isScalarAfterVectorization(I, VF) || 8812 CM.isProfitableToScalarize(I, VF)) 8813 return false; 8814 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8815 }; 8816 8817 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8818 return nullptr; 8819 8820 VPValue *Mask = nullptr; 8821 if (Legal->isMaskRequired(I)) 8822 Mask = createBlockInMask(I->getParent(), Plan); 8823 8824 // Determine if the pointer operand of the access is either consecutive or 8825 // reverse consecutive. 8826 LoopVectorizationCostModel::InstWidening Decision = 8827 CM.getWideningDecision(I, Range.Start); 8828 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8829 bool Consecutive = 8830 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8831 8832 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8833 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8834 Consecutive, Reverse); 8835 8836 StoreInst *Store = cast<StoreInst>(I); 8837 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8838 Mask, Consecutive, Reverse); 8839 } 8840 8841 VPWidenIntOrFpInductionRecipe * 8842 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8843 ArrayRef<VPValue *> Operands) const { 8844 // Check if this is an integer or fp induction. If so, build the recipe that 8845 // produces its scalar and vector values. 8846 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8847 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8848 II.getKind() == InductionDescriptor::IK_FpInduction) { 8849 assert(II.getStartValue() == 8850 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8851 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8852 return new VPWidenIntOrFpInductionRecipe( 8853 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8854 } 8855 8856 return nullptr; 8857 } 8858 8859 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8860 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8861 VPlan &Plan) const { 8862 // Optimize the special case where the source is a constant integer 8863 // induction variable. Notice that we can only optimize the 'trunc' case 8864 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8865 // (c) other casts depend on pointer size. 8866 8867 // Determine whether \p K is a truncation based on an induction variable that 8868 // can be optimized. 8869 auto isOptimizableIVTruncate = 8870 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8871 return [=](ElementCount VF) -> bool { 8872 return CM.isOptimizableIVTruncate(K, VF); 8873 }; 8874 }; 8875 8876 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8877 isOptimizableIVTruncate(I), Range)) { 8878 8879 InductionDescriptor II = 8880 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8881 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8882 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8883 Start, nullptr, I); 8884 } 8885 return nullptr; 8886 } 8887 8888 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8889 ArrayRef<VPValue *> Operands, 8890 VPlanPtr &Plan) { 8891 // If all incoming values are equal, the incoming VPValue can be used directly 8892 // instead of creating a new VPBlendRecipe. 8893 VPValue *FirstIncoming = Operands[0]; 8894 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8895 return FirstIncoming == Inc; 8896 })) { 8897 return Operands[0]; 8898 } 8899 8900 // We know that all PHIs in non-header blocks are converted into selects, so 8901 // we don't have to worry about the insertion order and we can just use the 8902 // builder. At this point we generate the predication tree. There may be 8903 // duplications since this is a simple recursive scan, but future 8904 // optimizations will clean it up. 8905 SmallVector<VPValue *, 2> OperandsWithMask; 8906 unsigned NumIncoming = Phi->getNumIncomingValues(); 8907 8908 for (unsigned In = 0; In < NumIncoming; In++) { 8909 VPValue *EdgeMask = 8910 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8911 assert((EdgeMask || NumIncoming == 1) && 8912 "Multiple predecessors with one having a full mask"); 8913 OperandsWithMask.push_back(Operands[In]); 8914 if (EdgeMask) 8915 OperandsWithMask.push_back(EdgeMask); 8916 } 8917 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8918 } 8919 8920 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8921 ArrayRef<VPValue *> Operands, 8922 VFRange &Range) const { 8923 8924 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8925 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8926 Range); 8927 8928 if (IsPredicated) 8929 return nullptr; 8930 8931 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8932 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8933 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8934 ID == Intrinsic::pseudoprobe || 8935 ID == Intrinsic::experimental_noalias_scope_decl)) 8936 return nullptr; 8937 8938 auto willWiden = [&](ElementCount VF) -> bool { 8939 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8940 // The following case may be scalarized depending on the VF. 8941 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8942 // version of the instruction. 8943 // Is it beneficial to perform intrinsic call compared to lib call? 8944 bool NeedToScalarize = false; 8945 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8946 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8947 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8948 return UseVectorIntrinsic || !NeedToScalarize; 8949 }; 8950 8951 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8952 return nullptr; 8953 8954 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8955 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8956 } 8957 8958 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8959 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8960 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8961 // Instruction should be widened, unless it is scalar after vectorization, 8962 // scalarization is profitable or it is predicated. 8963 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8964 return CM.isScalarAfterVectorization(I, VF) || 8965 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8966 }; 8967 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8968 Range); 8969 } 8970 8971 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8972 ArrayRef<VPValue *> Operands) const { 8973 auto IsVectorizableOpcode = [](unsigned Opcode) { 8974 switch (Opcode) { 8975 case Instruction::Add: 8976 case Instruction::And: 8977 case Instruction::AShr: 8978 case Instruction::BitCast: 8979 case Instruction::FAdd: 8980 case Instruction::FCmp: 8981 case Instruction::FDiv: 8982 case Instruction::FMul: 8983 case Instruction::FNeg: 8984 case Instruction::FPExt: 8985 case Instruction::FPToSI: 8986 case Instruction::FPToUI: 8987 case Instruction::FPTrunc: 8988 case Instruction::FRem: 8989 case Instruction::FSub: 8990 case Instruction::ICmp: 8991 case Instruction::IntToPtr: 8992 case Instruction::LShr: 8993 case Instruction::Mul: 8994 case Instruction::Or: 8995 case Instruction::PtrToInt: 8996 case Instruction::SDiv: 8997 case Instruction::Select: 8998 case Instruction::SExt: 8999 case Instruction::Shl: 9000 case Instruction::SIToFP: 9001 case Instruction::SRem: 9002 case Instruction::Sub: 9003 case Instruction::Trunc: 9004 case Instruction::UDiv: 9005 case Instruction::UIToFP: 9006 case Instruction::URem: 9007 case Instruction::Xor: 9008 case Instruction::ZExt: 9009 return true; 9010 } 9011 return false; 9012 }; 9013 9014 if (!IsVectorizableOpcode(I->getOpcode())) 9015 return nullptr; 9016 9017 // Success: widen this instruction. 9018 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 9019 } 9020 9021 void VPRecipeBuilder::fixHeaderPhis() { 9022 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 9023 for (VPWidenPHIRecipe *R : PhisToFix) { 9024 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 9025 VPRecipeBase *IncR = 9026 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 9027 R->addOperand(IncR->getVPSingleValue()); 9028 } 9029 } 9030 9031 VPBasicBlock *VPRecipeBuilder::handleReplication( 9032 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 9033 VPlanPtr &Plan) { 9034 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 9035 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 9036 Range); 9037 9038 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 9039 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 9040 9041 // Even if the instruction is not marked as uniform, there are certain 9042 // intrinsic calls that can be effectively treated as such, so we check for 9043 // them here. Conservatively, we only do this for scalable vectors, since 9044 // for fixed-width VFs we can always fall back on full scalarization. 9045 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 9046 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 9047 case Intrinsic::assume: 9048 case Intrinsic::lifetime_start: 9049 case Intrinsic::lifetime_end: 9050 // For scalable vectors if one of the operands is variant then we still 9051 // want to mark as uniform, which will generate one instruction for just 9052 // the first lane of the vector. We can't scalarize the call in the same 9053 // way as for fixed-width vectors because we don't know how many lanes 9054 // there are. 9055 // 9056 // The reasons for doing it this way for scalable vectors are: 9057 // 1. For the assume intrinsic generating the instruction for the first 9058 // lane is still be better than not generating any at all. For 9059 // example, the input may be a splat across all lanes. 9060 // 2. For the lifetime start/end intrinsics the pointer operand only 9061 // does anything useful when the input comes from a stack object, 9062 // which suggests it should always be uniform. For non-stack objects 9063 // the effect is to poison the object, which still allows us to 9064 // remove the call. 9065 IsUniform = true; 9066 break; 9067 default: 9068 break; 9069 } 9070 } 9071 9072 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 9073 IsUniform, IsPredicated); 9074 setRecipe(I, Recipe); 9075 Plan->addVPValue(I, Recipe); 9076 9077 // Find if I uses a predicated instruction. If so, it will use its scalar 9078 // value. Avoid hoisting the insert-element which packs the scalar value into 9079 // a vector value, as that happens iff all users use the vector value. 9080 for (VPValue *Op : Recipe->operands()) { 9081 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 9082 if (!PredR) 9083 continue; 9084 auto *RepR = 9085 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 9086 assert(RepR->isPredicated() && 9087 "expected Replicate recipe to be predicated"); 9088 RepR->setAlsoPack(false); 9089 } 9090 9091 // Finalize the recipe for Instr, first if it is not predicated. 9092 if (!IsPredicated) { 9093 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9094 VPBB->appendRecipe(Recipe); 9095 return VPBB; 9096 } 9097 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9098 assert(VPBB->getSuccessors().empty() && 9099 "VPBB has successors when handling predicated replication."); 9100 // Record predicated instructions for above packing optimizations. 9101 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9102 VPBlockUtils::insertBlockAfter(Region, VPBB); 9103 auto *RegSucc = new VPBasicBlock(); 9104 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9105 return RegSucc; 9106 } 9107 9108 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9109 VPRecipeBase *PredRecipe, 9110 VPlanPtr &Plan) { 9111 // Instructions marked for predication are replicated and placed under an 9112 // if-then construct to prevent side-effects. 9113 9114 // Generate recipes to compute the block mask for this region. 9115 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9116 9117 // Build the triangular if-then region. 9118 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9119 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9120 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9121 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9122 auto *PHIRecipe = Instr->getType()->isVoidTy() 9123 ? nullptr 9124 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9125 if (PHIRecipe) { 9126 Plan->removeVPValueFor(Instr); 9127 Plan->addVPValue(Instr, PHIRecipe); 9128 } 9129 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9130 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9131 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9132 9133 // Note: first set Entry as region entry and then connect successors starting 9134 // from it in order, to propagate the "parent" of each VPBasicBlock. 9135 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9136 VPBlockUtils::connectBlocks(Pred, Exit); 9137 9138 return Region; 9139 } 9140 9141 VPRecipeOrVPValueTy 9142 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9143 ArrayRef<VPValue *> Operands, 9144 VFRange &Range, VPlanPtr &Plan) { 9145 // First, check for specific widening recipes that deal with calls, memory 9146 // operations, inductions and Phi nodes. 9147 if (auto *CI = dyn_cast<CallInst>(Instr)) 9148 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9149 9150 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9151 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9152 9153 VPRecipeBase *Recipe; 9154 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9155 if (Phi->getParent() != OrigLoop->getHeader()) 9156 return tryToBlend(Phi, Operands, Plan); 9157 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9158 return toVPRecipeResult(Recipe); 9159 9160 VPWidenPHIRecipe *PhiRecipe = nullptr; 9161 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9162 VPValue *StartV = Operands[0]; 9163 if (Legal->isReductionVariable(Phi)) { 9164 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9165 assert(RdxDesc.getRecurrenceStartValue() == 9166 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9167 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9168 CM.isInLoopReduction(Phi), 9169 CM.useOrderedReductions(RdxDesc)); 9170 } else { 9171 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9172 } 9173 9174 // Record the incoming value from the backedge, so we can add the incoming 9175 // value from the backedge after all recipes have been created. 9176 recordRecipeOf(cast<Instruction>( 9177 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9178 PhisToFix.push_back(PhiRecipe); 9179 } else { 9180 // TODO: record start and backedge value for remaining pointer induction 9181 // phis. 9182 assert(Phi->getType()->isPointerTy() && 9183 "only pointer phis should be handled here"); 9184 PhiRecipe = new VPWidenPHIRecipe(Phi); 9185 } 9186 9187 return toVPRecipeResult(PhiRecipe); 9188 } 9189 9190 if (isa<TruncInst>(Instr) && 9191 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9192 Range, *Plan))) 9193 return toVPRecipeResult(Recipe); 9194 9195 if (!shouldWiden(Instr, Range)) 9196 return nullptr; 9197 9198 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9199 return toVPRecipeResult(new VPWidenGEPRecipe( 9200 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9201 9202 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9203 bool InvariantCond = 9204 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9205 return toVPRecipeResult(new VPWidenSelectRecipe( 9206 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9207 } 9208 9209 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9210 } 9211 9212 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9213 ElementCount MaxVF) { 9214 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9215 9216 // Collect instructions from the original loop that will become trivially dead 9217 // in the vectorized loop. We don't need to vectorize these instructions. For 9218 // example, original induction update instructions can become dead because we 9219 // separately emit induction "steps" when generating code for the new loop. 9220 // Similarly, we create a new latch condition when setting up the structure 9221 // of the new loop, so the old one can become dead. 9222 SmallPtrSet<Instruction *, 4> DeadInstructions; 9223 collectTriviallyDeadInstructions(DeadInstructions); 9224 9225 // Add assume instructions we need to drop to DeadInstructions, to prevent 9226 // them from being added to the VPlan. 9227 // TODO: We only need to drop assumes in blocks that get flattend. If the 9228 // control flow is preserved, we should keep them. 9229 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9230 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9231 9232 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9233 // Dead instructions do not need sinking. Remove them from SinkAfter. 9234 for (Instruction *I : DeadInstructions) 9235 SinkAfter.erase(I); 9236 9237 // Cannot sink instructions after dead instructions (there won't be any 9238 // recipes for them). Instead, find the first non-dead previous instruction. 9239 for (auto &P : Legal->getSinkAfter()) { 9240 Instruction *SinkTarget = P.second; 9241 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9242 (void)FirstInst; 9243 while (DeadInstructions.contains(SinkTarget)) { 9244 assert( 9245 SinkTarget != FirstInst && 9246 "Must find a live instruction (at least the one feeding the " 9247 "first-order recurrence PHI) before reaching beginning of the block"); 9248 SinkTarget = SinkTarget->getPrevNode(); 9249 assert(SinkTarget != P.first && 9250 "sink source equals target, no sinking required"); 9251 } 9252 P.second = SinkTarget; 9253 } 9254 9255 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9256 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9257 VFRange SubRange = {VF, MaxVFPlusOne}; 9258 VPlans.push_back( 9259 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9260 VF = SubRange.End; 9261 } 9262 } 9263 9264 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9265 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9266 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9267 9268 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9269 9270 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9271 9272 // --------------------------------------------------------------------------- 9273 // Pre-construction: record ingredients whose recipes we'll need to further 9274 // process after constructing the initial VPlan. 9275 // --------------------------------------------------------------------------- 9276 9277 // Mark instructions we'll need to sink later and their targets as 9278 // ingredients whose recipe we'll need to record. 9279 for (auto &Entry : SinkAfter) { 9280 RecipeBuilder.recordRecipeOf(Entry.first); 9281 RecipeBuilder.recordRecipeOf(Entry.second); 9282 } 9283 for (auto &Reduction : CM.getInLoopReductionChains()) { 9284 PHINode *Phi = Reduction.first; 9285 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9286 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9287 9288 RecipeBuilder.recordRecipeOf(Phi); 9289 for (auto &R : ReductionOperations) { 9290 RecipeBuilder.recordRecipeOf(R); 9291 // For min/max reducitons, where we have a pair of icmp/select, we also 9292 // need to record the ICmp recipe, so it can be removed later. 9293 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9294 "Only min/max recurrences allowed for inloop reductions"); 9295 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9296 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9297 } 9298 } 9299 9300 // For each interleave group which is relevant for this (possibly trimmed) 9301 // Range, add it to the set of groups to be later applied to the VPlan and add 9302 // placeholders for its members' Recipes which we'll be replacing with a 9303 // single VPInterleaveRecipe. 9304 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9305 auto applyIG = [IG, this](ElementCount VF) -> bool { 9306 return (VF.isVector() && // Query is illegal for VF == 1 9307 CM.getWideningDecision(IG->getInsertPos(), VF) == 9308 LoopVectorizationCostModel::CM_Interleave); 9309 }; 9310 if (!getDecisionAndClampRange(applyIG, Range)) 9311 continue; 9312 InterleaveGroups.insert(IG); 9313 for (unsigned i = 0; i < IG->getFactor(); i++) 9314 if (Instruction *Member = IG->getMember(i)) 9315 RecipeBuilder.recordRecipeOf(Member); 9316 }; 9317 9318 // --------------------------------------------------------------------------- 9319 // Build initial VPlan: Scan the body of the loop in a topological order to 9320 // visit each basic block after having visited its predecessor basic blocks. 9321 // --------------------------------------------------------------------------- 9322 9323 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9324 auto Plan = std::make_unique<VPlan>(); 9325 9326 // Scan the body of the loop in a topological order to visit each basic block 9327 // after having visited its predecessor basic blocks. 9328 LoopBlocksDFS DFS(OrigLoop); 9329 DFS.perform(LI); 9330 9331 VPBasicBlock *VPBB = nullptr; 9332 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9333 // Relevant instructions from basic block BB will be grouped into VPRecipe 9334 // ingredients and fill a new VPBasicBlock. 9335 unsigned VPBBsForBB = 0; 9336 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9337 if (VPBB) 9338 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9339 else 9340 Plan->setEntry(FirstVPBBForBB); 9341 VPBB = FirstVPBBForBB; 9342 Builder.setInsertPoint(VPBB); 9343 9344 // Introduce each ingredient into VPlan. 9345 // TODO: Model and preserve debug instrinsics in VPlan. 9346 for (Instruction &I : BB->instructionsWithoutDebug()) { 9347 Instruction *Instr = &I; 9348 9349 // First filter out irrelevant instructions, to ensure no recipes are 9350 // built for them. 9351 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9352 continue; 9353 9354 SmallVector<VPValue *, 4> Operands; 9355 auto *Phi = dyn_cast<PHINode>(Instr); 9356 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9357 Operands.push_back(Plan->getOrAddVPValue( 9358 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9359 } else { 9360 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9361 Operands = {OpRange.begin(), OpRange.end()}; 9362 } 9363 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9364 Instr, Operands, Range, Plan)) { 9365 // If Instr can be simplified to an existing VPValue, use it. 9366 if (RecipeOrValue.is<VPValue *>()) { 9367 auto *VPV = RecipeOrValue.get<VPValue *>(); 9368 Plan->addVPValue(Instr, VPV); 9369 // If the re-used value is a recipe, register the recipe for the 9370 // instruction, in case the recipe for Instr needs to be recorded. 9371 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9372 RecipeBuilder.setRecipe(Instr, R); 9373 continue; 9374 } 9375 // Otherwise, add the new recipe. 9376 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9377 for (auto *Def : Recipe->definedValues()) { 9378 auto *UV = Def->getUnderlyingValue(); 9379 Plan->addVPValue(UV, Def); 9380 } 9381 9382 RecipeBuilder.setRecipe(Instr, Recipe); 9383 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe)) { 9384 // Make sure induction recipes are all kept in the header block. 9385 // VPWidenIntOrFpInductionRecipe may be generated when reaching a 9386 // Trunc of an induction Phi, where Trunc may not be in the header. 9387 auto *Header = Plan->getEntry()->getEntryBasicBlock(); 9388 Header->insert(Recipe, Header->getFirstNonPhi()); 9389 } else 9390 VPBB->appendRecipe(Recipe); 9391 continue; 9392 } 9393 9394 // Otherwise, if all widening options failed, Instruction is to be 9395 // replicated. This may create a successor for VPBB. 9396 VPBasicBlock *NextVPBB = 9397 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9398 if (NextVPBB != VPBB) { 9399 VPBB = NextVPBB; 9400 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9401 : ""); 9402 } 9403 } 9404 } 9405 9406 assert(isa<VPBasicBlock>(Plan->getEntry()) && 9407 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9408 "entry block must be set to a non-empty VPBasicBlock"); 9409 RecipeBuilder.fixHeaderPhis(); 9410 9411 // --------------------------------------------------------------------------- 9412 // Transform initial VPlan: Apply previously taken decisions, in order, to 9413 // bring the VPlan to its final state. 9414 // --------------------------------------------------------------------------- 9415 9416 // Apply Sink-After legal constraints. 9417 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9418 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9419 if (Region && Region->isReplicator()) { 9420 assert(Region->getNumSuccessors() == 1 && 9421 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9422 assert(R->getParent()->size() == 1 && 9423 "A recipe in an original replicator region must be the only " 9424 "recipe in its block"); 9425 return Region; 9426 } 9427 return nullptr; 9428 }; 9429 for (auto &Entry : SinkAfter) { 9430 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9431 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9432 9433 auto *TargetRegion = GetReplicateRegion(Target); 9434 auto *SinkRegion = GetReplicateRegion(Sink); 9435 if (!SinkRegion) { 9436 // If the sink source is not a replicate region, sink the recipe directly. 9437 if (TargetRegion) { 9438 // The target is in a replication region, make sure to move Sink to 9439 // the block after it, not into the replication region itself. 9440 VPBasicBlock *NextBlock = 9441 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9442 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9443 } else 9444 Sink->moveAfter(Target); 9445 continue; 9446 } 9447 9448 // The sink source is in a replicate region. Unhook the region from the CFG. 9449 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9450 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9451 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9452 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9453 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9454 9455 if (TargetRegion) { 9456 // The target recipe is also in a replicate region, move the sink region 9457 // after the target region. 9458 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9459 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9460 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9461 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9462 } else { 9463 // The sink source is in a replicate region, we need to move the whole 9464 // replicate region, which should only contain a single recipe in the 9465 // main block. 9466 auto *SplitBlock = 9467 Target->getParent()->splitAt(std::next(Target->getIterator())); 9468 9469 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9470 9471 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9472 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9473 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9474 if (VPBB == SplitPred) 9475 VPBB = SplitBlock; 9476 } 9477 } 9478 9479 // Adjust the recipes for any inloop reductions. 9480 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9481 9482 // Introduce a recipe to combine the incoming and previous values of a 9483 // first-order recurrence. 9484 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9485 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9486 if (!RecurPhi) 9487 continue; 9488 9489 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9490 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9491 auto *Region = GetReplicateRegion(PrevRecipe); 9492 if (Region) 9493 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9494 if (Region || PrevRecipe->isPhi()) 9495 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9496 else 9497 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9498 9499 auto *RecurSplice = cast<VPInstruction>( 9500 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9501 {RecurPhi, RecurPhi->getBackedgeValue()})); 9502 9503 RecurPhi->replaceAllUsesWith(RecurSplice); 9504 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9505 // all users. 9506 RecurSplice->setOperand(0, RecurPhi); 9507 } 9508 9509 // Interleave memory: for each Interleave Group we marked earlier as relevant 9510 // for this VPlan, replace the Recipes widening its memory instructions with a 9511 // single VPInterleaveRecipe at its insertion point. 9512 for (auto IG : InterleaveGroups) { 9513 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9514 RecipeBuilder.getRecipe(IG->getInsertPos())); 9515 SmallVector<VPValue *, 4> StoredValues; 9516 for (unsigned i = 0; i < IG->getFactor(); ++i) 9517 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9518 auto *StoreR = 9519 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9520 StoredValues.push_back(StoreR->getStoredValue()); 9521 } 9522 9523 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9524 Recipe->getMask()); 9525 VPIG->insertBefore(Recipe); 9526 unsigned J = 0; 9527 for (unsigned i = 0; i < IG->getFactor(); ++i) 9528 if (Instruction *Member = IG->getMember(i)) { 9529 if (!Member->getType()->isVoidTy()) { 9530 VPValue *OriginalV = Plan->getVPValue(Member); 9531 Plan->removeVPValueFor(Member); 9532 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9533 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9534 J++; 9535 } 9536 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9537 } 9538 } 9539 9540 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9541 // in ways that accessing values using original IR values is incorrect. 9542 Plan->disableValue2VPValue(); 9543 9544 VPlanTransforms::sinkScalarOperands(*Plan); 9545 VPlanTransforms::mergeReplicateRegions(*Plan); 9546 9547 std::string PlanName; 9548 raw_string_ostream RSO(PlanName); 9549 ElementCount VF = Range.Start; 9550 Plan->addVF(VF); 9551 RSO << "Initial VPlan for VF={" << VF; 9552 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9553 Plan->addVF(VF); 9554 RSO << "," << VF; 9555 } 9556 RSO << "},UF>=1"; 9557 RSO.flush(); 9558 Plan->setName(PlanName); 9559 9560 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9561 return Plan; 9562 } 9563 9564 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9565 // Outer loop handling: They may require CFG and instruction level 9566 // transformations before even evaluating whether vectorization is profitable. 9567 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9568 // the vectorization pipeline. 9569 assert(!OrigLoop->isInnermost()); 9570 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9571 9572 // Create new empty VPlan 9573 auto Plan = std::make_unique<VPlan>(); 9574 9575 // Build hierarchical CFG 9576 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9577 HCFGBuilder.buildHierarchicalCFG(); 9578 9579 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9580 VF *= 2) 9581 Plan->addVF(VF); 9582 9583 if (EnableVPlanPredication) { 9584 VPlanPredicator VPP(*Plan); 9585 VPP.predicate(); 9586 9587 // Avoid running transformation to recipes until masked code generation in 9588 // VPlan-native path is in place. 9589 return Plan; 9590 } 9591 9592 SmallPtrSet<Instruction *, 1> DeadInstructions; 9593 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9594 Legal->getInductionVars(), 9595 DeadInstructions, *PSE.getSE()); 9596 return Plan; 9597 } 9598 9599 // Adjust the recipes for reductions. For in-loop reductions the chain of 9600 // instructions leading from the loop exit instr to the phi need to be converted 9601 // to reductions, with one operand being vector and the other being the scalar 9602 // reduction chain. For other reductions, a select is introduced between the phi 9603 // and live-out recipes when folding the tail. 9604 void LoopVectorizationPlanner::adjustRecipesForReductions( 9605 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9606 ElementCount MinVF) { 9607 for (auto &Reduction : CM.getInLoopReductionChains()) { 9608 PHINode *Phi = Reduction.first; 9609 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9610 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9611 9612 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9613 continue; 9614 9615 // ReductionOperations are orders top-down from the phi's use to the 9616 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9617 // which of the two operands will remain scalar and which will be reduced. 9618 // For minmax the chain will be the select instructions. 9619 Instruction *Chain = Phi; 9620 for (Instruction *R : ReductionOperations) { 9621 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9622 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9623 9624 VPValue *ChainOp = Plan->getVPValue(Chain); 9625 unsigned FirstOpId; 9626 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9627 "Only min/max recurrences allowed for inloop reductions"); 9628 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9629 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9630 "Expected to replace a VPWidenSelectSC"); 9631 FirstOpId = 1; 9632 } else { 9633 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) && 9634 "Expected to replace a VPWidenSC"); 9635 FirstOpId = 0; 9636 } 9637 unsigned VecOpId = 9638 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9639 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9640 9641 auto *CondOp = CM.foldTailByMasking() 9642 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9643 : nullptr; 9644 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9645 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9646 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9647 Plan->removeVPValueFor(R); 9648 Plan->addVPValue(R, RedRecipe); 9649 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9650 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9651 WidenRecipe->eraseFromParent(); 9652 9653 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9654 VPRecipeBase *CompareRecipe = 9655 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9656 assert(isa<VPWidenRecipe>(CompareRecipe) && 9657 "Expected to replace a VPWidenSC"); 9658 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9659 "Expected no remaining users"); 9660 CompareRecipe->eraseFromParent(); 9661 } 9662 Chain = R; 9663 } 9664 } 9665 9666 // If tail is folded by masking, introduce selects between the phi 9667 // and the live-out instruction of each reduction, at the end of the latch. 9668 if (CM.foldTailByMasking()) { 9669 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9670 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9671 if (!PhiR || PhiR->isInLoop()) 9672 continue; 9673 Builder.setInsertPoint(LatchVPBB); 9674 VPValue *Cond = 9675 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9676 VPValue *Red = PhiR->getBackedgeValue(); 9677 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9678 } 9679 } 9680 } 9681 9682 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9683 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9684 VPSlotTracker &SlotTracker) const { 9685 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9686 IG->getInsertPos()->printAsOperand(O, false); 9687 O << ", "; 9688 getAddr()->printAsOperand(O, SlotTracker); 9689 VPValue *Mask = getMask(); 9690 if (Mask) { 9691 O << ", "; 9692 Mask->printAsOperand(O, SlotTracker); 9693 } 9694 9695 unsigned OpIdx = 0; 9696 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9697 if (!IG->getMember(i)) 9698 continue; 9699 if (getNumStoreOperands() > 0) { 9700 O << "\n" << Indent << " store "; 9701 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9702 O << " to index " << i; 9703 } else { 9704 O << "\n" << Indent << " "; 9705 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9706 O << " = load from index " << i; 9707 } 9708 ++OpIdx; 9709 } 9710 } 9711 #endif 9712 9713 void VPWidenCallRecipe::execute(VPTransformState &State) { 9714 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9715 *this, State); 9716 } 9717 9718 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9719 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9720 this, *this, InvariantCond, State); 9721 } 9722 9723 void VPWidenRecipe::execute(VPTransformState &State) { 9724 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9725 } 9726 9727 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9728 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9729 *this, State.UF, State.VF, IsPtrLoopInvariant, 9730 IsIndexLoopInvariant, State); 9731 } 9732 9733 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9734 assert(!State.Instance && "Int or FP induction being replicated."); 9735 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9736 getTruncInst(), getVPValue(0), 9737 getCastValue(), State); 9738 } 9739 9740 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9741 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9742 State); 9743 } 9744 9745 void VPBlendRecipe::execute(VPTransformState &State) { 9746 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9747 // We know that all PHIs in non-header blocks are converted into 9748 // selects, so we don't have to worry about the insertion order and we 9749 // can just use the builder. 9750 // At this point we generate the predication tree. There may be 9751 // duplications since this is a simple recursive scan, but future 9752 // optimizations will clean it up. 9753 9754 unsigned NumIncoming = getNumIncomingValues(); 9755 9756 // Generate a sequence of selects of the form: 9757 // SELECT(Mask3, In3, 9758 // SELECT(Mask2, In2, 9759 // SELECT(Mask1, In1, 9760 // In0))) 9761 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9762 // are essentially undef are taken from In0. 9763 InnerLoopVectorizer::VectorParts Entry(State.UF); 9764 for (unsigned In = 0; In < NumIncoming; ++In) { 9765 for (unsigned Part = 0; Part < State.UF; ++Part) { 9766 // We might have single edge PHIs (blocks) - use an identity 9767 // 'select' for the first PHI operand. 9768 Value *In0 = State.get(getIncomingValue(In), Part); 9769 if (In == 0) 9770 Entry[Part] = In0; // Initialize with the first incoming value. 9771 else { 9772 // Select between the current value and the previous incoming edge 9773 // based on the incoming mask. 9774 Value *Cond = State.get(getMask(In), Part); 9775 Entry[Part] = 9776 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9777 } 9778 } 9779 } 9780 for (unsigned Part = 0; Part < State.UF; ++Part) 9781 State.set(this, Entry[Part], Part); 9782 } 9783 9784 void VPInterleaveRecipe::execute(VPTransformState &State) { 9785 assert(!State.Instance && "Interleave group being replicated."); 9786 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9787 getStoredValues(), getMask()); 9788 } 9789 9790 void VPReductionRecipe::execute(VPTransformState &State) { 9791 assert(!State.Instance && "Reduction being replicated."); 9792 Value *PrevInChain = State.get(getChainOp(), 0); 9793 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9794 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9795 // Propagate the fast-math flags carried by the underlying instruction. 9796 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9797 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9798 for (unsigned Part = 0; Part < State.UF; ++Part) { 9799 Value *NewVecOp = State.get(getVecOp(), Part); 9800 if (VPValue *Cond = getCondOp()) { 9801 Value *NewCond = State.get(Cond, Part); 9802 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9803 Value *Iden = RdxDesc->getRecurrenceIdentity( 9804 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9805 Value *IdenVec = 9806 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9807 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9808 NewVecOp = Select; 9809 } 9810 Value *NewRed; 9811 Value *NextInChain; 9812 if (IsOrdered) { 9813 if (State.VF.isVector()) 9814 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9815 PrevInChain); 9816 else 9817 NewRed = State.Builder.CreateBinOp( 9818 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9819 NewVecOp); 9820 PrevInChain = NewRed; 9821 } else { 9822 PrevInChain = State.get(getChainOp(), Part); 9823 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9824 } 9825 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9826 NextInChain = 9827 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9828 NewRed, PrevInChain); 9829 } else if (IsOrdered) 9830 NextInChain = NewRed; 9831 else 9832 NextInChain = State.Builder.CreateBinOp( 9833 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9834 PrevInChain); 9835 State.set(this, NextInChain, Part); 9836 } 9837 } 9838 9839 void VPReplicateRecipe::execute(VPTransformState &State) { 9840 if (State.Instance) { // Generate a single instance. 9841 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9842 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9843 *State.Instance, IsPredicated, State); 9844 // Insert scalar instance packing it into a vector. 9845 if (AlsoPack && State.VF.isVector()) { 9846 // If we're constructing lane 0, initialize to start from poison. 9847 if (State.Instance->Lane.isFirstLane()) { 9848 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9849 Value *Poison = PoisonValue::get( 9850 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9851 State.set(this, Poison, State.Instance->Part); 9852 } 9853 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9854 } 9855 return; 9856 } 9857 9858 // If the instruction is a store to a uniform address, we only need to 9859 // generate the last lane for the last UF part. 9860 Instruction *I = getUnderlyingInstr(); 9861 if (State.VF.isVector() && IsUniform && isa<StoreInst>(I)) { 9862 VPLane Lane = VPLane::getLastLaneForVF(State.VF); 9863 State.ILV->scalarizeInstruction( 9864 I, this, *this, VPIteration(State.UF - 1, Lane), IsPredicated, State); 9865 return; 9866 } 9867 9868 // Generate scalar instances for all VF lanes of all UF parts, unless the 9869 // instruction is uniform inwhich case generate only the first lane for each 9870 // of the UF parts. 9871 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9872 assert((!State.VF.isScalable() || IsUniform) && 9873 "Can't scalarize a scalable vector"); 9874 for (unsigned Part = 0; Part < State.UF; ++Part) 9875 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9876 State.ILV->scalarizeInstruction(I, this, *this, VPIteration(Part, Lane), 9877 IsPredicated, State); 9878 } 9879 9880 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9881 assert(State.Instance && "Branch on Mask works only on single instance."); 9882 9883 unsigned Part = State.Instance->Part; 9884 unsigned Lane = State.Instance->Lane.getKnownLane(); 9885 9886 Value *ConditionBit = nullptr; 9887 VPValue *BlockInMask = getMask(); 9888 if (BlockInMask) { 9889 ConditionBit = State.get(BlockInMask, Part); 9890 if (ConditionBit->getType()->isVectorTy()) 9891 ConditionBit = State.Builder.CreateExtractElement( 9892 ConditionBit, State.Builder.getInt32(Lane)); 9893 } else // Block in mask is all-one. 9894 ConditionBit = State.Builder.getTrue(); 9895 9896 // Replace the temporary unreachable terminator with a new conditional branch, 9897 // whose two destinations will be set later when they are created. 9898 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9899 assert(isa<UnreachableInst>(CurrentTerminator) && 9900 "Expected to replace unreachable terminator with conditional branch."); 9901 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9902 CondBr->setSuccessor(0, nullptr); 9903 ReplaceInstWithInst(CurrentTerminator, CondBr); 9904 } 9905 9906 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9907 assert(State.Instance && "Predicated instruction PHI works per instance."); 9908 Instruction *ScalarPredInst = 9909 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9910 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9911 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9912 assert(PredicatingBB && "Predicated block has no single predecessor."); 9913 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9914 "operand must be VPReplicateRecipe"); 9915 9916 // By current pack/unpack logic we need to generate only a single phi node: if 9917 // a vector value for the predicated instruction exists at this point it means 9918 // the instruction has vector users only, and a phi for the vector value is 9919 // needed. In this case the recipe of the predicated instruction is marked to 9920 // also do that packing, thereby "hoisting" the insert-element sequence. 9921 // Otherwise, a phi node for the scalar value is needed. 9922 unsigned Part = State.Instance->Part; 9923 if (State.hasVectorValue(getOperand(0), Part)) { 9924 Value *VectorValue = State.get(getOperand(0), Part); 9925 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9926 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9927 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9928 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9929 if (State.hasVectorValue(this, Part)) 9930 State.reset(this, VPhi, Part); 9931 else 9932 State.set(this, VPhi, Part); 9933 // NOTE: Currently we need to update the value of the operand, so the next 9934 // predicated iteration inserts its generated value in the correct vector. 9935 State.reset(getOperand(0), VPhi, Part); 9936 } else { 9937 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9938 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9939 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9940 PredicatingBB); 9941 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9942 if (State.hasScalarValue(this, *State.Instance)) 9943 State.reset(this, Phi, *State.Instance); 9944 else 9945 State.set(this, Phi, *State.Instance); 9946 // NOTE: Currently we need to update the value of the operand, so the next 9947 // predicated iteration inserts its generated value in the correct vector. 9948 State.reset(getOperand(0), Phi, *State.Instance); 9949 } 9950 } 9951 9952 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9953 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9954 State.ILV->vectorizeMemoryInstruction( 9955 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9956 StoredValue, getMask(), Consecutive, Reverse); 9957 } 9958 9959 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9960 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9961 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9962 // for predication. 9963 static ScalarEpilogueLowering getScalarEpilogueLowering( 9964 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9965 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9966 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9967 LoopVectorizationLegality &LVL) { 9968 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9969 // don't look at hints or options, and don't request a scalar epilogue. 9970 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9971 // LoopAccessInfo (due to code dependency and not being able to reliably get 9972 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9973 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9974 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9975 // back to the old way and vectorize with versioning when forced. See D81345.) 9976 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9977 PGSOQueryType::IRPass) && 9978 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9979 return CM_ScalarEpilogueNotAllowedOptSize; 9980 9981 // 2) If set, obey the directives 9982 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9983 switch (PreferPredicateOverEpilogue) { 9984 case PreferPredicateTy::ScalarEpilogue: 9985 return CM_ScalarEpilogueAllowed; 9986 case PreferPredicateTy::PredicateElseScalarEpilogue: 9987 return CM_ScalarEpilogueNotNeededUsePredicate; 9988 case PreferPredicateTy::PredicateOrDontVectorize: 9989 return CM_ScalarEpilogueNotAllowedUsePredicate; 9990 }; 9991 } 9992 9993 // 3) If set, obey the hints 9994 switch (Hints.getPredicate()) { 9995 case LoopVectorizeHints::FK_Enabled: 9996 return CM_ScalarEpilogueNotNeededUsePredicate; 9997 case LoopVectorizeHints::FK_Disabled: 9998 return CM_ScalarEpilogueAllowed; 9999 }; 10000 10001 // 4) if the TTI hook indicates this is profitable, request predication. 10002 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10003 LVL.getLAI())) 10004 return CM_ScalarEpilogueNotNeededUsePredicate; 10005 10006 return CM_ScalarEpilogueAllowed; 10007 } 10008 10009 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10010 // If Values have been set for this Def return the one relevant for \p Part. 10011 if (hasVectorValue(Def, Part)) 10012 return Data.PerPartOutput[Def][Part]; 10013 10014 if (!hasScalarValue(Def, {Part, 0})) { 10015 Value *IRV = Def->getLiveInIRValue(); 10016 Value *B = ILV->getBroadcastInstrs(IRV); 10017 set(Def, B, Part); 10018 return B; 10019 } 10020 10021 Value *ScalarValue = get(Def, {Part, 0}); 10022 // If we aren't vectorizing, we can just copy the scalar map values over 10023 // to the vector map. 10024 if (VF.isScalar()) { 10025 set(Def, ScalarValue, Part); 10026 return ScalarValue; 10027 } 10028 10029 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10030 bool IsUniform = RepR && RepR->isUniform(); 10031 10032 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10033 // Check if there is a scalar value for the selected lane. 10034 if (!hasScalarValue(Def, {Part, LastLane})) { 10035 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10036 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10037 "unexpected recipe found to be invariant"); 10038 IsUniform = true; 10039 LastLane = 0; 10040 } 10041 10042 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10043 // Set the insert point after the last scalarized instruction or after the 10044 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10045 // will directly follow the scalar definitions. 10046 auto OldIP = Builder.saveIP(); 10047 auto NewIP = 10048 isa<PHINode>(LastInst) 10049 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10050 : std::next(BasicBlock::iterator(LastInst)); 10051 Builder.SetInsertPoint(&*NewIP); 10052 10053 // However, if we are vectorizing, we need to construct the vector values. 10054 // If the value is known to be uniform after vectorization, we can just 10055 // broadcast the scalar value corresponding to lane zero for each unroll 10056 // iteration. Otherwise, we construct the vector values using 10057 // insertelement instructions. Since the resulting vectors are stored in 10058 // State, we will only generate the insertelements once. 10059 Value *VectorValue = nullptr; 10060 if (IsUniform) { 10061 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10062 set(Def, VectorValue, Part); 10063 } else { 10064 // Initialize packing with insertelements to start from undef. 10065 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10066 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10067 set(Def, Undef, Part); 10068 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10069 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10070 VectorValue = get(Def, Part); 10071 } 10072 Builder.restoreIP(OldIP); 10073 return VectorValue; 10074 } 10075 10076 // Process the loop in the VPlan-native vectorization path. This path builds 10077 // VPlan upfront in the vectorization pipeline, which allows to apply 10078 // VPlan-to-VPlan transformations from the very beginning without modifying the 10079 // input LLVM IR. 10080 static bool processLoopInVPlanNativePath( 10081 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10082 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10083 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10084 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10085 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10086 LoopVectorizationRequirements &Requirements) { 10087 10088 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10089 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10090 return false; 10091 } 10092 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10093 Function *F = L->getHeader()->getParent(); 10094 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10095 10096 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10097 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10098 10099 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10100 &Hints, IAI); 10101 // Use the planner for outer loop vectorization. 10102 // TODO: CM is not used at this point inside the planner. Turn CM into an 10103 // optional argument if we don't need it in the future. 10104 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10105 Requirements, ORE); 10106 10107 // Get user vectorization factor. 10108 ElementCount UserVF = Hints.getWidth(); 10109 10110 CM.collectElementTypesForWidening(); 10111 10112 // Plan how to best vectorize, return the best VF and its cost. 10113 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10114 10115 // If we are stress testing VPlan builds, do not attempt to generate vector 10116 // code. Masked vector code generation support will follow soon. 10117 // Also, do not attempt to vectorize if no vector code will be produced. 10118 if (VPlanBuildStressTest || EnableVPlanPredication || 10119 VectorizationFactor::Disabled() == VF) 10120 return false; 10121 10122 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10123 10124 { 10125 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10126 F->getParent()->getDataLayout()); 10127 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10128 &CM, BFI, PSI, Checks); 10129 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10130 << L->getHeader()->getParent()->getName() << "\"\n"); 10131 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10132 } 10133 10134 // Mark the loop as already vectorized to avoid vectorizing again. 10135 Hints.setAlreadyVectorized(); 10136 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10137 return true; 10138 } 10139 10140 // Emit a remark if there are stores to floats that required a floating point 10141 // extension. If the vectorized loop was generated with floating point there 10142 // will be a performance penalty from the conversion overhead and the change in 10143 // the vector width. 10144 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10145 SmallVector<Instruction *, 4> Worklist; 10146 for (BasicBlock *BB : L->getBlocks()) { 10147 for (Instruction &Inst : *BB) { 10148 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10149 if (S->getValueOperand()->getType()->isFloatTy()) 10150 Worklist.push_back(S); 10151 } 10152 } 10153 } 10154 10155 // Traverse the floating point stores upwards searching, for floating point 10156 // conversions. 10157 SmallPtrSet<const Instruction *, 4> Visited; 10158 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10159 while (!Worklist.empty()) { 10160 auto *I = Worklist.pop_back_val(); 10161 if (!L->contains(I)) 10162 continue; 10163 if (!Visited.insert(I).second) 10164 continue; 10165 10166 // Emit a remark if the floating point store required a floating 10167 // point conversion. 10168 // TODO: More work could be done to identify the root cause such as a 10169 // constant or a function return type and point the user to it. 10170 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10171 ORE->emit([&]() { 10172 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10173 I->getDebugLoc(), L->getHeader()) 10174 << "floating point conversion changes vector width. " 10175 << "Mixed floating point precision requires an up/down " 10176 << "cast that will negatively impact performance."; 10177 }); 10178 10179 for (Use &Op : I->operands()) 10180 if (auto *OpI = dyn_cast<Instruction>(Op)) 10181 Worklist.push_back(OpI); 10182 } 10183 } 10184 10185 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10186 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10187 !EnableLoopInterleaving), 10188 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10189 !EnableLoopVectorization) {} 10190 10191 bool LoopVectorizePass::processLoop(Loop *L) { 10192 assert((EnableVPlanNativePath || L->isInnermost()) && 10193 "VPlan-native path is not enabled. Only process inner loops."); 10194 10195 #ifndef NDEBUG 10196 const std::string DebugLocStr = getDebugLocString(L); 10197 #endif /* NDEBUG */ 10198 10199 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10200 << L->getHeader()->getParent()->getName() << "\" from " 10201 << DebugLocStr << "\n"); 10202 10203 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10204 10205 LLVM_DEBUG( 10206 dbgs() << "LV: Loop hints:" 10207 << " force=" 10208 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10209 ? "disabled" 10210 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10211 ? "enabled" 10212 : "?")) 10213 << " width=" << Hints.getWidth() 10214 << " interleave=" << Hints.getInterleave() << "\n"); 10215 10216 // Function containing loop 10217 Function *F = L->getHeader()->getParent(); 10218 10219 // Looking at the diagnostic output is the only way to determine if a loop 10220 // was vectorized (other than looking at the IR or machine code), so it 10221 // is important to generate an optimization remark for each loop. Most of 10222 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10223 // generated as OptimizationRemark and OptimizationRemarkMissed are 10224 // less verbose reporting vectorized loops and unvectorized loops that may 10225 // benefit from vectorization, respectively. 10226 10227 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10228 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10229 return false; 10230 } 10231 10232 PredicatedScalarEvolution PSE(*SE, *L); 10233 10234 // Check if it is legal to vectorize the loop. 10235 LoopVectorizationRequirements Requirements; 10236 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10237 &Requirements, &Hints, DB, AC, BFI, PSI); 10238 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10239 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10240 Hints.emitRemarkWithHints(); 10241 return false; 10242 } 10243 10244 // Check the function attributes and profiles to find out if this function 10245 // should be optimized for size. 10246 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10247 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10248 10249 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10250 // here. They may require CFG and instruction level transformations before 10251 // even evaluating whether vectorization is profitable. Since we cannot modify 10252 // the incoming IR, we need to build VPlan upfront in the vectorization 10253 // pipeline. 10254 if (!L->isInnermost()) 10255 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10256 ORE, BFI, PSI, Hints, Requirements); 10257 10258 assert(L->isInnermost() && "Inner loop expected."); 10259 10260 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10261 // count by optimizing for size, to minimize overheads. 10262 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10263 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10264 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10265 << "This loop is worth vectorizing only if no scalar " 10266 << "iteration overheads are incurred."); 10267 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10268 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10269 else { 10270 LLVM_DEBUG(dbgs() << "\n"); 10271 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10272 } 10273 } 10274 10275 // Check the function attributes to see if implicit floats are allowed. 10276 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10277 // an integer loop and the vector instructions selected are purely integer 10278 // vector instructions? 10279 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10280 reportVectorizationFailure( 10281 "Can't vectorize when the NoImplicitFloat attribute is used", 10282 "loop not vectorized due to NoImplicitFloat attribute", 10283 "NoImplicitFloat", ORE, L); 10284 Hints.emitRemarkWithHints(); 10285 return false; 10286 } 10287 10288 // Check if the target supports potentially unsafe FP vectorization. 10289 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10290 // for the target we're vectorizing for, to make sure none of the 10291 // additional fp-math flags can help. 10292 if (Hints.isPotentiallyUnsafe() && 10293 TTI->isFPVectorizationPotentiallyUnsafe()) { 10294 reportVectorizationFailure( 10295 "Potentially unsafe FP op prevents vectorization", 10296 "loop not vectorized due to unsafe FP support.", 10297 "UnsafeFP", ORE, L); 10298 Hints.emitRemarkWithHints(); 10299 return false; 10300 } 10301 10302 bool AllowOrderedReductions; 10303 // If the flag is set, use that instead and override the TTI behaviour. 10304 if (ForceOrderedReductions.getNumOccurrences() > 0) 10305 AllowOrderedReductions = ForceOrderedReductions; 10306 else 10307 AllowOrderedReductions = TTI->enableOrderedReductions(); 10308 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10309 ORE->emit([&]() { 10310 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10311 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10312 ExactFPMathInst->getDebugLoc(), 10313 ExactFPMathInst->getParent()) 10314 << "loop not vectorized: cannot prove it is safe to reorder " 10315 "floating-point operations"; 10316 }); 10317 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10318 "reorder floating-point operations\n"); 10319 Hints.emitRemarkWithHints(); 10320 return false; 10321 } 10322 10323 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10324 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10325 10326 // If an override option has been passed in for interleaved accesses, use it. 10327 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10328 UseInterleaved = EnableInterleavedMemAccesses; 10329 10330 // Analyze interleaved memory accesses. 10331 if (UseInterleaved) { 10332 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10333 } 10334 10335 // Use the cost model. 10336 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10337 F, &Hints, IAI); 10338 CM.collectValuesToIgnore(); 10339 CM.collectElementTypesForWidening(); 10340 10341 // Use the planner for vectorization. 10342 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10343 Requirements, ORE); 10344 10345 // Get user vectorization factor and interleave count. 10346 ElementCount UserVF = Hints.getWidth(); 10347 unsigned UserIC = Hints.getInterleave(); 10348 10349 // Plan how to best vectorize, return the best VF and its cost. 10350 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10351 10352 VectorizationFactor VF = VectorizationFactor::Disabled(); 10353 unsigned IC = 1; 10354 10355 if (MaybeVF) { 10356 VF = *MaybeVF; 10357 // Select the interleave count. 10358 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10359 } 10360 10361 // Identify the diagnostic messages that should be produced. 10362 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10363 bool VectorizeLoop = true, InterleaveLoop = true; 10364 if (VF.Width.isScalar()) { 10365 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10366 VecDiagMsg = std::make_pair( 10367 "VectorizationNotBeneficial", 10368 "the cost-model indicates that vectorization is not beneficial"); 10369 VectorizeLoop = false; 10370 } 10371 10372 if (!MaybeVF && UserIC > 1) { 10373 // Tell the user interleaving was avoided up-front, despite being explicitly 10374 // requested. 10375 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10376 "interleaving should be avoided up front\n"); 10377 IntDiagMsg = std::make_pair( 10378 "InterleavingAvoided", 10379 "Ignoring UserIC, because interleaving was avoided up front"); 10380 InterleaveLoop = false; 10381 } else if (IC == 1 && UserIC <= 1) { 10382 // Tell the user interleaving is not beneficial. 10383 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10384 IntDiagMsg = std::make_pair( 10385 "InterleavingNotBeneficial", 10386 "the cost-model indicates that interleaving is not beneficial"); 10387 InterleaveLoop = false; 10388 if (UserIC == 1) { 10389 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10390 IntDiagMsg.second += 10391 " and is explicitly disabled or interleave count is set to 1"; 10392 } 10393 } else if (IC > 1 && UserIC == 1) { 10394 // Tell the user interleaving is beneficial, but it explicitly disabled. 10395 LLVM_DEBUG( 10396 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10397 IntDiagMsg = std::make_pair( 10398 "InterleavingBeneficialButDisabled", 10399 "the cost-model indicates that interleaving is beneficial " 10400 "but is explicitly disabled or interleave count is set to 1"); 10401 InterleaveLoop = false; 10402 } 10403 10404 // Override IC if user provided an interleave count. 10405 IC = UserIC > 0 ? UserIC : IC; 10406 10407 // Emit diagnostic messages, if any. 10408 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10409 if (!VectorizeLoop && !InterleaveLoop) { 10410 // Do not vectorize or interleaving the loop. 10411 ORE->emit([&]() { 10412 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10413 L->getStartLoc(), L->getHeader()) 10414 << VecDiagMsg.second; 10415 }); 10416 ORE->emit([&]() { 10417 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10418 L->getStartLoc(), L->getHeader()) 10419 << IntDiagMsg.second; 10420 }); 10421 return false; 10422 } else if (!VectorizeLoop && InterleaveLoop) { 10423 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10424 ORE->emit([&]() { 10425 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10426 L->getStartLoc(), L->getHeader()) 10427 << VecDiagMsg.second; 10428 }); 10429 } else if (VectorizeLoop && !InterleaveLoop) { 10430 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10431 << ") in " << DebugLocStr << '\n'); 10432 ORE->emit([&]() { 10433 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10434 L->getStartLoc(), L->getHeader()) 10435 << IntDiagMsg.second; 10436 }); 10437 } else if (VectorizeLoop && InterleaveLoop) { 10438 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10439 << ") in " << DebugLocStr << '\n'); 10440 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10441 } 10442 10443 bool DisableRuntimeUnroll = false; 10444 MDNode *OrigLoopID = L->getLoopID(); 10445 { 10446 // Optimistically generate runtime checks. Drop them if they turn out to not 10447 // be profitable. Limit the scope of Checks, so the cleanup happens 10448 // immediately after vector codegeneration is done. 10449 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10450 F->getParent()->getDataLayout()); 10451 if (!VF.Width.isScalar() || IC > 1) 10452 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10453 10454 using namespace ore; 10455 if (!VectorizeLoop) { 10456 assert(IC > 1 && "interleave count should not be 1 or 0"); 10457 // If we decided that it is not legal to vectorize the loop, then 10458 // interleave it. 10459 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10460 &CM, BFI, PSI, Checks); 10461 10462 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10463 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10464 10465 ORE->emit([&]() { 10466 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10467 L->getHeader()) 10468 << "interleaved loop (interleaved count: " 10469 << NV("InterleaveCount", IC) << ")"; 10470 }); 10471 } else { 10472 // If we decided that it is *legal* to vectorize the loop, then do it. 10473 10474 // Consider vectorizing the epilogue too if it's profitable. 10475 VectorizationFactor EpilogueVF = 10476 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10477 if (EpilogueVF.Width.isVector()) { 10478 10479 // The first pass vectorizes the main loop and creates a scalar epilogue 10480 // to be vectorized by executing the plan (potentially with a different 10481 // factor) again shortly afterwards. 10482 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10483 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10484 EPI, &LVL, &CM, BFI, PSI, Checks); 10485 10486 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10487 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10488 DT); 10489 ++LoopsVectorized; 10490 10491 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10492 formLCSSARecursively(*L, *DT, LI, SE); 10493 10494 // Second pass vectorizes the epilogue and adjusts the control flow 10495 // edges from the first pass. 10496 EPI.MainLoopVF = EPI.EpilogueVF; 10497 EPI.MainLoopUF = EPI.EpilogueUF; 10498 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10499 ORE, EPI, &LVL, &CM, BFI, PSI, 10500 Checks); 10501 10502 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10503 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10504 DT); 10505 ++LoopsEpilogueVectorized; 10506 10507 if (!MainILV.areSafetyChecksAdded()) 10508 DisableRuntimeUnroll = true; 10509 } else { 10510 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10511 &LVL, &CM, BFI, PSI, Checks); 10512 10513 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10514 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10515 ++LoopsVectorized; 10516 10517 // Add metadata to disable runtime unrolling a scalar loop when there 10518 // are no runtime checks about strides and memory. A scalar loop that is 10519 // rarely used is not worth unrolling. 10520 if (!LB.areSafetyChecksAdded()) 10521 DisableRuntimeUnroll = true; 10522 } 10523 // Report the vectorization decision. 10524 ORE->emit([&]() { 10525 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10526 L->getHeader()) 10527 << "vectorized loop (vectorization width: " 10528 << NV("VectorizationFactor", VF.Width) 10529 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10530 }); 10531 } 10532 10533 if (ORE->allowExtraAnalysis(LV_NAME)) 10534 checkMixedPrecision(L, ORE); 10535 } 10536 10537 Optional<MDNode *> RemainderLoopID = 10538 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10539 LLVMLoopVectorizeFollowupEpilogue}); 10540 if (RemainderLoopID.hasValue()) { 10541 L->setLoopID(RemainderLoopID.getValue()); 10542 } else { 10543 if (DisableRuntimeUnroll) 10544 AddRuntimeUnrollDisableMetaData(L); 10545 10546 // Mark the loop as already vectorized to avoid vectorizing again. 10547 Hints.setAlreadyVectorized(); 10548 } 10549 10550 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10551 return true; 10552 } 10553 10554 LoopVectorizeResult LoopVectorizePass::runImpl( 10555 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10556 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10557 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10558 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10559 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10560 SE = &SE_; 10561 LI = &LI_; 10562 TTI = &TTI_; 10563 DT = &DT_; 10564 BFI = &BFI_; 10565 TLI = TLI_; 10566 AA = &AA_; 10567 AC = &AC_; 10568 GetLAA = &GetLAA_; 10569 DB = &DB_; 10570 ORE = &ORE_; 10571 PSI = PSI_; 10572 10573 // Don't attempt if 10574 // 1. the target claims to have no vector registers, and 10575 // 2. interleaving won't help ILP. 10576 // 10577 // The second condition is necessary because, even if the target has no 10578 // vector registers, loop vectorization may still enable scalar 10579 // interleaving. 10580 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10581 TTI->getMaxInterleaveFactor(1) < 2) 10582 return LoopVectorizeResult(false, false); 10583 10584 bool Changed = false, CFGChanged = false; 10585 10586 // The vectorizer requires loops to be in simplified form. 10587 // Since simplification may add new inner loops, it has to run before the 10588 // legality and profitability checks. This means running the loop vectorizer 10589 // will simplify all loops, regardless of whether anything end up being 10590 // vectorized. 10591 for (auto &L : *LI) 10592 Changed |= CFGChanged |= 10593 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10594 10595 // Build up a worklist of inner-loops to vectorize. This is necessary as 10596 // the act of vectorizing or partially unrolling a loop creates new loops 10597 // and can invalidate iterators across the loops. 10598 SmallVector<Loop *, 8> Worklist; 10599 10600 for (Loop *L : *LI) 10601 collectSupportedLoops(*L, LI, ORE, Worklist); 10602 10603 LoopsAnalyzed += Worklist.size(); 10604 10605 // Now walk the identified inner loops. 10606 while (!Worklist.empty()) { 10607 Loop *L = Worklist.pop_back_val(); 10608 10609 // For the inner loops we actually process, form LCSSA to simplify the 10610 // transform. 10611 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10612 10613 Changed |= CFGChanged |= processLoop(L); 10614 } 10615 10616 // Process each loop nest in the function. 10617 return LoopVectorizeResult(Changed, CFGChanged); 10618 } 10619 10620 PreservedAnalyses LoopVectorizePass::run(Function &F, 10621 FunctionAnalysisManager &AM) { 10622 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10623 auto &LI = AM.getResult<LoopAnalysis>(F); 10624 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10625 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10626 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10627 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10628 auto &AA = AM.getResult<AAManager>(F); 10629 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10630 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10631 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10632 10633 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10634 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10635 [&](Loop &L) -> const LoopAccessInfo & { 10636 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10637 TLI, TTI, nullptr, nullptr, nullptr}; 10638 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10639 }; 10640 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10641 ProfileSummaryInfo *PSI = 10642 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10643 LoopVectorizeResult Result = 10644 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10645 if (!Result.MadeAnyChange) 10646 return PreservedAnalyses::all(); 10647 PreservedAnalyses PA; 10648 10649 // We currently do not preserve loopinfo/dominator analyses with outer loop 10650 // vectorization. Until this is addressed, mark these analyses as preserved 10651 // only for non-VPlan-native path. 10652 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10653 if (!EnableVPlanNativePath) { 10654 PA.preserve<LoopAnalysis>(); 10655 PA.preserve<DominatorTreeAnalysis>(); 10656 } 10657 if (!Result.MadeCFGChange) 10658 PA.preserveSet<CFGAnalyses>(); 10659 return PA; 10660 } 10661 10662 void LoopVectorizePass::printPipeline( 10663 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10664 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10665 OS, MapClassName2PassName); 10666 10667 OS << "<"; 10668 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10669 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10670 OS << ">"; 10671 } 10672