1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 static cl::opt<bool> EnableIndVarRegisterHeur( 311 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 312 cl::desc("Count the induction variable only once when interleaving")); 313 314 static cl::opt<bool> EnableCondStoresVectorization( 315 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 316 cl::desc("Enable if predication of stores during vectorization.")); 317 318 static cl::opt<unsigned> MaxNestedScalarReductionIC( 319 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 320 cl::desc("The maximum interleave count to use when interleaving a scalar " 321 "reduction in a nested loop.")); 322 323 static cl::opt<bool> 324 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 325 cl::Hidden, 326 cl::desc("Prefer in-loop vector reductions, " 327 "overriding the targets preference.")); 328 329 static cl::opt<bool> ForceOrderedReductions( 330 "force-ordered-reductions", cl::init(false), cl::Hidden, 331 cl::desc("Enable the vectorisation of loops with in-order (strict) " 332 "FP reductions")); 333 334 static cl::opt<bool> PreferPredicatedReductionSelect( 335 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 336 cl::desc( 337 "Prefer predicating a reduction operation over an after loop select.")); 338 339 cl::opt<bool> EnableVPlanNativePath( 340 "enable-vplan-native-path", cl::init(false), cl::Hidden, 341 cl::desc("Enable VPlan-native vectorization path with " 342 "support for outer loop vectorization.")); 343 344 // FIXME: Remove this switch once we have divergence analysis. Currently we 345 // assume divergent non-backedge branches when this switch is true. 346 cl::opt<bool> EnableVPlanPredication( 347 "enable-vplan-predication", cl::init(false), cl::Hidden, 348 cl::desc("Enable VPlan-native vectorization path predicator with " 349 "support for outer loop vectorization.")); 350 351 // This flag enables the stress testing of the VPlan H-CFG construction in the 352 // VPlan-native vectorization path. It must be used in conjuction with 353 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 354 // verification of the H-CFGs built. 355 static cl::opt<bool> VPlanBuildStressTest( 356 "vplan-build-stress-test", cl::init(false), cl::Hidden, 357 cl::desc( 358 "Build VPlan for every supported loop nest in the function and bail " 359 "out right after the build (stress test the VPlan H-CFG construction " 360 "in the VPlan-native vectorization path).")); 361 362 cl::opt<bool> llvm::EnableLoopInterleaving( 363 "interleave-loops", cl::init(true), cl::Hidden, 364 cl::desc("Enable loop interleaving in Loop vectorization passes")); 365 cl::opt<bool> llvm::EnableLoopVectorization( 366 "vectorize-loops", cl::init(true), cl::Hidden, 367 cl::desc("Run the Loop vectorization passes")); 368 369 cl::opt<bool> PrintVPlansInDotFormat( 370 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 371 cl::desc("Use dot format instead of plain text when dumping VPlans")); 372 373 /// A helper function that returns true if the given type is irregular. The 374 /// type is irregular if its allocated size doesn't equal the store size of an 375 /// element of the corresponding vector type. 376 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 377 // Determine if an array of N elements of type Ty is "bitcast compatible" 378 // with a <N x Ty> vector. 379 // This is only true if there is no padding between the array elements. 380 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 381 } 382 383 /// A helper function that returns the reciprocal of the block probability of 384 /// predicated blocks. If we return X, we are assuming the predicated block 385 /// will execute once for every X iterations of the loop header. 386 /// 387 /// TODO: We should use actual block probability here, if available. Currently, 388 /// we always assume predicated blocks have a 50% chance of executing. 389 static unsigned getReciprocalPredBlockProb() { return 2; } 390 391 /// A helper function that returns an integer or floating-point constant with 392 /// value C. 393 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 394 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 395 : ConstantFP::get(Ty, C); 396 } 397 398 /// Returns "best known" trip count for the specified loop \p L as defined by 399 /// the following procedure: 400 /// 1) Returns exact trip count if it is known. 401 /// 2) Returns expected trip count according to profile data if any. 402 /// 3) Returns upper bound estimate if it is known. 403 /// 4) Returns None if all of the above failed. 404 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 405 // Check if exact trip count is known. 406 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 407 return ExpectedTC; 408 409 // Check if there is an expected trip count available from profile data. 410 if (LoopVectorizeWithBlockFrequency) 411 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 412 return EstimatedTC; 413 414 // Check if upper bound estimate is known. 415 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 416 return ExpectedTC; 417 418 return None; 419 } 420 421 // Forward declare GeneratedRTChecks. 422 class GeneratedRTChecks; 423 424 namespace llvm { 425 426 AnalysisKey ShouldRunExtraVectorPasses::Key; 427 428 /// InnerLoopVectorizer vectorizes loops which contain only one basic 429 /// block to a specified vectorization factor (VF). 430 /// This class performs the widening of scalars into vectors, or multiple 431 /// scalars. This class also implements the following features: 432 /// * It inserts an epilogue loop for handling loops that don't have iteration 433 /// counts that are known to be a multiple of the vectorization factor. 434 /// * It handles the code generation for reduction variables. 435 /// * Scalarization (implementation using scalars) of un-vectorizable 436 /// instructions. 437 /// InnerLoopVectorizer does not perform any vectorization-legality 438 /// checks, and relies on the caller to check for the different legality 439 /// aspects. The InnerLoopVectorizer relies on the 440 /// LoopVectorizationLegality class to provide information about the induction 441 /// and reduction variables that were found to a given vectorization factor. 442 class InnerLoopVectorizer { 443 public: 444 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 445 LoopInfo *LI, DominatorTree *DT, 446 const TargetLibraryInfo *TLI, 447 const TargetTransformInfo *TTI, AssumptionCache *AC, 448 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 449 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 450 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 451 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 452 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 453 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 454 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 455 PSI(PSI), RTChecks(RTChecks) { 456 // Query this against the original loop and save it here because the profile 457 // of the original loop header may change as the transformation happens. 458 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 459 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 460 } 461 462 virtual ~InnerLoopVectorizer() = default; 463 464 /// Create a new empty loop that will contain vectorized instructions later 465 /// on, while the old loop will be used as the scalar remainder. Control flow 466 /// is generated around the vectorized (and scalar epilogue) loops consisting 467 /// of various checks and bypasses. Return the pre-header block of the new 468 /// loop and the start value for the canonical induction, if it is != 0. The 469 /// latter is the case when vectorizing the epilogue loop. In the case of 470 /// epilogue vectorization, this function is overriden to handle the more 471 /// complex control flow around the loops. 472 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 473 474 /// Widen a single call instruction within the innermost loop. 475 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 476 VPTransformState &State); 477 478 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 479 void fixVectorizedLoop(VPTransformState &State); 480 481 // Return true if any runtime check is added. 482 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 483 484 /// A type for vectorized values in the new loop. Each value from the 485 /// original loop, when vectorized, is represented by UF vector values in the 486 /// new unrolled loop, where UF is the unroll factor. 487 using VectorParts = SmallVector<Value *, 2>; 488 489 /// Vectorize a single first-order recurrence or pointer induction PHINode in 490 /// a block. This method handles the induction variable canonicalization. It 491 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 492 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 493 VPTransformState &State); 494 495 /// A helper function to scalarize a single Instruction in the innermost loop. 496 /// Generates a sequence of scalar instances for each lane between \p MinLane 497 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 498 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 499 /// Instr's operands. 500 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 501 const VPIteration &Instance, bool IfPredicateInstr, 502 VPTransformState &State); 503 504 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 505 /// is provided, the integer induction variable will first be truncated to 506 /// the corresponding type. \p CanonicalIV is the scalar value generated for 507 /// the canonical induction variable. 508 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, 509 VPTransformState &State, Value *CanonicalIV); 510 511 /// Construct the vector value of a scalarized value \p V one lane at a time. 512 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 513 VPTransformState &State); 514 515 /// Try to vectorize interleaved access group \p Group with the base address 516 /// given in \p Addr, optionally masking the vector operations if \p 517 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 518 /// values in the vectorized loop. 519 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 520 ArrayRef<VPValue *> VPDefs, 521 VPTransformState &State, VPValue *Addr, 522 ArrayRef<VPValue *> StoredValues, 523 VPValue *BlockInMask = nullptr); 524 525 /// Set the debug location in the builder \p Ptr using the debug location in 526 /// \p V. If \p Ptr is None then it uses the class member's Builder. 527 void setDebugLocFromInst(const Value *V, 528 Optional<IRBuilderBase *> CustomBuilder = None); 529 530 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 531 void fixNonInductionPHIs(VPTransformState &State); 532 533 /// Returns true if the reordering of FP operations is not allowed, but we are 534 /// able to vectorize with strict in-order reductions for the given RdxDesc. 535 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 536 537 /// Create a broadcast instruction. This method generates a broadcast 538 /// instruction (shuffle) for loop invariant values and for the induction 539 /// value. If this is the induction variable then we extend it to N, N+1, ... 540 /// this is needed because each iteration in the loop corresponds to a SIMD 541 /// element. 542 virtual Value *getBroadcastInstrs(Value *V); 543 544 /// Add metadata from one instruction to another. 545 /// 546 /// This includes both the original MDs from \p From and additional ones (\see 547 /// addNewMetadata). Use this for *newly created* instructions in the vector 548 /// loop. 549 void addMetadata(Instruction *To, Instruction *From); 550 551 /// Similar to the previous function but it adds the metadata to a 552 /// vector of instructions. 553 void addMetadata(ArrayRef<Value *> To, Instruction *From); 554 555 // Returns the resume value (bc.merge.rdx) for a reduction as 556 // generated by fixReduction. 557 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 558 559 protected: 560 friend class LoopVectorizationPlanner; 561 562 /// A small list of PHINodes. 563 using PhiVector = SmallVector<PHINode *, 4>; 564 565 /// A type for scalarized values in the new loop. Each value from the 566 /// original loop, when scalarized, is represented by UF x VF scalar values 567 /// in the new unrolled loop, where UF is the unroll factor and VF is the 568 /// vectorization factor. 569 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 570 571 /// Set up the values of the IVs correctly when exiting the vector loop. 572 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 573 Value *CountRoundDown, Value *EndValue, 574 BasicBlock *MiddleBlock); 575 576 /// Introduce a conditional branch (on true, condition to be set later) at the 577 /// end of the header=latch connecting it to itself (across the backedge) and 578 /// to the exit block of \p L. 579 void createHeaderBranch(Loop *L); 580 581 /// Handle all cross-iteration phis in the header. 582 void fixCrossIterationPHIs(VPTransformState &State); 583 584 /// Create the exit value of first order recurrences in the middle block and 585 /// update their users. 586 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 587 VPTransformState &State); 588 589 /// Create code for the loop exit value of the reduction. 590 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 591 592 /// Clear NSW/NUW flags from reduction instructions if necessary. 593 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 594 VPTransformState &State); 595 596 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 597 /// means we need to add the appropriate incoming value from the middle 598 /// block as exiting edges from the scalar epilogue loop (if present) are 599 /// already in place, and we exit the vector loop exclusively to the middle 600 /// block. 601 void fixLCSSAPHIs(VPTransformState &State); 602 603 /// Iteratively sink the scalarized operands of a predicated instruction into 604 /// the block that was created for it. 605 void sinkScalarOperands(Instruction *PredInst); 606 607 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 608 /// represented as. 609 void truncateToMinimalBitwidths(VPTransformState &State); 610 611 /// Create a vector induction phi node based on an existing scalar one. \p 612 /// EntryVal is the value from the original loop that maps to the vector phi 613 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 614 /// truncate instruction, instead of widening the original IV, we widen a 615 /// version of the IV truncated to \p EntryVal's type. 616 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 617 Value *Step, Value *Start, 618 Instruction *EntryVal, VPValue *Def, 619 VPTransformState &State); 620 621 /// Returns (and creates if needed) the original loop trip count. 622 Value *getOrCreateTripCount(Loop *NewLoop); 623 624 /// Returns (and creates if needed) the trip count of the widened loop. 625 Value *getOrCreateVectorTripCount(Loop *NewLoop); 626 627 /// Returns a bitcasted value to the requested vector type. 628 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 629 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 630 const DataLayout &DL); 631 632 /// Emit a bypass check to see if the vector trip count is zero, including if 633 /// it overflows. 634 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 635 636 /// Emit a bypass check to see if all of the SCEV assumptions we've 637 /// had to make are correct. Returns the block containing the checks or 638 /// nullptr if no checks have been added. 639 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 640 641 /// Emit bypass checks to check any memory assumptions we may have made. 642 /// Returns the block containing the checks or nullptr if no checks have been 643 /// added. 644 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 645 646 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 647 /// vector loop preheader, middle block and scalar preheader. Also 648 /// allocate a loop object for the new vector loop and return it. 649 Loop *createVectorLoopSkeleton(StringRef Prefix); 650 651 /// Create new phi nodes for the induction variables to resume iteration count 652 /// in the scalar epilogue, from where the vectorized loop left off. 653 /// In cases where the loop skeleton is more complicated (eg. epilogue 654 /// vectorization) and the resume values can come from an additional bypass 655 /// block, the \p AdditionalBypass pair provides information about the bypass 656 /// block and the end value on the edge from bypass to this loop. 657 void createInductionResumeValues( 658 Loop *L, 659 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 660 661 /// Complete the loop skeleton by adding debug MDs, creating appropriate 662 /// conditional branches in the middle block, preparing the builder and 663 /// running the verifier. Take in the vector loop \p L as argument, and return 664 /// the preheader of the completed vector loop. 665 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 666 667 /// Add additional metadata to \p To that was not present on \p Orig. 668 /// 669 /// Currently this is used to add the noalias annotations based on the 670 /// inserted memchecks. Use this for instructions that are *cloned* into the 671 /// vector loop. 672 void addNewMetadata(Instruction *To, const Instruction *Orig); 673 674 /// Collect poison-generating recipes that may generate a poison value that is 675 /// used after vectorization, even when their operands are not poison. Those 676 /// recipes meet the following conditions: 677 /// * Contribute to the address computation of a recipe generating a widen 678 /// memory load/store (VPWidenMemoryInstructionRecipe or 679 /// VPInterleaveRecipe). 680 /// * Such a widen memory load/store has at least one underlying Instruction 681 /// that is in a basic block that needs predication and after vectorization 682 /// the generated instruction won't be predicated. 683 void collectPoisonGeneratingRecipes(VPTransformState &State); 684 685 /// Allow subclasses to override and print debug traces before/after vplan 686 /// execution, when trace information is requested. 687 virtual void printDebugTracesAtStart(){}; 688 virtual void printDebugTracesAtEnd(){}; 689 690 /// The original loop. 691 Loop *OrigLoop; 692 693 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 694 /// dynamic knowledge to simplify SCEV expressions and converts them to a 695 /// more usable form. 696 PredicatedScalarEvolution &PSE; 697 698 /// Loop Info. 699 LoopInfo *LI; 700 701 /// Dominator Tree. 702 DominatorTree *DT; 703 704 /// Alias Analysis. 705 AAResults *AA; 706 707 /// Target Library Info. 708 const TargetLibraryInfo *TLI; 709 710 /// Target Transform Info. 711 const TargetTransformInfo *TTI; 712 713 /// Assumption Cache. 714 AssumptionCache *AC; 715 716 /// Interface to emit optimization remarks. 717 OptimizationRemarkEmitter *ORE; 718 719 /// LoopVersioning. It's only set up (non-null) if memchecks were 720 /// used. 721 /// 722 /// This is currently only used to add no-alias metadata based on the 723 /// memchecks. The actually versioning is performed manually. 724 std::unique_ptr<LoopVersioning> LVer; 725 726 /// The vectorization SIMD factor to use. Each vector will have this many 727 /// vector elements. 728 ElementCount VF; 729 730 /// The vectorization unroll factor to use. Each scalar is vectorized to this 731 /// many different vector instructions. 732 unsigned UF; 733 734 /// The builder that we use 735 IRBuilder<> Builder; 736 737 // --- Vectorization state --- 738 739 /// The vector-loop preheader. 740 BasicBlock *LoopVectorPreHeader; 741 742 /// The scalar-loop preheader. 743 BasicBlock *LoopScalarPreHeader; 744 745 /// Middle Block between the vector and the scalar. 746 BasicBlock *LoopMiddleBlock; 747 748 /// The unique ExitBlock of the scalar loop if one exists. Note that 749 /// there can be multiple exiting edges reaching this block. 750 BasicBlock *LoopExitBlock; 751 752 /// The vector loop body. 753 BasicBlock *LoopVectorBody; 754 755 /// The scalar loop body. 756 BasicBlock *LoopScalarBody; 757 758 /// A list of all bypass blocks. The first block is the entry of the loop. 759 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 760 761 /// Store instructions that were predicated. 762 SmallVector<Instruction *, 4> PredicatedInstructions; 763 764 /// Trip count of the original loop. 765 Value *TripCount = nullptr; 766 767 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 768 Value *VectorTripCount = nullptr; 769 770 /// The legality analysis. 771 LoopVectorizationLegality *Legal; 772 773 /// The profitablity analysis. 774 LoopVectorizationCostModel *Cost; 775 776 // Record whether runtime checks are added. 777 bool AddedSafetyChecks = false; 778 779 // Holds the end values for each induction variable. We save the end values 780 // so we can later fix-up the external users of the induction variables. 781 DenseMap<PHINode *, Value *> IVEndValues; 782 783 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 784 // fixed up at the end of vector code generation. 785 SmallVector<PHINode *, 8> OrigPHIsToFix; 786 787 /// BFI and PSI are used to check for profile guided size optimizations. 788 BlockFrequencyInfo *BFI; 789 ProfileSummaryInfo *PSI; 790 791 // Whether this loop should be optimized for size based on profile guided size 792 // optimizatios. 793 bool OptForSizeBasedOnProfile; 794 795 /// Structure to hold information about generated runtime checks, responsible 796 /// for cleaning the checks, if vectorization turns out unprofitable. 797 GeneratedRTChecks &RTChecks; 798 799 // Holds the resume values for reductions in the loops, used to set the 800 // correct start value of reduction PHIs when vectorizing the epilogue. 801 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 802 ReductionResumeValues; 803 }; 804 805 class InnerLoopUnroller : public InnerLoopVectorizer { 806 public: 807 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 808 LoopInfo *LI, DominatorTree *DT, 809 const TargetLibraryInfo *TLI, 810 const TargetTransformInfo *TTI, AssumptionCache *AC, 811 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 812 LoopVectorizationLegality *LVL, 813 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 814 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 815 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 816 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 817 BFI, PSI, Check) {} 818 819 private: 820 Value *getBroadcastInstrs(Value *V) override; 821 }; 822 823 /// Encapsulate information regarding vectorization of a loop and its epilogue. 824 /// This information is meant to be updated and used across two stages of 825 /// epilogue vectorization. 826 struct EpilogueLoopVectorizationInfo { 827 ElementCount MainLoopVF = ElementCount::getFixed(0); 828 unsigned MainLoopUF = 0; 829 ElementCount EpilogueVF = ElementCount::getFixed(0); 830 unsigned EpilogueUF = 0; 831 BasicBlock *MainLoopIterationCountCheck = nullptr; 832 BasicBlock *EpilogueIterationCountCheck = nullptr; 833 BasicBlock *SCEVSafetyCheck = nullptr; 834 BasicBlock *MemSafetyCheck = nullptr; 835 Value *TripCount = nullptr; 836 Value *VectorTripCount = nullptr; 837 838 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 839 ElementCount EVF, unsigned EUF) 840 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 841 assert(EUF == 1 && 842 "A high UF for the epilogue loop is likely not beneficial."); 843 } 844 }; 845 846 /// An extension of the inner loop vectorizer that creates a skeleton for a 847 /// vectorized loop that has its epilogue (residual) also vectorized. 848 /// The idea is to run the vplan on a given loop twice, firstly to setup the 849 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 850 /// from the first step and vectorize the epilogue. This is achieved by 851 /// deriving two concrete strategy classes from this base class and invoking 852 /// them in succession from the loop vectorizer planner. 853 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 854 public: 855 InnerLoopAndEpilogueVectorizer( 856 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 857 DominatorTree *DT, const TargetLibraryInfo *TLI, 858 const TargetTransformInfo *TTI, AssumptionCache *AC, 859 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 860 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 861 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 862 GeneratedRTChecks &Checks) 863 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 864 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 865 Checks), 866 EPI(EPI) {} 867 868 // Override this function to handle the more complex control flow around the 869 // three loops. 870 std::pair<BasicBlock *, Value *> 871 createVectorizedLoopSkeleton() final override { 872 return createEpilogueVectorizedLoopSkeleton(); 873 } 874 875 /// The interface for creating a vectorized skeleton using one of two 876 /// different strategies, each corresponding to one execution of the vplan 877 /// as described above. 878 virtual std::pair<BasicBlock *, Value *> 879 createEpilogueVectorizedLoopSkeleton() = 0; 880 881 /// Holds and updates state information required to vectorize the main loop 882 /// and its epilogue in two separate passes. This setup helps us avoid 883 /// regenerating and recomputing runtime safety checks. It also helps us to 884 /// shorten the iteration-count-check path length for the cases where the 885 /// iteration count of the loop is so small that the main vector loop is 886 /// completely skipped. 887 EpilogueLoopVectorizationInfo &EPI; 888 }; 889 890 /// A specialized derived class of inner loop vectorizer that performs 891 /// vectorization of *main* loops in the process of vectorizing loops and their 892 /// epilogues. 893 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 894 public: 895 EpilogueVectorizerMainLoop( 896 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 897 DominatorTree *DT, const TargetLibraryInfo *TLI, 898 const TargetTransformInfo *TTI, AssumptionCache *AC, 899 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 900 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 901 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 902 GeneratedRTChecks &Check) 903 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 904 EPI, LVL, CM, BFI, PSI, Check) {} 905 /// Implements the interface for creating a vectorized skeleton using the 906 /// *main loop* strategy (ie the first pass of vplan execution). 907 std::pair<BasicBlock *, Value *> 908 createEpilogueVectorizedLoopSkeleton() final override; 909 910 protected: 911 /// Emits an iteration count bypass check once for the main loop (when \p 912 /// ForEpilogue is false) and once for the epilogue loop (when \p 913 /// ForEpilogue is true). 914 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 915 bool ForEpilogue); 916 void printDebugTracesAtStart() override; 917 void printDebugTracesAtEnd() override; 918 }; 919 920 // A specialized derived class of inner loop vectorizer that performs 921 // vectorization of *epilogue* loops in the process of vectorizing loops and 922 // their epilogues. 923 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 924 public: 925 EpilogueVectorizerEpilogueLoop( 926 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 927 DominatorTree *DT, const TargetLibraryInfo *TLI, 928 const TargetTransformInfo *TTI, AssumptionCache *AC, 929 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 930 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 931 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 932 GeneratedRTChecks &Checks) 933 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 934 EPI, LVL, CM, BFI, PSI, Checks) {} 935 /// Implements the interface for creating a vectorized skeleton using the 936 /// *epilogue loop* strategy (ie the second pass of vplan execution). 937 std::pair<BasicBlock *, Value *> 938 createEpilogueVectorizedLoopSkeleton() final override; 939 940 protected: 941 /// Emits an iteration count bypass check after the main vector loop has 942 /// finished to see if there are any iterations left to execute by either 943 /// the vector epilogue or the scalar epilogue. 944 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 945 BasicBlock *Bypass, 946 BasicBlock *Insert); 947 void printDebugTracesAtStart() override; 948 void printDebugTracesAtEnd() override; 949 }; 950 } // end namespace llvm 951 952 /// Look for a meaningful debug location on the instruction or it's 953 /// operands. 954 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 955 if (!I) 956 return I; 957 958 DebugLoc Empty; 959 if (I->getDebugLoc() != Empty) 960 return I; 961 962 for (Use &Op : I->operands()) { 963 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 964 if (OpInst->getDebugLoc() != Empty) 965 return OpInst; 966 } 967 968 return I; 969 } 970 971 void InnerLoopVectorizer::setDebugLocFromInst( 972 const Value *V, Optional<IRBuilderBase *> CustomBuilder) { 973 IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 974 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 975 const DILocation *DIL = Inst->getDebugLoc(); 976 977 // When a FSDiscriminator is enabled, we don't need to add the multiply 978 // factors to the discriminators. 979 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 980 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 981 // FIXME: For scalable vectors, assume vscale=1. 982 auto NewDIL = 983 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 984 if (NewDIL) 985 B->SetCurrentDebugLocation(NewDIL.getValue()); 986 else 987 LLVM_DEBUG(dbgs() 988 << "Failed to create new discriminator: " 989 << DIL->getFilename() << " Line: " << DIL->getLine()); 990 } else 991 B->SetCurrentDebugLocation(DIL); 992 } else 993 B->SetCurrentDebugLocation(DebugLoc()); 994 } 995 996 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 997 /// is passed, the message relates to that particular instruction. 998 #ifndef NDEBUG 999 static void debugVectorizationMessage(const StringRef Prefix, 1000 const StringRef DebugMsg, 1001 Instruction *I) { 1002 dbgs() << "LV: " << Prefix << DebugMsg; 1003 if (I != nullptr) 1004 dbgs() << " " << *I; 1005 else 1006 dbgs() << '.'; 1007 dbgs() << '\n'; 1008 } 1009 #endif 1010 1011 /// Create an analysis remark that explains why vectorization failed 1012 /// 1013 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1014 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1015 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1016 /// the location of the remark. \return the remark object that can be 1017 /// streamed to. 1018 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1019 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1020 Value *CodeRegion = TheLoop->getHeader(); 1021 DebugLoc DL = TheLoop->getStartLoc(); 1022 1023 if (I) { 1024 CodeRegion = I->getParent(); 1025 // If there is no debug location attached to the instruction, revert back to 1026 // using the loop's. 1027 if (I->getDebugLoc()) 1028 DL = I->getDebugLoc(); 1029 } 1030 1031 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1032 } 1033 1034 namespace llvm { 1035 1036 /// Return a value for Step multiplied by VF. 1037 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1038 int64_t Step) { 1039 assert(Ty->isIntegerTy() && "Expected an integer step"); 1040 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1041 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1042 } 1043 1044 /// Return the runtime value for VF. 1045 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1046 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1047 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1048 } 1049 1050 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1051 ElementCount VF) { 1052 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1053 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1054 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1055 return B.CreateUIToFP(RuntimeVF, FTy); 1056 } 1057 1058 void reportVectorizationFailure(const StringRef DebugMsg, 1059 const StringRef OREMsg, const StringRef ORETag, 1060 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1061 Instruction *I) { 1062 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1063 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1064 ORE->emit( 1065 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1066 << "loop not vectorized: " << OREMsg); 1067 } 1068 1069 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1070 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1071 Instruction *I) { 1072 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1073 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1074 ORE->emit( 1075 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1076 << Msg); 1077 } 1078 1079 } // end namespace llvm 1080 1081 #ifndef NDEBUG 1082 /// \return string containing a file name and a line # for the given loop. 1083 static std::string getDebugLocString(const Loop *L) { 1084 std::string Result; 1085 if (L) { 1086 raw_string_ostream OS(Result); 1087 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1088 LoopDbgLoc.print(OS); 1089 else 1090 // Just print the module name. 1091 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1092 OS.flush(); 1093 } 1094 return Result; 1095 } 1096 #endif 1097 1098 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1099 const Instruction *Orig) { 1100 // If the loop was versioned with memchecks, add the corresponding no-alias 1101 // metadata. 1102 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1103 LVer->annotateInstWithNoAlias(To, Orig); 1104 } 1105 1106 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1107 VPTransformState &State) { 1108 1109 // Collect recipes in the backward slice of `Root` that may generate a poison 1110 // value that is used after vectorization. 1111 SmallPtrSet<VPRecipeBase *, 16> Visited; 1112 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1113 SmallVector<VPRecipeBase *, 16> Worklist; 1114 Worklist.push_back(Root); 1115 1116 // Traverse the backward slice of Root through its use-def chain. 1117 while (!Worklist.empty()) { 1118 VPRecipeBase *CurRec = Worklist.back(); 1119 Worklist.pop_back(); 1120 1121 if (!Visited.insert(CurRec).second) 1122 continue; 1123 1124 // Prune search if we find another recipe generating a widen memory 1125 // instruction. Widen memory instructions involved in address computation 1126 // will lead to gather/scatter instructions, which don't need to be 1127 // handled. 1128 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1129 isa<VPInterleaveRecipe>(CurRec) || 1130 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1131 continue; 1132 1133 // This recipe contributes to the address computation of a widen 1134 // load/store. Collect recipe if its underlying instruction has 1135 // poison-generating flags. 1136 Instruction *Instr = CurRec->getUnderlyingInstr(); 1137 if (Instr && Instr->hasPoisonGeneratingFlags()) 1138 State.MayGeneratePoisonRecipes.insert(CurRec); 1139 1140 // Add new definitions to the worklist. 1141 for (VPValue *operand : CurRec->operands()) 1142 if (VPDef *OpDef = operand->getDef()) 1143 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1144 } 1145 }); 1146 1147 // Traverse all the recipes in the VPlan and collect the poison-generating 1148 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1149 // VPInterleaveRecipe. 1150 auto Iter = depth_first( 1151 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1152 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1153 for (VPRecipeBase &Recipe : *VPBB) { 1154 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1155 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1156 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1157 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1158 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1159 collectPoisonGeneratingInstrsInBackwardSlice( 1160 cast<VPRecipeBase>(AddrDef)); 1161 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1162 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1163 if (AddrDef) { 1164 // Check if any member of the interleave group needs predication. 1165 const InterleaveGroup<Instruction> *InterGroup = 1166 InterleaveRec->getInterleaveGroup(); 1167 bool NeedPredication = false; 1168 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1169 I < NumMembers; ++I) { 1170 Instruction *Member = InterGroup->getMember(I); 1171 if (Member) 1172 NeedPredication |= 1173 Legal->blockNeedsPredication(Member->getParent()); 1174 } 1175 1176 if (NeedPredication) 1177 collectPoisonGeneratingInstrsInBackwardSlice( 1178 cast<VPRecipeBase>(AddrDef)); 1179 } 1180 } 1181 } 1182 } 1183 } 1184 1185 void InnerLoopVectorizer::addMetadata(Instruction *To, 1186 Instruction *From) { 1187 propagateMetadata(To, From); 1188 addNewMetadata(To, From); 1189 } 1190 1191 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1192 Instruction *From) { 1193 for (Value *V : To) { 1194 if (Instruction *I = dyn_cast<Instruction>(V)) 1195 addMetadata(I, From); 1196 } 1197 } 1198 1199 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1200 const RecurrenceDescriptor &RdxDesc) { 1201 auto It = ReductionResumeValues.find(&RdxDesc); 1202 assert(It != ReductionResumeValues.end() && 1203 "Expected to find a resume value for the reduction."); 1204 return It->second; 1205 } 1206 1207 namespace llvm { 1208 1209 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1210 // lowered. 1211 enum ScalarEpilogueLowering { 1212 1213 // The default: allowing scalar epilogues. 1214 CM_ScalarEpilogueAllowed, 1215 1216 // Vectorization with OptForSize: don't allow epilogues. 1217 CM_ScalarEpilogueNotAllowedOptSize, 1218 1219 // A special case of vectorisation with OptForSize: loops with a very small 1220 // trip count are considered for vectorization under OptForSize, thereby 1221 // making sure the cost of their loop body is dominant, free of runtime 1222 // guards and scalar iteration overheads. 1223 CM_ScalarEpilogueNotAllowedLowTripLoop, 1224 1225 // Loop hint predicate indicating an epilogue is undesired. 1226 CM_ScalarEpilogueNotNeededUsePredicate, 1227 1228 // Directive indicating we must either tail fold or not vectorize 1229 CM_ScalarEpilogueNotAllowedUsePredicate 1230 }; 1231 1232 /// ElementCountComparator creates a total ordering for ElementCount 1233 /// for the purposes of using it in a set structure. 1234 struct ElementCountComparator { 1235 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1236 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1237 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1238 } 1239 }; 1240 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1241 1242 /// LoopVectorizationCostModel - estimates the expected speedups due to 1243 /// vectorization. 1244 /// In many cases vectorization is not profitable. This can happen because of 1245 /// a number of reasons. In this class we mainly attempt to predict the 1246 /// expected speedup/slowdowns due to the supported instruction set. We use the 1247 /// TargetTransformInfo to query the different backends for the cost of 1248 /// different operations. 1249 class LoopVectorizationCostModel { 1250 public: 1251 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1252 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1253 LoopVectorizationLegality *Legal, 1254 const TargetTransformInfo &TTI, 1255 const TargetLibraryInfo *TLI, DemandedBits *DB, 1256 AssumptionCache *AC, 1257 OptimizationRemarkEmitter *ORE, const Function *F, 1258 const LoopVectorizeHints *Hints, 1259 InterleavedAccessInfo &IAI) 1260 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1261 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1262 Hints(Hints), InterleaveInfo(IAI) {} 1263 1264 /// \return An upper bound for the vectorization factors (both fixed and 1265 /// scalable). If the factors are 0, vectorization and interleaving should be 1266 /// avoided up front. 1267 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1268 1269 /// \return True if runtime checks are required for vectorization, and false 1270 /// otherwise. 1271 bool runtimeChecksRequired(); 1272 1273 /// \return The most profitable vectorization factor and the cost of that VF. 1274 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1275 /// then this vectorization factor will be selected if vectorization is 1276 /// possible. 1277 VectorizationFactor 1278 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1279 1280 VectorizationFactor 1281 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1282 const LoopVectorizationPlanner &LVP); 1283 1284 /// Setup cost-based decisions for user vectorization factor. 1285 /// \return true if the UserVF is a feasible VF to be chosen. 1286 bool selectUserVectorizationFactor(ElementCount UserVF) { 1287 collectUniformsAndScalars(UserVF); 1288 collectInstsToScalarize(UserVF); 1289 return expectedCost(UserVF).first.isValid(); 1290 } 1291 1292 /// \return The size (in bits) of the smallest and widest types in the code 1293 /// that needs to be vectorized. We ignore values that remain scalar such as 1294 /// 64 bit loop indices. 1295 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1296 1297 /// \return The desired interleave count. 1298 /// If interleave count has been specified by metadata it will be returned. 1299 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1300 /// are the selected vectorization factor and the cost of the selected VF. 1301 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1302 1303 /// Memory access instruction may be vectorized in more than one way. 1304 /// Form of instruction after vectorization depends on cost. 1305 /// This function takes cost-based decisions for Load/Store instructions 1306 /// and collects them in a map. This decisions map is used for building 1307 /// the lists of loop-uniform and loop-scalar instructions. 1308 /// The calculated cost is saved with widening decision in order to 1309 /// avoid redundant calculations. 1310 void setCostBasedWideningDecision(ElementCount VF); 1311 1312 /// A struct that represents some properties of the register usage 1313 /// of a loop. 1314 struct RegisterUsage { 1315 /// Holds the number of loop invariant values that are used in the loop. 1316 /// The key is ClassID of target-provided register class. 1317 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1318 /// Holds the maximum number of concurrent live intervals in the loop. 1319 /// The key is ClassID of target-provided register class. 1320 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1321 }; 1322 1323 /// \return Returns information about the register usages of the loop for the 1324 /// given vectorization factors. 1325 SmallVector<RegisterUsage, 8> 1326 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1327 1328 /// Collect values we want to ignore in the cost model. 1329 void collectValuesToIgnore(); 1330 1331 /// Collect all element types in the loop for which widening is needed. 1332 void collectElementTypesForWidening(); 1333 1334 /// Split reductions into those that happen in the loop, and those that happen 1335 /// outside. In loop reductions are collected into InLoopReductionChains. 1336 void collectInLoopReductions(); 1337 1338 /// Returns true if we should use strict in-order reductions for the given 1339 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1340 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1341 /// of FP operations. 1342 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1343 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1344 } 1345 1346 /// \returns The smallest bitwidth each instruction can be represented with. 1347 /// The vector equivalents of these instructions should be truncated to this 1348 /// type. 1349 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1350 return MinBWs; 1351 } 1352 1353 /// \returns True if it is more profitable to scalarize instruction \p I for 1354 /// vectorization factor \p VF. 1355 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1356 assert(VF.isVector() && 1357 "Profitable to scalarize relevant only for VF > 1."); 1358 1359 // Cost model is not run in the VPlan-native path - return conservative 1360 // result until this changes. 1361 if (EnableVPlanNativePath) 1362 return false; 1363 1364 auto Scalars = InstsToScalarize.find(VF); 1365 assert(Scalars != InstsToScalarize.end() && 1366 "VF not yet analyzed for scalarization profitability"); 1367 return Scalars->second.find(I) != Scalars->second.end(); 1368 } 1369 1370 /// Returns true if \p I is known to be uniform after vectorization. 1371 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1372 if (VF.isScalar()) 1373 return true; 1374 1375 // Cost model is not run in the VPlan-native path - return conservative 1376 // result until this changes. 1377 if (EnableVPlanNativePath) 1378 return false; 1379 1380 auto UniformsPerVF = Uniforms.find(VF); 1381 assert(UniformsPerVF != Uniforms.end() && 1382 "VF not yet analyzed for uniformity"); 1383 return UniformsPerVF->second.count(I); 1384 } 1385 1386 /// Returns true if \p I is known to be scalar after vectorization. 1387 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1388 if (VF.isScalar()) 1389 return true; 1390 1391 // Cost model is not run in the VPlan-native path - return conservative 1392 // result until this changes. 1393 if (EnableVPlanNativePath) 1394 return false; 1395 1396 auto ScalarsPerVF = Scalars.find(VF); 1397 assert(ScalarsPerVF != Scalars.end() && 1398 "Scalar values are not calculated for VF"); 1399 return ScalarsPerVF->second.count(I); 1400 } 1401 1402 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1403 /// for vectorization factor \p VF. 1404 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1405 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1406 !isProfitableToScalarize(I, VF) && 1407 !isScalarAfterVectorization(I, VF); 1408 } 1409 1410 /// Decision that was taken during cost calculation for memory instruction. 1411 enum InstWidening { 1412 CM_Unknown, 1413 CM_Widen, // For consecutive accesses with stride +1. 1414 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1415 CM_Interleave, 1416 CM_GatherScatter, 1417 CM_Scalarize 1418 }; 1419 1420 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1421 /// instruction \p I and vector width \p VF. 1422 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1423 InstructionCost Cost) { 1424 assert(VF.isVector() && "Expected VF >=2"); 1425 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1426 } 1427 1428 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1429 /// interleaving group \p Grp and vector width \p VF. 1430 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1431 ElementCount VF, InstWidening W, 1432 InstructionCost Cost) { 1433 assert(VF.isVector() && "Expected VF >=2"); 1434 /// Broadcast this decicion to all instructions inside the group. 1435 /// But the cost will be assigned to one instruction only. 1436 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1437 if (auto *I = Grp->getMember(i)) { 1438 if (Grp->getInsertPos() == I) 1439 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1440 else 1441 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1442 } 1443 } 1444 } 1445 1446 /// Return the cost model decision for the given instruction \p I and vector 1447 /// width \p VF. Return CM_Unknown if this instruction did not pass 1448 /// through the cost modeling. 1449 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1450 assert(VF.isVector() && "Expected VF to be a vector VF"); 1451 // Cost model is not run in the VPlan-native path - return conservative 1452 // result until this changes. 1453 if (EnableVPlanNativePath) 1454 return CM_GatherScatter; 1455 1456 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1457 auto Itr = WideningDecisions.find(InstOnVF); 1458 if (Itr == WideningDecisions.end()) 1459 return CM_Unknown; 1460 return Itr->second.first; 1461 } 1462 1463 /// Return the vectorization cost for the given instruction \p I and vector 1464 /// width \p VF. 1465 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1466 assert(VF.isVector() && "Expected VF >=2"); 1467 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1468 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1469 "The cost is not calculated"); 1470 return WideningDecisions[InstOnVF].second; 1471 } 1472 1473 /// Return True if instruction \p I is an optimizable truncate whose operand 1474 /// is an induction variable. Such a truncate will be removed by adding a new 1475 /// induction variable with the destination type. 1476 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1477 // If the instruction is not a truncate, return false. 1478 auto *Trunc = dyn_cast<TruncInst>(I); 1479 if (!Trunc) 1480 return false; 1481 1482 // Get the source and destination types of the truncate. 1483 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1484 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1485 1486 // If the truncate is free for the given types, return false. Replacing a 1487 // free truncate with an induction variable would add an induction variable 1488 // update instruction to each iteration of the loop. We exclude from this 1489 // check the primary induction variable since it will need an update 1490 // instruction regardless. 1491 Value *Op = Trunc->getOperand(0); 1492 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1493 return false; 1494 1495 // If the truncated value is not an induction variable, return false. 1496 return Legal->isInductionPhi(Op); 1497 } 1498 1499 /// Collects the instructions to scalarize for each predicated instruction in 1500 /// the loop. 1501 void collectInstsToScalarize(ElementCount VF); 1502 1503 /// Collect Uniform and Scalar values for the given \p VF. 1504 /// The sets depend on CM decision for Load/Store instructions 1505 /// that may be vectorized as interleave, gather-scatter or scalarized. 1506 void collectUniformsAndScalars(ElementCount VF) { 1507 // Do the analysis once. 1508 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1509 return; 1510 setCostBasedWideningDecision(VF); 1511 collectLoopUniforms(VF); 1512 collectLoopScalars(VF); 1513 } 1514 1515 /// Returns true if the target machine supports masked store operation 1516 /// for the given \p DataType and kind of access to \p Ptr. 1517 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1518 return Legal->isConsecutivePtr(DataType, Ptr) && 1519 TTI.isLegalMaskedStore(DataType, Alignment); 1520 } 1521 1522 /// Returns true if the target machine supports masked load operation 1523 /// for the given \p DataType and kind of access to \p Ptr. 1524 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1525 return Legal->isConsecutivePtr(DataType, Ptr) && 1526 TTI.isLegalMaskedLoad(DataType, Alignment); 1527 } 1528 1529 /// Returns true if the target machine can represent \p V as a masked gather 1530 /// or scatter operation. 1531 bool isLegalGatherOrScatter(Value *V, 1532 ElementCount VF = ElementCount::getFixed(1)) { 1533 bool LI = isa<LoadInst>(V); 1534 bool SI = isa<StoreInst>(V); 1535 if (!LI && !SI) 1536 return false; 1537 auto *Ty = getLoadStoreType(V); 1538 Align Align = getLoadStoreAlignment(V); 1539 if (VF.isVector()) 1540 Ty = VectorType::get(Ty, VF); 1541 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1542 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1543 } 1544 1545 /// Returns true if the target machine supports all of the reduction 1546 /// variables found for the given VF. 1547 bool canVectorizeReductions(ElementCount VF) const { 1548 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1549 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1550 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1551 })); 1552 } 1553 1554 /// Returns true if \p I is an instruction that will be scalarized with 1555 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1556 /// instructions include conditional stores and instructions that may divide 1557 /// by zero. 1558 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1559 1560 // Returns true if \p I is an instruction that will be predicated either 1561 // through scalar predication or masked load/store or masked gather/scatter. 1562 // \p VF is the vectorization factor that will be used to vectorize \p I. 1563 // Superset of instructions that return true for isScalarWithPredication. 1564 bool isPredicatedInst(Instruction *I, ElementCount VF, 1565 bool IsKnownUniform = false) { 1566 // When we know the load is uniform and the original scalar loop was not 1567 // predicated we don't need to mark it as a predicated instruction. Any 1568 // vectorised blocks created when tail-folding are something artificial we 1569 // have introduced and we know there is always at least one active lane. 1570 // That's why we call Legal->blockNeedsPredication here because it doesn't 1571 // query tail-folding. 1572 if (IsKnownUniform && isa<LoadInst>(I) && 1573 !Legal->blockNeedsPredication(I->getParent())) 1574 return false; 1575 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1576 return false; 1577 // Loads and stores that need some form of masked operation are predicated 1578 // instructions. 1579 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1580 return Legal->isMaskRequired(I); 1581 return isScalarWithPredication(I, VF); 1582 } 1583 1584 /// Returns true if \p I is a memory instruction with consecutive memory 1585 /// access that can be widened. 1586 bool 1587 memoryInstructionCanBeWidened(Instruction *I, 1588 ElementCount VF = ElementCount::getFixed(1)); 1589 1590 /// Returns true if \p I is a memory instruction in an interleaved-group 1591 /// of memory accesses that can be vectorized with wide vector loads/stores 1592 /// and shuffles. 1593 bool 1594 interleavedAccessCanBeWidened(Instruction *I, 1595 ElementCount VF = ElementCount::getFixed(1)); 1596 1597 /// Check if \p Instr belongs to any interleaved access group. 1598 bool isAccessInterleaved(Instruction *Instr) { 1599 return InterleaveInfo.isInterleaved(Instr); 1600 } 1601 1602 /// Get the interleaved access group that \p Instr belongs to. 1603 const InterleaveGroup<Instruction> * 1604 getInterleavedAccessGroup(Instruction *Instr) { 1605 return InterleaveInfo.getInterleaveGroup(Instr); 1606 } 1607 1608 /// Returns true if we're required to use a scalar epilogue for at least 1609 /// the final iteration of the original loop. 1610 bool requiresScalarEpilogue(ElementCount VF) const { 1611 if (!isScalarEpilogueAllowed()) 1612 return false; 1613 // If we might exit from anywhere but the latch, must run the exiting 1614 // iteration in scalar form. 1615 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1616 return true; 1617 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1618 } 1619 1620 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1621 /// loop hint annotation. 1622 bool isScalarEpilogueAllowed() const { 1623 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1624 } 1625 1626 /// Returns true if all loop blocks should be masked to fold tail loop. 1627 bool foldTailByMasking() const { return FoldTailByMasking; } 1628 1629 /// Returns true if the instructions in this block requires predication 1630 /// for any reason, e.g. because tail folding now requires a predicate 1631 /// or because the block in the original loop was predicated. 1632 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1633 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1634 } 1635 1636 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1637 /// nodes to the chain of instructions representing the reductions. Uses a 1638 /// MapVector to ensure deterministic iteration order. 1639 using ReductionChainMap = 1640 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1641 1642 /// Return the chain of instructions representing an inloop reduction. 1643 const ReductionChainMap &getInLoopReductionChains() const { 1644 return InLoopReductionChains; 1645 } 1646 1647 /// Returns true if the Phi is part of an inloop reduction. 1648 bool isInLoopReduction(PHINode *Phi) const { 1649 return InLoopReductionChains.count(Phi); 1650 } 1651 1652 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1653 /// with factor VF. Return the cost of the instruction, including 1654 /// scalarization overhead if it's needed. 1655 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1656 1657 /// Estimate cost of a call instruction CI if it were vectorized with factor 1658 /// VF. Return the cost of the instruction, including scalarization overhead 1659 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1660 /// scalarized - 1661 /// i.e. either vector version isn't available, or is too expensive. 1662 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1663 bool &NeedToScalarize) const; 1664 1665 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1666 /// that of B. 1667 bool isMoreProfitable(const VectorizationFactor &A, 1668 const VectorizationFactor &B) const; 1669 1670 /// Invalidates decisions already taken by the cost model. 1671 void invalidateCostModelingDecisions() { 1672 WideningDecisions.clear(); 1673 Uniforms.clear(); 1674 Scalars.clear(); 1675 } 1676 1677 private: 1678 unsigned NumPredStores = 0; 1679 1680 /// Convenience function that returns the value of vscale_range iff 1681 /// vscale_range.min == vscale_range.max or otherwise returns the value 1682 /// returned by the corresponding TLI method. 1683 Optional<unsigned> getVScaleForTuning() const; 1684 1685 /// \return An upper bound for the vectorization factors for both 1686 /// fixed and scalable vectorization, where the minimum-known number of 1687 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1688 /// disabled or unsupported, then the scalable part will be equal to 1689 /// ElementCount::getScalable(0). 1690 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1691 ElementCount UserVF, 1692 bool FoldTailByMasking); 1693 1694 /// \return the maximized element count based on the targets vector 1695 /// registers and the loop trip-count, but limited to a maximum safe VF. 1696 /// This is a helper function of computeFeasibleMaxVF. 1697 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1698 /// issue that occurred on one of the buildbots which cannot be reproduced 1699 /// without having access to the properietary compiler (see comments on 1700 /// D98509). The issue is currently under investigation and this workaround 1701 /// will be removed as soon as possible. 1702 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1703 unsigned SmallestType, 1704 unsigned WidestType, 1705 const ElementCount &MaxSafeVF, 1706 bool FoldTailByMasking); 1707 1708 /// \return the maximum legal scalable VF, based on the safe max number 1709 /// of elements. 1710 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1711 1712 /// The vectorization cost is a combination of the cost itself and a boolean 1713 /// indicating whether any of the contributing operations will actually 1714 /// operate on vector values after type legalization in the backend. If this 1715 /// latter value is false, then all operations will be scalarized (i.e. no 1716 /// vectorization has actually taken place). 1717 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1718 1719 /// Returns the expected execution cost. The unit of the cost does 1720 /// not matter because we use the 'cost' units to compare different 1721 /// vector widths. The cost that is returned is *not* normalized by 1722 /// the factor width. If \p Invalid is not nullptr, this function 1723 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1724 /// each instruction that has an Invalid cost for the given VF. 1725 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1726 VectorizationCostTy 1727 expectedCost(ElementCount VF, 1728 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1729 1730 /// Returns the execution time cost of an instruction for a given vector 1731 /// width. Vector width of one means scalar. 1732 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1733 1734 /// The cost-computation logic from getInstructionCost which provides 1735 /// the vector type as an output parameter. 1736 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1737 Type *&VectorTy); 1738 1739 /// Return the cost of instructions in an inloop reduction pattern, if I is 1740 /// part of that pattern. 1741 Optional<InstructionCost> 1742 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1743 TTI::TargetCostKind CostKind); 1744 1745 /// Calculate vectorization cost of memory instruction \p I. 1746 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1747 1748 /// The cost computation for scalarized memory instruction. 1749 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1750 1751 /// The cost computation for interleaving group of memory instructions. 1752 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1753 1754 /// The cost computation for Gather/Scatter instruction. 1755 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1756 1757 /// The cost computation for widening instruction \p I with consecutive 1758 /// memory access. 1759 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1760 1761 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1762 /// Load: scalar load + broadcast. 1763 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1764 /// element) 1765 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1766 1767 /// Estimate the overhead of scalarizing an instruction. This is a 1768 /// convenience wrapper for the type-based getScalarizationOverhead API. 1769 InstructionCost getScalarizationOverhead(Instruction *I, 1770 ElementCount VF) const; 1771 1772 /// Returns whether the instruction is a load or store and will be a emitted 1773 /// as a vector operation. 1774 bool isConsecutiveLoadOrStore(Instruction *I); 1775 1776 /// Map of scalar integer values to the smallest bitwidth they can be legally 1777 /// represented as. The vector equivalents of these values should be truncated 1778 /// to this type. 1779 MapVector<Instruction *, uint64_t> MinBWs; 1780 1781 /// A type representing the costs for instructions if they were to be 1782 /// scalarized rather than vectorized. The entries are Instruction-Cost 1783 /// pairs. 1784 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1785 1786 /// A set containing all BasicBlocks that are known to present after 1787 /// vectorization as a predicated block. 1788 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1789 1790 /// Records whether it is allowed to have the original scalar loop execute at 1791 /// least once. This may be needed as a fallback loop in case runtime 1792 /// aliasing/dependence checks fail, or to handle the tail/remainder 1793 /// iterations when the trip count is unknown or doesn't divide by the VF, 1794 /// or as a peel-loop to handle gaps in interleave-groups. 1795 /// Under optsize and when the trip count is very small we don't allow any 1796 /// iterations to execute in the scalar loop. 1797 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1798 1799 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1800 bool FoldTailByMasking = false; 1801 1802 /// A map holding scalar costs for different vectorization factors. The 1803 /// presence of a cost for an instruction in the mapping indicates that the 1804 /// instruction will be scalarized when vectorizing with the associated 1805 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1806 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1807 1808 /// Holds the instructions known to be uniform after vectorization. 1809 /// The data is collected per VF. 1810 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1811 1812 /// Holds the instructions known to be scalar after vectorization. 1813 /// The data is collected per VF. 1814 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1815 1816 /// Holds the instructions (address computations) that are forced to be 1817 /// scalarized. 1818 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1819 1820 /// PHINodes of the reductions that should be expanded in-loop along with 1821 /// their associated chains of reduction operations, in program order from top 1822 /// (PHI) to bottom 1823 ReductionChainMap InLoopReductionChains; 1824 1825 /// A Map of inloop reduction operations and their immediate chain operand. 1826 /// FIXME: This can be removed once reductions can be costed correctly in 1827 /// vplan. This was added to allow quick lookup to the inloop operations, 1828 /// without having to loop through InLoopReductionChains. 1829 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1830 1831 /// Returns the expected difference in cost from scalarizing the expression 1832 /// feeding a predicated instruction \p PredInst. The instructions to 1833 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1834 /// non-negative return value implies the expression will be scalarized. 1835 /// Currently, only single-use chains are considered for scalarization. 1836 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1837 ElementCount VF); 1838 1839 /// Collect the instructions that are uniform after vectorization. An 1840 /// instruction is uniform if we represent it with a single scalar value in 1841 /// the vectorized loop corresponding to each vector iteration. Examples of 1842 /// uniform instructions include pointer operands of consecutive or 1843 /// interleaved memory accesses. Note that although uniformity implies an 1844 /// instruction will be scalar, the reverse is not true. In general, a 1845 /// scalarized instruction will be represented by VF scalar values in the 1846 /// vectorized loop, each corresponding to an iteration of the original 1847 /// scalar loop. 1848 void collectLoopUniforms(ElementCount VF); 1849 1850 /// Collect the instructions that are scalar after vectorization. An 1851 /// instruction is scalar if it is known to be uniform or will be scalarized 1852 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1853 /// to the list if they are used by a load/store instruction that is marked as 1854 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1855 /// VF values in the vectorized loop, each corresponding to an iteration of 1856 /// the original scalar loop. 1857 void collectLoopScalars(ElementCount VF); 1858 1859 /// Keeps cost model vectorization decision and cost for instructions. 1860 /// Right now it is used for memory instructions only. 1861 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1862 std::pair<InstWidening, InstructionCost>>; 1863 1864 DecisionList WideningDecisions; 1865 1866 /// Returns true if \p V is expected to be vectorized and it needs to be 1867 /// extracted. 1868 bool needsExtract(Value *V, ElementCount VF) const { 1869 Instruction *I = dyn_cast<Instruction>(V); 1870 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1871 TheLoop->isLoopInvariant(I)) 1872 return false; 1873 1874 // Assume we can vectorize V (and hence we need extraction) if the 1875 // scalars are not computed yet. This can happen, because it is called 1876 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1877 // the scalars are collected. That should be a safe assumption in most 1878 // cases, because we check if the operands have vectorizable types 1879 // beforehand in LoopVectorizationLegality. 1880 return Scalars.find(VF) == Scalars.end() || 1881 !isScalarAfterVectorization(I, VF); 1882 }; 1883 1884 /// Returns a range containing only operands needing to be extracted. 1885 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1886 ElementCount VF) const { 1887 return SmallVector<Value *, 4>(make_filter_range( 1888 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1889 } 1890 1891 /// Determines if we have the infrastructure to vectorize loop \p L and its 1892 /// epilogue, assuming the main loop is vectorized by \p VF. 1893 bool isCandidateForEpilogueVectorization(const Loop &L, 1894 const ElementCount VF) const; 1895 1896 /// Returns true if epilogue vectorization is considered profitable, and 1897 /// false otherwise. 1898 /// \p VF is the vectorization factor chosen for the original loop. 1899 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1900 1901 public: 1902 /// The loop that we evaluate. 1903 Loop *TheLoop; 1904 1905 /// Predicated scalar evolution analysis. 1906 PredicatedScalarEvolution &PSE; 1907 1908 /// Loop Info analysis. 1909 LoopInfo *LI; 1910 1911 /// Vectorization legality. 1912 LoopVectorizationLegality *Legal; 1913 1914 /// Vector target information. 1915 const TargetTransformInfo &TTI; 1916 1917 /// Target Library Info. 1918 const TargetLibraryInfo *TLI; 1919 1920 /// Demanded bits analysis. 1921 DemandedBits *DB; 1922 1923 /// Assumption cache. 1924 AssumptionCache *AC; 1925 1926 /// Interface to emit optimization remarks. 1927 OptimizationRemarkEmitter *ORE; 1928 1929 const Function *TheFunction; 1930 1931 /// Loop Vectorize Hint. 1932 const LoopVectorizeHints *Hints; 1933 1934 /// The interleave access information contains groups of interleaved accesses 1935 /// with the same stride and close to each other. 1936 InterleavedAccessInfo &InterleaveInfo; 1937 1938 /// Values to ignore in the cost model. 1939 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1940 1941 /// Values to ignore in the cost model when VF > 1. 1942 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1943 1944 /// All element types found in the loop. 1945 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1946 1947 /// Profitable vector factors. 1948 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1949 }; 1950 } // end namespace llvm 1951 1952 /// Helper struct to manage generating runtime checks for vectorization. 1953 /// 1954 /// The runtime checks are created up-front in temporary blocks to allow better 1955 /// estimating the cost and un-linked from the existing IR. After deciding to 1956 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1957 /// temporary blocks are completely removed. 1958 class GeneratedRTChecks { 1959 /// Basic block which contains the generated SCEV checks, if any. 1960 BasicBlock *SCEVCheckBlock = nullptr; 1961 1962 /// The value representing the result of the generated SCEV checks. If it is 1963 /// nullptr, either no SCEV checks have been generated or they have been used. 1964 Value *SCEVCheckCond = nullptr; 1965 1966 /// Basic block which contains the generated memory runtime checks, if any. 1967 BasicBlock *MemCheckBlock = nullptr; 1968 1969 /// The value representing the result of the generated memory runtime checks. 1970 /// If it is nullptr, either no memory runtime checks have been generated or 1971 /// they have been used. 1972 Value *MemRuntimeCheckCond = nullptr; 1973 1974 DominatorTree *DT; 1975 LoopInfo *LI; 1976 1977 SCEVExpander SCEVExp; 1978 SCEVExpander MemCheckExp; 1979 1980 public: 1981 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1982 const DataLayout &DL) 1983 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1984 MemCheckExp(SE, DL, "scev.check") {} 1985 1986 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1987 /// accurately estimate the cost of the runtime checks. The blocks are 1988 /// un-linked from the IR and is added back during vector code generation. If 1989 /// there is no vector code generation, the check blocks are removed 1990 /// completely. 1991 void Create(Loop *L, const LoopAccessInfo &LAI, 1992 const SCEVUnionPredicate &UnionPred) { 1993 1994 BasicBlock *LoopHeader = L->getHeader(); 1995 BasicBlock *Preheader = L->getLoopPreheader(); 1996 1997 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1998 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1999 // may be used by SCEVExpander. The blocks will be un-linked from their 2000 // predecessors and removed from LI & DT at the end of the function. 2001 if (!UnionPred.isAlwaysTrue()) { 2002 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2003 nullptr, "vector.scevcheck"); 2004 2005 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2006 &UnionPred, SCEVCheckBlock->getTerminator()); 2007 } 2008 2009 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2010 if (RtPtrChecking.Need) { 2011 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2012 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2013 "vector.memcheck"); 2014 2015 MemRuntimeCheckCond = 2016 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2017 RtPtrChecking.getChecks(), MemCheckExp); 2018 assert(MemRuntimeCheckCond && 2019 "no RT checks generated although RtPtrChecking " 2020 "claimed checks are required"); 2021 } 2022 2023 if (!MemCheckBlock && !SCEVCheckBlock) 2024 return; 2025 2026 // Unhook the temporary block with the checks, update various places 2027 // accordingly. 2028 if (SCEVCheckBlock) 2029 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2030 if (MemCheckBlock) 2031 MemCheckBlock->replaceAllUsesWith(Preheader); 2032 2033 if (SCEVCheckBlock) { 2034 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2035 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2036 Preheader->getTerminator()->eraseFromParent(); 2037 } 2038 if (MemCheckBlock) { 2039 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2040 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2041 Preheader->getTerminator()->eraseFromParent(); 2042 } 2043 2044 DT->changeImmediateDominator(LoopHeader, Preheader); 2045 if (MemCheckBlock) { 2046 DT->eraseNode(MemCheckBlock); 2047 LI->removeBlock(MemCheckBlock); 2048 } 2049 if (SCEVCheckBlock) { 2050 DT->eraseNode(SCEVCheckBlock); 2051 LI->removeBlock(SCEVCheckBlock); 2052 } 2053 } 2054 2055 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2056 /// unused. 2057 ~GeneratedRTChecks() { 2058 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2059 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2060 if (!SCEVCheckCond) 2061 SCEVCleaner.markResultUsed(); 2062 2063 if (!MemRuntimeCheckCond) 2064 MemCheckCleaner.markResultUsed(); 2065 2066 if (MemRuntimeCheckCond) { 2067 auto &SE = *MemCheckExp.getSE(); 2068 // Memory runtime check generation creates compares that use expanded 2069 // values. Remove them before running the SCEVExpanderCleaners. 2070 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2071 if (MemCheckExp.isInsertedInstruction(&I)) 2072 continue; 2073 SE.forgetValue(&I); 2074 I.eraseFromParent(); 2075 } 2076 } 2077 MemCheckCleaner.cleanup(); 2078 SCEVCleaner.cleanup(); 2079 2080 if (SCEVCheckCond) 2081 SCEVCheckBlock->eraseFromParent(); 2082 if (MemRuntimeCheckCond) 2083 MemCheckBlock->eraseFromParent(); 2084 } 2085 2086 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2087 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2088 /// depending on the generated condition. 2089 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2090 BasicBlock *LoopVectorPreHeader, 2091 BasicBlock *LoopExitBlock) { 2092 if (!SCEVCheckCond) 2093 return nullptr; 2094 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2095 if (C->isZero()) 2096 return nullptr; 2097 2098 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2099 2100 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2101 // Create new preheader for vector loop. 2102 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2103 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2104 2105 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2106 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2107 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2108 SCEVCheckBlock); 2109 2110 DT->addNewBlock(SCEVCheckBlock, Pred); 2111 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2112 2113 ReplaceInstWithInst( 2114 SCEVCheckBlock->getTerminator(), 2115 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2116 // Mark the check as used, to prevent it from being removed during cleanup. 2117 SCEVCheckCond = nullptr; 2118 return SCEVCheckBlock; 2119 } 2120 2121 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2122 /// the branches to branch to the vector preheader or \p Bypass, depending on 2123 /// the generated condition. 2124 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2125 BasicBlock *LoopVectorPreHeader) { 2126 // Check if we generated code that checks in runtime if arrays overlap. 2127 if (!MemRuntimeCheckCond) 2128 return nullptr; 2129 2130 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2131 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2132 MemCheckBlock); 2133 2134 DT->addNewBlock(MemCheckBlock, Pred); 2135 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2136 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2137 2138 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2139 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2140 2141 ReplaceInstWithInst( 2142 MemCheckBlock->getTerminator(), 2143 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2144 MemCheckBlock->getTerminator()->setDebugLoc( 2145 Pred->getTerminator()->getDebugLoc()); 2146 2147 // Mark the check as used, to prevent it from being removed during cleanup. 2148 MemRuntimeCheckCond = nullptr; 2149 return MemCheckBlock; 2150 } 2151 }; 2152 2153 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2154 // vectorization. The loop needs to be annotated with #pragma omp simd 2155 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2156 // vector length information is not provided, vectorization is not considered 2157 // explicit. Interleave hints are not allowed either. These limitations will be 2158 // relaxed in the future. 2159 // Please, note that we are currently forced to abuse the pragma 'clang 2160 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2161 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2162 // provides *explicit vectorization hints* (LV can bypass legal checks and 2163 // assume that vectorization is legal). However, both hints are implemented 2164 // using the same metadata (llvm.loop.vectorize, processed by 2165 // LoopVectorizeHints). This will be fixed in the future when the native IR 2166 // representation for pragma 'omp simd' is introduced. 2167 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2168 OptimizationRemarkEmitter *ORE) { 2169 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2170 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2171 2172 // Only outer loops with an explicit vectorization hint are supported. 2173 // Unannotated outer loops are ignored. 2174 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2175 return false; 2176 2177 Function *Fn = OuterLp->getHeader()->getParent(); 2178 if (!Hints.allowVectorization(Fn, OuterLp, 2179 true /*VectorizeOnlyWhenForced*/)) { 2180 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2181 return false; 2182 } 2183 2184 if (Hints.getInterleave() > 1) { 2185 // TODO: Interleave support is future work. 2186 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2187 "outer loops.\n"); 2188 Hints.emitRemarkWithHints(); 2189 return false; 2190 } 2191 2192 return true; 2193 } 2194 2195 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2196 OptimizationRemarkEmitter *ORE, 2197 SmallVectorImpl<Loop *> &V) { 2198 // Collect inner loops and outer loops without irreducible control flow. For 2199 // now, only collect outer loops that have explicit vectorization hints. If we 2200 // are stress testing the VPlan H-CFG construction, we collect the outermost 2201 // loop of every loop nest. 2202 if (L.isInnermost() || VPlanBuildStressTest || 2203 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2204 LoopBlocksRPO RPOT(&L); 2205 RPOT.perform(LI); 2206 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2207 V.push_back(&L); 2208 // TODO: Collect inner loops inside marked outer loops in case 2209 // vectorization fails for the outer loop. Do not invoke 2210 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2211 // already known to be reducible. We can use an inherited attribute for 2212 // that. 2213 return; 2214 } 2215 } 2216 for (Loop *InnerL : L) 2217 collectSupportedLoops(*InnerL, LI, ORE, V); 2218 } 2219 2220 namespace { 2221 2222 /// The LoopVectorize Pass. 2223 struct LoopVectorize : public FunctionPass { 2224 /// Pass identification, replacement for typeid 2225 static char ID; 2226 2227 LoopVectorizePass Impl; 2228 2229 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2230 bool VectorizeOnlyWhenForced = false) 2231 : FunctionPass(ID), 2232 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2233 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2234 } 2235 2236 bool runOnFunction(Function &F) override { 2237 if (skipFunction(F)) 2238 return false; 2239 2240 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2241 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2242 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2243 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2244 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2245 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2246 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2247 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2248 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2249 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2250 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2251 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2252 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2253 2254 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2255 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2256 2257 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2258 GetLAA, *ORE, PSI).MadeAnyChange; 2259 } 2260 2261 void getAnalysisUsage(AnalysisUsage &AU) const override { 2262 AU.addRequired<AssumptionCacheTracker>(); 2263 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2264 AU.addRequired<DominatorTreeWrapperPass>(); 2265 AU.addRequired<LoopInfoWrapperPass>(); 2266 AU.addRequired<ScalarEvolutionWrapperPass>(); 2267 AU.addRequired<TargetTransformInfoWrapperPass>(); 2268 AU.addRequired<AAResultsWrapperPass>(); 2269 AU.addRequired<LoopAccessLegacyAnalysis>(); 2270 AU.addRequired<DemandedBitsWrapperPass>(); 2271 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2272 AU.addRequired<InjectTLIMappingsLegacy>(); 2273 2274 // We currently do not preserve loopinfo/dominator analyses with outer loop 2275 // vectorization. Until this is addressed, mark these analyses as preserved 2276 // only for non-VPlan-native path. 2277 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2278 if (!EnableVPlanNativePath) { 2279 AU.addPreserved<LoopInfoWrapperPass>(); 2280 AU.addPreserved<DominatorTreeWrapperPass>(); 2281 } 2282 2283 AU.addPreserved<BasicAAWrapperPass>(); 2284 AU.addPreserved<GlobalsAAWrapperPass>(); 2285 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2286 } 2287 }; 2288 2289 } // end anonymous namespace 2290 2291 //===----------------------------------------------------------------------===// 2292 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2293 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2294 //===----------------------------------------------------------------------===// 2295 2296 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2297 // We need to place the broadcast of invariant variables outside the loop, 2298 // but only if it's proven safe to do so. Else, broadcast will be inside 2299 // vector loop body. 2300 Instruction *Instr = dyn_cast<Instruction>(V); 2301 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2302 (!Instr || 2303 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2304 // Place the code for broadcasting invariant variables in the new preheader. 2305 IRBuilder<>::InsertPointGuard Guard(Builder); 2306 if (SafeToHoist) 2307 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2308 2309 // Broadcast the scalar into all locations in the vector. 2310 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2311 2312 return Shuf; 2313 } 2314 2315 /// This function adds 2316 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2317 /// to each vector element of Val. The sequence starts at StartIndex. 2318 /// \p Opcode is relevant for FP induction variable. 2319 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2320 Instruction::BinaryOps BinOp, ElementCount VF, 2321 IRBuilderBase &Builder) { 2322 assert(VF.isVector() && "only vector VFs are supported"); 2323 2324 // Create and check the types. 2325 auto *ValVTy = cast<VectorType>(Val->getType()); 2326 ElementCount VLen = ValVTy->getElementCount(); 2327 2328 Type *STy = Val->getType()->getScalarType(); 2329 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2330 "Induction Step must be an integer or FP"); 2331 assert(Step->getType() == STy && "Step has wrong type"); 2332 2333 SmallVector<Constant *, 8> Indices; 2334 2335 // Create a vector of consecutive numbers from zero to VF. 2336 VectorType *InitVecValVTy = ValVTy; 2337 Type *InitVecValSTy = STy; 2338 if (STy->isFloatingPointTy()) { 2339 InitVecValSTy = 2340 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2341 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2342 } 2343 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2344 2345 // Splat the StartIdx 2346 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2347 2348 if (STy->isIntegerTy()) { 2349 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2350 Step = Builder.CreateVectorSplat(VLen, Step); 2351 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2352 // FIXME: The newly created binary instructions should contain nsw/nuw 2353 // flags, which can be found from the original scalar operations. 2354 Step = Builder.CreateMul(InitVec, Step); 2355 return Builder.CreateAdd(Val, Step, "induction"); 2356 } 2357 2358 // Floating point induction. 2359 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2360 "Binary Opcode should be specified for FP induction"); 2361 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2362 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2363 2364 Step = Builder.CreateVectorSplat(VLen, Step); 2365 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2366 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2367 } 2368 2369 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2370 const InductionDescriptor &II, Value *Step, Value *Start, 2371 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2372 IRBuilderBase &Builder = State.Builder; 2373 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2374 "Expected either an induction phi-node or a truncate of it!"); 2375 2376 // Construct the initial value of the vector IV in the vector loop preheader 2377 auto CurrIP = Builder.saveIP(); 2378 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2379 if (isa<TruncInst>(EntryVal)) { 2380 assert(Start->getType()->isIntegerTy() && 2381 "Truncation requires an integer type"); 2382 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2383 Step = Builder.CreateTrunc(Step, TruncType); 2384 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2385 } 2386 2387 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2388 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2389 Value *SteppedStart = getStepVector( 2390 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2391 2392 // We create vector phi nodes for both integer and floating-point induction 2393 // variables. Here, we determine the kind of arithmetic we will perform. 2394 Instruction::BinaryOps AddOp; 2395 Instruction::BinaryOps MulOp; 2396 if (Step->getType()->isIntegerTy()) { 2397 AddOp = Instruction::Add; 2398 MulOp = Instruction::Mul; 2399 } else { 2400 AddOp = II.getInductionOpcode(); 2401 MulOp = Instruction::FMul; 2402 } 2403 2404 // Multiply the vectorization factor by the step using integer or 2405 // floating-point arithmetic as appropriate. 2406 Type *StepType = Step->getType(); 2407 Value *RuntimeVF; 2408 if (Step->getType()->isFloatingPointTy()) 2409 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2410 else 2411 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2412 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2413 2414 // Create a vector splat to use in the induction update. 2415 // 2416 // FIXME: If the step is non-constant, we create the vector splat with 2417 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2418 // handle a constant vector splat. 2419 Value *SplatVF = isa<Constant>(Mul) 2420 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2421 : Builder.CreateVectorSplat(State.VF, Mul); 2422 Builder.restoreIP(CurrIP); 2423 2424 // We may need to add the step a number of times, depending on the unroll 2425 // factor. The last of those goes into the PHI. 2426 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2427 &*LoopVectorBody->getFirstInsertionPt()); 2428 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2429 Instruction *LastInduction = VecInd; 2430 for (unsigned Part = 0; Part < UF; ++Part) { 2431 State.set(Def, LastInduction, Part); 2432 2433 if (isa<TruncInst>(EntryVal)) 2434 addMetadata(LastInduction, EntryVal); 2435 2436 LastInduction = cast<Instruction>( 2437 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2438 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2439 } 2440 2441 // Move the last step to the end of the latch block. This ensures consistent 2442 // placement of all induction updates. 2443 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2444 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2445 LastInduction->moveBefore(Br); 2446 LastInduction->setName("vec.ind.next"); 2447 2448 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2449 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2450 } 2451 2452 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2453 /// variable on which to base the steps, \p Step is the size of the step, and 2454 /// \p EntryVal is the value from the original loop that maps to the steps. 2455 /// Note that \p EntryVal doesn't have to be an induction variable - it 2456 /// can also be a truncate instruction. 2457 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2458 Instruction *EntryVal, 2459 const InductionDescriptor &ID, VPValue *Def, 2460 VPTransformState &State) { 2461 IRBuilderBase &Builder = State.Builder; 2462 // We shouldn't have to build scalar steps if we aren't vectorizing. 2463 assert(State.VF.isVector() && "VF should be greater than one"); 2464 // Get the value type and ensure it and the step have the same integer type. 2465 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2466 assert(ScalarIVTy == Step->getType() && 2467 "Val and Step should have the same type"); 2468 2469 // We build scalar steps for both integer and floating-point induction 2470 // variables. Here, we determine the kind of arithmetic we will perform. 2471 Instruction::BinaryOps AddOp; 2472 Instruction::BinaryOps MulOp; 2473 if (ScalarIVTy->isIntegerTy()) { 2474 AddOp = Instruction::Add; 2475 MulOp = Instruction::Mul; 2476 } else { 2477 AddOp = ID.getInductionOpcode(); 2478 MulOp = Instruction::FMul; 2479 } 2480 2481 // Determine the number of scalars we need to generate for each unroll 2482 // iteration. 2483 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2484 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2485 // Compute the scalar steps and save the results in State. 2486 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2487 ScalarIVTy->getScalarSizeInBits()); 2488 Type *VecIVTy = nullptr; 2489 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2490 if (!FirstLaneOnly && State.VF.isScalable()) { 2491 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2492 UnitStepVec = 2493 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2494 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2495 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2496 } 2497 2498 for (unsigned Part = 0; Part < State.UF; ++Part) { 2499 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2500 2501 if (!FirstLaneOnly && State.VF.isScalable()) { 2502 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2503 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2504 if (ScalarIVTy->isFloatingPointTy()) 2505 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2506 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2507 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2508 State.set(Def, Add, Part); 2509 // It's useful to record the lane values too for the known minimum number 2510 // of elements so we do those below. This improves the code quality when 2511 // trying to extract the first element, for example. 2512 } 2513 2514 if (ScalarIVTy->isFloatingPointTy()) 2515 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2516 2517 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2518 Value *StartIdx = Builder.CreateBinOp( 2519 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2520 // The step returned by `createStepForVF` is a runtime-evaluated value 2521 // when VF is scalable. Otherwise, it should be folded into a Constant. 2522 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2523 "Expected StartIdx to be folded to a constant when VF is not " 2524 "scalable"); 2525 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2526 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2527 State.set(Def, Add, VPIteration(Part, Lane)); 2528 } 2529 } 2530 } 2531 2532 // Generate code for the induction step. Note that induction steps are 2533 // required to be loop-invariant 2534 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2535 Instruction *InsertBefore, 2536 Loop *OrigLoop = nullptr) { 2537 const DataLayout &DL = SE.getDataLayout(); 2538 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2539 "Induction step should be loop invariant"); 2540 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2541 return E->getValue(); 2542 2543 SCEVExpander Exp(SE, DL, "induction"); 2544 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2545 } 2546 2547 /// Compute the transformed value of Index at offset StartValue using step 2548 /// StepValue. 2549 /// For integer induction, returns StartValue + Index * StepValue. 2550 /// For pointer induction, returns StartValue[Index * StepValue]. 2551 /// FIXME: The newly created binary instructions should contain nsw/nuw 2552 /// flags, which can be found from the original scalar operations. 2553 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *Step, 2554 const InductionDescriptor &ID) { 2555 2556 auto StartValue = ID.getStartValue(); 2557 assert(Index->getType()->getScalarType() == Step->getType() && 2558 "Index scalar type does not match StepValue type"); 2559 2560 // Note: the IR at this point is broken. We cannot use SE to create any new 2561 // SCEV and then expand it, hoping that SCEV's simplification will give us 2562 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2563 // lead to various SCEV crashes. So all we can do is to use builder and rely 2564 // on InstCombine for future simplifications. Here we handle some trivial 2565 // cases only. 2566 auto CreateAdd = [&B](Value *X, Value *Y) { 2567 assert(X->getType() == Y->getType() && "Types don't match!"); 2568 if (auto *CX = dyn_cast<ConstantInt>(X)) 2569 if (CX->isZero()) 2570 return Y; 2571 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2572 if (CY->isZero()) 2573 return X; 2574 return B.CreateAdd(X, Y); 2575 }; 2576 2577 // We allow X to be a vector type, in which case Y will potentially be 2578 // splatted into a vector with the same element count. 2579 auto CreateMul = [&B](Value *X, Value *Y) { 2580 assert(X->getType()->getScalarType() == Y->getType() && 2581 "Types don't match!"); 2582 if (auto *CX = dyn_cast<ConstantInt>(X)) 2583 if (CX->isOne()) 2584 return Y; 2585 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2586 if (CY->isOne()) 2587 return X; 2588 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2589 if (XVTy && !isa<VectorType>(Y->getType())) 2590 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2591 return B.CreateMul(X, Y); 2592 }; 2593 2594 switch (ID.getKind()) { 2595 case InductionDescriptor::IK_IntInduction: { 2596 assert(!isa<VectorType>(Index->getType()) && 2597 "Vector indices not supported for integer inductions yet"); 2598 assert(Index->getType() == StartValue->getType() && 2599 "Index type does not match StartValue type"); 2600 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2601 return B.CreateSub(StartValue, Index); 2602 auto *Offset = CreateMul(Index, Step); 2603 return CreateAdd(StartValue, Offset); 2604 } 2605 case InductionDescriptor::IK_PtrInduction: { 2606 assert(isa<Constant>(Step) && 2607 "Expected constant step for pointer induction"); 2608 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2609 } 2610 case InductionDescriptor::IK_FpInduction: { 2611 assert(!isa<VectorType>(Index->getType()) && 2612 "Vector indices not supported for FP inductions yet"); 2613 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2614 auto InductionBinOp = ID.getInductionBinOp(); 2615 assert(InductionBinOp && 2616 (InductionBinOp->getOpcode() == Instruction::FAdd || 2617 InductionBinOp->getOpcode() == Instruction::FSub) && 2618 "Original bin op should be defined for FP induction"); 2619 2620 Value *MulExp = B.CreateFMul(Step, Index); 2621 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2622 "induction"); 2623 } 2624 case InductionDescriptor::IK_NoInduction: 2625 return nullptr; 2626 } 2627 llvm_unreachable("invalid enum"); 2628 } 2629 2630 void InnerLoopVectorizer::widenIntOrFpInduction( 2631 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, 2632 Value *CanonicalIV) { 2633 Value *Start = Def->getStartValue()->getLiveInIRValue(); 2634 const InductionDescriptor &ID = Def->getInductionDescriptor(); 2635 TruncInst *Trunc = Def->getTruncInst(); 2636 IRBuilderBase &Builder = State.Builder; 2637 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2638 assert(!State.VF.isZero() && "VF must be non-zero"); 2639 2640 // The value from the original loop to which we are mapping the new induction 2641 // variable. 2642 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2643 2644 auto &DL = EntryVal->getModule()->getDataLayout(); 2645 2646 // Generate code for the induction step. Note that induction steps are 2647 // required to be loop-invariant 2648 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2649 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2650 "Induction step should be loop invariant"); 2651 if (PSE.getSE()->isSCEVable(IV->getType())) { 2652 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2653 return Exp.expandCodeFor(Step, Step->getType(), 2654 State.CFG.VectorPreHeader->getTerminator()); 2655 } 2656 return cast<SCEVUnknown>(Step)->getValue(); 2657 }; 2658 2659 // The scalar value to broadcast. This is derived from the canonical 2660 // induction variable. If a truncation type is given, truncate the canonical 2661 // induction variable and step. Otherwise, derive these values from the 2662 // induction descriptor. 2663 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2664 Value *ScalarIV = CanonicalIV; 2665 Type *NeededType = IV->getType(); 2666 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { 2667 ScalarIV = 2668 NeededType->isIntegerTy() 2669 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) 2670 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); 2671 ScalarIV = emitTransformedIndex(Builder, ScalarIV, Step, ID); 2672 ScalarIV->setName("offset.idx"); 2673 } 2674 if (Trunc) { 2675 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2676 assert(Step->getType()->isIntegerTy() && 2677 "Truncation requires an integer step"); 2678 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2679 Step = Builder.CreateTrunc(Step, TruncType); 2680 } 2681 return ScalarIV; 2682 }; 2683 2684 // Fast-math-flags propagate from the original induction instruction. 2685 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2686 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2687 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2688 2689 // Now do the actual transformations, and start with creating the step value. 2690 Value *Step = CreateStepValue(ID.getStep()); 2691 if (State.VF.isScalar()) { 2692 Value *ScalarIV = CreateScalarIV(Step); 2693 Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), 2694 Step->getType()->getScalarSizeInBits()); 2695 2696 Instruction::BinaryOps IncOp = ID.getInductionOpcode(); 2697 if (IncOp == Instruction::BinaryOpsEnd) 2698 IncOp = Instruction::Add; 2699 for (unsigned Part = 0; Part < UF; ++Part) { 2700 Value *StartIdx = ConstantInt::get(ScalarTy, Part); 2701 Instruction::BinaryOps MulOp = Instruction::Mul; 2702 if (Step->getType()->isFloatingPointTy()) { 2703 StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); 2704 MulOp = Instruction::FMul; 2705 } 2706 2707 Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2708 Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); 2709 State.set(Def, EntryPart, Part); 2710 if (Trunc) { 2711 assert(!Step->getType()->isFloatingPointTy() && 2712 "fp inductions shouldn't be truncated"); 2713 addMetadata(EntryPart, Trunc); 2714 } 2715 } 2716 return; 2717 } 2718 2719 // Create a new independent vector induction variable, if one is needed. 2720 if (Def->needsVectorIV()) 2721 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2722 2723 if (Def->needsScalarIV()) { 2724 // Create scalar steps that can be used by instructions we will later 2725 // scalarize. Note that the addition of the scalar steps will not increase 2726 // the number of instructions in the loop in the common case prior to 2727 // InstCombine. We will be trading one vector extract for each scalar step. 2728 Value *ScalarIV = CreateScalarIV(Step); 2729 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2730 } 2731 } 2732 2733 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2734 const VPIteration &Instance, 2735 VPTransformState &State) { 2736 Value *ScalarInst = State.get(Def, Instance); 2737 Value *VectorValue = State.get(Def, Instance.Part); 2738 VectorValue = Builder.CreateInsertElement( 2739 VectorValue, ScalarInst, 2740 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2741 State.set(Def, VectorValue, Instance.Part); 2742 } 2743 2744 // Return whether we allow using masked interleave-groups (for dealing with 2745 // strided loads/stores that reside in predicated blocks, or for dealing 2746 // with gaps). 2747 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2748 // If an override option has been passed in for interleaved accesses, use it. 2749 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2750 return EnableMaskedInterleavedMemAccesses; 2751 2752 return TTI.enableMaskedInterleavedAccessVectorization(); 2753 } 2754 2755 // Try to vectorize the interleave group that \p Instr belongs to. 2756 // 2757 // E.g. Translate following interleaved load group (factor = 3): 2758 // for (i = 0; i < N; i+=3) { 2759 // R = Pic[i]; // Member of index 0 2760 // G = Pic[i+1]; // Member of index 1 2761 // B = Pic[i+2]; // Member of index 2 2762 // ... // do something to R, G, B 2763 // } 2764 // To: 2765 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2766 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2767 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2768 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2769 // 2770 // Or translate following interleaved store group (factor = 3): 2771 // for (i = 0; i < N; i+=3) { 2772 // ... do something to R, G, B 2773 // Pic[i] = R; // Member of index 0 2774 // Pic[i+1] = G; // Member of index 1 2775 // Pic[i+2] = B; // Member of index 2 2776 // } 2777 // To: 2778 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2779 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2780 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2781 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2782 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2783 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2784 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2785 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2786 VPValue *BlockInMask) { 2787 Instruction *Instr = Group->getInsertPos(); 2788 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2789 2790 // Prepare for the vector type of the interleaved load/store. 2791 Type *ScalarTy = getLoadStoreType(Instr); 2792 unsigned InterleaveFactor = Group->getFactor(); 2793 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2794 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2795 2796 // Prepare for the new pointers. 2797 SmallVector<Value *, 2> AddrParts; 2798 unsigned Index = Group->getIndex(Instr); 2799 2800 // TODO: extend the masked interleaved-group support to reversed access. 2801 assert((!BlockInMask || !Group->isReverse()) && 2802 "Reversed masked interleave-group not supported."); 2803 2804 // If the group is reverse, adjust the index to refer to the last vector lane 2805 // instead of the first. We adjust the index from the first vector lane, 2806 // rather than directly getting the pointer for lane VF - 1, because the 2807 // pointer operand of the interleaved access is supposed to be uniform. For 2808 // uniform instructions, we're only required to generate a value for the 2809 // first vector lane in each unroll iteration. 2810 if (Group->isReverse()) 2811 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2812 2813 for (unsigned Part = 0; Part < UF; Part++) { 2814 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2815 setDebugLocFromInst(AddrPart); 2816 2817 // Notice current instruction could be any index. Need to adjust the address 2818 // to the member of index 0. 2819 // 2820 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2821 // b = A[i]; // Member of index 0 2822 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2823 // 2824 // E.g. A[i+1] = a; // Member of index 1 2825 // A[i] = b; // Member of index 0 2826 // A[i+2] = c; // Member of index 2 (Current instruction) 2827 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2828 2829 bool InBounds = false; 2830 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2831 InBounds = gep->isInBounds(); 2832 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2833 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2834 2835 // Cast to the vector pointer type. 2836 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2837 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2838 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2839 } 2840 2841 setDebugLocFromInst(Instr); 2842 Value *PoisonVec = PoisonValue::get(VecTy); 2843 2844 Value *MaskForGaps = nullptr; 2845 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2846 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2847 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2848 } 2849 2850 // Vectorize the interleaved load group. 2851 if (isa<LoadInst>(Instr)) { 2852 // For each unroll part, create a wide load for the group. 2853 SmallVector<Value *, 2> NewLoads; 2854 for (unsigned Part = 0; Part < UF; Part++) { 2855 Instruction *NewLoad; 2856 if (BlockInMask || MaskForGaps) { 2857 assert(useMaskedInterleavedAccesses(*TTI) && 2858 "masked interleaved groups are not allowed."); 2859 Value *GroupMask = MaskForGaps; 2860 if (BlockInMask) { 2861 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2862 Value *ShuffledMask = Builder.CreateShuffleVector( 2863 BlockInMaskPart, 2864 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2865 "interleaved.mask"); 2866 GroupMask = MaskForGaps 2867 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2868 MaskForGaps) 2869 : ShuffledMask; 2870 } 2871 NewLoad = 2872 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2873 GroupMask, PoisonVec, "wide.masked.vec"); 2874 } 2875 else 2876 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2877 Group->getAlign(), "wide.vec"); 2878 Group->addMetadata(NewLoad); 2879 NewLoads.push_back(NewLoad); 2880 } 2881 2882 // For each member in the group, shuffle out the appropriate data from the 2883 // wide loads. 2884 unsigned J = 0; 2885 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2886 Instruction *Member = Group->getMember(I); 2887 2888 // Skip the gaps in the group. 2889 if (!Member) 2890 continue; 2891 2892 auto StrideMask = 2893 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2894 for (unsigned Part = 0; Part < UF; Part++) { 2895 Value *StridedVec = Builder.CreateShuffleVector( 2896 NewLoads[Part], StrideMask, "strided.vec"); 2897 2898 // If this member has different type, cast the result type. 2899 if (Member->getType() != ScalarTy) { 2900 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2901 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2902 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2903 } 2904 2905 if (Group->isReverse()) 2906 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2907 2908 State.set(VPDefs[J], StridedVec, Part); 2909 } 2910 ++J; 2911 } 2912 return; 2913 } 2914 2915 // The sub vector type for current instruction. 2916 auto *SubVT = VectorType::get(ScalarTy, VF); 2917 2918 // Vectorize the interleaved store group. 2919 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2920 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2921 "masked interleaved groups are not allowed."); 2922 assert((!MaskForGaps || !VF.isScalable()) && 2923 "masking gaps for scalable vectors is not yet supported."); 2924 for (unsigned Part = 0; Part < UF; Part++) { 2925 // Collect the stored vector from each member. 2926 SmallVector<Value *, 4> StoredVecs; 2927 for (unsigned i = 0; i < InterleaveFactor; i++) { 2928 assert((Group->getMember(i) || MaskForGaps) && 2929 "Fail to get a member from an interleaved store group"); 2930 Instruction *Member = Group->getMember(i); 2931 2932 // Skip the gaps in the group. 2933 if (!Member) { 2934 Value *Undef = PoisonValue::get(SubVT); 2935 StoredVecs.push_back(Undef); 2936 continue; 2937 } 2938 2939 Value *StoredVec = State.get(StoredValues[i], Part); 2940 2941 if (Group->isReverse()) 2942 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2943 2944 // If this member has different type, cast it to a unified type. 2945 2946 if (StoredVec->getType() != SubVT) 2947 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2948 2949 StoredVecs.push_back(StoredVec); 2950 } 2951 2952 // Concatenate all vectors into a wide vector. 2953 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2954 2955 // Interleave the elements in the wide vector. 2956 Value *IVec = Builder.CreateShuffleVector( 2957 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2958 "interleaved.vec"); 2959 2960 Instruction *NewStoreInstr; 2961 if (BlockInMask || MaskForGaps) { 2962 Value *GroupMask = MaskForGaps; 2963 if (BlockInMask) { 2964 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2965 Value *ShuffledMask = Builder.CreateShuffleVector( 2966 BlockInMaskPart, 2967 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2968 "interleaved.mask"); 2969 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2970 ShuffledMask, MaskForGaps) 2971 : ShuffledMask; 2972 } 2973 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2974 Group->getAlign(), GroupMask); 2975 } else 2976 NewStoreInstr = 2977 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2978 2979 Group->addMetadata(NewStoreInstr); 2980 } 2981 } 2982 2983 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2984 VPReplicateRecipe *RepRecipe, 2985 const VPIteration &Instance, 2986 bool IfPredicateInstr, 2987 VPTransformState &State) { 2988 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2989 2990 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2991 // the first lane and part. 2992 if (isa<NoAliasScopeDeclInst>(Instr)) 2993 if (!Instance.isFirstIteration()) 2994 return; 2995 2996 setDebugLocFromInst(Instr); 2997 2998 // Does this instruction return a value ? 2999 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3000 3001 Instruction *Cloned = Instr->clone(); 3002 if (!IsVoidRetTy) 3003 Cloned->setName(Instr->getName() + ".cloned"); 3004 3005 // If the scalarized instruction contributes to the address computation of a 3006 // widen masked load/store which was in a basic block that needed predication 3007 // and is not predicated after vectorization, we can't propagate 3008 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 3009 // instruction could feed a poison value to the base address of the widen 3010 // load/store. 3011 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 3012 Cloned->dropPoisonGeneratingFlags(); 3013 3014 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3015 Builder.GetInsertPoint()); 3016 // Replace the operands of the cloned instructions with their scalar 3017 // equivalents in the new loop. 3018 for (auto &I : enumerate(RepRecipe->operands())) { 3019 auto InputInstance = Instance; 3020 VPValue *Operand = I.value(); 3021 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 3022 if (OperandR && OperandR->isUniform()) 3023 InputInstance.Lane = VPLane::getFirstLane(); 3024 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 3025 } 3026 addNewMetadata(Cloned, Instr); 3027 3028 // Place the cloned scalar in the new loop. 3029 Builder.Insert(Cloned); 3030 3031 State.set(RepRecipe, Cloned, Instance); 3032 3033 // If we just cloned a new assumption, add it the assumption cache. 3034 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3035 AC->registerAssumption(II); 3036 3037 // End if-block. 3038 if (IfPredicateInstr) 3039 PredicatedInstructions.push_back(Cloned); 3040 } 3041 3042 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 3043 BasicBlock *Header = L->getHeader(); 3044 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 3045 3046 IRBuilder<> B(Header->getTerminator()); 3047 Instruction *OldInst = 3048 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 3049 setDebugLocFromInst(OldInst, &B); 3050 3051 // Connect the header to the exit and header blocks and replace the old 3052 // terminator. 3053 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 3054 3055 // Now we have two terminators. Remove the old one from the block. 3056 Header->getTerminator()->eraseFromParent(); 3057 } 3058 3059 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3060 if (TripCount) 3061 return TripCount; 3062 3063 assert(L && "Create Trip Count for null loop."); 3064 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3065 // Find the loop boundaries. 3066 ScalarEvolution *SE = PSE.getSE(); 3067 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3068 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3069 "Invalid loop count"); 3070 3071 Type *IdxTy = Legal->getWidestInductionType(); 3072 assert(IdxTy && "No type for induction"); 3073 3074 // The exit count might have the type of i64 while the phi is i32. This can 3075 // happen if we have an induction variable that is sign extended before the 3076 // compare. The only way that we get a backedge taken count is that the 3077 // induction variable was signed and as such will not overflow. In such a case 3078 // truncation is legal. 3079 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3080 IdxTy->getPrimitiveSizeInBits()) 3081 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3082 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3083 3084 // Get the total trip count from the count by adding 1. 3085 const SCEV *ExitCount = SE->getAddExpr( 3086 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3087 3088 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3089 3090 // Expand the trip count and place the new instructions in the preheader. 3091 // Notice that the pre-header does not change, only the loop body. 3092 SCEVExpander Exp(*SE, DL, "induction"); 3093 3094 // Count holds the overall loop count (N). 3095 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3096 L->getLoopPreheader()->getTerminator()); 3097 3098 if (TripCount->getType()->isPointerTy()) 3099 TripCount = 3100 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3101 L->getLoopPreheader()->getTerminator()); 3102 3103 return TripCount; 3104 } 3105 3106 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3107 if (VectorTripCount) 3108 return VectorTripCount; 3109 3110 Value *TC = getOrCreateTripCount(L); 3111 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3112 3113 Type *Ty = TC->getType(); 3114 // This is where we can make the step a runtime constant. 3115 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3116 3117 // If the tail is to be folded by masking, round the number of iterations N 3118 // up to a multiple of Step instead of rounding down. This is done by first 3119 // adding Step-1 and then rounding down. Note that it's ok if this addition 3120 // overflows: the vector induction variable will eventually wrap to zero given 3121 // that it starts at zero and its Step is a power of two; the loop will then 3122 // exit, with the last early-exit vector comparison also producing all-true. 3123 if (Cost->foldTailByMasking()) { 3124 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3125 "VF*UF must be a power of 2 when folding tail by masking"); 3126 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 3127 TC = Builder.CreateAdd( 3128 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 3129 } 3130 3131 // Now we need to generate the expression for the part of the loop that the 3132 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3133 // iterations are not required for correctness, or N - Step, otherwise. Step 3134 // is equal to the vectorization factor (number of SIMD elements) times the 3135 // unroll factor (number of SIMD instructions). 3136 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3137 3138 // There are cases where we *must* run at least one iteration in the remainder 3139 // loop. See the cost model for when this can happen. If the step evenly 3140 // divides the trip count, we set the remainder to be equal to the step. If 3141 // the step does not evenly divide the trip count, no adjustment is necessary 3142 // since there will already be scalar iterations. Note that the minimum 3143 // iterations check ensures that N >= Step. 3144 if (Cost->requiresScalarEpilogue(VF)) { 3145 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3146 R = Builder.CreateSelect(IsZero, Step, R); 3147 } 3148 3149 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3150 3151 return VectorTripCount; 3152 } 3153 3154 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3155 const DataLayout &DL) { 3156 // Verify that V is a vector type with same number of elements as DstVTy. 3157 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3158 unsigned VF = DstFVTy->getNumElements(); 3159 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3160 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3161 Type *SrcElemTy = SrcVecTy->getElementType(); 3162 Type *DstElemTy = DstFVTy->getElementType(); 3163 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3164 "Vector elements must have same size"); 3165 3166 // Do a direct cast if element types are castable. 3167 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3168 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3169 } 3170 // V cannot be directly casted to desired vector type. 3171 // May happen when V is a floating point vector but DstVTy is a vector of 3172 // pointers or vice-versa. Handle this using a two-step bitcast using an 3173 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3174 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3175 "Only one type should be a pointer type"); 3176 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3177 "Only one type should be a floating point type"); 3178 Type *IntTy = 3179 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3180 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3181 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3182 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3183 } 3184 3185 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3186 BasicBlock *Bypass) { 3187 Value *Count = getOrCreateTripCount(L); 3188 // Reuse existing vector loop preheader for TC checks. 3189 // Note that new preheader block is generated for vector loop. 3190 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3191 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3192 3193 // Generate code to check if the loop's trip count is less than VF * UF, or 3194 // equal to it in case a scalar epilogue is required; this implies that the 3195 // vector trip count is zero. This check also covers the case where adding one 3196 // to the backedge-taken count overflowed leading to an incorrect trip count 3197 // of zero. In this case we will also jump to the scalar loop. 3198 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3199 : ICmpInst::ICMP_ULT; 3200 3201 // If tail is to be folded, vector loop takes care of all iterations. 3202 Value *CheckMinIters = Builder.getFalse(); 3203 if (!Cost->foldTailByMasking()) { 3204 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3205 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3206 } 3207 // Create new preheader for vector loop. 3208 LoopVectorPreHeader = 3209 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3210 "vector.ph"); 3211 3212 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3213 DT->getNode(Bypass)->getIDom()) && 3214 "TC check is expected to dominate Bypass"); 3215 3216 // Update dominator for Bypass & LoopExit (if needed). 3217 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3218 if (!Cost->requiresScalarEpilogue(VF)) 3219 // If there is an epilogue which must run, there's no edge from the 3220 // middle block to exit blocks and thus no need to update the immediate 3221 // dominator of the exit blocks. 3222 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3223 3224 ReplaceInstWithInst( 3225 TCCheckBlock->getTerminator(), 3226 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3227 LoopBypassBlocks.push_back(TCCheckBlock); 3228 } 3229 3230 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3231 3232 BasicBlock *const SCEVCheckBlock = 3233 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3234 if (!SCEVCheckBlock) 3235 return nullptr; 3236 3237 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3238 (OptForSizeBasedOnProfile && 3239 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3240 "Cannot SCEV check stride or overflow when optimizing for size"); 3241 3242 3243 // Update dominator only if this is first RT check. 3244 if (LoopBypassBlocks.empty()) { 3245 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3246 if (!Cost->requiresScalarEpilogue(VF)) 3247 // If there is an epilogue which must run, there's no edge from the 3248 // middle block to exit blocks and thus no need to update the immediate 3249 // dominator of the exit blocks. 3250 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3251 } 3252 3253 LoopBypassBlocks.push_back(SCEVCheckBlock); 3254 AddedSafetyChecks = true; 3255 return SCEVCheckBlock; 3256 } 3257 3258 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3259 BasicBlock *Bypass) { 3260 // VPlan-native path does not do any analysis for runtime checks currently. 3261 if (EnableVPlanNativePath) 3262 return nullptr; 3263 3264 BasicBlock *const MemCheckBlock = 3265 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3266 3267 // Check if we generated code that checks in runtime if arrays overlap. We put 3268 // the checks into a separate block to make the more common case of few 3269 // elements faster. 3270 if (!MemCheckBlock) 3271 return nullptr; 3272 3273 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3274 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3275 "Cannot emit memory checks when optimizing for size, unless forced " 3276 "to vectorize."); 3277 ORE->emit([&]() { 3278 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3279 L->getStartLoc(), L->getHeader()) 3280 << "Code-size may be reduced by not forcing " 3281 "vectorization, or by source-code modifications " 3282 "eliminating the need for runtime checks " 3283 "(e.g., adding 'restrict')."; 3284 }); 3285 } 3286 3287 LoopBypassBlocks.push_back(MemCheckBlock); 3288 3289 AddedSafetyChecks = true; 3290 3291 // We currently don't use LoopVersioning for the actual loop cloning but we 3292 // still use it to add the noalias metadata. 3293 LVer = std::make_unique<LoopVersioning>( 3294 *Legal->getLAI(), 3295 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3296 DT, PSE.getSE()); 3297 LVer->prepareNoAliasMetadata(); 3298 return MemCheckBlock; 3299 } 3300 3301 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3302 LoopScalarBody = OrigLoop->getHeader(); 3303 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3304 assert(LoopVectorPreHeader && "Invalid loop structure"); 3305 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3306 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3307 "multiple exit loop without required epilogue?"); 3308 3309 LoopMiddleBlock = 3310 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3311 LI, nullptr, Twine(Prefix) + "middle.block"); 3312 LoopScalarPreHeader = 3313 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3314 nullptr, Twine(Prefix) + "scalar.ph"); 3315 3316 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3317 3318 // Set up the middle block terminator. Two cases: 3319 // 1) If we know that we must execute the scalar epilogue, emit an 3320 // unconditional branch. 3321 // 2) Otherwise, we must have a single unique exit block (due to how we 3322 // implement the multiple exit case). In this case, set up a conditonal 3323 // branch from the middle block to the loop scalar preheader, and the 3324 // exit block. completeLoopSkeleton will update the condition to use an 3325 // iteration check, if required to decide whether to execute the remainder. 3326 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3327 BranchInst::Create(LoopScalarPreHeader) : 3328 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3329 Builder.getTrue()); 3330 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3331 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3332 3333 // We intentionally don't let SplitBlock to update LoopInfo since 3334 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3335 // LoopVectorBody is explicitly added to the correct place few lines later. 3336 LoopVectorBody = 3337 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3338 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3339 3340 // Update dominator for loop exit. 3341 if (!Cost->requiresScalarEpilogue(VF)) 3342 // If there is an epilogue which must run, there's no edge from the 3343 // middle block to exit blocks and thus no need to update the immediate 3344 // dominator of the exit blocks. 3345 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3346 3347 // Create and register the new vector loop. 3348 Loop *Lp = LI->AllocateLoop(); 3349 Loop *ParentLoop = OrigLoop->getParentLoop(); 3350 3351 // Insert the new loop into the loop nest and register the new basic blocks 3352 // before calling any utilities such as SCEV that require valid LoopInfo. 3353 if (ParentLoop) { 3354 ParentLoop->addChildLoop(Lp); 3355 } else { 3356 LI->addTopLevelLoop(Lp); 3357 } 3358 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3359 return Lp; 3360 } 3361 3362 void InnerLoopVectorizer::createInductionResumeValues( 3363 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3364 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3365 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3366 "Inconsistent information about additional bypass."); 3367 3368 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3369 assert(VectorTripCount && L && "Expected valid arguments"); 3370 // We are going to resume the execution of the scalar loop. 3371 // Go over all of the induction variables that we found and fix the 3372 // PHIs that are left in the scalar version of the loop. 3373 // The starting values of PHI nodes depend on the counter of the last 3374 // iteration in the vectorized loop. 3375 // If we come from a bypass edge then we need to start from the original 3376 // start value. 3377 Instruction *OldInduction = Legal->getPrimaryInduction(); 3378 for (auto &InductionEntry : Legal->getInductionVars()) { 3379 PHINode *OrigPhi = InductionEntry.first; 3380 InductionDescriptor II = InductionEntry.second; 3381 3382 // Create phi nodes to merge from the backedge-taken check block. 3383 PHINode *BCResumeVal = 3384 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3385 LoopScalarPreHeader->getTerminator()); 3386 // Copy original phi DL over to the new one. 3387 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3388 Value *&EndValue = IVEndValues[OrigPhi]; 3389 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3390 if (OrigPhi == OldInduction) { 3391 // We know what the end value is. 3392 EndValue = VectorTripCount; 3393 } else { 3394 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3395 3396 // Fast-math-flags propagate from the original induction instruction. 3397 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3398 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3399 3400 Type *StepType = II.getStep()->getType(); 3401 Instruction::CastOps CastOp = 3402 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3403 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3404 Value *Step = 3405 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3406 EndValue = emitTransformedIndex(B, CRD, Step, II); 3407 EndValue->setName("ind.end"); 3408 3409 // Compute the end value for the additional bypass (if applicable). 3410 if (AdditionalBypass.first) { 3411 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3412 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3413 StepType, true); 3414 Value *Step = 3415 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3416 CRD = 3417 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3418 EndValueFromAdditionalBypass = emitTransformedIndex(B, CRD, Step, II); 3419 EndValueFromAdditionalBypass->setName("ind.end"); 3420 } 3421 } 3422 // The new PHI merges the original incoming value, in case of a bypass, 3423 // or the value at the end of the vectorized loop. 3424 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3425 3426 // Fix the scalar body counter (PHI node). 3427 // The old induction's phi node in the scalar body needs the truncated 3428 // value. 3429 for (BasicBlock *BB : LoopBypassBlocks) 3430 BCResumeVal->addIncoming(II.getStartValue(), BB); 3431 3432 if (AdditionalBypass.first) 3433 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3434 EndValueFromAdditionalBypass); 3435 3436 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3437 } 3438 } 3439 3440 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3441 MDNode *OrigLoopID) { 3442 assert(L && "Expected valid loop."); 3443 3444 // The trip counts should be cached by now. 3445 Value *Count = getOrCreateTripCount(L); 3446 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3447 3448 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3449 3450 // Add a check in the middle block to see if we have completed 3451 // all of the iterations in the first vector loop. Three cases: 3452 // 1) If we require a scalar epilogue, there is no conditional branch as 3453 // we unconditionally branch to the scalar preheader. Do nothing. 3454 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3455 // Thus if tail is to be folded, we know we don't need to run the 3456 // remainder and we can use the previous value for the condition (true). 3457 // 3) Otherwise, construct a runtime check. 3458 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3459 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3460 Count, VectorTripCount, "cmp.n", 3461 LoopMiddleBlock->getTerminator()); 3462 3463 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3464 // of the corresponding compare because they may have ended up with 3465 // different line numbers and we want to avoid awkward line stepping while 3466 // debugging. Eg. if the compare has got a line number inside the loop. 3467 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3468 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3469 } 3470 3471 // Get ready to start creating new instructions into the vectorized body. 3472 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3473 "Inconsistent vector loop preheader"); 3474 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3475 3476 #ifdef EXPENSIVE_CHECKS 3477 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3478 LI->verify(*DT); 3479 #endif 3480 3481 return LoopVectorPreHeader; 3482 } 3483 3484 std::pair<BasicBlock *, Value *> 3485 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3486 /* 3487 In this function we generate a new loop. The new loop will contain 3488 the vectorized instructions while the old loop will continue to run the 3489 scalar remainder. 3490 3491 [ ] <-- loop iteration number check. 3492 / | 3493 / v 3494 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3495 | / | 3496 | / v 3497 || [ ] <-- vector pre header. 3498 |/ | 3499 | v 3500 | [ ] \ 3501 | [ ]_| <-- vector loop. 3502 | | 3503 | v 3504 \ -[ ] <--- middle-block. 3505 \/ | 3506 /\ v 3507 | ->[ ] <--- new preheader. 3508 | | 3509 (opt) v <-- edge from middle to exit iff epilogue is not required. 3510 | [ ] \ 3511 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3512 \ | 3513 \ v 3514 >[ ] <-- exit block(s). 3515 ... 3516 */ 3517 3518 // Get the metadata of the original loop before it gets modified. 3519 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3520 3521 // Workaround! Compute the trip count of the original loop and cache it 3522 // before we start modifying the CFG. This code has a systemic problem 3523 // wherein it tries to run analysis over partially constructed IR; this is 3524 // wrong, and not simply for SCEV. The trip count of the original loop 3525 // simply happens to be prone to hitting this in practice. In theory, we 3526 // can hit the same issue for any SCEV, or ValueTracking query done during 3527 // mutation. See PR49900. 3528 getOrCreateTripCount(OrigLoop); 3529 3530 // Create an empty vector loop, and prepare basic blocks for the runtime 3531 // checks. 3532 Loop *Lp = createVectorLoopSkeleton(""); 3533 3534 // Now, compare the new count to zero. If it is zero skip the vector loop and 3535 // jump to the scalar loop. This check also covers the case where the 3536 // backedge-taken count is uint##_max: adding one to it will overflow leading 3537 // to an incorrect trip count of zero. In this (rare) case we will also jump 3538 // to the scalar loop. 3539 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3540 3541 // Generate the code to check any assumptions that we've made for SCEV 3542 // expressions. 3543 emitSCEVChecks(Lp, LoopScalarPreHeader); 3544 3545 // Generate the code that checks in runtime if arrays overlap. We put the 3546 // checks into a separate block to make the more common case of few elements 3547 // faster. 3548 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3549 3550 createHeaderBranch(Lp); 3551 3552 // Emit phis for the new starting index of the scalar loop. 3553 createInductionResumeValues(Lp); 3554 3555 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3556 } 3557 3558 // Fix up external users of the induction variable. At this point, we are 3559 // in LCSSA form, with all external PHIs that use the IV having one input value, 3560 // coming from the remainder loop. We need those PHIs to also have a correct 3561 // value for the IV when arriving directly from the middle block. 3562 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3563 const InductionDescriptor &II, 3564 Value *CountRoundDown, Value *EndValue, 3565 BasicBlock *MiddleBlock) { 3566 // There are two kinds of external IV usages - those that use the value 3567 // computed in the last iteration (the PHI) and those that use the penultimate 3568 // value (the value that feeds into the phi from the loop latch). 3569 // We allow both, but they, obviously, have different values. 3570 3571 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3572 3573 DenseMap<Value *, Value *> MissingVals; 3574 3575 // An external user of the last iteration's value should see the value that 3576 // the remainder loop uses to initialize its own IV. 3577 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3578 for (User *U : PostInc->users()) { 3579 Instruction *UI = cast<Instruction>(U); 3580 if (!OrigLoop->contains(UI)) { 3581 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3582 MissingVals[UI] = EndValue; 3583 } 3584 } 3585 3586 // An external user of the penultimate value need to see EndValue - Step. 3587 // The simplest way to get this is to recompute it from the constituent SCEVs, 3588 // that is Start + (Step * (CRD - 1)). 3589 for (User *U : OrigPhi->users()) { 3590 auto *UI = cast<Instruction>(U); 3591 if (!OrigLoop->contains(UI)) { 3592 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3593 3594 IRBuilder<> B(MiddleBlock->getTerminator()); 3595 3596 // Fast-math-flags propagate from the original induction instruction. 3597 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3598 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3599 3600 Value *CountMinusOne = B.CreateSub( 3601 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3602 Value *CMO = 3603 !II.getStep()->getType()->isIntegerTy() 3604 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3605 II.getStep()->getType()) 3606 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3607 CMO->setName("cast.cmo"); 3608 3609 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3610 LoopVectorBody->getTerminator()); 3611 Value *Escape = emitTransformedIndex(B, CMO, Step, II); 3612 Escape->setName("ind.escape"); 3613 MissingVals[UI] = Escape; 3614 } 3615 } 3616 3617 for (auto &I : MissingVals) { 3618 PHINode *PHI = cast<PHINode>(I.first); 3619 // One corner case we have to handle is two IVs "chasing" each-other, 3620 // that is %IV2 = phi [...], [ %IV1, %latch ] 3621 // In this case, if IV1 has an external use, we need to avoid adding both 3622 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3623 // don't already have an incoming value for the middle block. 3624 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3625 PHI->addIncoming(I.second, MiddleBlock); 3626 } 3627 } 3628 3629 namespace { 3630 3631 struct CSEDenseMapInfo { 3632 static bool canHandle(const Instruction *I) { 3633 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3634 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3635 } 3636 3637 static inline Instruction *getEmptyKey() { 3638 return DenseMapInfo<Instruction *>::getEmptyKey(); 3639 } 3640 3641 static inline Instruction *getTombstoneKey() { 3642 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3643 } 3644 3645 static unsigned getHashValue(const Instruction *I) { 3646 assert(canHandle(I) && "Unknown instruction!"); 3647 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3648 I->value_op_end())); 3649 } 3650 3651 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3652 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3653 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3654 return LHS == RHS; 3655 return LHS->isIdenticalTo(RHS); 3656 } 3657 }; 3658 3659 } // end anonymous namespace 3660 3661 ///Perform cse of induction variable instructions. 3662 static void cse(BasicBlock *BB) { 3663 // Perform simple cse. 3664 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3665 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3666 if (!CSEDenseMapInfo::canHandle(&In)) 3667 continue; 3668 3669 // Check if we can replace this instruction with any of the 3670 // visited instructions. 3671 if (Instruction *V = CSEMap.lookup(&In)) { 3672 In.replaceAllUsesWith(V); 3673 In.eraseFromParent(); 3674 continue; 3675 } 3676 3677 CSEMap[&In] = &In; 3678 } 3679 } 3680 3681 InstructionCost 3682 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3683 bool &NeedToScalarize) const { 3684 Function *F = CI->getCalledFunction(); 3685 Type *ScalarRetTy = CI->getType(); 3686 SmallVector<Type *, 4> Tys, ScalarTys; 3687 for (auto &ArgOp : CI->args()) 3688 ScalarTys.push_back(ArgOp->getType()); 3689 3690 // Estimate cost of scalarized vector call. The source operands are assumed 3691 // to be vectors, so we need to extract individual elements from there, 3692 // execute VF scalar calls, and then gather the result into the vector return 3693 // value. 3694 InstructionCost ScalarCallCost = 3695 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3696 if (VF.isScalar()) 3697 return ScalarCallCost; 3698 3699 // Compute corresponding vector type for return value and arguments. 3700 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3701 for (Type *ScalarTy : ScalarTys) 3702 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3703 3704 // Compute costs of unpacking argument values for the scalar calls and 3705 // packing the return values to a vector. 3706 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3707 3708 InstructionCost Cost = 3709 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3710 3711 // If we can't emit a vector call for this function, then the currently found 3712 // cost is the cost we need to return. 3713 NeedToScalarize = true; 3714 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3715 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3716 3717 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3718 return Cost; 3719 3720 // If the corresponding vector cost is cheaper, return its cost. 3721 InstructionCost VectorCallCost = 3722 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3723 if (VectorCallCost < Cost) { 3724 NeedToScalarize = false; 3725 Cost = VectorCallCost; 3726 } 3727 return Cost; 3728 } 3729 3730 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3731 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3732 return Elt; 3733 return VectorType::get(Elt, VF); 3734 } 3735 3736 InstructionCost 3737 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3738 ElementCount VF) const { 3739 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3740 assert(ID && "Expected intrinsic call!"); 3741 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3742 FastMathFlags FMF; 3743 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3744 FMF = FPMO->getFastMathFlags(); 3745 3746 SmallVector<const Value *> Arguments(CI->args()); 3747 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3748 SmallVector<Type *> ParamTys; 3749 std::transform(FTy->param_begin(), FTy->param_end(), 3750 std::back_inserter(ParamTys), 3751 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3752 3753 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3754 dyn_cast<IntrinsicInst>(CI)); 3755 return TTI.getIntrinsicInstrCost(CostAttrs, 3756 TargetTransformInfo::TCK_RecipThroughput); 3757 } 3758 3759 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3760 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3761 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3762 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3763 } 3764 3765 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3766 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3767 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3768 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3769 } 3770 3771 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3772 // For every instruction `I` in MinBWs, truncate the operands, create a 3773 // truncated version of `I` and reextend its result. InstCombine runs 3774 // later and will remove any ext/trunc pairs. 3775 SmallPtrSet<Value *, 4> Erased; 3776 for (const auto &KV : Cost->getMinimalBitwidths()) { 3777 // If the value wasn't vectorized, we must maintain the original scalar 3778 // type. The absence of the value from State indicates that it 3779 // wasn't vectorized. 3780 // FIXME: Should not rely on getVPValue at this point. 3781 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3782 if (!State.hasAnyVectorValue(Def)) 3783 continue; 3784 for (unsigned Part = 0; Part < UF; ++Part) { 3785 Value *I = State.get(Def, Part); 3786 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3787 continue; 3788 Type *OriginalTy = I->getType(); 3789 Type *ScalarTruncatedTy = 3790 IntegerType::get(OriginalTy->getContext(), KV.second); 3791 auto *TruncatedTy = VectorType::get( 3792 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3793 if (TruncatedTy == OriginalTy) 3794 continue; 3795 3796 IRBuilder<> B(cast<Instruction>(I)); 3797 auto ShrinkOperand = [&](Value *V) -> Value * { 3798 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3799 if (ZI->getSrcTy() == TruncatedTy) 3800 return ZI->getOperand(0); 3801 return B.CreateZExtOrTrunc(V, TruncatedTy); 3802 }; 3803 3804 // The actual instruction modification depends on the instruction type, 3805 // unfortunately. 3806 Value *NewI = nullptr; 3807 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3808 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3809 ShrinkOperand(BO->getOperand(1))); 3810 3811 // Any wrapping introduced by shrinking this operation shouldn't be 3812 // considered undefined behavior. So, we can't unconditionally copy 3813 // arithmetic wrapping flags to NewI. 3814 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3815 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3816 NewI = 3817 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3818 ShrinkOperand(CI->getOperand(1))); 3819 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3820 NewI = B.CreateSelect(SI->getCondition(), 3821 ShrinkOperand(SI->getTrueValue()), 3822 ShrinkOperand(SI->getFalseValue())); 3823 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3824 switch (CI->getOpcode()) { 3825 default: 3826 llvm_unreachable("Unhandled cast!"); 3827 case Instruction::Trunc: 3828 NewI = ShrinkOperand(CI->getOperand(0)); 3829 break; 3830 case Instruction::SExt: 3831 NewI = B.CreateSExtOrTrunc( 3832 CI->getOperand(0), 3833 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3834 break; 3835 case Instruction::ZExt: 3836 NewI = B.CreateZExtOrTrunc( 3837 CI->getOperand(0), 3838 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3839 break; 3840 } 3841 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3842 auto Elements0 = 3843 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3844 auto *O0 = B.CreateZExtOrTrunc( 3845 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3846 auto Elements1 = 3847 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3848 auto *O1 = B.CreateZExtOrTrunc( 3849 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3850 3851 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3852 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3853 // Don't do anything with the operands, just extend the result. 3854 continue; 3855 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3856 auto Elements = 3857 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3858 auto *O0 = B.CreateZExtOrTrunc( 3859 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3860 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3861 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3862 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3863 auto Elements = 3864 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3865 auto *O0 = B.CreateZExtOrTrunc( 3866 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3867 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3868 } else { 3869 // If we don't know what to do, be conservative and don't do anything. 3870 continue; 3871 } 3872 3873 // Lastly, extend the result. 3874 NewI->takeName(cast<Instruction>(I)); 3875 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3876 I->replaceAllUsesWith(Res); 3877 cast<Instruction>(I)->eraseFromParent(); 3878 Erased.insert(I); 3879 State.reset(Def, Res, Part); 3880 } 3881 } 3882 3883 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3884 for (const auto &KV : Cost->getMinimalBitwidths()) { 3885 // If the value wasn't vectorized, we must maintain the original scalar 3886 // type. The absence of the value from State indicates that it 3887 // wasn't vectorized. 3888 // FIXME: Should not rely on getVPValue at this point. 3889 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3890 if (!State.hasAnyVectorValue(Def)) 3891 continue; 3892 for (unsigned Part = 0; Part < UF; ++Part) { 3893 Value *I = State.get(Def, Part); 3894 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3895 if (Inst && Inst->use_empty()) { 3896 Value *NewI = Inst->getOperand(0); 3897 Inst->eraseFromParent(); 3898 State.reset(Def, NewI, Part); 3899 } 3900 } 3901 } 3902 } 3903 3904 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3905 // Insert truncates and extends for any truncated instructions as hints to 3906 // InstCombine. 3907 if (VF.isVector()) 3908 truncateToMinimalBitwidths(State); 3909 3910 // Fix widened non-induction PHIs by setting up the PHI operands. 3911 if (OrigPHIsToFix.size()) { 3912 assert(EnableVPlanNativePath && 3913 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3914 fixNonInductionPHIs(State); 3915 } 3916 3917 // At this point every instruction in the original loop is widened to a 3918 // vector form. Now we need to fix the recurrences in the loop. These PHI 3919 // nodes are currently empty because we did not want to introduce cycles. 3920 // This is the second stage of vectorizing recurrences. 3921 fixCrossIterationPHIs(State); 3922 3923 // Forget the original basic block. 3924 PSE.getSE()->forgetLoop(OrigLoop); 3925 3926 // If we inserted an edge from the middle block to the unique exit block, 3927 // update uses outside the loop (phis) to account for the newly inserted 3928 // edge. 3929 if (!Cost->requiresScalarEpilogue(VF)) { 3930 // Fix-up external users of the induction variables. 3931 for (auto &Entry : Legal->getInductionVars()) 3932 fixupIVUsers(Entry.first, Entry.second, 3933 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3934 IVEndValues[Entry.first], LoopMiddleBlock); 3935 3936 fixLCSSAPHIs(State); 3937 } 3938 3939 for (Instruction *PI : PredicatedInstructions) 3940 sinkScalarOperands(&*PI); 3941 3942 // Remove redundant induction instructions. 3943 cse(LoopVectorBody); 3944 3945 // Set/update profile weights for the vector and remainder loops as original 3946 // loop iterations are now distributed among them. Note that original loop 3947 // represented by LoopScalarBody becomes remainder loop after vectorization. 3948 // 3949 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3950 // end up getting slightly roughened result but that should be OK since 3951 // profile is not inherently precise anyway. Note also possible bypass of 3952 // vector code caused by legality checks is ignored, assigning all the weight 3953 // to the vector loop, optimistically. 3954 // 3955 // For scalable vectorization we can't know at compile time how many iterations 3956 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3957 // vscale of '1'. 3958 setProfileInfoAfterUnrolling( 3959 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3960 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3961 } 3962 3963 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3964 // In order to support recurrences we need to be able to vectorize Phi nodes. 3965 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3966 // stage #2: We now need to fix the recurrences by adding incoming edges to 3967 // the currently empty PHI nodes. At this point every instruction in the 3968 // original loop is widened to a vector form so we can use them to construct 3969 // the incoming edges. 3970 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 3971 for (VPRecipeBase &R : Header->phis()) { 3972 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3973 fixReduction(ReductionPhi, State); 3974 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3975 fixFirstOrderRecurrence(FOR, State); 3976 } 3977 } 3978 3979 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3980 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3981 // This is the second phase of vectorizing first-order recurrences. An 3982 // overview of the transformation is described below. Suppose we have the 3983 // following loop. 3984 // 3985 // for (int i = 0; i < n; ++i) 3986 // b[i] = a[i] - a[i - 1]; 3987 // 3988 // There is a first-order recurrence on "a". For this loop, the shorthand 3989 // scalar IR looks like: 3990 // 3991 // scalar.ph: 3992 // s_init = a[-1] 3993 // br scalar.body 3994 // 3995 // scalar.body: 3996 // i = phi [0, scalar.ph], [i+1, scalar.body] 3997 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3998 // s2 = a[i] 3999 // b[i] = s2 - s1 4000 // br cond, scalar.body, ... 4001 // 4002 // In this example, s1 is a recurrence because it's value depends on the 4003 // previous iteration. In the first phase of vectorization, we created a 4004 // vector phi v1 for s1. We now complete the vectorization and produce the 4005 // shorthand vector IR shown below (for VF = 4, UF = 1). 4006 // 4007 // vector.ph: 4008 // v_init = vector(..., ..., ..., a[-1]) 4009 // br vector.body 4010 // 4011 // vector.body 4012 // i = phi [0, vector.ph], [i+4, vector.body] 4013 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4014 // v2 = a[i, i+1, i+2, i+3]; 4015 // v3 = vector(v1(3), v2(0, 1, 2)) 4016 // b[i, i+1, i+2, i+3] = v2 - v3 4017 // br cond, vector.body, middle.block 4018 // 4019 // middle.block: 4020 // x = v2(3) 4021 // br scalar.ph 4022 // 4023 // scalar.ph: 4024 // s_init = phi [x, middle.block], [a[-1], otherwise] 4025 // br scalar.body 4026 // 4027 // After execution completes the vector loop, we extract the next value of 4028 // the recurrence (x) to use as the initial value in the scalar loop. 4029 4030 // Extract the last vector element in the middle block. This will be the 4031 // initial value for the recurrence when jumping to the scalar loop. 4032 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4033 Value *Incoming = State.get(PreviousDef, UF - 1); 4034 auto *ExtractForScalar = Incoming; 4035 auto *IdxTy = Builder.getInt32Ty(); 4036 if (VF.isVector()) { 4037 auto *One = ConstantInt::get(IdxTy, 1); 4038 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4039 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4040 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4041 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4042 "vector.recur.extract"); 4043 } 4044 // Extract the second last element in the middle block if the 4045 // Phi is used outside the loop. We need to extract the phi itself 4046 // and not the last element (the phi update in the current iteration). This 4047 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4048 // when the scalar loop is not run at all. 4049 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4050 if (VF.isVector()) { 4051 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4052 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4053 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4054 Incoming, Idx, "vector.recur.extract.for.phi"); 4055 } else if (UF > 1) 4056 // When loop is unrolled without vectorizing, initialize 4057 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4058 // of `Incoming`. This is analogous to the vectorized case above: extracting 4059 // the second last element when VF > 1. 4060 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4061 4062 // Fix the initial value of the original recurrence in the scalar loop. 4063 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4064 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4065 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4066 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4067 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4068 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4069 Start->addIncoming(Incoming, BB); 4070 } 4071 4072 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4073 Phi->setName("scalar.recur"); 4074 4075 // Finally, fix users of the recurrence outside the loop. The users will need 4076 // either the last value of the scalar recurrence or the last value of the 4077 // vector recurrence we extracted in the middle block. Since the loop is in 4078 // LCSSA form, we just need to find all the phi nodes for the original scalar 4079 // recurrence in the exit block, and then add an edge for the middle block. 4080 // Note that LCSSA does not imply single entry when the original scalar loop 4081 // had multiple exiting edges (as we always run the last iteration in the 4082 // scalar epilogue); in that case, there is no edge from middle to exit and 4083 // and thus no phis which needed updated. 4084 if (!Cost->requiresScalarEpilogue(VF)) 4085 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4086 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4087 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4088 } 4089 4090 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4091 VPTransformState &State) { 4092 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4093 // Get it's reduction variable descriptor. 4094 assert(Legal->isReductionVariable(OrigPhi) && 4095 "Unable to find the reduction variable"); 4096 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4097 4098 RecurKind RK = RdxDesc.getRecurrenceKind(); 4099 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4100 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4101 setDebugLocFromInst(ReductionStartValue); 4102 4103 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4104 // This is the vector-clone of the value that leaves the loop. 4105 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4106 4107 // Wrap flags are in general invalid after vectorization, clear them. 4108 clearReductionWrapFlags(RdxDesc, State); 4109 4110 // Before each round, move the insertion point right between 4111 // the PHIs and the values we are going to write. 4112 // This allows us to write both PHINodes and the extractelement 4113 // instructions. 4114 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4115 4116 setDebugLocFromInst(LoopExitInst); 4117 4118 Type *PhiTy = OrigPhi->getType(); 4119 // If tail is folded by masking, the vector value to leave the loop should be 4120 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4121 // instead of the former. For an inloop reduction the reduction will already 4122 // be predicated, and does not need to be handled here. 4123 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4124 for (unsigned Part = 0; Part < UF; ++Part) { 4125 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4126 Value *Sel = nullptr; 4127 for (User *U : VecLoopExitInst->users()) { 4128 if (isa<SelectInst>(U)) { 4129 assert(!Sel && "Reduction exit feeding two selects"); 4130 Sel = U; 4131 } else 4132 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4133 } 4134 assert(Sel && "Reduction exit feeds no select"); 4135 State.reset(LoopExitInstDef, Sel, Part); 4136 4137 // If the target can create a predicated operator for the reduction at no 4138 // extra cost in the loop (for example a predicated vadd), it can be 4139 // cheaper for the select to remain in the loop than be sunk out of it, 4140 // and so use the select value for the phi instead of the old 4141 // LoopExitValue. 4142 if (PreferPredicatedReductionSelect || 4143 TTI->preferPredicatedReductionSelect( 4144 RdxDesc.getOpcode(), PhiTy, 4145 TargetTransformInfo::ReductionFlags())) { 4146 auto *VecRdxPhi = 4147 cast<PHINode>(State.get(PhiR, Part)); 4148 VecRdxPhi->setIncomingValueForBlock( 4149 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4150 } 4151 } 4152 } 4153 4154 // If the vector reduction can be performed in a smaller type, we truncate 4155 // then extend the loop exit value to enable InstCombine to evaluate the 4156 // entire expression in the smaller type. 4157 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4158 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4159 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4160 Builder.SetInsertPoint( 4161 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4162 VectorParts RdxParts(UF); 4163 for (unsigned Part = 0; Part < UF; ++Part) { 4164 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4165 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4166 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4167 : Builder.CreateZExt(Trunc, VecTy); 4168 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4169 if (U != Trunc) { 4170 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4171 RdxParts[Part] = Extnd; 4172 } 4173 } 4174 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4175 for (unsigned Part = 0; Part < UF; ++Part) { 4176 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4177 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4178 } 4179 } 4180 4181 // Reduce all of the unrolled parts into a single vector. 4182 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4183 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4184 4185 // The middle block terminator has already been assigned a DebugLoc here (the 4186 // OrigLoop's single latch terminator). We want the whole middle block to 4187 // appear to execute on this line because: (a) it is all compiler generated, 4188 // (b) these instructions are always executed after evaluating the latch 4189 // conditional branch, and (c) other passes may add new predecessors which 4190 // terminate on this line. This is the easiest way to ensure we don't 4191 // accidentally cause an extra step back into the loop while debugging. 4192 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4193 if (PhiR->isOrdered()) 4194 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4195 else { 4196 // Floating-point operations should have some FMF to enable the reduction. 4197 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4198 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4199 for (unsigned Part = 1; Part < UF; ++Part) { 4200 Value *RdxPart = State.get(LoopExitInstDef, Part); 4201 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4202 ReducedPartRdx = Builder.CreateBinOp( 4203 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4204 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4205 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4206 ReducedPartRdx, RdxPart); 4207 else 4208 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4209 } 4210 } 4211 4212 // Create the reduction after the loop. Note that inloop reductions create the 4213 // target reduction in the loop using a Reduction recipe. 4214 if (VF.isVector() && !PhiR->isInLoop()) { 4215 ReducedPartRdx = 4216 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4217 // If the reduction can be performed in a smaller type, we need to extend 4218 // the reduction to the wider type before we branch to the original loop. 4219 if (PhiTy != RdxDesc.getRecurrenceType()) 4220 ReducedPartRdx = RdxDesc.isSigned() 4221 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4222 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4223 } 4224 4225 PHINode *ResumePhi = 4226 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4227 4228 // Create a phi node that merges control-flow from the backedge-taken check 4229 // block and the middle block. 4230 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4231 LoopScalarPreHeader->getTerminator()); 4232 4233 // If we are fixing reductions in the epilogue loop then we should already 4234 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4235 // we carry over the incoming values correctly. 4236 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4237 if (Incoming == LoopMiddleBlock) 4238 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4239 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4240 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4241 Incoming); 4242 else 4243 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4244 } 4245 4246 // Set the resume value for this reduction 4247 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4248 4249 // Now, we need to fix the users of the reduction variable 4250 // inside and outside of the scalar remainder loop. 4251 4252 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4253 // in the exit blocks. See comment on analogous loop in 4254 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4255 if (!Cost->requiresScalarEpilogue(VF)) 4256 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4257 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4258 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4259 4260 // Fix the scalar loop reduction variable with the incoming reduction sum 4261 // from the vector body and from the backedge value. 4262 int IncomingEdgeBlockIdx = 4263 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4264 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4265 // Pick the other block. 4266 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4267 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4268 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4269 } 4270 4271 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4272 VPTransformState &State) { 4273 RecurKind RK = RdxDesc.getRecurrenceKind(); 4274 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4275 return; 4276 4277 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4278 assert(LoopExitInstr && "null loop exit instruction"); 4279 SmallVector<Instruction *, 8> Worklist; 4280 SmallPtrSet<Instruction *, 8> Visited; 4281 Worklist.push_back(LoopExitInstr); 4282 Visited.insert(LoopExitInstr); 4283 4284 while (!Worklist.empty()) { 4285 Instruction *Cur = Worklist.pop_back_val(); 4286 if (isa<OverflowingBinaryOperator>(Cur)) 4287 for (unsigned Part = 0; Part < UF; ++Part) { 4288 // FIXME: Should not rely on getVPValue at this point. 4289 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4290 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4291 } 4292 4293 for (User *U : Cur->users()) { 4294 Instruction *UI = cast<Instruction>(U); 4295 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4296 Visited.insert(UI).second) 4297 Worklist.push_back(UI); 4298 } 4299 } 4300 } 4301 4302 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4303 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4304 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4305 // Some phis were already hand updated by the reduction and recurrence 4306 // code above, leave them alone. 4307 continue; 4308 4309 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4310 // Non-instruction incoming values will have only one value. 4311 4312 VPLane Lane = VPLane::getFirstLane(); 4313 if (isa<Instruction>(IncomingValue) && 4314 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4315 VF)) 4316 Lane = VPLane::getLastLaneForVF(VF); 4317 4318 // Can be a loop invariant incoming value or the last scalar value to be 4319 // extracted from the vectorized loop. 4320 // FIXME: Should not rely on getVPValue at this point. 4321 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4322 Value *lastIncomingValue = 4323 OrigLoop->isLoopInvariant(IncomingValue) 4324 ? IncomingValue 4325 : State.get(State.Plan->getVPValue(IncomingValue, true), 4326 VPIteration(UF - 1, Lane)); 4327 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4328 } 4329 } 4330 4331 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4332 // The basic block and loop containing the predicated instruction. 4333 auto *PredBB = PredInst->getParent(); 4334 auto *VectorLoop = LI->getLoopFor(PredBB); 4335 4336 // Initialize a worklist with the operands of the predicated instruction. 4337 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4338 4339 // Holds instructions that we need to analyze again. An instruction may be 4340 // reanalyzed if we don't yet know if we can sink it or not. 4341 SmallVector<Instruction *, 8> InstsToReanalyze; 4342 4343 // Returns true if a given use occurs in the predicated block. Phi nodes use 4344 // their operands in their corresponding predecessor blocks. 4345 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4346 auto *I = cast<Instruction>(U.getUser()); 4347 BasicBlock *BB = I->getParent(); 4348 if (auto *Phi = dyn_cast<PHINode>(I)) 4349 BB = Phi->getIncomingBlock( 4350 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4351 return BB == PredBB; 4352 }; 4353 4354 // Iteratively sink the scalarized operands of the predicated instruction 4355 // into the block we created for it. When an instruction is sunk, it's 4356 // operands are then added to the worklist. The algorithm ends after one pass 4357 // through the worklist doesn't sink a single instruction. 4358 bool Changed; 4359 do { 4360 // Add the instructions that need to be reanalyzed to the worklist, and 4361 // reset the changed indicator. 4362 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4363 InstsToReanalyze.clear(); 4364 Changed = false; 4365 4366 while (!Worklist.empty()) { 4367 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4368 4369 // We can't sink an instruction if it is a phi node, is not in the loop, 4370 // or may have side effects. 4371 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4372 I->mayHaveSideEffects()) 4373 continue; 4374 4375 // If the instruction is already in PredBB, check if we can sink its 4376 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4377 // sinking the scalar instruction I, hence it appears in PredBB; but it 4378 // may have failed to sink I's operands (recursively), which we try 4379 // (again) here. 4380 if (I->getParent() == PredBB) { 4381 Worklist.insert(I->op_begin(), I->op_end()); 4382 continue; 4383 } 4384 4385 // It's legal to sink the instruction if all its uses occur in the 4386 // predicated block. Otherwise, there's nothing to do yet, and we may 4387 // need to reanalyze the instruction. 4388 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4389 InstsToReanalyze.push_back(I); 4390 continue; 4391 } 4392 4393 // Move the instruction to the beginning of the predicated block, and add 4394 // it's operands to the worklist. 4395 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4396 Worklist.insert(I->op_begin(), I->op_end()); 4397 4398 // The sinking may have enabled other instructions to be sunk, so we will 4399 // need to iterate. 4400 Changed = true; 4401 } 4402 } while (Changed); 4403 } 4404 4405 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4406 for (PHINode *OrigPhi : OrigPHIsToFix) { 4407 VPWidenPHIRecipe *VPPhi = 4408 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4409 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4410 // Make sure the builder has a valid insert point. 4411 Builder.SetInsertPoint(NewPhi); 4412 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4413 VPValue *Inc = VPPhi->getIncomingValue(i); 4414 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4415 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4416 } 4417 } 4418 } 4419 4420 bool InnerLoopVectorizer::useOrderedReductions( 4421 const RecurrenceDescriptor &RdxDesc) { 4422 return Cost->useOrderedReductions(RdxDesc); 4423 } 4424 4425 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4426 VPWidenPHIRecipe *PhiR, 4427 VPTransformState &State) { 4428 PHINode *P = cast<PHINode>(PN); 4429 if (EnableVPlanNativePath) { 4430 // Currently we enter here in the VPlan-native path for non-induction 4431 // PHIs where all control flow is uniform. We simply widen these PHIs. 4432 // Create a vector phi with no operands - the vector phi operands will be 4433 // set at the end of vector code generation. 4434 Type *VecTy = (State.VF.isScalar()) 4435 ? PN->getType() 4436 : VectorType::get(PN->getType(), State.VF); 4437 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4438 State.set(PhiR, VecPhi, 0); 4439 OrigPHIsToFix.push_back(P); 4440 4441 return; 4442 } 4443 4444 assert(PN->getParent() == OrigLoop->getHeader() && 4445 "Non-header phis should have been handled elsewhere"); 4446 4447 // In order to support recurrences we need to be able to vectorize Phi nodes. 4448 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4449 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4450 // this value when we vectorize all of the instructions that use the PHI. 4451 4452 assert(!Legal->isReductionVariable(P) && 4453 "reductions should be handled elsewhere"); 4454 4455 setDebugLocFromInst(P); 4456 4457 // This PHINode must be an induction variable. 4458 // Make sure that we know about it. 4459 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4460 4461 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4462 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4463 4464 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4465 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4466 4467 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4468 // which can be found from the original scalar operations. 4469 switch (II.getKind()) { 4470 case InductionDescriptor::IK_NoInduction: 4471 llvm_unreachable("Unknown induction"); 4472 case InductionDescriptor::IK_IntInduction: 4473 case InductionDescriptor::IK_FpInduction: 4474 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4475 case InductionDescriptor::IK_PtrInduction: { 4476 // Handle the pointer induction variable case. 4477 assert(P->getType()->isPointerTy() && "Unexpected type."); 4478 4479 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4480 // This is the normalized GEP that starts counting at zero. 4481 Value *PtrInd = 4482 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4483 // Determine the number of scalars we need to generate for each unroll 4484 // iteration. If the instruction is uniform, we only need to generate the 4485 // first lane. Otherwise, we generate all VF values. 4486 bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); 4487 assert((IsUniform || !State.VF.isScalable()) && 4488 "Cannot scalarize a scalable VF"); 4489 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4490 4491 for (unsigned Part = 0; Part < UF; ++Part) { 4492 Value *PartStart = 4493 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4494 4495 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4496 Value *Idx = Builder.CreateAdd( 4497 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4498 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4499 4500 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 4501 State.CFG.PrevBB->getTerminator()); 4502 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, Step, II); 4503 SclrGep->setName("next.gep"); 4504 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4505 } 4506 } 4507 return; 4508 } 4509 assert(isa<SCEVConstant>(II.getStep()) && 4510 "Induction step not a SCEV constant!"); 4511 Type *PhiType = II.getStep()->getType(); 4512 4513 // Build a pointer phi 4514 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4515 Type *ScStValueType = ScalarStartValue->getType(); 4516 PHINode *NewPointerPhi = 4517 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4518 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4519 4520 // A pointer induction, performed by using a gep 4521 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4522 Instruction *InductionLoc = LoopLatch->getTerminator(); 4523 const SCEV *ScalarStep = II.getStep(); 4524 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4525 Value *ScalarStepValue = 4526 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4527 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4528 Value *NumUnrolledElems = 4529 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4530 Value *InductionGEP = GetElementPtrInst::Create( 4531 II.getElementType(), NewPointerPhi, 4532 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4533 InductionLoc); 4534 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4535 4536 // Create UF many actual address geps that use the pointer 4537 // phi as base and a vectorized version of the step value 4538 // (<step*0, ..., step*N>) as offset. 4539 for (unsigned Part = 0; Part < State.UF; ++Part) { 4540 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4541 Value *StartOffsetScalar = 4542 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4543 Value *StartOffset = 4544 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4545 // Create a vector of consecutive numbers from zero to VF. 4546 StartOffset = 4547 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4548 4549 Value *GEP = Builder.CreateGEP( 4550 II.getElementType(), NewPointerPhi, 4551 Builder.CreateMul( 4552 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4553 "vector.gep")); 4554 State.set(PhiR, GEP, Part); 4555 } 4556 } 4557 } 4558 } 4559 4560 /// A helper function for checking whether an integer division-related 4561 /// instruction may divide by zero (in which case it must be predicated if 4562 /// executed conditionally in the scalar code). 4563 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4564 /// Non-zero divisors that are non compile-time constants will not be 4565 /// converted into multiplication, so we will still end up scalarizing 4566 /// the division, but can do so w/o predication. 4567 static bool mayDivideByZero(Instruction &I) { 4568 assert((I.getOpcode() == Instruction::UDiv || 4569 I.getOpcode() == Instruction::SDiv || 4570 I.getOpcode() == Instruction::URem || 4571 I.getOpcode() == Instruction::SRem) && 4572 "Unexpected instruction"); 4573 Value *Divisor = I.getOperand(1); 4574 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4575 return !CInt || CInt->isZero(); 4576 } 4577 4578 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4579 VPUser &ArgOperands, 4580 VPTransformState &State) { 4581 assert(!isa<DbgInfoIntrinsic>(I) && 4582 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4583 setDebugLocFromInst(&I); 4584 4585 Module *M = I.getParent()->getParent()->getParent(); 4586 auto *CI = cast<CallInst>(&I); 4587 4588 SmallVector<Type *, 4> Tys; 4589 for (Value *ArgOperand : CI->args()) 4590 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4591 4592 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4593 4594 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4595 // version of the instruction. 4596 // Is it beneficial to perform intrinsic call compared to lib call? 4597 bool NeedToScalarize = false; 4598 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4599 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4600 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4601 assert((UseVectorIntrinsic || !NeedToScalarize) && 4602 "Instruction should be scalarized elsewhere."); 4603 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4604 "Either the intrinsic cost or vector call cost must be valid"); 4605 4606 for (unsigned Part = 0; Part < UF; ++Part) { 4607 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4608 SmallVector<Value *, 4> Args; 4609 for (auto &I : enumerate(ArgOperands.operands())) { 4610 // Some intrinsics have a scalar argument - don't replace it with a 4611 // vector. 4612 Value *Arg; 4613 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4614 Arg = State.get(I.value(), Part); 4615 else { 4616 Arg = State.get(I.value(), VPIteration(0, 0)); 4617 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4618 TysForDecl.push_back(Arg->getType()); 4619 } 4620 Args.push_back(Arg); 4621 } 4622 4623 Function *VectorF; 4624 if (UseVectorIntrinsic) { 4625 // Use vector version of the intrinsic. 4626 if (VF.isVector()) 4627 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4628 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4629 assert(VectorF && "Can't retrieve vector intrinsic."); 4630 } else { 4631 // Use vector version of the function call. 4632 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4633 #ifndef NDEBUG 4634 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4635 "Can't create vector function."); 4636 #endif 4637 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4638 } 4639 SmallVector<OperandBundleDef, 1> OpBundles; 4640 CI->getOperandBundlesAsDefs(OpBundles); 4641 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4642 4643 if (isa<FPMathOperator>(V)) 4644 V->copyFastMathFlags(CI); 4645 4646 State.set(Def, V, Part); 4647 addMetadata(V, &I); 4648 } 4649 } 4650 4651 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4652 // We should not collect Scalars more than once per VF. Right now, this 4653 // function is called from collectUniformsAndScalars(), which already does 4654 // this check. Collecting Scalars for VF=1 does not make any sense. 4655 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4656 "This function should not be visited twice for the same VF"); 4657 4658 SmallSetVector<Instruction *, 8> Worklist; 4659 4660 // These sets are used to seed the analysis with pointers used by memory 4661 // accesses that will remain scalar. 4662 SmallSetVector<Instruction *, 8> ScalarPtrs; 4663 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4664 auto *Latch = TheLoop->getLoopLatch(); 4665 4666 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4667 // The pointer operands of loads and stores will be scalar as long as the 4668 // memory access is not a gather or scatter operation. The value operand of a 4669 // store will remain scalar if the store is scalarized. 4670 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4671 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4672 assert(WideningDecision != CM_Unknown && 4673 "Widening decision should be ready at this moment"); 4674 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4675 if (Ptr == Store->getValueOperand()) 4676 return WideningDecision == CM_Scalarize; 4677 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4678 "Ptr is neither a value or pointer operand"); 4679 return WideningDecision != CM_GatherScatter; 4680 }; 4681 4682 // A helper that returns true if the given value is a bitcast or 4683 // getelementptr instruction contained in the loop. 4684 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4685 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4686 isa<GetElementPtrInst>(V)) && 4687 !TheLoop->isLoopInvariant(V); 4688 }; 4689 4690 // A helper that evaluates a memory access's use of a pointer. If the use will 4691 // be a scalar use and the pointer is only used by memory accesses, we place 4692 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4693 // PossibleNonScalarPtrs. 4694 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4695 // We only care about bitcast and getelementptr instructions contained in 4696 // the loop. 4697 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4698 return; 4699 4700 // If the pointer has already been identified as scalar (e.g., if it was 4701 // also identified as uniform), there's nothing to do. 4702 auto *I = cast<Instruction>(Ptr); 4703 if (Worklist.count(I)) 4704 return; 4705 4706 // If the use of the pointer will be a scalar use, and all users of the 4707 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4708 // place the pointer in PossibleNonScalarPtrs. 4709 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4710 return isa<LoadInst>(U) || isa<StoreInst>(U); 4711 })) 4712 ScalarPtrs.insert(I); 4713 else 4714 PossibleNonScalarPtrs.insert(I); 4715 }; 4716 4717 // We seed the scalars analysis with three classes of instructions: (1) 4718 // instructions marked uniform-after-vectorization and (2) bitcast, 4719 // getelementptr and (pointer) phi instructions used by memory accesses 4720 // requiring a scalar use. 4721 // 4722 // (1) Add to the worklist all instructions that have been identified as 4723 // uniform-after-vectorization. 4724 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4725 4726 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4727 // memory accesses requiring a scalar use. The pointer operands of loads and 4728 // stores will be scalar as long as the memory accesses is not a gather or 4729 // scatter operation. The value operand of a store will remain scalar if the 4730 // store is scalarized. 4731 for (auto *BB : TheLoop->blocks()) 4732 for (auto &I : *BB) { 4733 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4734 evaluatePtrUse(Load, Load->getPointerOperand()); 4735 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4736 evaluatePtrUse(Store, Store->getPointerOperand()); 4737 evaluatePtrUse(Store, Store->getValueOperand()); 4738 } 4739 } 4740 for (auto *I : ScalarPtrs) 4741 if (!PossibleNonScalarPtrs.count(I)) { 4742 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4743 Worklist.insert(I); 4744 } 4745 4746 // Insert the forced scalars. 4747 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4748 // induction variable when the PHI user is scalarized. 4749 auto ForcedScalar = ForcedScalars.find(VF); 4750 if (ForcedScalar != ForcedScalars.end()) 4751 for (auto *I : ForcedScalar->second) 4752 Worklist.insert(I); 4753 4754 // Expand the worklist by looking through any bitcasts and getelementptr 4755 // instructions we've already identified as scalar. This is similar to the 4756 // expansion step in collectLoopUniforms(); however, here we're only 4757 // expanding to include additional bitcasts and getelementptr instructions. 4758 unsigned Idx = 0; 4759 while (Idx != Worklist.size()) { 4760 Instruction *Dst = Worklist[Idx++]; 4761 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4762 continue; 4763 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4764 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4765 auto *J = cast<Instruction>(U); 4766 return !TheLoop->contains(J) || Worklist.count(J) || 4767 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4768 isScalarUse(J, Src)); 4769 })) { 4770 Worklist.insert(Src); 4771 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4772 } 4773 } 4774 4775 // An induction variable will remain scalar if all users of the induction 4776 // variable and induction variable update remain scalar. 4777 for (auto &Induction : Legal->getInductionVars()) { 4778 auto *Ind = Induction.first; 4779 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4780 4781 // If tail-folding is applied, the primary induction variable will be used 4782 // to feed a vector compare. 4783 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4784 continue; 4785 4786 // Returns true if \p Indvar is a pointer induction that is used directly by 4787 // load/store instruction \p I. 4788 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4789 Instruction *I) { 4790 return Induction.second.getKind() == 4791 InductionDescriptor::IK_PtrInduction && 4792 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4793 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4794 }; 4795 4796 // Determine if all users of the induction variable are scalar after 4797 // vectorization. 4798 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4799 auto *I = cast<Instruction>(U); 4800 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4801 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4802 }); 4803 if (!ScalarInd) 4804 continue; 4805 4806 // Determine if all users of the induction variable update instruction are 4807 // scalar after vectorization. 4808 auto ScalarIndUpdate = 4809 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4810 auto *I = cast<Instruction>(U); 4811 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4812 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4813 }); 4814 if (!ScalarIndUpdate) 4815 continue; 4816 4817 // The induction variable and its update instruction will remain scalar. 4818 Worklist.insert(Ind); 4819 Worklist.insert(IndUpdate); 4820 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4821 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4822 << "\n"); 4823 } 4824 4825 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4826 } 4827 4828 bool LoopVectorizationCostModel::isScalarWithPredication( 4829 Instruction *I, ElementCount VF) const { 4830 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4831 return false; 4832 switch(I->getOpcode()) { 4833 default: 4834 break; 4835 case Instruction::Load: 4836 case Instruction::Store: { 4837 if (!Legal->isMaskRequired(I)) 4838 return false; 4839 auto *Ptr = getLoadStorePointerOperand(I); 4840 auto *Ty = getLoadStoreType(I); 4841 Type *VTy = Ty; 4842 if (VF.isVector()) 4843 VTy = VectorType::get(Ty, VF); 4844 const Align Alignment = getLoadStoreAlignment(I); 4845 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4846 TTI.isLegalMaskedGather(VTy, Alignment)) 4847 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4848 TTI.isLegalMaskedScatter(VTy, Alignment)); 4849 } 4850 case Instruction::UDiv: 4851 case Instruction::SDiv: 4852 case Instruction::SRem: 4853 case Instruction::URem: 4854 return mayDivideByZero(*I); 4855 } 4856 return false; 4857 } 4858 4859 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4860 Instruction *I, ElementCount VF) { 4861 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4862 assert(getWideningDecision(I, VF) == CM_Unknown && 4863 "Decision should not be set yet."); 4864 auto *Group = getInterleavedAccessGroup(I); 4865 assert(Group && "Must have a group."); 4866 4867 // If the instruction's allocated size doesn't equal it's type size, it 4868 // requires padding and will be scalarized. 4869 auto &DL = I->getModule()->getDataLayout(); 4870 auto *ScalarTy = getLoadStoreType(I); 4871 if (hasIrregularType(ScalarTy, DL)) 4872 return false; 4873 4874 // Check if masking is required. 4875 // A Group may need masking for one of two reasons: it resides in a block that 4876 // needs predication, or it was decided to use masking to deal with gaps 4877 // (either a gap at the end of a load-access that may result in a speculative 4878 // load, or any gaps in a store-access). 4879 bool PredicatedAccessRequiresMasking = 4880 blockNeedsPredicationForAnyReason(I->getParent()) && 4881 Legal->isMaskRequired(I); 4882 bool LoadAccessWithGapsRequiresEpilogMasking = 4883 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4884 !isScalarEpilogueAllowed(); 4885 bool StoreAccessWithGapsRequiresMasking = 4886 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4887 if (!PredicatedAccessRequiresMasking && 4888 !LoadAccessWithGapsRequiresEpilogMasking && 4889 !StoreAccessWithGapsRequiresMasking) 4890 return true; 4891 4892 // If masked interleaving is required, we expect that the user/target had 4893 // enabled it, because otherwise it either wouldn't have been created or 4894 // it should have been invalidated by the CostModel. 4895 assert(useMaskedInterleavedAccesses(TTI) && 4896 "Masked interleave-groups for predicated accesses are not enabled."); 4897 4898 if (Group->isReverse()) 4899 return false; 4900 4901 auto *Ty = getLoadStoreType(I); 4902 const Align Alignment = getLoadStoreAlignment(I); 4903 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4904 : TTI.isLegalMaskedStore(Ty, Alignment); 4905 } 4906 4907 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4908 Instruction *I, ElementCount VF) { 4909 // Get and ensure we have a valid memory instruction. 4910 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4911 4912 auto *Ptr = getLoadStorePointerOperand(I); 4913 auto *ScalarTy = getLoadStoreType(I); 4914 4915 // In order to be widened, the pointer should be consecutive, first of all. 4916 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4917 return false; 4918 4919 // If the instruction is a store located in a predicated block, it will be 4920 // scalarized. 4921 if (isScalarWithPredication(I, VF)) 4922 return false; 4923 4924 // If the instruction's allocated size doesn't equal it's type size, it 4925 // requires padding and will be scalarized. 4926 auto &DL = I->getModule()->getDataLayout(); 4927 if (hasIrregularType(ScalarTy, DL)) 4928 return false; 4929 4930 return true; 4931 } 4932 4933 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4934 // We should not collect Uniforms more than once per VF. Right now, 4935 // this function is called from collectUniformsAndScalars(), which 4936 // already does this check. Collecting Uniforms for VF=1 does not make any 4937 // sense. 4938 4939 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4940 "This function should not be visited twice for the same VF"); 4941 4942 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4943 // not analyze again. Uniforms.count(VF) will return 1. 4944 Uniforms[VF].clear(); 4945 4946 // We now know that the loop is vectorizable! 4947 // Collect instructions inside the loop that will remain uniform after 4948 // vectorization. 4949 4950 // Global values, params and instructions outside of current loop are out of 4951 // scope. 4952 auto isOutOfScope = [&](Value *V) -> bool { 4953 Instruction *I = dyn_cast<Instruction>(V); 4954 return (!I || !TheLoop->contains(I)); 4955 }; 4956 4957 // Worklist containing uniform instructions demanding lane 0. 4958 SetVector<Instruction *> Worklist; 4959 BasicBlock *Latch = TheLoop->getLoopLatch(); 4960 4961 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4962 // that are scalar with predication must not be considered uniform after 4963 // vectorization, because that would create an erroneous replicating region 4964 // where only a single instance out of VF should be formed. 4965 // TODO: optimize such seldom cases if found important, see PR40816. 4966 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4967 if (isOutOfScope(I)) { 4968 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4969 << *I << "\n"); 4970 return; 4971 } 4972 if (isScalarWithPredication(I, VF)) { 4973 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4974 << *I << "\n"); 4975 return; 4976 } 4977 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4978 Worklist.insert(I); 4979 }; 4980 4981 // Start with the conditional branch. If the branch condition is an 4982 // instruction contained in the loop that is only used by the branch, it is 4983 // uniform. 4984 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4985 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4986 addToWorklistIfAllowed(Cmp); 4987 4988 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4989 InstWidening WideningDecision = getWideningDecision(I, VF); 4990 assert(WideningDecision != CM_Unknown && 4991 "Widening decision should be ready at this moment"); 4992 4993 // A uniform memory op is itself uniform. We exclude uniform stores 4994 // here as they demand the last lane, not the first one. 4995 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4996 assert(WideningDecision == CM_Scalarize); 4997 return true; 4998 } 4999 5000 return (WideningDecision == CM_Widen || 5001 WideningDecision == CM_Widen_Reverse || 5002 WideningDecision == CM_Interleave); 5003 }; 5004 5005 5006 // Returns true if Ptr is the pointer operand of a memory access instruction 5007 // I, and I is known to not require scalarization. 5008 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5009 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5010 }; 5011 5012 // Holds a list of values which are known to have at least one uniform use. 5013 // Note that there may be other uses which aren't uniform. A "uniform use" 5014 // here is something which only demands lane 0 of the unrolled iterations; 5015 // it does not imply that all lanes produce the same value (e.g. this is not 5016 // the usual meaning of uniform) 5017 SetVector<Value *> HasUniformUse; 5018 5019 // Scan the loop for instructions which are either a) known to have only 5020 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5021 for (auto *BB : TheLoop->blocks()) 5022 for (auto &I : *BB) { 5023 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5024 switch (II->getIntrinsicID()) { 5025 case Intrinsic::sideeffect: 5026 case Intrinsic::experimental_noalias_scope_decl: 5027 case Intrinsic::assume: 5028 case Intrinsic::lifetime_start: 5029 case Intrinsic::lifetime_end: 5030 if (TheLoop->hasLoopInvariantOperands(&I)) 5031 addToWorklistIfAllowed(&I); 5032 break; 5033 default: 5034 break; 5035 } 5036 } 5037 5038 // ExtractValue instructions must be uniform, because the operands are 5039 // known to be loop-invariant. 5040 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5041 assert(isOutOfScope(EVI->getAggregateOperand()) && 5042 "Expected aggregate value to be loop invariant"); 5043 addToWorklistIfAllowed(EVI); 5044 continue; 5045 } 5046 5047 // If there's no pointer operand, there's nothing to do. 5048 auto *Ptr = getLoadStorePointerOperand(&I); 5049 if (!Ptr) 5050 continue; 5051 5052 // A uniform memory op is itself uniform. We exclude uniform stores 5053 // here as they demand the last lane, not the first one. 5054 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5055 addToWorklistIfAllowed(&I); 5056 5057 if (isUniformDecision(&I, VF)) { 5058 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5059 HasUniformUse.insert(Ptr); 5060 } 5061 } 5062 5063 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5064 // demanding) users. Since loops are assumed to be in LCSSA form, this 5065 // disallows uses outside the loop as well. 5066 for (auto *V : HasUniformUse) { 5067 if (isOutOfScope(V)) 5068 continue; 5069 auto *I = cast<Instruction>(V); 5070 auto UsersAreMemAccesses = 5071 llvm::all_of(I->users(), [&](User *U) -> bool { 5072 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5073 }); 5074 if (UsersAreMemAccesses) 5075 addToWorklistIfAllowed(I); 5076 } 5077 5078 // Expand Worklist in topological order: whenever a new instruction 5079 // is added , its users should be already inside Worklist. It ensures 5080 // a uniform instruction will only be used by uniform instructions. 5081 unsigned idx = 0; 5082 while (idx != Worklist.size()) { 5083 Instruction *I = Worklist[idx++]; 5084 5085 for (auto OV : I->operand_values()) { 5086 // isOutOfScope operands cannot be uniform instructions. 5087 if (isOutOfScope(OV)) 5088 continue; 5089 // First order recurrence Phi's should typically be considered 5090 // non-uniform. 5091 auto *OP = dyn_cast<PHINode>(OV); 5092 if (OP && Legal->isFirstOrderRecurrence(OP)) 5093 continue; 5094 // If all the users of the operand are uniform, then add the 5095 // operand into the uniform worklist. 5096 auto *OI = cast<Instruction>(OV); 5097 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5098 auto *J = cast<Instruction>(U); 5099 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5100 })) 5101 addToWorklistIfAllowed(OI); 5102 } 5103 } 5104 5105 // For an instruction to be added into Worklist above, all its users inside 5106 // the loop should also be in Worklist. However, this condition cannot be 5107 // true for phi nodes that form a cyclic dependence. We must process phi 5108 // nodes separately. An induction variable will remain uniform if all users 5109 // of the induction variable and induction variable update remain uniform. 5110 // The code below handles both pointer and non-pointer induction variables. 5111 for (auto &Induction : Legal->getInductionVars()) { 5112 auto *Ind = Induction.first; 5113 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5114 5115 // Determine if all users of the induction variable are uniform after 5116 // vectorization. 5117 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5118 auto *I = cast<Instruction>(U); 5119 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5120 isVectorizedMemAccessUse(I, Ind); 5121 }); 5122 if (!UniformInd) 5123 continue; 5124 5125 // Determine if all users of the induction variable update instruction are 5126 // uniform after vectorization. 5127 auto UniformIndUpdate = 5128 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5129 auto *I = cast<Instruction>(U); 5130 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5131 isVectorizedMemAccessUse(I, IndUpdate); 5132 }); 5133 if (!UniformIndUpdate) 5134 continue; 5135 5136 // The induction variable and its update instruction will remain uniform. 5137 addToWorklistIfAllowed(Ind); 5138 addToWorklistIfAllowed(IndUpdate); 5139 } 5140 5141 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5142 } 5143 5144 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5145 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5146 5147 if (Legal->getRuntimePointerChecking()->Need) { 5148 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5149 "runtime pointer checks needed. Enable vectorization of this " 5150 "loop with '#pragma clang loop vectorize(enable)' when " 5151 "compiling with -Os/-Oz", 5152 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5153 return true; 5154 } 5155 5156 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5157 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5158 "runtime SCEV checks needed. Enable vectorization of this " 5159 "loop with '#pragma clang loop vectorize(enable)' when " 5160 "compiling with -Os/-Oz", 5161 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5162 return true; 5163 } 5164 5165 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5166 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5167 reportVectorizationFailure("Runtime stride check for small trip count", 5168 "runtime stride == 1 checks needed. Enable vectorization of " 5169 "this loop without such check by compiling with -Os/-Oz", 5170 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5171 return true; 5172 } 5173 5174 return false; 5175 } 5176 5177 ElementCount 5178 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5179 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5180 return ElementCount::getScalable(0); 5181 5182 if (Hints->isScalableVectorizationDisabled()) { 5183 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5184 "ScalableVectorizationDisabled", ORE, TheLoop); 5185 return ElementCount::getScalable(0); 5186 } 5187 5188 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5189 5190 auto MaxScalableVF = ElementCount::getScalable( 5191 std::numeric_limits<ElementCount::ScalarTy>::max()); 5192 5193 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5194 // FIXME: While for scalable vectors this is currently sufficient, this should 5195 // be replaced by a more detailed mechanism that filters out specific VFs, 5196 // instead of invalidating vectorization for a whole set of VFs based on the 5197 // MaxVF. 5198 5199 // Disable scalable vectorization if the loop contains unsupported reductions. 5200 if (!canVectorizeReductions(MaxScalableVF)) { 5201 reportVectorizationInfo( 5202 "Scalable vectorization not supported for the reduction " 5203 "operations found in this loop.", 5204 "ScalableVFUnfeasible", ORE, TheLoop); 5205 return ElementCount::getScalable(0); 5206 } 5207 5208 // Disable scalable vectorization if the loop contains any instructions 5209 // with element types not supported for scalable vectors. 5210 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5211 return !Ty->isVoidTy() && 5212 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5213 })) { 5214 reportVectorizationInfo("Scalable vectorization is not supported " 5215 "for all element types found in this loop.", 5216 "ScalableVFUnfeasible", ORE, TheLoop); 5217 return ElementCount::getScalable(0); 5218 } 5219 5220 if (Legal->isSafeForAnyVectorWidth()) 5221 return MaxScalableVF; 5222 5223 // Limit MaxScalableVF by the maximum safe dependence distance. 5224 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5225 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5226 MaxVScale = 5227 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5228 MaxScalableVF = ElementCount::getScalable( 5229 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5230 if (!MaxScalableVF) 5231 reportVectorizationInfo( 5232 "Max legal vector width too small, scalable vectorization " 5233 "unfeasible.", 5234 "ScalableVFUnfeasible", ORE, TheLoop); 5235 5236 return MaxScalableVF; 5237 } 5238 5239 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5240 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5241 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5242 unsigned SmallestType, WidestType; 5243 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5244 5245 // Get the maximum safe dependence distance in bits computed by LAA. 5246 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5247 // the memory accesses that is most restrictive (involved in the smallest 5248 // dependence distance). 5249 unsigned MaxSafeElements = 5250 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5251 5252 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5253 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5254 5255 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5256 << ".\n"); 5257 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5258 << ".\n"); 5259 5260 // First analyze the UserVF, fall back if the UserVF should be ignored. 5261 if (UserVF) { 5262 auto MaxSafeUserVF = 5263 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5264 5265 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5266 // If `VF=vscale x N` is safe, then so is `VF=N` 5267 if (UserVF.isScalable()) 5268 return FixedScalableVFPair( 5269 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5270 else 5271 return UserVF; 5272 } 5273 5274 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5275 5276 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5277 // is better to ignore the hint and let the compiler choose a suitable VF. 5278 if (!UserVF.isScalable()) { 5279 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5280 << " is unsafe, clamping to max safe VF=" 5281 << MaxSafeFixedVF << ".\n"); 5282 ORE->emit([&]() { 5283 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5284 TheLoop->getStartLoc(), 5285 TheLoop->getHeader()) 5286 << "User-specified vectorization factor " 5287 << ore::NV("UserVectorizationFactor", UserVF) 5288 << " is unsafe, clamping to maximum safe vectorization factor " 5289 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5290 }); 5291 return MaxSafeFixedVF; 5292 } 5293 5294 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5295 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5296 << " is ignored because scalable vectors are not " 5297 "available.\n"); 5298 ORE->emit([&]() { 5299 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5300 TheLoop->getStartLoc(), 5301 TheLoop->getHeader()) 5302 << "User-specified vectorization factor " 5303 << ore::NV("UserVectorizationFactor", UserVF) 5304 << " is ignored because the target does not support scalable " 5305 "vectors. The compiler will pick a more suitable value."; 5306 }); 5307 } else { 5308 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5309 << " is unsafe. Ignoring scalable UserVF.\n"); 5310 ORE->emit([&]() { 5311 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5312 TheLoop->getStartLoc(), 5313 TheLoop->getHeader()) 5314 << "User-specified vectorization factor " 5315 << ore::NV("UserVectorizationFactor", UserVF) 5316 << " is unsafe. Ignoring the hint to let the compiler pick a " 5317 "more suitable value."; 5318 }); 5319 } 5320 } 5321 5322 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5323 << " / " << WidestType << " bits.\n"); 5324 5325 FixedScalableVFPair Result(ElementCount::getFixed(1), 5326 ElementCount::getScalable(0)); 5327 if (auto MaxVF = 5328 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5329 MaxSafeFixedVF, FoldTailByMasking)) 5330 Result.FixedVF = MaxVF; 5331 5332 if (auto MaxVF = 5333 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5334 MaxSafeScalableVF, FoldTailByMasking)) 5335 if (MaxVF.isScalable()) { 5336 Result.ScalableVF = MaxVF; 5337 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5338 << "\n"); 5339 } 5340 5341 return Result; 5342 } 5343 5344 FixedScalableVFPair 5345 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5346 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5347 // TODO: It may by useful to do since it's still likely to be dynamically 5348 // uniform if the target can skip. 5349 reportVectorizationFailure( 5350 "Not inserting runtime ptr check for divergent target", 5351 "runtime pointer checks needed. Not enabled for divergent target", 5352 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5353 return FixedScalableVFPair::getNone(); 5354 } 5355 5356 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5357 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5358 if (TC == 1) { 5359 reportVectorizationFailure("Single iteration (non) loop", 5360 "loop trip count is one, irrelevant for vectorization", 5361 "SingleIterationLoop", ORE, TheLoop); 5362 return FixedScalableVFPair::getNone(); 5363 } 5364 5365 switch (ScalarEpilogueStatus) { 5366 case CM_ScalarEpilogueAllowed: 5367 return computeFeasibleMaxVF(TC, UserVF, false); 5368 case CM_ScalarEpilogueNotAllowedUsePredicate: 5369 LLVM_FALLTHROUGH; 5370 case CM_ScalarEpilogueNotNeededUsePredicate: 5371 LLVM_DEBUG( 5372 dbgs() << "LV: vector predicate hint/switch found.\n" 5373 << "LV: Not allowing scalar epilogue, creating predicated " 5374 << "vector loop.\n"); 5375 break; 5376 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5377 // fallthrough as a special case of OptForSize 5378 case CM_ScalarEpilogueNotAllowedOptSize: 5379 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5380 LLVM_DEBUG( 5381 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5382 else 5383 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5384 << "count.\n"); 5385 5386 // Bail if runtime checks are required, which are not good when optimising 5387 // for size. 5388 if (runtimeChecksRequired()) 5389 return FixedScalableVFPair::getNone(); 5390 5391 break; 5392 } 5393 5394 // The only loops we can vectorize without a scalar epilogue, are loops with 5395 // a bottom-test and a single exiting block. We'd have to handle the fact 5396 // that not every instruction executes on the last iteration. This will 5397 // require a lane mask which varies through the vector loop body. (TODO) 5398 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5399 // If there was a tail-folding hint/switch, but we can't fold the tail by 5400 // masking, fallback to a vectorization with a scalar epilogue. 5401 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5402 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5403 "scalar epilogue instead.\n"); 5404 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5405 return computeFeasibleMaxVF(TC, UserVF, false); 5406 } 5407 return FixedScalableVFPair::getNone(); 5408 } 5409 5410 // Now try the tail folding 5411 5412 // Invalidate interleave groups that require an epilogue if we can't mask 5413 // the interleave-group. 5414 if (!useMaskedInterleavedAccesses(TTI)) { 5415 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5416 "No decisions should have been taken at this point"); 5417 // Note: There is no need to invalidate any cost modeling decisions here, as 5418 // non where taken so far. 5419 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5420 } 5421 5422 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5423 // Avoid tail folding if the trip count is known to be a multiple of any VF 5424 // we chose. 5425 // FIXME: The condition below pessimises the case for fixed-width vectors, 5426 // when scalable VFs are also candidates for vectorization. 5427 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5428 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5429 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5430 "MaxFixedVF must be a power of 2"); 5431 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5432 : MaxFixedVF.getFixedValue(); 5433 ScalarEvolution *SE = PSE.getSE(); 5434 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5435 const SCEV *ExitCount = SE->getAddExpr( 5436 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5437 const SCEV *Rem = SE->getURemExpr( 5438 SE->applyLoopGuards(ExitCount, TheLoop), 5439 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5440 if (Rem->isZero()) { 5441 // Accept MaxFixedVF if we do not have a tail. 5442 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5443 return MaxFactors; 5444 } 5445 } 5446 5447 // For scalable vectors don't use tail folding for low trip counts or 5448 // optimizing for code size. We only permit this if the user has explicitly 5449 // requested it. 5450 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5451 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5452 MaxFactors.ScalableVF.isVector()) 5453 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5454 5455 // If we don't know the precise trip count, or if the trip count that we 5456 // found modulo the vectorization factor is not zero, try to fold the tail 5457 // by masking. 5458 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5459 if (Legal->prepareToFoldTailByMasking()) { 5460 FoldTailByMasking = true; 5461 return MaxFactors; 5462 } 5463 5464 // If there was a tail-folding hint/switch, but we can't fold the tail by 5465 // masking, fallback to a vectorization with a scalar epilogue. 5466 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5467 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5468 "scalar epilogue instead.\n"); 5469 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5470 return MaxFactors; 5471 } 5472 5473 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5474 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5475 return FixedScalableVFPair::getNone(); 5476 } 5477 5478 if (TC == 0) { 5479 reportVectorizationFailure( 5480 "Unable to calculate the loop count due to complex control flow", 5481 "unable to calculate the loop count due to complex control flow", 5482 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5483 return FixedScalableVFPair::getNone(); 5484 } 5485 5486 reportVectorizationFailure( 5487 "Cannot optimize for size and vectorize at the same time.", 5488 "cannot optimize for size and vectorize at the same time. " 5489 "Enable vectorization of this loop with '#pragma clang loop " 5490 "vectorize(enable)' when compiling with -Os/-Oz", 5491 "NoTailLoopWithOptForSize", ORE, TheLoop); 5492 return FixedScalableVFPair::getNone(); 5493 } 5494 5495 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5496 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5497 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5498 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5499 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5500 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5501 : TargetTransformInfo::RGK_FixedWidthVector); 5502 5503 // Convenience function to return the minimum of two ElementCounts. 5504 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5505 assert((LHS.isScalable() == RHS.isScalable()) && 5506 "Scalable flags must match"); 5507 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5508 }; 5509 5510 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5511 // Note that both WidestRegister and WidestType may not be a powers of 2. 5512 auto MaxVectorElementCount = ElementCount::get( 5513 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5514 ComputeScalableMaxVF); 5515 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5516 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5517 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5518 5519 if (!MaxVectorElementCount) { 5520 LLVM_DEBUG(dbgs() << "LV: The target has no " 5521 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5522 << " vector registers.\n"); 5523 return ElementCount::getFixed(1); 5524 } 5525 5526 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5527 if (ConstTripCount && 5528 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5529 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5530 // If loop trip count (TC) is known at compile time there is no point in 5531 // choosing VF greater than TC (as done in the loop below). Select maximum 5532 // power of two which doesn't exceed TC. 5533 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5534 // when the TC is less than or equal to the known number of lanes. 5535 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5536 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5537 "exceeding the constant trip count: " 5538 << ClampedConstTripCount << "\n"); 5539 return ElementCount::getFixed(ClampedConstTripCount); 5540 } 5541 5542 ElementCount MaxVF = MaxVectorElementCount; 5543 if (TTI.shouldMaximizeVectorBandwidth() || 5544 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5545 auto MaxVectorElementCountMaxBW = ElementCount::get( 5546 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5547 ComputeScalableMaxVF); 5548 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5549 5550 // Collect all viable vectorization factors larger than the default MaxVF 5551 // (i.e. MaxVectorElementCount). 5552 SmallVector<ElementCount, 8> VFs; 5553 for (ElementCount VS = MaxVectorElementCount * 2; 5554 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5555 VFs.push_back(VS); 5556 5557 // For each VF calculate its register usage. 5558 auto RUs = calculateRegisterUsage(VFs); 5559 5560 // Select the largest VF which doesn't require more registers than existing 5561 // ones. 5562 for (int i = RUs.size() - 1; i >= 0; --i) { 5563 bool Selected = true; 5564 for (auto &pair : RUs[i].MaxLocalUsers) { 5565 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5566 if (pair.second > TargetNumRegisters) 5567 Selected = false; 5568 } 5569 if (Selected) { 5570 MaxVF = VFs[i]; 5571 break; 5572 } 5573 } 5574 if (ElementCount MinVF = 5575 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5576 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5577 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5578 << ") with target's minimum: " << MinVF << '\n'); 5579 MaxVF = MinVF; 5580 } 5581 } 5582 } 5583 return MaxVF; 5584 } 5585 5586 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5587 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5588 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5589 auto Min = Attr.getVScaleRangeMin(); 5590 auto Max = Attr.getVScaleRangeMax(); 5591 if (Max && Min == Max) 5592 return Max; 5593 } 5594 5595 return TTI.getVScaleForTuning(); 5596 } 5597 5598 bool LoopVectorizationCostModel::isMoreProfitable( 5599 const VectorizationFactor &A, const VectorizationFactor &B) const { 5600 InstructionCost CostA = A.Cost; 5601 InstructionCost CostB = B.Cost; 5602 5603 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5604 5605 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5606 MaxTripCount) { 5607 // If we are folding the tail and the trip count is a known (possibly small) 5608 // constant, the trip count will be rounded up to an integer number of 5609 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5610 // which we compare directly. When not folding the tail, the total cost will 5611 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5612 // approximated with the per-lane cost below instead of using the tripcount 5613 // as here. 5614 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5615 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5616 return RTCostA < RTCostB; 5617 } 5618 5619 // Improve estimate for the vector width if it is scalable. 5620 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5621 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5622 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5623 if (A.Width.isScalable()) 5624 EstimatedWidthA *= VScale.getValue(); 5625 if (B.Width.isScalable()) 5626 EstimatedWidthB *= VScale.getValue(); 5627 } 5628 5629 // Assume vscale may be larger than 1 (or the value being tuned for), 5630 // so that scalable vectorization is slightly favorable over fixed-width 5631 // vectorization. 5632 if (A.Width.isScalable() && !B.Width.isScalable()) 5633 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5634 5635 // To avoid the need for FP division: 5636 // (CostA / A.Width) < (CostB / B.Width) 5637 // <=> (CostA * B.Width) < (CostB * A.Width) 5638 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5639 } 5640 5641 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5642 const ElementCountSet &VFCandidates) { 5643 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5644 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5645 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5646 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5647 "Expected Scalar VF to be a candidate"); 5648 5649 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5650 VectorizationFactor ChosenFactor = ScalarCost; 5651 5652 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5653 if (ForceVectorization && VFCandidates.size() > 1) { 5654 // Ignore scalar width, because the user explicitly wants vectorization. 5655 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5656 // evaluation. 5657 ChosenFactor.Cost = InstructionCost::getMax(); 5658 } 5659 5660 SmallVector<InstructionVFPair> InvalidCosts; 5661 for (const auto &i : VFCandidates) { 5662 // The cost for scalar VF=1 is already calculated, so ignore it. 5663 if (i.isScalar()) 5664 continue; 5665 5666 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5667 VectorizationFactor Candidate(i, C.first); 5668 5669 #ifndef NDEBUG 5670 unsigned AssumedMinimumVscale = 1; 5671 if (Optional<unsigned> VScale = getVScaleForTuning()) 5672 AssumedMinimumVscale = VScale.getValue(); 5673 unsigned Width = 5674 Candidate.Width.isScalable() 5675 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5676 : Candidate.Width.getFixedValue(); 5677 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5678 << " costs: " << (Candidate.Cost / Width)); 5679 if (i.isScalable()) 5680 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5681 << AssumedMinimumVscale << ")"); 5682 LLVM_DEBUG(dbgs() << ".\n"); 5683 #endif 5684 5685 if (!C.second && !ForceVectorization) { 5686 LLVM_DEBUG( 5687 dbgs() << "LV: Not considering vector loop of width " << i 5688 << " because it will not generate any vector instructions.\n"); 5689 continue; 5690 } 5691 5692 // If profitable add it to ProfitableVF list. 5693 if (isMoreProfitable(Candidate, ScalarCost)) 5694 ProfitableVFs.push_back(Candidate); 5695 5696 if (isMoreProfitable(Candidate, ChosenFactor)) 5697 ChosenFactor = Candidate; 5698 } 5699 5700 // Emit a report of VFs with invalid costs in the loop. 5701 if (!InvalidCosts.empty()) { 5702 // Group the remarks per instruction, keeping the instruction order from 5703 // InvalidCosts. 5704 std::map<Instruction *, unsigned> Numbering; 5705 unsigned I = 0; 5706 for (auto &Pair : InvalidCosts) 5707 if (!Numbering.count(Pair.first)) 5708 Numbering[Pair.first] = I++; 5709 5710 // Sort the list, first on instruction(number) then on VF. 5711 llvm::sort(InvalidCosts, 5712 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5713 if (Numbering[A.first] != Numbering[B.first]) 5714 return Numbering[A.first] < Numbering[B.first]; 5715 ElementCountComparator ECC; 5716 return ECC(A.second, B.second); 5717 }); 5718 5719 // For a list of ordered instruction-vf pairs: 5720 // [(load, vf1), (load, vf2), (store, vf1)] 5721 // Group the instructions together to emit separate remarks for: 5722 // load (vf1, vf2) 5723 // store (vf1) 5724 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5725 auto Subset = ArrayRef<InstructionVFPair>(); 5726 do { 5727 if (Subset.empty()) 5728 Subset = Tail.take_front(1); 5729 5730 Instruction *I = Subset.front().first; 5731 5732 // If the next instruction is different, or if there are no other pairs, 5733 // emit a remark for the collated subset. e.g. 5734 // [(load, vf1), (load, vf2))] 5735 // to emit: 5736 // remark: invalid costs for 'load' at VF=(vf, vf2) 5737 if (Subset == Tail || Tail[Subset.size()].first != I) { 5738 std::string OutString; 5739 raw_string_ostream OS(OutString); 5740 assert(!Subset.empty() && "Unexpected empty range"); 5741 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5742 for (auto &Pair : Subset) 5743 OS << (Pair.second == Subset.front().second ? "" : ", ") 5744 << Pair.second; 5745 OS << "):"; 5746 if (auto *CI = dyn_cast<CallInst>(I)) 5747 OS << " call to " << CI->getCalledFunction()->getName(); 5748 else 5749 OS << " " << I->getOpcodeName(); 5750 OS.flush(); 5751 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5752 Tail = Tail.drop_front(Subset.size()); 5753 Subset = {}; 5754 } else 5755 // Grow the subset by one element 5756 Subset = Tail.take_front(Subset.size() + 1); 5757 } while (!Tail.empty()); 5758 } 5759 5760 if (!EnableCondStoresVectorization && NumPredStores) { 5761 reportVectorizationFailure("There are conditional stores.", 5762 "store that is conditionally executed prevents vectorization", 5763 "ConditionalStore", ORE, TheLoop); 5764 ChosenFactor = ScalarCost; 5765 } 5766 5767 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5768 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5769 << "LV: Vectorization seems to be not beneficial, " 5770 << "but was forced by a user.\n"); 5771 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5772 return ChosenFactor; 5773 } 5774 5775 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5776 const Loop &L, ElementCount VF) const { 5777 // Cross iteration phis such as reductions need special handling and are 5778 // currently unsupported. 5779 if (any_of(L.getHeader()->phis(), 5780 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5781 return false; 5782 5783 // Phis with uses outside of the loop require special handling and are 5784 // currently unsupported. 5785 for (auto &Entry : Legal->getInductionVars()) { 5786 // Look for uses of the value of the induction at the last iteration. 5787 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5788 for (User *U : PostInc->users()) 5789 if (!L.contains(cast<Instruction>(U))) 5790 return false; 5791 // Look for uses of penultimate value of the induction. 5792 for (User *U : Entry.first->users()) 5793 if (!L.contains(cast<Instruction>(U))) 5794 return false; 5795 } 5796 5797 // Induction variables that are widened require special handling that is 5798 // currently not supported. 5799 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5800 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5801 this->isProfitableToScalarize(Entry.first, VF)); 5802 })) 5803 return false; 5804 5805 // Epilogue vectorization code has not been auditted to ensure it handles 5806 // non-latch exits properly. It may be fine, but it needs auditted and 5807 // tested. 5808 if (L.getExitingBlock() != L.getLoopLatch()) 5809 return false; 5810 5811 return true; 5812 } 5813 5814 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5815 const ElementCount VF) const { 5816 // FIXME: We need a much better cost-model to take different parameters such 5817 // as register pressure, code size increase and cost of extra branches into 5818 // account. For now we apply a very crude heuristic and only consider loops 5819 // with vectorization factors larger than a certain value. 5820 // We also consider epilogue vectorization unprofitable for targets that don't 5821 // consider interleaving beneficial (eg. MVE). 5822 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5823 return false; 5824 // FIXME: We should consider changing the threshold for scalable 5825 // vectors to take VScaleForTuning into account. 5826 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5827 return true; 5828 return false; 5829 } 5830 5831 VectorizationFactor 5832 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5833 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5834 VectorizationFactor Result = VectorizationFactor::Disabled(); 5835 if (!EnableEpilogueVectorization) { 5836 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5837 return Result; 5838 } 5839 5840 if (!isScalarEpilogueAllowed()) { 5841 LLVM_DEBUG( 5842 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5843 "allowed.\n";); 5844 return Result; 5845 } 5846 5847 // Not really a cost consideration, but check for unsupported cases here to 5848 // simplify the logic. 5849 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5850 LLVM_DEBUG( 5851 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5852 "not a supported candidate.\n";); 5853 return Result; 5854 } 5855 5856 if (EpilogueVectorizationForceVF > 1) { 5857 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5858 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5859 if (LVP.hasPlanWithVF(ForcedEC)) 5860 return {ForcedEC, 0}; 5861 else { 5862 LLVM_DEBUG( 5863 dbgs() 5864 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5865 return Result; 5866 } 5867 } 5868 5869 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5870 TheLoop->getHeader()->getParent()->hasMinSize()) { 5871 LLVM_DEBUG( 5872 dbgs() 5873 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5874 return Result; 5875 } 5876 5877 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5878 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5879 "this loop\n"); 5880 return Result; 5881 } 5882 5883 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5884 // the main loop handles 8 lanes per iteration. We could still benefit from 5885 // vectorizing the epilogue loop with VF=4. 5886 ElementCount EstimatedRuntimeVF = MainLoopVF; 5887 if (MainLoopVF.isScalable()) { 5888 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5889 if (Optional<unsigned> VScale = getVScaleForTuning()) 5890 EstimatedRuntimeVF *= VScale.getValue(); 5891 } 5892 5893 for (auto &NextVF : ProfitableVFs) 5894 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5895 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5896 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5897 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5898 LVP.hasPlanWithVF(NextVF.Width)) 5899 Result = NextVF; 5900 5901 if (Result != VectorizationFactor::Disabled()) 5902 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5903 << Result.Width << "\n";); 5904 return Result; 5905 } 5906 5907 std::pair<unsigned, unsigned> 5908 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5909 unsigned MinWidth = -1U; 5910 unsigned MaxWidth = 8; 5911 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5912 // For in-loop reductions, no element types are added to ElementTypesInLoop 5913 // if there are no loads/stores in the loop. In this case, check through the 5914 // reduction variables to determine the maximum width. 5915 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5916 // Reset MaxWidth so that we can find the smallest type used by recurrences 5917 // in the loop. 5918 MaxWidth = -1U; 5919 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5920 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5921 // When finding the min width used by the recurrence we need to account 5922 // for casts on the input operands of the recurrence. 5923 MaxWidth = std::min<unsigned>( 5924 MaxWidth, std::min<unsigned>( 5925 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5926 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5927 } 5928 } else { 5929 for (Type *T : ElementTypesInLoop) { 5930 MinWidth = std::min<unsigned>( 5931 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5932 MaxWidth = std::max<unsigned>( 5933 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5934 } 5935 } 5936 return {MinWidth, MaxWidth}; 5937 } 5938 5939 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5940 ElementTypesInLoop.clear(); 5941 // For each block. 5942 for (BasicBlock *BB : TheLoop->blocks()) { 5943 // For each instruction in the loop. 5944 for (Instruction &I : BB->instructionsWithoutDebug()) { 5945 Type *T = I.getType(); 5946 5947 // Skip ignored values. 5948 if (ValuesToIgnore.count(&I)) 5949 continue; 5950 5951 // Only examine Loads, Stores and PHINodes. 5952 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5953 continue; 5954 5955 // Examine PHI nodes that are reduction variables. Update the type to 5956 // account for the recurrence type. 5957 if (auto *PN = dyn_cast<PHINode>(&I)) { 5958 if (!Legal->isReductionVariable(PN)) 5959 continue; 5960 const RecurrenceDescriptor &RdxDesc = 5961 Legal->getReductionVars().find(PN)->second; 5962 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5963 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5964 RdxDesc.getRecurrenceType(), 5965 TargetTransformInfo::ReductionFlags())) 5966 continue; 5967 T = RdxDesc.getRecurrenceType(); 5968 } 5969 5970 // Examine the stored values. 5971 if (auto *ST = dyn_cast<StoreInst>(&I)) 5972 T = ST->getValueOperand()->getType(); 5973 5974 assert(T->isSized() && 5975 "Expected the load/store/recurrence type to be sized"); 5976 5977 ElementTypesInLoop.insert(T); 5978 } 5979 } 5980 } 5981 5982 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5983 unsigned LoopCost) { 5984 // -- The interleave heuristics -- 5985 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5986 // There are many micro-architectural considerations that we can't predict 5987 // at this level. For example, frontend pressure (on decode or fetch) due to 5988 // code size, or the number and capabilities of the execution ports. 5989 // 5990 // We use the following heuristics to select the interleave count: 5991 // 1. If the code has reductions, then we interleave to break the cross 5992 // iteration dependency. 5993 // 2. If the loop is really small, then we interleave to reduce the loop 5994 // overhead. 5995 // 3. We don't interleave if we think that we will spill registers to memory 5996 // due to the increased register pressure. 5997 5998 if (!isScalarEpilogueAllowed()) 5999 return 1; 6000 6001 // We used the distance for the interleave count. 6002 if (Legal->getMaxSafeDepDistBytes() != -1U) 6003 return 1; 6004 6005 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6006 const bool HasReductions = !Legal->getReductionVars().empty(); 6007 // Do not interleave loops with a relatively small known or estimated trip 6008 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6009 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6010 // because with the above conditions interleaving can expose ILP and break 6011 // cross iteration dependences for reductions. 6012 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6013 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6014 return 1; 6015 6016 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6017 // We divide by these constants so assume that we have at least one 6018 // instruction that uses at least one register. 6019 for (auto& pair : R.MaxLocalUsers) { 6020 pair.second = std::max(pair.second, 1U); 6021 } 6022 6023 // We calculate the interleave count using the following formula. 6024 // Subtract the number of loop invariants from the number of available 6025 // registers. These registers are used by all of the interleaved instances. 6026 // Next, divide the remaining registers by the number of registers that is 6027 // required by the loop, in order to estimate how many parallel instances 6028 // fit without causing spills. All of this is rounded down if necessary to be 6029 // a power of two. We want power of two interleave count to simplify any 6030 // addressing operations or alignment considerations. 6031 // We also want power of two interleave counts to ensure that the induction 6032 // variable of the vector loop wraps to zero, when tail is folded by masking; 6033 // this currently happens when OptForSize, in which case IC is set to 1 above. 6034 unsigned IC = UINT_MAX; 6035 6036 for (auto& pair : R.MaxLocalUsers) { 6037 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6038 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6039 << " registers of " 6040 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6041 if (VF.isScalar()) { 6042 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6043 TargetNumRegisters = ForceTargetNumScalarRegs; 6044 } else { 6045 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6046 TargetNumRegisters = ForceTargetNumVectorRegs; 6047 } 6048 unsigned MaxLocalUsers = pair.second; 6049 unsigned LoopInvariantRegs = 0; 6050 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6051 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6052 6053 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6054 // Don't count the induction variable as interleaved. 6055 if (EnableIndVarRegisterHeur) { 6056 TmpIC = 6057 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6058 std::max(1U, (MaxLocalUsers - 1))); 6059 } 6060 6061 IC = std::min(IC, TmpIC); 6062 } 6063 6064 // Clamp the interleave ranges to reasonable counts. 6065 unsigned MaxInterleaveCount = 6066 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6067 6068 // Check if the user has overridden the max. 6069 if (VF.isScalar()) { 6070 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6071 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6072 } else { 6073 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6074 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6075 } 6076 6077 // If trip count is known or estimated compile time constant, limit the 6078 // interleave count to be less than the trip count divided by VF, provided it 6079 // is at least 1. 6080 // 6081 // For scalable vectors we can't know if interleaving is beneficial. It may 6082 // not be beneficial for small loops if none of the lanes in the second vector 6083 // iterations is enabled. However, for larger loops, there is likely to be a 6084 // similar benefit as for fixed-width vectors. For now, we choose to leave 6085 // the InterleaveCount as if vscale is '1', although if some information about 6086 // the vector is known (e.g. min vector size), we can make a better decision. 6087 if (BestKnownTC) { 6088 MaxInterleaveCount = 6089 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6090 // Make sure MaxInterleaveCount is greater than 0. 6091 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6092 } 6093 6094 assert(MaxInterleaveCount > 0 && 6095 "Maximum interleave count must be greater than 0"); 6096 6097 // Clamp the calculated IC to be between the 1 and the max interleave count 6098 // that the target and trip count allows. 6099 if (IC > MaxInterleaveCount) 6100 IC = MaxInterleaveCount; 6101 else 6102 // Make sure IC is greater than 0. 6103 IC = std::max(1u, IC); 6104 6105 assert(IC > 0 && "Interleave count must be greater than 0."); 6106 6107 // If we did not calculate the cost for VF (because the user selected the VF) 6108 // then we calculate the cost of VF here. 6109 if (LoopCost == 0) { 6110 InstructionCost C = expectedCost(VF).first; 6111 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6112 LoopCost = *C.getValue(); 6113 } 6114 6115 assert(LoopCost && "Non-zero loop cost expected"); 6116 6117 // Interleave if we vectorized this loop and there is a reduction that could 6118 // benefit from interleaving. 6119 if (VF.isVector() && HasReductions) { 6120 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6121 return IC; 6122 } 6123 6124 // For any scalar loop that either requires runtime checks or predication we 6125 // are better off leaving this to the unroller. Note that if we've already 6126 // vectorized the loop we will have done the runtime check and so interleaving 6127 // won't require further checks. 6128 bool ScalarInterleavingRequiresPredication = 6129 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 6130 return Legal->blockNeedsPredication(BB); 6131 })); 6132 bool ScalarInterleavingRequiresRuntimePointerCheck = 6133 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6134 6135 // We want to interleave small loops in order to reduce the loop overhead and 6136 // potentially expose ILP opportunities. 6137 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6138 << "LV: IC is " << IC << '\n' 6139 << "LV: VF is " << VF << '\n'); 6140 const bool AggressivelyInterleaveReductions = 6141 TTI.enableAggressiveInterleaving(HasReductions); 6142 if (!ScalarInterleavingRequiresRuntimePointerCheck && 6143 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 6144 // We assume that the cost overhead is 1 and we use the cost model 6145 // to estimate the cost of the loop and interleave until the cost of the 6146 // loop overhead is about 5% of the cost of the loop. 6147 unsigned SmallIC = 6148 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6149 6150 // Interleave until store/load ports (estimated by max interleave count) are 6151 // saturated. 6152 unsigned NumStores = Legal->getNumStores(); 6153 unsigned NumLoads = Legal->getNumLoads(); 6154 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6155 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6156 6157 // There is little point in interleaving for reductions containing selects 6158 // and compares when VF=1 since it may just create more overhead than it's 6159 // worth for loops with small trip counts. This is because we still have to 6160 // do the final reduction after the loop. 6161 bool HasSelectCmpReductions = 6162 HasReductions && 6163 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6164 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6165 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6166 RdxDesc.getRecurrenceKind()); 6167 }); 6168 if (HasSelectCmpReductions) { 6169 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6170 return 1; 6171 } 6172 6173 // If we have a scalar reduction (vector reductions are already dealt with 6174 // by this point), we can increase the critical path length if the loop 6175 // we're interleaving is inside another loop. For tree-wise reductions 6176 // set the limit to 2, and for ordered reductions it's best to disable 6177 // interleaving entirely. 6178 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6179 bool HasOrderedReductions = 6180 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6181 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6182 return RdxDesc.isOrdered(); 6183 }); 6184 if (HasOrderedReductions) { 6185 LLVM_DEBUG( 6186 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6187 return 1; 6188 } 6189 6190 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6191 SmallIC = std::min(SmallIC, F); 6192 StoresIC = std::min(StoresIC, F); 6193 LoadsIC = std::min(LoadsIC, F); 6194 } 6195 6196 if (EnableLoadStoreRuntimeInterleave && 6197 std::max(StoresIC, LoadsIC) > SmallIC) { 6198 LLVM_DEBUG( 6199 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6200 return std::max(StoresIC, LoadsIC); 6201 } 6202 6203 // If there are scalar reductions and TTI has enabled aggressive 6204 // interleaving for reductions, we will interleave to expose ILP. 6205 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6206 AggressivelyInterleaveReductions) { 6207 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6208 // Interleave no less than SmallIC but not as aggressive as the normal IC 6209 // to satisfy the rare situation when resources are too limited. 6210 return std::max(IC / 2, SmallIC); 6211 } else { 6212 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6213 return SmallIC; 6214 } 6215 } 6216 6217 // Interleave if this is a large loop (small loops are already dealt with by 6218 // this point) that could benefit from interleaving. 6219 if (AggressivelyInterleaveReductions) { 6220 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6221 return IC; 6222 } 6223 6224 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6225 return 1; 6226 } 6227 6228 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6229 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6230 // This function calculates the register usage by measuring the highest number 6231 // of values that are alive at a single location. Obviously, this is a very 6232 // rough estimation. We scan the loop in a topological order in order and 6233 // assign a number to each instruction. We use RPO to ensure that defs are 6234 // met before their users. We assume that each instruction that has in-loop 6235 // users starts an interval. We record every time that an in-loop value is 6236 // used, so we have a list of the first and last occurrences of each 6237 // instruction. Next, we transpose this data structure into a multi map that 6238 // holds the list of intervals that *end* at a specific location. This multi 6239 // map allows us to perform a linear search. We scan the instructions linearly 6240 // and record each time that a new interval starts, by placing it in a set. 6241 // If we find this value in the multi-map then we remove it from the set. 6242 // The max register usage is the maximum size of the set. 6243 // We also search for instructions that are defined outside the loop, but are 6244 // used inside the loop. We need this number separately from the max-interval 6245 // usage number because when we unroll, loop-invariant values do not take 6246 // more register. 6247 LoopBlocksDFS DFS(TheLoop); 6248 DFS.perform(LI); 6249 6250 RegisterUsage RU; 6251 6252 // Each 'key' in the map opens a new interval. The values 6253 // of the map are the index of the 'last seen' usage of the 6254 // instruction that is the key. 6255 using IntervalMap = DenseMap<Instruction *, unsigned>; 6256 6257 // Maps instruction to its index. 6258 SmallVector<Instruction *, 64> IdxToInstr; 6259 // Marks the end of each interval. 6260 IntervalMap EndPoint; 6261 // Saves the list of instruction indices that are used in the loop. 6262 SmallPtrSet<Instruction *, 8> Ends; 6263 // Saves the list of values that are used in the loop but are 6264 // defined outside the loop, such as arguments and constants. 6265 SmallPtrSet<Value *, 8> LoopInvariants; 6266 6267 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6268 for (Instruction &I : BB->instructionsWithoutDebug()) { 6269 IdxToInstr.push_back(&I); 6270 6271 // Save the end location of each USE. 6272 for (Value *U : I.operands()) { 6273 auto *Instr = dyn_cast<Instruction>(U); 6274 6275 // Ignore non-instruction values such as arguments, constants, etc. 6276 if (!Instr) 6277 continue; 6278 6279 // If this instruction is outside the loop then record it and continue. 6280 if (!TheLoop->contains(Instr)) { 6281 LoopInvariants.insert(Instr); 6282 continue; 6283 } 6284 6285 // Overwrite previous end points. 6286 EndPoint[Instr] = IdxToInstr.size(); 6287 Ends.insert(Instr); 6288 } 6289 } 6290 } 6291 6292 // Saves the list of intervals that end with the index in 'key'. 6293 using InstrList = SmallVector<Instruction *, 2>; 6294 DenseMap<unsigned, InstrList> TransposeEnds; 6295 6296 // Transpose the EndPoints to a list of values that end at each index. 6297 for (auto &Interval : EndPoint) 6298 TransposeEnds[Interval.second].push_back(Interval.first); 6299 6300 SmallPtrSet<Instruction *, 8> OpenIntervals; 6301 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6302 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6303 6304 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6305 6306 // A lambda that gets the register usage for the given type and VF. 6307 const auto &TTICapture = TTI; 6308 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6309 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6310 return 0; 6311 InstructionCost::CostType RegUsage = 6312 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6313 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6314 "Nonsensical values for register usage."); 6315 return RegUsage; 6316 }; 6317 6318 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6319 Instruction *I = IdxToInstr[i]; 6320 6321 // Remove all of the instructions that end at this location. 6322 InstrList &List = TransposeEnds[i]; 6323 for (Instruction *ToRemove : List) 6324 OpenIntervals.erase(ToRemove); 6325 6326 // Ignore instructions that are never used within the loop. 6327 if (!Ends.count(I)) 6328 continue; 6329 6330 // Skip ignored values. 6331 if (ValuesToIgnore.count(I)) 6332 continue; 6333 6334 // For each VF find the maximum usage of registers. 6335 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6336 // Count the number of live intervals. 6337 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6338 6339 if (VFs[j].isScalar()) { 6340 for (auto Inst : OpenIntervals) { 6341 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6342 if (RegUsage.find(ClassID) == RegUsage.end()) 6343 RegUsage[ClassID] = 1; 6344 else 6345 RegUsage[ClassID] += 1; 6346 } 6347 } else { 6348 collectUniformsAndScalars(VFs[j]); 6349 for (auto Inst : OpenIntervals) { 6350 // Skip ignored values for VF > 1. 6351 if (VecValuesToIgnore.count(Inst)) 6352 continue; 6353 if (isScalarAfterVectorization(Inst, VFs[j])) { 6354 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6355 if (RegUsage.find(ClassID) == RegUsage.end()) 6356 RegUsage[ClassID] = 1; 6357 else 6358 RegUsage[ClassID] += 1; 6359 } else { 6360 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6361 if (RegUsage.find(ClassID) == RegUsage.end()) 6362 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6363 else 6364 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6365 } 6366 } 6367 } 6368 6369 for (auto& pair : RegUsage) { 6370 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6371 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6372 else 6373 MaxUsages[j][pair.first] = pair.second; 6374 } 6375 } 6376 6377 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6378 << OpenIntervals.size() << '\n'); 6379 6380 // Add the current instruction to the list of open intervals. 6381 OpenIntervals.insert(I); 6382 } 6383 6384 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6385 SmallMapVector<unsigned, unsigned, 4> Invariant; 6386 6387 for (auto Inst : LoopInvariants) { 6388 unsigned Usage = 6389 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6390 unsigned ClassID = 6391 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6392 if (Invariant.find(ClassID) == Invariant.end()) 6393 Invariant[ClassID] = Usage; 6394 else 6395 Invariant[ClassID] += Usage; 6396 } 6397 6398 LLVM_DEBUG({ 6399 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6400 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6401 << " item\n"; 6402 for (const auto &pair : MaxUsages[i]) { 6403 dbgs() << "LV(REG): RegisterClass: " 6404 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6405 << " registers\n"; 6406 } 6407 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6408 << " item\n"; 6409 for (const auto &pair : Invariant) { 6410 dbgs() << "LV(REG): RegisterClass: " 6411 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6412 << " registers\n"; 6413 } 6414 }); 6415 6416 RU.LoopInvariantRegs = Invariant; 6417 RU.MaxLocalUsers = MaxUsages[i]; 6418 RUs[i] = RU; 6419 } 6420 6421 return RUs; 6422 } 6423 6424 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6425 // If we aren't vectorizing the loop, or if we've already collected the 6426 // instructions to scalarize, there's nothing to do. Collection may already 6427 // have occurred if we have a user-selected VF and are now computing the 6428 // expected cost for interleaving. 6429 if (VF.isScalar() || VF.isZero() || 6430 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6431 return; 6432 6433 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6434 // not profitable to scalarize any instructions, the presence of VF in the 6435 // map will indicate that we've analyzed it already. 6436 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6437 6438 // Find all the instructions that are scalar with predication in the loop and 6439 // determine if it would be better to not if-convert the blocks they are in. 6440 // If so, we also record the instructions to scalarize. 6441 for (BasicBlock *BB : TheLoop->blocks()) { 6442 if (!blockNeedsPredicationForAnyReason(BB)) 6443 continue; 6444 for (Instruction &I : *BB) 6445 if (isScalarWithPredication(&I, VF)) { 6446 ScalarCostsTy ScalarCosts; 6447 // Do not apply discount if scalable, because that would lead to 6448 // invalid scalarization costs. 6449 if (!VF.isScalable() && 6450 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6451 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6452 // Remember that BB will remain after vectorization. 6453 PredicatedBBsAfterVectorization.insert(BB); 6454 } 6455 } 6456 } 6457 6458 int LoopVectorizationCostModel::computePredInstDiscount( 6459 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6460 assert(!isUniformAfterVectorization(PredInst, VF) && 6461 "Instruction marked uniform-after-vectorization will be predicated"); 6462 6463 // Initialize the discount to zero, meaning that the scalar version and the 6464 // vector version cost the same. 6465 InstructionCost Discount = 0; 6466 6467 // Holds instructions to analyze. The instructions we visit are mapped in 6468 // ScalarCosts. Those instructions are the ones that would be scalarized if 6469 // we find that the scalar version costs less. 6470 SmallVector<Instruction *, 8> Worklist; 6471 6472 // Returns true if the given instruction can be scalarized. 6473 auto canBeScalarized = [&](Instruction *I) -> bool { 6474 // We only attempt to scalarize instructions forming a single-use chain 6475 // from the original predicated block that would otherwise be vectorized. 6476 // Although not strictly necessary, we give up on instructions we know will 6477 // already be scalar to avoid traversing chains that are unlikely to be 6478 // beneficial. 6479 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6480 isScalarAfterVectorization(I, VF)) 6481 return false; 6482 6483 // If the instruction is scalar with predication, it will be analyzed 6484 // separately. We ignore it within the context of PredInst. 6485 if (isScalarWithPredication(I, VF)) 6486 return false; 6487 6488 // If any of the instruction's operands are uniform after vectorization, 6489 // the instruction cannot be scalarized. This prevents, for example, a 6490 // masked load from being scalarized. 6491 // 6492 // We assume we will only emit a value for lane zero of an instruction 6493 // marked uniform after vectorization, rather than VF identical values. 6494 // Thus, if we scalarize an instruction that uses a uniform, we would 6495 // create uses of values corresponding to the lanes we aren't emitting code 6496 // for. This behavior can be changed by allowing getScalarValue to clone 6497 // the lane zero values for uniforms rather than asserting. 6498 for (Use &U : I->operands()) 6499 if (auto *J = dyn_cast<Instruction>(U.get())) 6500 if (isUniformAfterVectorization(J, VF)) 6501 return false; 6502 6503 // Otherwise, we can scalarize the instruction. 6504 return true; 6505 }; 6506 6507 // Compute the expected cost discount from scalarizing the entire expression 6508 // feeding the predicated instruction. We currently only consider expressions 6509 // that are single-use instruction chains. 6510 Worklist.push_back(PredInst); 6511 while (!Worklist.empty()) { 6512 Instruction *I = Worklist.pop_back_val(); 6513 6514 // If we've already analyzed the instruction, there's nothing to do. 6515 if (ScalarCosts.find(I) != ScalarCosts.end()) 6516 continue; 6517 6518 // Compute the cost of the vector instruction. Note that this cost already 6519 // includes the scalarization overhead of the predicated instruction. 6520 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6521 6522 // Compute the cost of the scalarized instruction. This cost is the cost of 6523 // the instruction as if it wasn't if-converted and instead remained in the 6524 // predicated block. We will scale this cost by block probability after 6525 // computing the scalarization overhead. 6526 InstructionCost ScalarCost = 6527 VF.getFixedValue() * 6528 getInstructionCost(I, ElementCount::getFixed(1)).first; 6529 6530 // Compute the scalarization overhead of needed insertelement instructions 6531 // and phi nodes. 6532 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6533 ScalarCost += TTI.getScalarizationOverhead( 6534 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6535 APInt::getAllOnes(VF.getFixedValue()), true, false); 6536 ScalarCost += 6537 VF.getFixedValue() * 6538 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6539 } 6540 6541 // Compute the scalarization overhead of needed extractelement 6542 // instructions. For each of the instruction's operands, if the operand can 6543 // be scalarized, add it to the worklist; otherwise, account for the 6544 // overhead. 6545 for (Use &U : I->operands()) 6546 if (auto *J = dyn_cast<Instruction>(U.get())) { 6547 assert(VectorType::isValidElementType(J->getType()) && 6548 "Instruction has non-scalar type"); 6549 if (canBeScalarized(J)) 6550 Worklist.push_back(J); 6551 else if (needsExtract(J, VF)) { 6552 ScalarCost += TTI.getScalarizationOverhead( 6553 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6554 APInt::getAllOnes(VF.getFixedValue()), false, true); 6555 } 6556 } 6557 6558 // Scale the total scalar cost by block probability. 6559 ScalarCost /= getReciprocalPredBlockProb(); 6560 6561 // Compute the discount. A non-negative discount means the vector version 6562 // of the instruction costs more, and scalarizing would be beneficial. 6563 Discount += VectorCost - ScalarCost; 6564 ScalarCosts[I] = ScalarCost; 6565 } 6566 6567 return *Discount.getValue(); 6568 } 6569 6570 LoopVectorizationCostModel::VectorizationCostTy 6571 LoopVectorizationCostModel::expectedCost( 6572 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6573 VectorizationCostTy Cost; 6574 6575 // For each block. 6576 for (BasicBlock *BB : TheLoop->blocks()) { 6577 VectorizationCostTy BlockCost; 6578 6579 // For each instruction in the old loop. 6580 for (Instruction &I : BB->instructionsWithoutDebug()) { 6581 // Skip ignored values. 6582 if (ValuesToIgnore.count(&I) || 6583 (VF.isVector() && VecValuesToIgnore.count(&I))) 6584 continue; 6585 6586 VectorizationCostTy C = getInstructionCost(&I, VF); 6587 6588 // Check if we should override the cost. 6589 if (C.first.isValid() && 6590 ForceTargetInstructionCost.getNumOccurrences() > 0) 6591 C.first = InstructionCost(ForceTargetInstructionCost); 6592 6593 // Keep a list of instructions with invalid costs. 6594 if (Invalid && !C.first.isValid()) 6595 Invalid->emplace_back(&I, VF); 6596 6597 BlockCost.first += C.first; 6598 BlockCost.second |= C.second; 6599 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6600 << " for VF " << VF << " For instruction: " << I 6601 << '\n'); 6602 } 6603 6604 // If we are vectorizing a predicated block, it will have been 6605 // if-converted. This means that the block's instructions (aside from 6606 // stores and instructions that may divide by zero) will now be 6607 // unconditionally executed. For the scalar case, we may not always execute 6608 // the predicated block, if it is an if-else block. Thus, scale the block's 6609 // cost by the probability of executing it. blockNeedsPredication from 6610 // Legal is used so as to not include all blocks in tail folded loops. 6611 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6612 BlockCost.first /= getReciprocalPredBlockProb(); 6613 6614 Cost.first += BlockCost.first; 6615 Cost.second |= BlockCost.second; 6616 } 6617 6618 return Cost; 6619 } 6620 6621 /// Gets Address Access SCEV after verifying that the access pattern 6622 /// is loop invariant except the induction variable dependence. 6623 /// 6624 /// This SCEV can be sent to the Target in order to estimate the address 6625 /// calculation cost. 6626 static const SCEV *getAddressAccessSCEV( 6627 Value *Ptr, 6628 LoopVectorizationLegality *Legal, 6629 PredicatedScalarEvolution &PSE, 6630 const Loop *TheLoop) { 6631 6632 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6633 if (!Gep) 6634 return nullptr; 6635 6636 // We are looking for a gep with all loop invariant indices except for one 6637 // which should be an induction variable. 6638 auto SE = PSE.getSE(); 6639 unsigned NumOperands = Gep->getNumOperands(); 6640 for (unsigned i = 1; i < NumOperands; ++i) { 6641 Value *Opd = Gep->getOperand(i); 6642 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6643 !Legal->isInductionVariable(Opd)) 6644 return nullptr; 6645 } 6646 6647 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6648 return PSE.getSCEV(Ptr); 6649 } 6650 6651 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6652 return Legal->hasStride(I->getOperand(0)) || 6653 Legal->hasStride(I->getOperand(1)); 6654 } 6655 6656 InstructionCost 6657 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6658 ElementCount VF) { 6659 assert(VF.isVector() && 6660 "Scalarization cost of instruction implies vectorization."); 6661 if (VF.isScalable()) 6662 return InstructionCost::getInvalid(); 6663 6664 Type *ValTy = getLoadStoreType(I); 6665 auto SE = PSE.getSE(); 6666 6667 unsigned AS = getLoadStoreAddressSpace(I); 6668 Value *Ptr = getLoadStorePointerOperand(I); 6669 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6670 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6671 // that it is being called from this specific place. 6672 6673 // Figure out whether the access is strided and get the stride value 6674 // if it's known in compile time 6675 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6676 6677 // Get the cost of the scalar memory instruction and address computation. 6678 InstructionCost Cost = 6679 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6680 6681 // Don't pass *I here, since it is scalar but will actually be part of a 6682 // vectorized loop where the user of it is a vectorized instruction. 6683 const Align Alignment = getLoadStoreAlignment(I); 6684 Cost += VF.getKnownMinValue() * 6685 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6686 AS, TTI::TCK_RecipThroughput); 6687 6688 // Get the overhead of the extractelement and insertelement instructions 6689 // we might create due to scalarization. 6690 Cost += getScalarizationOverhead(I, VF); 6691 6692 // If we have a predicated load/store, it will need extra i1 extracts and 6693 // conditional branches, but may not be executed for each vector lane. Scale 6694 // the cost by the probability of executing the predicated block. 6695 if (isPredicatedInst(I, VF)) { 6696 Cost /= getReciprocalPredBlockProb(); 6697 6698 // Add the cost of an i1 extract and a branch 6699 auto *Vec_i1Ty = 6700 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6701 Cost += TTI.getScalarizationOverhead( 6702 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6703 /*Insert=*/false, /*Extract=*/true); 6704 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6705 } 6706 6707 return Cost; 6708 } 6709 6710 InstructionCost 6711 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6712 ElementCount VF) { 6713 Type *ValTy = getLoadStoreType(I); 6714 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6715 Value *Ptr = getLoadStorePointerOperand(I); 6716 unsigned AS = getLoadStoreAddressSpace(I); 6717 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6718 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6719 6720 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6721 "Stride should be 1 or -1 for consecutive memory access"); 6722 const Align Alignment = getLoadStoreAlignment(I); 6723 InstructionCost Cost = 0; 6724 if (Legal->isMaskRequired(I)) 6725 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6726 CostKind); 6727 else 6728 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6729 CostKind, I); 6730 6731 bool Reverse = ConsecutiveStride < 0; 6732 if (Reverse) 6733 Cost += 6734 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6735 return Cost; 6736 } 6737 6738 InstructionCost 6739 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6740 ElementCount VF) { 6741 assert(Legal->isUniformMemOp(*I)); 6742 6743 Type *ValTy = getLoadStoreType(I); 6744 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6745 const Align Alignment = getLoadStoreAlignment(I); 6746 unsigned AS = getLoadStoreAddressSpace(I); 6747 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6748 if (isa<LoadInst>(I)) { 6749 return TTI.getAddressComputationCost(ValTy) + 6750 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6751 CostKind) + 6752 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6753 } 6754 StoreInst *SI = cast<StoreInst>(I); 6755 6756 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6757 return TTI.getAddressComputationCost(ValTy) + 6758 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6759 CostKind) + 6760 (isLoopInvariantStoreValue 6761 ? 0 6762 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6763 VF.getKnownMinValue() - 1)); 6764 } 6765 6766 InstructionCost 6767 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6768 ElementCount VF) { 6769 Type *ValTy = getLoadStoreType(I); 6770 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6771 const Align Alignment = getLoadStoreAlignment(I); 6772 const Value *Ptr = getLoadStorePointerOperand(I); 6773 6774 return TTI.getAddressComputationCost(VectorTy) + 6775 TTI.getGatherScatterOpCost( 6776 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6777 TargetTransformInfo::TCK_RecipThroughput, I); 6778 } 6779 6780 InstructionCost 6781 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6782 ElementCount VF) { 6783 // TODO: Once we have support for interleaving with scalable vectors 6784 // we can calculate the cost properly here. 6785 if (VF.isScalable()) 6786 return InstructionCost::getInvalid(); 6787 6788 Type *ValTy = getLoadStoreType(I); 6789 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6790 unsigned AS = getLoadStoreAddressSpace(I); 6791 6792 auto Group = getInterleavedAccessGroup(I); 6793 assert(Group && "Fail to get an interleaved access group."); 6794 6795 unsigned InterleaveFactor = Group->getFactor(); 6796 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6797 6798 // Holds the indices of existing members in the interleaved group. 6799 SmallVector<unsigned, 4> Indices; 6800 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6801 if (Group->getMember(IF)) 6802 Indices.push_back(IF); 6803 6804 // Calculate the cost of the whole interleaved group. 6805 bool UseMaskForGaps = 6806 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6807 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6808 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6809 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6810 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6811 6812 if (Group->isReverse()) { 6813 // TODO: Add support for reversed masked interleaved access. 6814 assert(!Legal->isMaskRequired(I) && 6815 "Reverse masked interleaved access not supported."); 6816 Cost += 6817 Group->getNumMembers() * 6818 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6819 } 6820 return Cost; 6821 } 6822 6823 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6824 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6825 using namespace llvm::PatternMatch; 6826 // Early exit for no inloop reductions 6827 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6828 return None; 6829 auto *VectorTy = cast<VectorType>(Ty); 6830 6831 // We are looking for a pattern of, and finding the minimal acceptable cost: 6832 // reduce(mul(ext(A), ext(B))) or 6833 // reduce(mul(A, B)) or 6834 // reduce(ext(A)) or 6835 // reduce(A). 6836 // The basic idea is that we walk down the tree to do that, finding the root 6837 // reduction instruction in InLoopReductionImmediateChains. From there we find 6838 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6839 // of the components. If the reduction cost is lower then we return it for the 6840 // reduction instruction and 0 for the other instructions in the pattern. If 6841 // it is not we return an invalid cost specifying the orignal cost method 6842 // should be used. 6843 Instruction *RetI = I; 6844 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6845 if (!RetI->hasOneUser()) 6846 return None; 6847 RetI = RetI->user_back(); 6848 } 6849 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6850 RetI->user_back()->getOpcode() == Instruction::Add) { 6851 if (!RetI->hasOneUser()) 6852 return None; 6853 RetI = RetI->user_back(); 6854 } 6855 6856 // Test if the found instruction is a reduction, and if not return an invalid 6857 // cost specifying the parent to use the original cost modelling. 6858 if (!InLoopReductionImmediateChains.count(RetI)) 6859 return None; 6860 6861 // Find the reduction this chain is a part of and calculate the basic cost of 6862 // the reduction on its own. 6863 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6864 Instruction *ReductionPhi = LastChain; 6865 while (!isa<PHINode>(ReductionPhi)) 6866 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6867 6868 const RecurrenceDescriptor &RdxDesc = 6869 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6870 6871 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6872 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6873 6874 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6875 // normal fmul instruction to the cost of the fadd reduction. 6876 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6877 BaseCost += 6878 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6879 6880 // If we're using ordered reductions then we can just return the base cost 6881 // here, since getArithmeticReductionCost calculates the full ordered 6882 // reduction cost when FP reassociation is not allowed. 6883 if (useOrderedReductions(RdxDesc)) 6884 return BaseCost; 6885 6886 // Get the operand that was not the reduction chain and match it to one of the 6887 // patterns, returning the better cost if it is found. 6888 Instruction *RedOp = RetI->getOperand(1) == LastChain 6889 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6890 : dyn_cast<Instruction>(RetI->getOperand(1)); 6891 6892 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6893 6894 Instruction *Op0, *Op1; 6895 if (RedOp && 6896 match(RedOp, 6897 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6898 match(Op0, m_ZExtOrSExt(m_Value())) && 6899 Op0->getOpcode() == Op1->getOpcode() && 6900 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6901 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6902 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6903 6904 // Matched reduce(ext(mul(ext(A), ext(B))) 6905 // Note that the extend opcodes need to all match, or if A==B they will have 6906 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6907 // which is equally fine. 6908 bool IsUnsigned = isa<ZExtInst>(Op0); 6909 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6910 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6911 6912 InstructionCost ExtCost = 6913 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6914 TTI::CastContextHint::None, CostKind, Op0); 6915 InstructionCost MulCost = 6916 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6917 InstructionCost Ext2Cost = 6918 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6919 TTI::CastContextHint::None, CostKind, RedOp); 6920 6921 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6922 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6923 CostKind); 6924 6925 if (RedCost.isValid() && 6926 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6927 return I == RetI ? RedCost : 0; 6928 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6929 !TheLoop->isLoopInvariant(RedOp)) { 6930 // Matched reduce(ext(A)) 6931 bool IsUnsigned = isa<ZExtInst>(RedOp); 6932 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6933 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6934 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6935 CostKind); 6936 6937 InstructionCost ExtCost = 6938 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6939 TTI::CastContextHint::None, CostKind, RedOp); 6940 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6941 return I == RetI ? RedCost : 0; 6942 } else if (RedOp && 6943 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6944 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6945 Op0->getOpcode() == Op1->getOpcode() && 6946 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6947 bool IsUnsigned = isa<ZExtInst>(Op0); 6948 Type *Op0Ty = Op0->getOperand(0)->getType(); 6949 Type *Op1Ty = Op1->getOperand(0)->getType(); 6950 Type *LargestOpTy = 6951 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6952 : Op0Ty; 6953 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6954 6955 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6956 // different sizes. We take the largest type as the ext to reduce, and add 6957 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6958 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6959 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6960 TTI::CastContextHint::None, CostKind, Op0); 6961 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6962 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6963 TTI::CastContextHint::None, CostKind, Op1); 6964 InstructionCost MulCost = 6965 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6966 6967 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6968 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6969 CostKind); 6970 InstructionCost ExtraExtCost = 0; 6971 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6972 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6973 ExtraExtCost = TTI.getCastInstrCost( 6974 ExtraExtOp->getOpcode(), ExtType, 6975 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6976 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6977 } 6978 6979 if (RedCost.isValid() && 6980 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6981 return I == RetI ? RedCost : 0; 6982 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6983 // Matched reduce(mul()) 6984 InstructionCost MulCost = 6985 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6986 6987 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6988 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6989 CostKind); 6990 6991 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6992 return I == RetI ? RedCost : 0; 6993 } 6994 } 6995 6996 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6997 } 6998 6999 InstructionCost 7000 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7001 ElementCount VF) { 7002 // Calculate scalar cost only. Vectorization cost should be ready at this 7003 // moment. 7004 if (VF.isScalar()) { 7005 Type *ValTy = getLoadStoreType(I); 7006 const Align Alignment = getLoadStoreAlignment(I); 7007 unsigned AS = getLoadStoreAddressSpace(I); 7008 7009 return TTI.getAddressComputationCost(ValTy) + 7010 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7011 TTI::TCK_RecipThroughput, I); 7012 } 7013 return getWideningCost(I, VF); 7014 } 7015 7016 LoopVectorizationCostModel::VectorizationCostTy 7017 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7018 ElementCount VF) { 7019 // If we know that this instruction will remain uniform, check the cost of 7020 // the scalar version. 7021 if (isUniformAfterVectorization(I, VF)) 7022 VF = ElementCount::getFixed(1); 7023 7024 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7025 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7026 7027 // Forced scalars do not have any scalarization overhead. 7028 auto ForcedScalar = ForcedScalars.find(VF); 7029 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7030 auto InstSet = ForcedScalar->second; 7031 if (InstSet.count(I)) 7032 return VectorizationCostTy( 7033 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7034 VF.getKnownMinValue()), 7035 false); 7036 } 7037 7038 Type *VectorTy; 7039 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7040 7041 bool TypeNotScalarized = false; 7042 if (VF.isVector() && VectorTy->isVectorTy()) { 7043 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7044 if (NumParts) 7045 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7046 else 7047 C = InstructionCost::getInvalid(); 7048 } 7049 return VectorizationCostTy(C, TypeNotScalarized); 7050 } 7051 7052 InstructionCost 7053 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7054 ElementCount VF) const { 7055 7056 // There is no mechanism yet to create a scalable scalarization loop, 7057 // so this is currently Invalid. 7058 if (VF.isScalable()) 7059 return InstructionCost::getInvalid(); 7060 7061 if (VF.isScalar()) 7062 return 0; 7063 7064 InstructionCost Cost = 0; 7065 Type *RetTy = ToVectorTy(I->getType(), VF); 7066 if (!RetTy->isVoidTy() && 7067 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7068 Cost += TTI.getScalarizationOverhead( 7069 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7070 false); 7071 7072 // Some targets keep addresses scalar. 7073 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7074 return Cost; 7075 7076 // Some targets support efficient element stores. 7077 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7078 return Cost; 7079 7080 // Collect operands to consider. 7081 CallInst *CI = dyn_cast<CallInst>(I); 7082 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7083 7084 // Skip operands that do not require extraction/scalarization and do not incur 7085 // any overhead. 7086 SmallVector<Type *> Tys; 7087 for (auto *V : filterExtractingOperands(Ops, VF)) 7088 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7089 return Cost + TTI.getOperandsScalarizationOverhead( 7090 filterExtractingOperands(Ops, VF), Tys); 7091 } 7092 7093 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7094 if (VF.isScalar()) 7095 return; 7096 NumPredStores = 0; 7097 for (BasicBlock *BB : TheLoop->blocks()) { 7098 // For each instruction in the old loop. 7099 for (Instruction &I : *BB) { 7100 Value *Ptr = getLoadStorePointerOperand(&I); 7101 if (!Ptr) 7102 continue; 7103 7104 // TODO: We should generate better code and update the cost model for 7105 // predicated uniform stores. Today they are treated as any other 7106 // predicated store (see added test cases in 7107 // invariant-store-vectorization.ll). 7108 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 7109 NumPredStores++; 7110 7111 if (Legal->isUniformMemOp(I)) { 7112 // TODO: Avoid replicating loads and stores instead of 7113 // relying on instcombine to remove them. 7114 // Load: Scalar load + broadcast 7115 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7116 InstructionCost Cost; 7117 if (isa<StoreInst>(&I) && VF.isScalable() && 7118 isLegalGatherOrScatter(&I, VF)) { 7119 Cost = getGatherScatterCost(&I, VF); 7120 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7121 } else { 7122 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7123 "Cannot yet scalarize uniform stores"); 7124 Cost = getUniformMemOpCost(&I, VF); 7125 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7126 } 7127 continue; 7128 } 7129 7130 // We assume that widening is the best solution when possible. 7131 if (memoryInstructionCanBeWidened(&I, VF)) { 7132 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7133 int ConsecutiveStride = Legal->isConsecutivePtr( 7134 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7135 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7136 "Expected consecutive stride."); 7137 InstWidening Decision = 7138 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7139 setWideningDecision(&I, VF, Decision, Cost); 7140 continue; 7141 } 7142 7143 // Choose between Interleaving, Gather/Scatter or Scalarization. 7144 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7145 unsigned NumAccesses = 1; 7146 if (isAccessInterleaved(&I)) { 7147 auto Group = getInterleavedAccessGroup(&I); 7148 assert(Group && "Fail to get an interleaved access group."); 7149 7150 // Make one decision for the whole group. 7151 if (getWideningDecision(&I, VF) != CM_Unknown) 7152 continue; 7153 7154 NumAccesses = Group->getNumMembers(); 7155 if (interleavedAccessCanBeWidened(&I, VF)) 7156 InterleaveCost = getInterleaveGroupCost(&I, VF); 7157 } 7158 7159 InstructionCost GatherScatterCost = 7160 isLegalGatherOrScatter(&I, VF) 7161 ? getGatherScatterCost(&I, VF) * NumAccesses 7162 : InstructionCost::getInvalid(); 7163 7164 InstructionCost ScalarizationCost = 7165 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7166 7167 // Choose better solution for the current VF, 7168 // write down this decision and use it during vectorization. 7169 InstructionCost Cost; 7170 InstWidening Decision; 7171 if (InterleaveCost <= GatherScatterCost && 7172 InterleaveCost < ScalarizationCost) { 7173 Decision = CM_Interleave; 7174 Cost = InterleaveCost; 7175 } else if (GatherScatterCost < ScalarizationCost) { 7176 Decision = CM_GatherScatter; 7177 Cost = GatherScatterCost; 7178 } else { 7179 Decision = CM_Scalarize; 7180 Cost = ScalarizationCost; 7181 } 7182 // If the instructions belongs to an interleave group, the whole group 7183 // receives the same decision. The whole group receives the cost, but 7184 // the cost will actually be assigned to one instruction. 7185 if (auto Group = getInterleavedAccessGroup(&I)) 7186 setWideningDecision(Group, VF, Decision, Cost); 7187 else 7188 setWideningDecision(&I, VF, Decision, Cost); 7189 } 7190 } 7191 7192 // Make sure that any load of address and any other address computation 7193 // remains scalar unless there is gather/scatter support. This avoids 7194 // inevitable extracts into address registers, and also has the benefit of 7195 // activating LSR more, since that pass can't optimize vectorized 7196 // addresses. 7197 if (TTI.prefersVectorizedAddressing()) 7198 return; 7199 7200 // Start with all scalar pointer uses. 7201 SmallPtrSet<Instruction *, 8> AddrDefs; 7202 for (BasicBlock *BB : TheLoop->blocks()) 7203 for (Instruction &I : *BB) { 7204 Instruction *PtrDef = 7205 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7206 if (PtrDef && TheLoop->contains(PtrDef) && 7207 getWideningDecision(&I, VF) != CM_GatherScatter) 7208 AddrDefs.insert(PtrDef); 7209 } 7210 7211 // Add all instructions used to generate the addresses. 7212 SmallVector<Instruction *, 4> Worklist; 7213 append_range(Worklist, AddrDefs); 7214 while (!Worklist.empty()) { 7215 Instruction *I = Worklist.pop_back_val(); 7216 for (auto &Op : I->operands()) 7217 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7218 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7219 AddrDefs.insert(InstOp).second) 7220 Worklist.push_back(InstOp); 7221 } 7222 7223 for (auto *I : AddrDefs) { 7224 if (isa<LoadInst>(I)) { 7225 // Setting the desired widening decision should ideally be handled in 7226 // by cost functions, but since this involves the task of finding out 7227 // if the loaded register is involved in an address computation, it is 7228 // instead changed here when we know this is the case. 7229 InstWidening Decision = getWideningDecision(I, VF); 7230 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7231 // Scalarize a widened load of address. 7232 setWideningDecision( 7233 I, VF, CM_Scalarize, 7234 (VF.getKnownMinValue() * 7235 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7236 else if (auto Group = getInterleavedAccessGroup(I)) { 7237 // Scalarize an interleave group of address loads. 7238 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7239 if (Instruction *Member = Group->getMember(I)) 7240 setWideningDecision( 7241 Member, VF, CM_Scalarize, 7242 (VF.getKnownMinValue() * 7243 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7244 } 7245 } 7246 } else 7247 // Make sure I gets scalarized and a cost estimate without 7248 // scalarization overhead. 7249 ForcedScalars[VF].insert(I); 7250 } 7251 } 7252 7253 InstructionCost 7254 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7255 Type *&VectorTy) { 7256 Type *RetTy = I->getType(); 7257 if (canTruncateToMinimalBitwidth(I, VF)) 7258 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7259 auto SE = PSE.getSE(); 7260 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7261 7262 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7263 ElementCount VF) -> bool { 7264 if (VF.isScalar()) 7265 return true; 7266 7267 auto Scalarized = InstsToScalarize.find(VF); 7268 assert(Scalarized != InstsToScalarize.end() && 7269 "VF not yet analyzed for scalarization profitability"); 7270 return !Scalarized->second.count(I) && 7271 llvm::all_of(I->users(), [&](User *U) { 7272 auto *UI = cast<Instruction>(U); 7273 return !Scalarized->second.count(UI); 7274 }); 7275 }; 7276 (void) hasSingleCopyAfterVectorization; 7277 7278 if (isScalarAfterVectorization(I, VF)) { 7279 // With the exception of GEPs and PHIs, after scalarization there should 7280 // only be one copy of the instruction generated in the loop. This is 7281 // because the VF is either 1, or any instructions that need scalarizing 7282 // have already been dealt with by the the time we get here. As a result, 7283 // it means we don't have to multiply the instruction cost by VF. 7284 assert(I->getOpcode() == Instruction::GetElementPtr || 7285 I->getOpcode() == Instruction::PHI || 7286 (I->getOpcode() == Instruction::BitCast && 7287 I->getType()->isPointerTy()) || 7288 hasSingleCopyAfterVectorization(I, VF)); 7289 VectorTy = RetTy; 7290 } else 7291 VectorTy = ToVectorTy(RetTy, VF); 7292 7293 // TODO: We need to estimate the cost of intrinsic calls. 7294 switch (I->getOpcode()) { 7295 case Instruction::GetElementPtr: 7296 // We mark this instruction as zero-cost because the cost of GEPs in 7297 // vectorized code depends on whether the corresponding memory instruction 7298 // is scalarized or not. Therefore, we handle GEPs with the memory 7299 // instruction cost. 7300 return 0; 7301 case Instruction::Br: { 7302 // In cases of scalarized and predicated instructions, there will be VF 7303 // predicated blocks in the vectorized loop. Each branch around these 7304 // blocks requires also an extract of its vector compare i1 element. 7305 bool ScalarPredicatedBB = false; 7306 BranchInst *BI = cast<BranchInst>(I); 7307 if (VF.isVector() && BI->isConditional() && 7308 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7309 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7310 ScalarPredicatedBB = true; 7311 7312 if (ScalarPredicatedBB) { 7313 // Not possible to scalarize scalable vector with predicated instructions. 7314 if (VF.isScalable()) 7315 return InstructionCost::getInvalid(); 7316 // Return cost for branches around scalarized and predicated blocks. 7317 auto *Vec_i1Ty = 7318 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7319 return ( 7320 TTI.getScalarizationOverhead( 7321 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7322 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7323 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7324 // The back-edge branch will remain, as will all scalar branches. 7325 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7326 else 7327 // This branch will be eliminated by if-conversion. 7328 return 0; 7329 // Note: We currently assume zero cost for an unconditional branch inside 7330 // a predicated block since it will become a fall-through, although we 7331 // may decide in the future to call TTI for all branches. 7332 } 7333 case Instruction::PHI: { 7334 auto *Phi = cast<PHINode>(I); 7335 7336 // First-order recurrences are replaced by vector shuffles inside the loop. 7337 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7338 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7339 return TTI.getShuffleCost( 7340 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7341 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7342 7343 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7344 // converted into select instructions. We require N - 1 selects per phi 7345 // node, where N is the number of incoming values. 7346 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7347 return (Phi->getNumIncomingValues() - 1) * 7348 TTI.getCmpSelInstrCost( 7349 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7350 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7351 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7352 7353 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7354 } 7355 case Instruction::UDiv: 7356 case Instruction::SDiv: 7357 case Instruction::URem: 7358 case Instruction::SRem: 7359 // If we have a predicated instruction, it may not be executed for each 7360 // vector lane. Get the scalarization cost and scale this amount by the 7361 // probability of executing the predicated block. If the instruction is not 7362 // predicated, we fall through to the next case. 7363 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7364 InstructionCost Cost = 0; 7365 7366 // These instructions have a non-void type, so account for the phi nodes 7367 // that we will create. This cost is likely to be zero. The phi node 7368 // cost, if any, should be scaled by the block probability because it 7369 // models a copy at the end of each predicated block. 7370 Cost += VF.getKnownMinValue() * 7371 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7372 7373 // The cost of the non-predicated instruction. 7374 Cost += VF.getKnownMinValue() * 7375 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7376 7377 // The cost of insertelement and extractelement instructions needed for 7378 // scalarization. 7379 Cost += getScalarizationOverhead(I, VF); 7380 7381 // Scale the cost by the probability of executing the predicated blocks. 7382 // This assumes the predicated block for each vector lane is equally 7383 // likely. 7384 return Cost / getReciprocalPredBlockProb(); 7385 } 7386 LLVM_FALLTHROUGH; 7387 case Instruction::Add: 7388 case Instruction::FAdd: 7389 case Instruction::Sub: 7390 case Instruction::FSub: 7391 case Instruction::Mul: 7392 case Instruction::FMul: 7393 case Instruction::FDiv: 7394 case Instruction::FRem: 7395 case Instruction::Shl: 7396 case Instruction::LShr: 7397 case Instruction::AShr: 7398 case Instruction::And: 7399 case Instruction::Or: 7400 case Instruction::Xor: { 7401 // Since we will replace the stride by 1 the multiplication should go away. 7402 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7403 return 0; 7404 7405 // Detect reduction patterns 7406 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7407 return *RedCost; 7408 7409 // Certain instructions can be cheaper to vectorize if they have a constant 7410 // second vector operand. One example of this are shifts on x86. 7411 Value *Op2 = I->getOperand(1); 7412 TargetTransformInfo::OperandValueProperties Op2VP; 7413 TargetTransformInfo::OperandValueKind Op2VK = 7414 TTI.getOperandInfo(Op2, Op2VP); 7415 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7416 Op2VK = TargetTransformInfo::OK_UniformValue; 7417 7418 SmallVector<const Value *, 4> Operands(I->operand_values()); 7419 return TTI.getArithmeticInstrCost( 7420 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7421 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7422 } 7423 case Instruction::FNeg: { 7424 return TTI.getArithmeticInstrCost( 7425 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7426 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7427 TargetTransformInfo::OP_None, I->getOperand(0), I); 7428 } 7429 case Instruction::Select: { 7430 SelectInst *SI = cast<SelectInst>(I); 7431 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7432 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7433 7434 const Value *Op0, *Op1; 7435 using namespace llvm::PatternMatch; 7436 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7437 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7438 // select x, y, false --> x & y 7439 // select x, true, y --> x | y 7440 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7441 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7442 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7443 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7444 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7445 Op1->getType()->getScalarSizeInBits() == 1); 7446 7447 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7448 return TTI.getArithmeticInstrCost( 7449 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7450 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7451 } 7452 7453 Type *CondTy = SI->getCondition()->getType(); 7454 if (!ScalarCond) 7455 CondTy = VectorType::get(CondTy, VF); 7456 7457 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7458 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7459 Pred = Cmp->getPredicate(); 7460 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7461 CostKind, I); 7462 } 7463 case Instruction::ICmp: 7464 case Instruction::FCmp: { 7465 Type *ValTy = I->getOperand(0)->getType(); 7466 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7467 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7468 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7469 VectorTy = ToVectorTy(ValTy, VF); 7470 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7471 cast<CmpInst>(I)->getPredicate(), CostKind, 7472 I); 7473 } 7474 case Instruction::Store: 7475 case Instruction::Load: { 7476 ElementCount Width = VF; 7477 if (Width.isVector()) { 7478 InstWidening Decision = getWideningDecision(I, Width); 7479 assert(Decision != CM_Unknown && 7480 "CM decision should be taken at this point"); 7481 if (Decision == CM_Scalarize) 7482 Width = ElementCount::getFixed(1); 7483 } 7484 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7485 return getMemoryInstructionCost(I, VF); 7486 } 7487 case Instruction::BitCast: 7488 if (I->getType()->isPointerTy()) 7489 return 0; 7490 LLVM_FALLTHROUGH; 7491 case Instruction::ZExt: 7492 case Instruction::SExt: 7493 case Instruction::FPToUI: 7494 case Instruction::FPToSI: 7495 case Instruction::FPExt: 7496 case Instruction::PtrToInt: 7497 case Instruction::IntToPtr: 7498 case Instruction::SIToFP: 7499 case Instruction::UIToFP: 7500 case Instruction::Trunc: 7501 case Instruction::FPTrunc: { 7502 // Computes the CastContextHint from a Load/Store instruction. 7503 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7504 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7505 "Expected a load or a store!"); 7506 7507 if (VF.isScalar() || !TheLoop->contains(I)) 7508 return TTI::CastContextHint::Normal; 7509 7510 switch (getWideningDecision(I, VF)) { 7511 case LoopVectorizationCostModel::CM_GatherScatter: 7512 return TTI::CastContextHint::GatherScatter; 7513 case LoopVectorizationCostModel::CM_Interleave: 7514 return TTI::CastContextHint::Interleave; 7515 case LoopVectorizationCostModel::CM_Scalarize: 7516 case LoopVectorizationCostModel::CM_Widen: 7517 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7518 : TTI::CastContextHint::Normal; 7519 case LoopVectorizationCostModel::CM_Widen_Reverse: 7520 return TTI::CastContextHint::Reversed; 7521 case LoopVectorizationCostModel::CM_Unknown: 7522 llvm_unreachable("Instr did not go through cost modelling?"); 7523 } 7524 7525 llvm_unreachable("Unhandled case!"); 7526 }; 7527 7528 unsigned Opcode = I->getOpcode(); 7529 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7530 // For Trunc, the context is the only user, which must be a StoreInst. 7531 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7532 if (I->hasOneUse()) 7533 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7534 CCH = ComputeCCH(Store); 7535 } 7536 // For Z/Sext, the context is the operand, which must be a LoadInst. 7537 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7538 Opcode == Instruction::FPExt) { 7539 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7540 CCH = ComputeCCH(Load); 7541 } 7542 7543 // We optimize the truncation of induction variables having constant 7544 // integer steps. The cost of these truncations is the same as the scalar 7545 // operation. 7546 if (isOptimizableIVTruncate(I, VF)) { 7547 auto *Trunc = cast<TruncInst>(I); 7548 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7549 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7550 } 7551 7552 // Detect reduction patterns 7553 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7554 return *RedCost; 7555 7556 Type *SrcScalarTy = I->getOperand(0)->getType(); 7557 Type *SrcVecTy = 7558 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7559 if (canTruncateToMinimalBitwidth(I, VF)) { 7560 // This cast is going to be shrunk. This may remove the cast or it might 7561 // turn it into slightly different cast. For example, if MinBW == 16, 7562 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7563 // 7564 // Calculate the modified src and dest types. 7565 Type *MinVecTy = VectorTy; 7566 if (Opcode == Instruction::Trunc) { 7567 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7568 VectorTy = 7569 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7570 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7571 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7572 VectorTy = 7573 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7574 } 7575 } 7576 7577 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7578 } 7579 case Instruction::Call: { 7580 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7581 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7582 return *RedCost; 7583 bool NeedToScalarize; 7584 CallInst *CI = cast<CallInst>(I); 7585 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7586 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7587 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7588 return std::min(CallCost, IntrinsicCost); 7589 } 7590 return CallCost; 7591 } 7592 case Instruction::ExtractValue: 7593 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7594 case Instruction::Alloca: 7595 // We cannot easily widen alloca to a scalable alloca, as 7596 // the result would need to be a vector of pointers. 7597 if (VF.isScalable()) 7598 return InstructionCost::getInvalid(); 7599 LLVM_FALLTHROUGH; 7600 default: 7601 // This opcode is unknown. Assume that it is the same as 'mul'. 7602 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7603 } // end of switch. 7604 } 7605 7606 char LoopVectorize::ID = 0; 7607 7608 static const char lv_name[] = "Loop Vectorization"; 7609 7610 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7611 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7612 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7613 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7614 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7615 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7616 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7617 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7618 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7619 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7620 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7621 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7622 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7623 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7624 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7625 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7626 7627 namespace llvm { 7628 7629 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7630 7631 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7632 bool VectorizeOnlyWhenForced) { 7633 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7634 } 7635 7636 } // end namespace llvm 7637 7638 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7639 // Check if the pointer operand of a load or store instruction is 7640 // consecutive. 7641 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7642 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7643 return false; 7644 } 7645 7646 void LoopVectorizationCostModel::collectValuesToIgnore() { 7647 // Ignore ephemeral values. 7648 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7649 7650 // Ignore type-promoting instructions we identified during reduction 7651 // detection. 7652 for (auto &Reduction : Legal->getReductionVars()) { 7653 const RecurrenceDescriptor &RedDes = Reduction.second; 7654 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7655 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7656 } 7657 // Ignore type-casting instructions we identified during induction 7658 // detection. 7659 for (auto &Induction : Legal->getInductionVars()) { 7660 const InductionDescriptor &IndDes = Induction.second; 7661 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7662 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7663 } 7664 } 7665 7666 void LoopVectorizationCostModel::collectInLoopReductions() { 7667 for (auto &Reduction : Legal->getReductionVars()) { 7668 PHINode *Phi = Reduction.first; 7669 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7670 7671 // We don't collect reductions that are type promoted (yet). 7672 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7673 continue; 7674 7675 // If the target would prefer this reduction to happen "in-loop", then we 7676 // want to record it as such. 7677 unsigned Opcode = RdxDesc.getOpcode(); 7678 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7679 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7680 TargetTransformInfo::ReductionFlags())) 7681 continue; 7682 7683 // Check that we can correctly put the reductions into the loop, by 7684 // finding the chain of operations that leads from the phi to the loop 7685 // exit value. 7686 SmallVector<Instruction *, 4> ReductionOperations = 7687 RdxDesc.getReductionOpChain(Phi, TheLoop); 7688 bool InLoop = !ReductionOperations.empty(); 7689 if (InLoop) { 7690 InLoopReductionChains[Phi] = ReductionOperations; 7691 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7692 Instruction *LastChain = Phi; 7693 for (auto *I : ReductionOperations) { 7694 InLoopReductionImmediateChains[I] = LastChain; 7695 LastChain = I; 7696 } 7697 } 7698 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7699 << " reduction for phi: " << *Phi << "\n"); 7700 } 7701 } 7702 7703 // TODO: we could return a pair of values that specify the max VF and 7704 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7705 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7706 // doesn't have a cost model that can choose which plan to execute if 7707 // more than one is generated. 7708 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7709 LoopVectorizationCostModel &CM) { 7710 unsigned WidestType; 7711 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7712 return WidestVectorRegBits / WidestType; 7713 } 7714 7715 VectorizationFactor 7716 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7717 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7718 ElementCount VF = UserVF; 7719 // Outer loop handling: They may require CFG and instruction level 7720 // transformations before even evaluating whether vectorization is profitable. 7721 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7722 // the vectorization pipeline. 7723 if (!OrigLoop->isInnermost()) { 7724 // If the user doesn't provide a vectorization factor, determine a 7725 // reasonable one. 7726 if (UserVF.isZero()) { 7727 VF = ElementCount::getFixed(determineVPlanVF( 7728 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7729 .getFixedSize(), 7730 CM)); 7731 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7732 7733 // Make sure we have a VF > 1 for stress testing. 7734 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7735 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7736 << "overriding computed VF.\n"); 7737 VF = ElementCount::getFixed(4); 7738 } 7739 } 7740 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7741 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7742 "VF needs to be a power of two"); 7743 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7744 << "VF " << VF << " to build VPlans.\n"); 7745 buildVPlans(VF, VF); 7746 7747 // For VPlan build stress testing, we bail out after VPlan construction. 7748 if (VPlanBuildStressTest) 7749 return VectorizationFactor::Disabled(); 7750 7751 return {VF, 0 /*Cost*/}; 7752 } 7753 7754 LLVM_DEBUG( 7755 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7756 "VPlan-native path.\n"); 7757 return VectorizationFactor::Disabled(); 7758 } 7759 7760 Optional<VectorizationFactor> 7761 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7762 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7763 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7764 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7765 return None; 7766 7767 // Invalidate interleave groups if all blocks of loop will be predicated. 7768 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7769 !useMaskedInterleavedAccesses(*TTI)) { 7770 LLVM_DEBUG( 7771 dbgs() 7772 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7773 "which requires masked-interleaved support.\n"); 7774 if (CM.InterleaveInfo.invalidateGroups()) 7775 // Invalidating interleave groups also requires invalidating all decisions 7776 // based on them, which includes widening decisions and uniform and scalar 7777 // values. 7778 CM.invalidateCostModelingDecisions(); 7779 } 7780 7781 ElementCount MaxUserVF = 7782 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7783 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7784 if (!UserVF.isZero() && UserVFIsLegal) { 7785 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7786 "VF needs to be a power of two"); 7787 // Collect the instructions (and their associated costs) that will be more 7788 // profitable to scalarize. 7789 if (CM.selectUserVectorizationFactor(UserVF)) { 7790 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7791 CM.collectInLoopReductions(); 7792 buildVPlansWithVPRecipes(UserVF, UserVF); 7793 LLVM_DEBUG(printPlans(dbgs())); 7794 return {{UserVF, 0}}; 7795 } else 7796 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7797 "InvalidCost", ORE, OrigLoop); 7798 } 7799 7800 // Populate the set of Vectorization Factor Candidates. 7801 ElementCountSet VFCandidates; 7802 for (auto VF = ElementCount::getFixed(1); 7803 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7804 VFCandidates.insert(VF); 7805 for (auto VF = ElementCount::getScalable(1); 7806 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7807 VFCandidates.insert(VF); 7808 7809 for (const auto &VF : VFCandidates) { 7810 // Collect Uniform and Scalar instructions after vectorization with VF. 7811 CM.collectUniformsAndScalars(VF); 7812 7813 // Collect the instructions (and their associated costs) that will be more 7814 // profitable to scalarize. 7815 if (VF.isVector()) 7816 CM.collectInstsToScalarize(VF); 7817 } 7818 7819 CM.collectInLoopReductions(); 7820 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7821 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7822 7823 LLVM_DEBUG(printPlans(dbgs())); 7824 if (!MaxFactors.hasVector()) 7825 return VectorizationFactor::Disabled(); 7826 7827 // Select the optimal vectorization factor. 7828 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7829 7830 // Check if it is profitable to vectorize with runtime checks. 7831 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7832 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7833 bool PragmaThresholdReached = 7834 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7835 bool ThresholdReached = 7836 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7837 if ((ThresholdReached && !Hints.allowReordering()) || 7838 PragmaThresholdReached) { 7839 ORE->emit([&]() { 7840 return OptimizationRemarkAnalysisAliasing( 7841 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7842 OrigLoop->getHeader()) 7843 << "loop not vectorized: cannot prove it is safe to reorder " 7844 "memory operations"; 7845 }); 7846 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7847 Hints.emitRemarkWithHints(); 7848 return VectorizationFactor::Disabled(); 7849 } 7850 } 7851 return SelectedVF; 7852 } 7853 7854 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7855 assert(count_if(VPlans, 7856 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7857 1 && 7858 "Best VF has not a single VPlan."); 7859 7860 for (const VPlanPtr &Plan : VPlans) { 7861 if (Plan->hasVF(VF)) 7862 return *Plan.get(); 7863 } 7864 llvm_unreachable("No plan found!"); 7865 } 7866 7867 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7868 SmallVector<Metadata *, 4> MDs; 7869 // Reserve first location for self reference to the LoopID metadata node. 7870 MDs.push_back(nullptr); 7871 bool IsUnrollMetadata = false; 7872 MDNode *LoopID = L->getLoopID(); 7873 if (LoopID) { 7874 // First find existing loop unrolling disable metadata. 7875 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7876 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7877 if (MD) { 7878 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7879 IsUnrollMetadata = 7880 S && S->getString().startswith("llvm.loop.unroll.disable"); 7881 } 7882 MDs.push_back(LoopID->getOperand(i)); 7883 } 7884 } 7885 7886 if (!IsUnrollMetadata) { 7887 // Add runtime unroll disable metadata. 7888 LLVMContext &Context = L->getHeader()->getContext(); 7889 SmallVector<Metadata *, 1> DisableOperands; 7890 DisableOperands.push_back( 7891 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7892 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7893 MDs.push_back(DisableNode); 7894 MDNode *NewLoopID = MDNode::get(Context, MDs); 7895 // Set operand 0 to refer to the loop id itself. 7896 NewLoopID->replaceOperandWith(0, NewLoopID); 7897 L->setLoopID(NewLoopID); 7898 } 7899 } 7900 7901 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7902 VPlan &BestVPlan, 7903 InnerLoopVectorizer &ILV, 7904 DominatorTree *DT) { 7905 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7906 << '\n'); 7907 7908 // Perform the actual loop transformation. 7909 7910 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7911 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7912 Value *CanonicalIVStartValue; 7913 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7914 ILV.createVectorizedLoopSkeleton(); 7915 ILV.collectPoisonGeneratingRecipes(State); 7916 7917 ILV.printDebugTracesAtStart(); 7918 7919 //===------------------------------------------------===// 7920 // 7921 // Notice: any optimization or new instruction that go 7922 // into the code below should also be implemented in 7923 // the cost-model. 7924 // 7925 //===------------------------------------------------===// 7926 7927 // 2. Copy and widen instructions from the old loop into the new loop. 7928 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7929 ILV.getOrCreateVectorTripCount(nullptr), 7930 CanonicalIVStartValue, State); 7931 BestVPlan.execute(&State); 7932 7933 // Keep all loop hints from the original loop on the vector loop (we'll 7934 // replace the vectorizer-specific hints below). 7935 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7936 7937 Optional<MDNode *> VectorizedLoopID = 7938 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7939 LLVMLoopVectorizeFollowupVectorized}); 7940 7941 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 7942 if (VectorizedLoopID.hasValue()) 7943 L->setLoopID(VectorizedLoopID.getValue()); 7944 else { 7945 // Keep all loop hints from the original loop on the vector loop (we'll 7946 // replace the vectorizer-specific hints below). 7947 if (MDNode *LID = OrigLoop->getLoopID()) 7948 L->setLoopID(LID); 7949 7950 LoopVectorizeHints Hints(L, true, *ORE); 7951 Hints.setAlreadyVectorized(); 7952 } 7953 // Disable runtime unrolling when vectorizing the epilogue loop. 7954 if (CanonicalIVStartValue) 7955 AddRuntimeUnrollDisableMetaData(L); 7956 7957 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7958 // predication, updating analyses. 7959 ILV.fixVectorizedLoop(State); 7960 7961 ILV.printDebugTracesAtEnd(); 7962 } 7963 7964 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7965 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7966 for (const auto &Plan : VPlans) 7967 if (PrintVPlansInDotFormat) 7968 Plan->printDOT(O); 7969 else 7970 Plan->print(O); 7971 } 7972 #endif 7973 7974 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7975 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7976 7977 // We create new control-flow for the vectorized loop, so the original exit 7978 // conditions will be dead after vectorization if it's only used by the 7979 // terminator 7980 SmallVector<BasicBlock*> ExitingBlocks; 7981 OrigLoop->getExitingBlocks(ExitingBlocks); 7982 for (auto *BB : ExitingBlocks) { 7983 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7984 if (!Cmp || !Cmp->hasOneUse()) 7985 continue; 7986 7987 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7988 if (!DeadInstructions.insert(Cmp).second) 7989 continue; 7990 7991 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7992 // TODO: can recurse through operands in general 7993 for (Value *Op : Cmp->operands()) { 7994 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7995 DeadInstructions.insert(cast<Instruction>(Op)); 7996 } 7997 } 7998 7999 // We create new "steps" for induction variable updates to which the original 8000 // induction variables map. An original update instruction will be dead if 8001 // all its users except the induction variable are dead. 8002 auto *Latch = OrigLoop->getLoopLatch(); 8003 for (auto &Induction : Legal->getInductionVars()) { 8004 PHINode *Ind = Induction.first; 8005 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8006 8007 // If the tail is to be folded by masking, the primary induction variable, 8008 // if exists, isn't dead: it will be used for masking. Don't kill it. 8009 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8010 continue; 8011 8012 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8013 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8014 })) 8015 DeadInstructions.insert(IndUpdate); 8016 } 8017 } 8018 8019 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8020 8021 //===--------------------------------------------------------------------===// 8022 // EpilogueVectorizerMainLoop 8023 //===--------------------------------------------------------------------===// 8024 8025 /// This function is partially responsible for generating the control flow 8026 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8027 std::pair<BasicBlock *, Value *> 8028 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8029 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8030 Loop *Lp = createVectorLoopSkeleton(""); 8031 8032 // Generate the code to check the minimum iteration count of the vector 8033 // epilogue (see below). 8034 EPI.EpilogueIterationCountCheck = 8035 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8036 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8037 8038 // Generate the code to check any assumptions that we've made for SCEV 8039 // expressions. 8040 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8041 8042 // Generate the code that checks at runtime if arrays overlap. We put the 8043 // checks into a separate block to make the more common case of few elements 8044 // faster. 8045 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8046 8047 // Generate the iteration count check for the main loop, *after* the check 8048 // for the epilogue loop, so that the path-length is shorter for the case 8049 // that goes directly through the vector epilogue. The longer-path length for 8050 // the main loop is compensated for, by the gain from vectorizing the larger 8051 // trip count. Note: the branch will get updated later on when we vectorize 8052 // the epilogue. 8053 EPI.MainLoopIterationCountCheck = 8054 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8055 8056 // Generate the induction variable. 8057 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8058 EPI.VectorTripCount = CountRoundDown; 8059 createHeaderBranch(Lp); 8060 8061 // Skip induction resume value creation here because they will be created in 8062 // the second pass. If we created them here, they wouldn't be used anyway, 8063 // because the vplan in the second pass still contains the inductions from the 8064 // original loop. 8065 8066 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 8067 } 8068 8069 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8070 LLVM_DEBUG({ 8071 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8072 << "Main Loop VF:" << EPI.MainLoopVF 8073 << ", Main Loop UF:" << EPI.MainLoopUF 8074 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8075 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8076 }); 8077 } 8078 8079 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8080 DEBUG_WITH_TYPE(VerboseDebug, { 8081 dbgs() << "intermediate fn:\n" 8082 << *OrigLoop->getHeader()->getParent() << "\n"; 8083 }); 8084 } 8085 8086 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8087 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8088 assert(L && "Expected valid Loop."); 8089 assert(Bypass && "Expected valid bypass basic block."); 8090 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8091 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8092 Value *Count = getOrCreateTripCount(L); 8093 // Reuse existing vector loop preheader for TC checks. 8094 // Note that new preheader block is generated for vector loop. 8095 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8096 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8097 8098 // Generate code to check if the loop's trip count is less than VF * UF of the 8099 // main vector loop. 8100 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8101 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8102 8103 Value *CheckMinIters = Builder.CreateICmp( 8104 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8105 "min.iters.check"); 8106 8107 if (!ForEpilogue) 8108 TCCheckBlock->setName("vector.main.loop.iter.check"); 8109 8110 // Create new preheader for vector loop. 8111 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8112 DT, LI, nullptr, "vector.ph"); 8113 8114 if (ForEpilogue) { 8115 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8116 DT->getNode(Bypass)->getIDom()) && 8117 "TC check is expected to dominate Bypass"); 8118 8119 // Update dominator for Bypass & LoopExit. 8120 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8121 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8122 // For loops with multiple exits, there's no edge from the middle block 8123 // to exit blocks (as the epilogue must run) and thus no need to update 8124 // the immediate dominator of the exit blocks. 8125 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8126 8127 LoopBypassBlocks.push_back(TCCheckBlock); 8128 8129 // Save the trip count so we don't have to regenerate it in the 8130 // vec.epilog.iter.check. This is safe to do because the trip count 8131 // generated here dominates the vector epilog iter check. 8132 EPI.TripCount = Count; 8133 } 8134 8135 ReplaceInstWithInst( 8136 TCCheckBlock->getTerminator(), 8137 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8138 8139 return TCCheckBlock; 8140 } 8141 8142 //===--------------------------------------------------------------------===// 8143 // EpilogueVectorizerEpilogueLoop 8144 //===--------------------------------------------------------------------===// 8145 8146 /// This function is partially responsible for generating the control flow 8147 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8148 std::pair<BasicBlock *, Value *> 8149 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8150 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8151 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8152 8153 // Now, compare the remaining count and if there aren't enough iterations to 8154 // execute the vectorized epilogue skip to the scalar part. 8155 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8156 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8157 LoopVectorPreHeader = 8158 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8159 LI, nullptr, "vec.epilog.ph"); 8160 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8161 VecEpilogueIterationCountCheck); 8162 8163 // Adjust the control flow taking the state info from the main loop 8164 // vectorization into account. 8165 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8166 "expected this to be saved from the previous pass."); 8167 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8168 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8169 8170 DT->changeImmediateDominator(LoopVectorPreHeader, 8171 EPI.MainLoopIterationCountCheck); 8172 8173 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8174 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8175 8176 if (EPI.SCEVSafetyCheck) 8177 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8178 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8179 if (EPI.MemSafetyCheck) 8180 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8181 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8182 8183 DT->changeImmediateDominator( 8184 VecEpilogueIterationCountCheck, 8185 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8186 8187 DT->changeImmediateDominator(LoopScalarPreHeader, 8188 EPI.EpilogueIterationCountCheck); 8189 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8190 // If there is an epilogue which must run, there's no edge from the 8191 // middle block to exit blocks and thus no need to update the immediate 8192 // dominator of the exit blocks. 8193 DT->changeImmediateDominator(LoopExitBlock, 8194 EPI.EpilogueIterationCountCheck); 8195 8196 // Keep track of bypass blocks, as they feed start values to the induction 8197 // phis in the scalar loop preheader. 8198 if (EPI.SCEVSafetyCheck) 8199 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8200 if (EPI.MemSafetyCheck) 8201 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8202 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8203 8204 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 8205 // merge control-flow from the latch block and the middle block. Update the 8206 // incoming values here and move the Phi into the preheader. 8207 SmallVector<PHINode *, 4> PhisInBlock; 8208 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8209 PhisInBlock.push_back(&Phi); 8210 8211 for (PHINode *Phi : PhisInBlock) { 8212 Phi->replaceIncomingBlockWith( 8213 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8214 VecEpilogueIterationCountCheck); 8215 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8216 if (EPI.SCEVSafetyCheck) 8217 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8218 if (EPI.MemSafetyCheck) 8219 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8220 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8221 } 8222 8223 // Generate a resume induction for the vector epilogue and put it in the 8224 // vector epilogue preheader 8225 Type *IdxTy = Legal->getWidestInductionType(); 8226 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8227 LoopVectorPreHeader->getFirstNonPHI()); 8228 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8229 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8230 EPI.MainLoopIterationCountCheck); 8231 8232 // Generate the induction variable. 8233 createHeaderBranch(Lp); 8234 8235 // Generate induction resume values. These variables save the new starting 8236 // indexes for the scalar loop. They are used to test if there are any tail 8237 // iterations left once the vector loop has completed. 8238 // Note that when the vectorized epilogue is skipped due to iteration count 8239 // check, then the resume value for the induction variable comes from 8240 // the trip count of the main vector loop, hence passing the AdditionalBypass 8241 // argument. 8242 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8243 EPI.VectorTripCount} /* AdditionalBypass */); 8244 8245 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8246 } 8247 8248 BasicBlock * 8249 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8250 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8251 8252 assert(EPI.TripCount && 8253 "Expected trip count to have been safed in the first pass."); 8254 assert( 8255 (!isa<Instruction>(EPI.TripCount) || 8256 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8257 "saved trip count does not dominate insertion point."); 8258 Value *TC = EPI.TripCount; 8259 IRBuilder<> Builder(Insert->getTerminator()); 8260 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8261 8262 // Generate code to check if the loop's trip count is less than VF * UF of the 8263 // vector epilogue loop. 8264 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8265 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8266 8267 Value *CheckMinIters = 8268 Builder.CreateICmp(P, Count, 8269 createStepForVF(Builder, Count->getType(), 8270 EPI.EpilogueVF, EPI.EpilogueUF), 8271 "min.epilog.iters.check"); 8272 8273 ReplaceInstWithInst( 8274 Insert->getTerminator(), 8275 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8276 8277 LoopBypassBlocks.push_back(Insert); 8278 return Insert; 8279 } 8280 8281 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8282 LLVM_DEBUG({ 8283 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8284 << "Epilogue Loop VF:" << EPI.EpilogueVF 8285 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8286 }); 8287 } 8288 8289 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8290 DEBUG_WITH_TYPE(VerboseDebug, { 8291 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8292 }); 8293 } 8294 8295 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8296 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8297 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8298 bool PredicateAtRangeStart = Predicate(Range.Start); 8299 8300 for (ElementCount TmpVF = Range.Start * 2; 8301 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8302 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8303 Range.End = TmpVF; 8304 break; 8305 } 8306 8307 return PredicateAtRangeStart; 8308 } 8309 8310 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8311 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8312 /// of VF's starting at a given VF and extending it as much as possible. Each 8313 /// vectorization decision can potentially shorten this sub-range during 8314 /// buildVPlan(). 8315 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8316 ElementCount MaxVF) { 8317 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8318 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8319 VFRange SubRange = {VF, MaxVFPlusOne}; 8320 VPlans.push_back(buildVPlan(SubRange)); 8321 VF = SubRange.End; 8322 } 8323 } 8324 8325 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8326 VPlanPtr &Plan) { 8327 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8328 8329 // Look for cached value. 8330 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8331 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8332 if (ECEntryIt != EdgeMaskCache.end()) 8333 return ECEntryIt->second; 8334 8335 VPValue *SrcMask = createBlockInMask(Src, Plan); 8336 8337 // The terminator has to be a branch inst! 8338 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8339 assert(BI && "Unexpected terminator found"); 8340 8341 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8342 return EdgeMaskCache[Edge] = SrcMask; 8343 8344 // If source is an exiting block, we know the exit edge is dynamically dead 8345 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8346 // adding uses of an otherwise potentially dead instruction. 8347 if (OrigLoop->isLoopExiting(Src)) 8348 return EdgeMaskCache[Edge] = SrcMask; 8349 8350 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8351 assert(EdgeMask && "No Edge Mask found for condition"); 8352 8353 if (BI->getSuccessor(0) != Dst) 8354 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8355 8356 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8357 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8358 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8359 // The select version does not introduce new UB if SrcMask is false and 8360 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8361 VPValue *False = Plan->getOrAddVPValue( 8362 ConstantInt::getFalse(BI->getCondition()->getType())); 8363 EdgeMask = 8364 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8365 } 8366 8367 return EdgeMaskCache[Edge] = EdgeMask; 8368 } 8369 8370 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8371 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8372 8373 // Look for cached value. 8374 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8375 if (BCEntryIt != BlockMaskCache.end()) 8376 return BCEntryIt->second; 8377 8378 // All-one mask is modelled as no-mask following the convention for masked 8379 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8380 VPValue *BlockMask = nullptr; 8381 8382 if (OrigLoop->getHeader() == BB) { 8383 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8384 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8385 8386 // Introduce the early-exit compare IV <= BTC to form header block mask. 8387 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8388 // constructing the desired canonical IV in the header block as its first 8389 // non-phi instructions. 8390 assert(CM.foldTailByMasking() && "must fold the tail"); 8391 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8392 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8393 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8394 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8395 8396 VPBuilder::InsertPointGuard Guard(Builder); 8397 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8398 if (CM.TTI.emitGetActiveLaneMask()) { 8399 VPValue *TC = Plan->getOrCreateTripCount(); 8400 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8401 } else { 8402 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8403 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8404 } 8405 return BlockMaskCache[BB] = BlockMask; 8406 } 8407 8408 // This is the block mask. We OR all incoming edges. 8409 for (auto *Predecessor : predecessors(BB)) { 8410 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8411 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8412 return BlockMaskCache[BB] = EdgeMask; 8413 8414 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8415 BlockMask = EdgeMask; 8416 continue; 8417 } 8418 8419 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8420 } 8421 8422 return BlockMaskCache[BB] = BlockMask; 8423 } 8424 8425 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8426 ArrayRef<VPValue *> Operands, 8427 VFRange &Range, 8428 VPlanPtr &Plan) { 8429 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8430 "Must be called with either a load or store"); 8431 8432 auto willWiden = [&](ElementCount VF) -> bool { 8433 if (VF.isScalar()) 8434 return false; 8435 LoopVectorizationCostModel::InstWidening Decision = 8436 CM.getWideningDecision(I, VF); 8437 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8438 "CM decision should be taken at this point."); 8439 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8440 return true; 8441 if (CM.isScalarAfterVectorization(I, VF) || 8442 CM.isProfitableToScalarize(I, VF)) 8443 return false; 8444 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8445 }; 8446 8447 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8448 return nullptr; 8449 8450 VPValue *Mask = nullptr; 8451 if (Legal->isMaskRequired(I)) 8452 Mask = createBlockInMask(I->getParent(), Plan); 8453 8454 // Determine if the pointer operand of the access is either consecutive or 8455 // reverse consecutive. 8456 LoopVectorizationCostModel::InstWidening Decision = 8457 CM.getWideningDecision(I, Range.Start); 8458 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8459 bool Consecutive = 8460 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8461 8462 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8463 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8464 Consecutive, Reverse); 8465 8466 StoreInst *Store = cast<StoreInst>(I); 8467 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8468 Mask, Consecutive, Reverse); 8469 } 8470 8471 static VPWidenIntOrFpInductionRecipe * 8472 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, 8473 VPValue *Start, const InductionDescriptor &IndDesc, 8474 LoopVectorizationCostModel &CM, Loop &OrigLoop, 8475 VFRange &Range) { 8476 // Returns true if an instruction \p I should be scalarized instead of 8477 // vectorized for the chosen vectorization factor. 8478 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8479 return CM.isScalarAfterVectorization(I, VF) || 8480 CM.isProfitableToScalarize(I, VF); 8481 }; 8482 8483 bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( 8484 [&](ElementCount VF) { 8485 // Returns true if we should generate a scalar version of \p IV. 8486 if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) 8487 return true; 8488 auto isScalarInst = [&](User *U) -> bool { 8489 auto *I = cast<Instruction>(U); 8490 return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); 8491 }; 8492 return any_of(PhiOrTrunc->users(), isScalarInst); 8493 }, 8494 Range); 8495 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8496 [&](ElementCount VF) { 8497 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8498 }, 8499 Range); 8500 assert(IndDesc.getStartValue() == 8501 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8502 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8503 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, 8504 NeedsScalarIV, !NeedsScalarIVOnly); 8505 } 8506 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8507 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, 8508 !NeedsScalarIVOnly); 8509 } 8510 8511 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8512 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { 8513 8514 // Check if this is an integer or fp induction. If so, build the recipe that 8515 // produces its scalar and vector values. 8516 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8517 return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, 8518 Range); 8519 8520 return nullptr; 8521 } 8522 8523 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8524 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8525 VPlan &Plan) const { 8526 // Optimize the special case where the source is a constant integer 8527 // induction variable. Notice that we can only optimize the 'trunc' case 8528 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8529 // (c) other casts depend on pointer size. 8530 8531 // Determine whether \p K is a truncation based on an induction variable that 8532 // can be optimized. 8533 auto isOptimizableIVTruncate = 8534 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8535 return [=](ElementCount VF) -> bool { 8536 return CM.isOptimizableIVTruncate(K, VF); 8537 }; 8538 }; 8539 8540 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8541 isOptimizableIVTruncate(I), Range)) { 8542 8543 auto *Phi = cast<PHINode>(I->getOperand(0)); 8544 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8545 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8546 return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); 8547 } 8548 return nullptr; 8549 } 8550 8551 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8552 ArrayRef<VPValue *> Operands, 8553 VPlanPtr &Plan) { 8554 // If all incoming values are equal, the incoming VPValue can be used directly 8555 // instead of creating a new VPBlendRecipe. 8556 VPValue *FirstIncoming = Operands[0]; 8557 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8558 return FirstIncoming == Inc; 8559 })) { 8560 return Operands[0]; 8561 } 8562 8563 // We know that all PHIs in non-header blocks are converted into selects, so 8564 // we don't have to worry about the insertion order and we can just use the 8565 // builder. At this point we generate the predication tree. There may be 8566 // duplications since this is a simple recursive scan, but future 8567 // optimizations will clean it up. 8568 SmallVector<VPValue *, 2> OperandsWithMask; 8569 unsigned NumIncoming = Phi->getNumIncomingValues(); 8570 8571 for (unsigned In = 0; In < NumIncoming; In++) { 8572 VPValue *EdgeMask = 8573 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8574 assert((EdgeMask || NumIncoming == 1) && 8575 "Multiple predecessors with one having a full mask"); 8576 OperandsWithMask.push_back(Operands[In]); 8577 if (EdgeMask) 8578 OperandsWithMask.push_back(EdgeMask); 8579 } 8580 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8581 } 8582 8583 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8584 ArrayRef<VPValue *> Operands, 8585 VFRange &Range) const { 8586 8587 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8588 [this, CI](ElementCount VF) { 8589 return CM.isScalarWithPredication(CI, VF); 8590 }, 8591 Range); 8592 8593 if (IsPredicated) 8594 return nullptr; 8595 8596 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8597 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8598 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8599 ID == Intrinsic::pseudoprobe || 8600 ID == Intrinsic::experimental_noalias_scope_decl)) 8601 return nullptr; 8602 8603 auto willWiden = [&](ElementCount VF) -> bool { 8604 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8605 // The following case may be scalarized depending on the VF. 8606 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8607 // version of the instruction. 8608 // Is it beneficial to perform intrinsic call compared to lib call? 8609 bool NeedToScalarize = false; 8610 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8611 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8612 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8613 return UseVectorIntrinsic || !NeedToScalarize; 8614 }; 8615 8616 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8617 return nullptr; 8618 8619 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8620 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8621 } 8622 8623 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8624 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8625 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8626 // Instruction should be widened, unless it is scalar after vectorization, 8627 // scalarization is profitable or it is predicated. 8628 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8629 return CM.isScalarAfterVectorization(I, VF) || 8630 CM.isProfitableToScalarize(I, VF) || 8631 CM.isScalarWithPredication(I, VF); 8632 }; 8633 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8634 Range); 8635 } 8636 8637 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8638 ArrayRef<VPValue *> Operands) const { 8639 auto IsVectorizableOpcode = [](unsigned Opcode) { 8640 switch (Opcode) { 8641 case Instruction::Add: 8642 case Instruction::And: 8643 case Instruction::AShr: 8644 case Instruction::BitCast: 8645 case Instruction::FAdd: 8646 case Instruction::FCmp: 8647 case Instruction::FDiv: 8648 case Instruction::FMul: 8649 case Instruction::FNeg: 8650 case Instruction::FPExt: 8651 case Instruction::FPToSI: 8652 case Instruction::FPToUI: 8653 case Instruction::FPTrunc: 8654 case Instruction::FRem: 8655 case Instruction::FSub: 8656 case Instruction::ICmp: 8657 case Instruction::IntToPtr: 8658 case Instruction::LShr: 8659 case Instruction::Mul: 8660 case Instruction::Or: 8661 case Instruction::PtrToInt: 8662 case Instruction::SDiv: 8663 case Instruction::Select: 8664 case Instruction::SExt: 8665 case Instruction::Shl: 8666 case Instruction::SIToFP: 8667 case Instruction::SRem: 8668 case Instruction::Sub: 8669 case Instruction::Trunc: 8670 case Instruction::UDiv: 8671 case Instruction::UIToFP: 8672 case Instruction::URem: 8673 case Instruction::Xor: 8674 case Instruction::ZExt: 8675 return true; 8676 } 8677 return false; 8678 }; 8679 8680 if (!IsVectorizableOpcode(I->getOpcode())) 8681 return nullptr; 8682 8683 // Success: widen this instruction. 8684 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8685 } 8686 8687 void VPRecipeBuilder::fixHeaderPhis() { 8688 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8689 for (VPHeaderPHIRecipe *R : PhisToFix) { 8690 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8691 VPRecipeBase *IncR = 8692 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8693 R->addOperand(IncR->getVPSingleValue()); 8694 } 8695 } 8696 8697 VPBasicBlock *VPRecipeBuilder::handleReplication( 8698 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8699 VPlanPtr &Plan) { 8700 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8701 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8702 Range); 8703 8704 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8705 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8706 Range); 8707 8708 // Even if the instruction is not marked as uniform, there are certain 8709 // intrinsic calls that can be effectively treated as such, so we check for 8710 // them here. Conservatively, we only do this for scalable vectors, since 8711 // for fixed-width VFs we can always fall back on full scalarization. 8712 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8713 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8714 case Intrinsic::assume: 8715 case Intrinsic::lifetime_start: 8716 case Intrinsic::lifetime_end: 8717 // For scalable vectors if one of the operands is variant then we still 8718 // want to mark as uniform, which will generate one instruction for just 8719 // the first lane of the vector. We can't scalarize the call in the same 8720 // way as for fixed-width vectors because we don't know how many lanes 8721 // there are. 8722 // 8723 // The reasons for doing it this way for scalable vectors are: 8724 // 1. For the assume intrinsic generating the instruction for the first 8725 // lane is still be better than not generating any at all. For 8726 // example, the input may be a splat across all lanes. 8727 // 2. For the lifetime start/end intrinsics the pointer operand only 8728 // does anything useful when the input comes from a stack object, 8729 // which suggests it should always be uniform. For non-stack objects 8730 // the effect is to poison the object, which still allows us to 8731 // remove the call. 8732 IsUniform = true; 8733 break; 8734 default: 8735 break; 8736 } 8737 } 8738 8739 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8740 IsUniform, IsPredicated); 8741 setRecipe(I, Recipe); 8742 Plan->addVPValue(I, Recipe); 8743 8744 // Find if I uses a predicated instruction. If so, it will use its scalar 8745 // value. Avoid hoisting the insert-element which packs the scalar value into 8746 // a vector value, as that happens iff all users use the vector value. 8747 for (VPValue *Op : Recipe->operands()) { 8748 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8749 if (!PredR) 8750 continue; 8751 auto *RepR = 8752 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8753 assert(RepR->isPredicated() && 8754 "expected Replicate recipe to be predicated"); 8755 RepR->setAlsoPack(false); 8756 } 8757 8758 // Finalize the recipe for Instr, first if it is not predicated. 8759 if (!IsPredicated) { 8760 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8761 VPBB->appendRecipe(Recipe); 8762 return VPBB; 8763 } 8764 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8765 8766 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8767 assert(SingleSucc && "VPBB must have a single successor when handling " 8768 "predicated replication."); 8769 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8770 // Record predicated instructions for above packing optimizations. 8771 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8772 VPBlockUtils::insertBlockAfter(Region, VPBB); 8773 auto *RegSucc = new VPBasicBlock(); 8774 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8775 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8776 return RegSucc; 8777 } 8778 8779 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8780 VPRecipeBase *PredRecipe, 8781 VPlanPtr &Plan) { 8782 // Instructions marked for predication are replicated and placed under an 8783 // if-then construct to prevent side-effects. 8784 8785 // Generate recipes to compute the block mask for this region. 8786 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8787 8788 // Build the triangular if-then region. 8789 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8790 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8791 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8792 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8793 auto *PHIRecipe = Instr->getType()->isVoidTy() 8794 ? nullptr 8795 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8796 if (PHIRecipe) { 8797 Plan->removeVPValueFor(Instr); 8798 Plan->addVPValue(Instr, PHIRecipe); 8799 } 8800 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8801 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8802 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8803 8804 // Note: first set Entry as region entry and then connect successors starting 8805 // from it in order, to propagate the "parent" of each VPBasicBlock. 8806 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8807 VPBlockUtils::connectBlocks(Pred, Exit); 8808 8809 return Region; 8810 } 8811 8812 VPRecipeOrVPValueTy 8813 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8814 ArrayRef<VPValue *> Operands, 8815 VFRange &Range, VPlanPtr &Plan) { 8816 // First, check for specific widening recipes that deal with calls, memory 8817 // operations, inductions and Phi nodes. 8818 if (auto *CI = dyn_cast<CallInst>(Instr)) 8819 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8820 8821 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8822 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8823 8824 VPRecipeBase *Recipe; 8825 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8826 if (Phi->getParent() != OrigLoop->getHeader()) 8827 return tryToBlend(Phi, Operands, Plan); 8828 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8829 return toVPRecipeResult(Recipe); 8830 8831 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8832 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8833 VPValue *StartV = Operands[0]; 8834 if (Legal->isReductionVariable(Phi)) { 8835 const RecurrenceDescriptor &RdxDesc = 8836 Legal->getReductionVars().find(Phi)->second; 8837 assert(RdxDesc.getRecurrenceStartValue() == 8838 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8839 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8840 CM.isInLoopReduction(Phi), 8841 CM.useOrderedReductions(RdxDesc)); 8842 } else { 8843 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8844 } 8845 8846 // Record the incoming value from the backedge, so we can add the incoming 8847 // value from the backedge after all recipes have been created. 8848 recordRecipeOf(cast<Instruction>( 8849 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8850 PhisToFix.push_back(PhiRecipe); 8851 } else { 8852 // TODO: record backedge value for remaining pointer induction phis. 8853 assert(Phi->getType()->isPointerTy() && 8854 "only pointer phis should be handled here"); 8855 assert(Legal->getInductionVars().count(Phi) && 8856 "Not an induction variable"); 8857 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8858 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8859 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8860 } 8861 8862 return toVPRecipeResult(PhiRecipe); 8863 } 8864 8865 if (isa<TruncInst>(Instr) && 8866 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8867 Range, *Plan))) 8868 return toVPRecipeResult(Recipe); 8869 8870 if (!shouldWiden(Instr, Range)) 8871 return nullptr; 8872 8873 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8874 return toVPRecipeResult(new VPWidenGEPRecipe( 8875 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8876 8877 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8878 bool InvariantCond = 8879 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8880 return toVPRecipeResult(new VPWidenSelectRecipe( 8881 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8882 } 8883 8884 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8885 } 8886 8887 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8888 ElementCount MaxVF) { 8889 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8890 8891 // Collect instructions from the original loop that will become trivially dead 8892 // in the vectorized loop. We don't need to vectorize these instructions. For 8893 // example, original induction update instructions can become dead because we 8894 // separately emit induction "steps" when generating code for the new loop. 8895 // Similarly, we create a new latch condition when setting up the structure 8896 // of the new loop, so the old one can become dead. 8897 SmallPtrSet<Instruction *, 4> DeadInstructions; 8898 collectTriviallyDeadInstructions(DeadInstructions); 8899 8900 // Add assume instructions we need to drop to DeadInstructions, to prevent 8901 // them from being added to the VPlan. 8902 // TODO: We only need to drop assumes in blocks that get flattend. If the 8903 // control flow is preserved, we should keep them. 8904 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8905 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8906 8907 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8908 // Dead instructions do not need sinking. Remove them from SinkAfter. 8909 for (Instruction *I : DeadInstructions) 8910 SinkAfter.erase(I); 8911 8912 // Cannot sink instructions after dead instructions (there won't be any 8913 // recipes for them). Instead, find the first non-dead previous instruction. 8914 for (auto &P : Legal->getSinkAfter()) { 8915 Instruction *SinkTarget = P.second; 8916 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8917 (void)FirstInst; 8918 while (DeadInstructions.contains(SinkTarget)) { 8919 assert( 8920 SinkTarget != FirstInst && 8921 "Must find a live instruction (at least the one feeding the " 8922 "first-order recurrence PHI) before reaching beginning of the block"); 8923 SinkTarget = SinkTarget->getPrevNode(); 8924 assert(SinkTarget != P.first && 8925 "sink source equals target, no sinking required"); 8926 } 8927 P.second = SinkTarget; 8928 } 8929 8930 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8931 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8932 VFRange SubRange = {VF, MaxVFPlusOne}; 8933 VPlans.push_back( 8934 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8935 VF = SubRange.End; 8936 } 8937 } 8938 8939 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8940 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8941 // BranchOnCount VPInstruction to the latch. 8942 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8943 bool HasNUW, bool IsVPlanNative) { 8944 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8945 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8946 8947 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8948 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8949 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8950 if (IsVPlanNative) 8951 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 8952 Header->insert(CanonicalIVPHI, Header->begin()); 8953 8954 auto *CanonicalIVIncrement = 8955 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8956 : VPInstruction::CanonicalIVIncrement, 8957 {CanonicalIVPHI}, DL); 8958 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8959 8960 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 8961 if (IsVPlanNative) { 8962 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 8963 EB->setCondBit(nullptr); 8964 } 8965 EB->appendRecipe(CanonicalIVIncrement); 8966 8967 auto *BranchOnCount = 8968 new VPInstruction(VPInstruction::BranchOnCount, 8969 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8970 EB->appendRecipe(BranchOnCount); 8971 } 8972 8973 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8974 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8975 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8976 8977 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8978 8979 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8980 8981 // --------------------------------------------------------------------------- 8982 // Pre-construction: record ingredients whose recipes we'll need to further 8983 // process after constructing the initial VPlan. 8984 // --------------------------------------------------------------------------- 8985 8986 // Mark instructions we'll need to sink later and their targets as 8987 // ingredients whose recipe we'll need to record. 8988 for (auto &Entry : SinkAfter) { 8989 RecipeBuilder.recordRecipeOf(Entry.first); 8990 RecipeBuilder.recordRecipeOf(Entry.second); 8991 } 8992 for (auto &Reduction : CM.getInLoopReductionChains()) { 8993 PHINode *Phi = Reduction.first; 8994 RecurKind Kind = 8995 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8996 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8997 8998 RecipeBuilder.recordRecipeOf(Phi); 8999 for (auto &R : ReductionOperations) { 9000 RecipeBuilder.recordRecipeOf(R); 9001 // For min/max reducitons, where we have a pair of icmp/select, we also 9002 // need to record the ICmp recipe, so it can be removed later. 9003 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9004 "Only min/max recurrences allowed for inloop reductions"); 9005 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9006 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9007 } 9008 } 9009 9010 // For each interleave group which is relevant for this (possibly trimmed) 9011 // Range, add it to the set of groups to be later applied to the VPlan and add 9012 // placeholders for its members' Recipes which we'll be replacing with a 9013 // single VPInterleaveRecipe. 9014 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9015 auto applyIG = [IG, this](ElementCount VF) -> bool { 9016 return (VF.isVector() && // Query is illegal for VF == 1 9017 CM.getWideningDecision(IG->getInsertPos(), VF) == 9018 LoopVectorizationCostModel::CM_Interleave); 9019 }; 9020 if (!getDecisionAndClampRange(applyIG, Range)) 9021 continue; 9022 InterleaveGroups.insert(IG); 9023 for (unsigned i = 0; i < IG->getFactor(); i++) 9024 if (Instruction *Member = IG->getMember(i)) 9025 RecipeBuilder.recordRecipeOf(Member); 9026 }; 9027 9028 // --------------------------------------------------------------------------- 9029 // Build initial VPlan: Scan the body of the loop in a topological order to 9030 // visit each basic block after having visited its predecessor basic blocks. 9031 // --------------------------------------------------------------------------- 9032 9033 // Create initial VPlan skeleton, with separate header and latch blocks. 9034 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9035 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9036 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9037 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9038 auto Plan = std::make_unique<VPlan>(TopRegion); 9039 9040 Instruction *DLInst = 9041 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9042 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 9043 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 9044 !CM.foldTailByMasking(), false); 9045 9046 // Scan the body of the loop in a topological order to visit each basic block 9047 // after having visited its predecessor basic blocks. 9048 LoopBlocksDFS DFS(OrigLoop); 9049 DFS.perform(LI); 9050 9051 VPBasicBlock *VPBB = HeaderVPBB; 9052 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9053 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9054 // Relevant instructions from basic block BB will be grouped into VPRecipe 9055 // ingredients and fill a new VPBasicBlock. 9056 unsigned VPBBsForBB = 0; 9057 VPBB->setName(BB->getName()); 9058 Builder.setInsertPoint(VPBB); 9059 9060 // Introduce each ingredient into VPlan. 9061 // TODO: Model and preserve debug instrinsics in VPlan. 9062 for (Instruction &I : BB->instructionsWithoutDebug()) { 9063 Instruction *Instr = &I; 9064 9065 // First filter out irrelevant instructions, to ensure no recipes are 9066 // built for them. 9067 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9068 continue; 9069 9070 SmallVector<VPValue *, 4> Operands; 9071 auto *Phi = dyn_cast<PHINode>(Instr); 9072 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9073 Operands.push_back(Plan->getOrAddVPValue( 9074 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9075 } else { 9076 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9077 Operands = {OpRange.begin(), OpRange.end()}; 9078 } 9079 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9080 Instr, Operands, Range, Plan)) { 9081 // If Instr can be simplified to an existing VPValue, use it. 9082 if (RecipeOrValue.is<VPValue *>()) { 9083 auto *VPV = RecipeOrValue.get<VPValue *>(); 9084 Plan->addVPValue(Instr, VPV); 9085 // If the re-used value is a recipe, register the recipe for the 9086 // instruction, in case the recipe for Instr needs to be recorded. 9087 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9088 RecipeBuilder.setRecipe(Instr, R); 9089 continue; 9090 } 9091 // Otherwise, add the new recipe. 9092 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9093 for (auto *Def : Recipe->definedValues()) { 9094 auto *UV = Def->getUnderlyingValue(); 9095 Plan->addVPValue(UV, Def); 9096 } 9097 9098 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9099 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9100 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9101 // of the header block. That can happen for truncates of induction 9102 // variables. Those recipes are moved to the phi section of the header 9103 // block after applying SinkAfter, which relies on the original 9104 // position of the trunc. 9105 assert(isa<TruncInst>(Instr)); 9106 InductionsToMove.push_back( 9107 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9108 } 9109 RecipeBuilder.setRecipe(Instr, Recipe); 9110 VPBB->appendRecipe(Recipe); 9111 continue; 9112 } 9113 9114 // Otherwise, if all widening options failed, Instruction is to be 9115 // replicated. This may create a successor for VPBB. 9116 VPBasicBlock *NextVPBB = 9117 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9118 if (NextVPBB != VPBB) { 9119 VPBB = NextVPBB; 9120 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9121 : ""); 9122 } 9123 } 9124 9125 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9126 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9127 } 9128 9129 // Fold the last, empty block into its predecessor. 9130 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9131 assert(VPBB && "expected to fold last (empty) block"); 9132 // After here, VPBB should not be used. 9133 VPBB = nullptr; 9134 9135 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9136 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9137 "entry block must be set to a VPRegionBlock having a non-empty entry " 9138 "VPBasicBlock"); 9139 RecipeBuilder.fixHeaderPhis(); 9140 9141 // --------------------------------------------------------------------------- 9142 // Transform initial VPlan: Apply previously taken decisions, in order, to 9143 // bring the VPlan to its final state. 9144 // --------------------------------------------------------------------------- 9145 9146 // Apply Sink-After legal constraints. 9147 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9148 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9149 if (Region && Region->isReplicator()) { 9150 assert(Region->getNumSuccessors() == 1 && 9151 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9152 assert(R->getParent()->size() == 1 && 9153 "A recipe in an original replicator region must be the only " 9154 "recipe in its block"); 9155 return Region; 9156 } 9157 return nullptr; 9158 }; 9159 for (auto &Entry : SinkAfter) { 9160 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9161 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9162 9163 auto *TargetRegion = GetReplicateRegion(Target); 9164 auto *SinkRegion = GetReplicateRegion(Sink); 9165 if (!SinkRegion) { 9166 // If the sink source is not a replicate region, sink the recipe directly. 9167 if (TargetRegion) { 9168 // The target is in a replication region, make sure to move Sink to 9169 // the block after it, not into the replication region itself. 9170 VPBasicBlock *NextBlock = 9171 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9172 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9173 } else 9174 Sink->moveAfter(Target); 9175 continue; 9176 } 9177 9178 // The sink source is in a replicate region. Unhook the region from the CFG. 9179 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9180 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9181 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9182 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9183 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9184 9185 if (TargetRegion) { 9186 // The target recipe is also in a replicate region, move the sink region 9187 // after the target region. 9188 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9189 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9190 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9191 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9192 } else { 9193 // The sink source is in a replicate region, we need to move the whole 9194 // replicate region, which should only contain a single recipe in the 9195 // main block. 9196 auto *SplitBlock = 9197 Target->getParent()->splitAt(std::next(Target->getIterator())); 9198 9199 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9200 9201 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9202 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9203 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9204 } 9205 } 9206 9207 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9208 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9209 9210 // Now that sink-after is done, move induction recipes for optimized truncates 9211 // to the phi section of the header block. 9212 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9213 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9214 9215 // Adjust the recipes for any inloop reductions. 9216 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9217 RecipeBuilder, Range.Start); 9218 9219 // Introduce a recipe to combine the incoming and previous values of a 9220 // first-order recurrence. 9221 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9222 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9223 if (!RecurPhi) 9224 continue; 9225 9226 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9227 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9228 auto *Region = GetReplicateRegion(PrevRecipe); 9229 if (Region) 9230 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9231 if (Region || PrevRecipe->isPhi()) 9232 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9233 else 9234 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9235 9236 auto *RecurSplice = cast<VPInstruction>( 9237 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9238 {RecurPhi, RecurPhi->getBackedgeValue()})); 9239 9240 RecurPhi->replaceAllUsesWith(RecurSplice); 9241 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9242 // all users. 9243 RecurSplice->setOperand(0, RecurPhi); 9244 } 9245 9246 // Interleave memory: for each Interleave Group we marked earlier as relevant 9247 // for this VPlan, replace the Recipes widening its memory instructions with a 9248 // single VPInterleaveRecipe at its insertion point. 9249 for (auto IG : InterleaveGroups) { 9250 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9251 RecipeBuilder.getRecipe(IG->getInsertPos())); 9252 SmallVector<VPValue *, 4> StoredValues; 9253 for (unsigned i = 0; i < IG->getFactor(); ++i) 9254 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9255 auto *StoreR = 9256 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9257 StoredValues.push_back(StoreR->getStoredValue()); 9258 } 9259 9260 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9261 Recipe->getMask()); 9262 VPIG->insertBefore(Recipe); 9263 unsigned J = 0; 9264 for (unsigned i = 0; i < IG->getFactor(); ++i) 9265 if (Instruction *Member = IG->getMember(i)) { 9266 if (!Member->getType()->isVoidTy()) { 9267 VPValue *OriginalV = Plan->getVPValue(Member); 9268 Plan->removeVPValueFor(Member); 9269 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9270 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9271 J++; 9272 } 9273 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9274 } 9275 } 9276 9277 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9278 // in ways that accessing values using original IR values is incorrect. 9279 Plan->disableValue2VPValue(); 9280 9281 VPlanTransforms::sinkScalarOperands(*Plan); 9282 VPlanTransforms::mergeReplicateRegions(*Plan); 9283 9284 std::string PlanName; 9285 raw_string_ostream RSO(PlanName); 9286 ElementCount VF = Range.Start; 9287 Plan->addVF(VF); 9288 RSO << "Initial VPlan for VF={" << VF; 9289 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9290 Plan->addVF(VF); 9291 RSO << "," << VF; 9292 } 9293 RSO << "},UF>=1"; 9294 RSO.flush(); 9295 Plan->setName(PlanName); 9296 9297 // Fold Exit block into its predecessor if possible. 9298 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9299 // VPBasicBlock as exit. 9300 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9301 9302 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9303 return Plan; 9304 } 9305 9306 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9307 // Outer loop handling: They may require CFG and instruction level 9308 // transformations before even evaluating whether vectorization is profitable. 9309 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9310 // the vectorization pipeline. 9311 assert(!OrigLoop->isInnermost()); 9312 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9313 9314 // Create new empty VPlan 9315 auto Plan = std::make_unique<VPlan>(); 9316 9317 // Build hierarchical CFG 9318 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9319 HCFGBuilder.buildHierarchicalCFG(); 9320 9321 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9322 VF *= 2) 9323 Plan->addVF(VF); 9324 9325 if (EnableVPlanPredication) { 9326 VPlanPredicator VPP(*Plan); 9327 VPP.predicate(); 9328 9329 // Avoid running transformation to recipes until masked code generation in 9330 // VPlan-native path is in place. 9331 return Plan; 9332 } 9333 9334 SmallPtrSet<Instruction *, 1> DeadInstructions; 9335 VPlanTransforms::VPInstructionsToVPRecipes( 9336 OrigLoop, Plan, 9337 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9338 DeadInstructions, *PSE.getSE()); 9339 9340 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9341 true, true); 9342 return Plan; 9343 } 9344 9345 // Adjust the recipes for reductions. For in-loop reductions the chain of 9346 // instructions leading from the loop exit instr to the phi need to be converted 9347 // to reductions, with one operand being vector and the other being the scalar 9348 // reduction chain. For other reductions, a select is introduced between the phi 9349 // and live-out recipes when folding the tail. 9350 void LoopVectorizationPlanner::adjustRecipesForReductions( 9351 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9352 ElementCount MinVF) { 9353 for (auto &Reduction : CM.getInLoopReductionChains()) { 9354 PHINode *Phi = Reduction.first; 9355 const RecurrenceDescriptor &RdxDesc = 9356 Legal->getReductionVars().find(Phi)->second; 9357 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9358 9359 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9360 continue; 9361 9362 // ReductionOperations are orders top-down from the phi's use to the 9363 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9364 // which of the two operands will remain scalar and which will be reduced. 9365 // For minmax the chain will be the select instructions. 9366 Instruction *Chain = Phi; 9367 for (Instruction *R : ReductionOperations) { 9368 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9369 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9370 9371 VPValue *ChainOp = Plan->getVPValue(Chain); 9372 unsigned FirstOpId; 9373 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9374 "Only min/max recurrences allowed for inloop reductions"); 9375 // Recognize a call to the llvm.fmuladd intrinsic. 9376 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9377 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9378 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9379 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9380 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9381 "Expected to replace a VPWidenSelectSC"); 9382 FirstOpId = 1; 9383 } else { 9384 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9385 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9386 "Expected to replace a VPWidenSC"); 9387 FirstOpId = 0; 9388 } 9389 unsigned VecOpId = 9390 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9391 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9392 9393 auto *CondOp = CM.foldTailByMasking() 9394 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9395 : nullptr; 9396 9397 if (IsFMulAdd) { 9398 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9399 // need to create an fmul recipe to use as the vector operand for the 9400 // fadd reduction. 9401 VPInstruction *FMulRecipe = new VPInstruction( 9402 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9403 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9404 WidenRecipe->getParent()->insert(FMulRecipe, 9405 WidenRecipe->getIterator()); 9406 VecOp = FMulRecipe; 9407 } 9408 VPReductionRecipe *RedRecipe = 9409 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9410 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9411 Plan->removeVPValueFor(R); 9412 Plan->addVPValue(R, RedRecipe); 9413 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9414 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9415 WidenRecipe->eraseFromParent(); 9416 9417 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9418 VPRecipeBase *CompareRecipe = 9419 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9420 assert(isa<VPWidenRecipe>(CompareRecipe) && 9421 "Expected to replace a VPWidenSC"); 9422 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9423 "Expected no remaining users"); 9424 CompareRecipe->eraseFromParent(); 9425 } 9426 Chain = R; 9427 } 9428 } 9429 9430 // If tail is folded by masking, introduce selects between the phi 9431 // and the live-out instruction of each reduction, at the beginning of the 9432 // dedicated latch block. 9433 if (CM.foldTailByMasking()) { 9434 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9435 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9436 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9437 if (!PhiR || PhiR->isInLoop()) 9438 continue; 9439 VPValue *Cond = 9440 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9441 VPValue *Red = PhiR->getBackedgeValue(); 9442 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9443 "reduction recipe must be defined before latch"); 9444 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9445 } 9446 } 9447 } 9448 9449 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9450 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9451 VPSlotTracker &SlotTracker) const { 9452 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9453 IG->getInsertPos()->printAsOperand(O, false); 9454 O << ", "; 9455 getAddr()->printAsOperand(O, SlotTracker); 9456 VPValue *Mask = getMask(); 9457 if (Mask) { 9458 O << ", "; 9459 Mask->printAsOperand(O, SlotTracker); 9460 } 9461 9462 unsigned OpIdx = 0; 9463 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9464 if (!IG->getMember(i)) 9465 continue; 9466 if (getNumStoreOperands() > 0) { 9467 O << "\n" << Indent << " store "; 9468 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9469 O << " to index " << i; 9470 } else { 9471 O << "\n" << Indent << " "; 9472 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9473 O << " = load from index " << i; 9474 } 9475 ++OpIdx; 9476 } 9477 } 9478 #endif 9479 9480 void VPWidenCallRecipe::execute(VPTransformState &State) { 9481 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9482 *this, State); 9483 } 9484 9485 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9486 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9487 State.ILV->setDebugLocFromInst(&I); 9488 9489 // The condition can be loop invariant but still defined inside the 9490 // loop. This means that we can't just use the original 'cond' value. 9491 // We have to take the 'vectorized' value and pick the first lane. 9492 // Instcombine will make this a no-op. 9493 auto *InvarCond = 9494 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9495 9496 for (unsigned Part = 0; Part < State.UF; ++Part) { 9497 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9498 Value *Op0 = State.get(getOperand(1), Part); 9499 Value *Op1 = State.get(getOperand(2), Part); 9500 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9501 State.set(this, Sel, Part); 9502 State.ILV->addMetadata(Sel, &I); 9503 } 9504 } 9505 9506 void VPWidenRecipe::execute(VPTransformState &State) { 9507 auto &I = *cast<Instruction>(getUnderlyingValue()); 9508 auto &Builder = State.Builder; 9509 switch (I.getOpcode()) { 9510 case Instruction::Call: 9511 case Instruction::Br: 9512 case Instruction::PHI: 9513 case Instruction::GetElementPtr: 9514 case Instruction::Select: 9515 llvm_unreachable("This instruction is handled by a different recipe."); 9516 case Instruction::UDiv: 9517 case Instruction::SDiv: 9518 case Instruction::SRem: 9519 case Instruction::URem: 9520 case Instruction::Add: 9521 case Instruction::FAdd: 9522 case Instruction::Sub: 9523 case Instruction::FSub: 9524 case Instruction::FNeg: 9525 case Instruction::Mul: 9526 case Instruction::FMul: 9527 case Instruction::FDiv: 9528 case Instruction::FRem: 9529 case Instruction::Shl: 9530 case Instruction::LShr: 9531 case Instruction::AShr: 9532 case Instruction::And: 9533 case Instruction::Or: 9534 case Instruction::Xor: { 9535 // Just widen unops and binops. 9536 State.ILV->setDebugLocFromInst(&I); 9537 9538 for (unsigned Part = 0; Part < State.UF; ++Part) { 9539 SmallVector<Value *, 2> Ops; 9540 for (VPValue *VPOp : operands()) 9541 Ops.push_back(State.get(VPOp, Part)); 9542 9543 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9544 9545 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9546 VecOp->copyIRFlags(&I); 9547 9548 // If the instruction is vectorized and was in a basic block that needed 9549 // predication, we can't propagate poison-generating flags (nuw/nsw, 9550 // exact, etc.). The control flow has been linearized and the 9551 // instruction is no longer guarded by the predicate, which could make 9552 // the flag properties to no longer hold. 9553 if (State.MayGeneratePoisonRecipes.contains(this)) 9554 VecOp->dropPoisonGeneratingFlags(); 9555 } 9556 9557 // Use this vector value for all users of the original instruction. 9558 State.set(this, V, Part); 9559 State.ILV->addMetadata(V, &I); 9560 } 9561 9562 break; 9563 } 9564 case Instruction::ICmp: 9565 case Instruction::FCmp: { 9566 // Widen compares. Generate vector compares. 9567 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9568 auto *Cmp = cast<CmpInst>(&I); 9569 State.ILV->setDebugLocFromInst(Cmp); 9570 for (unsigned Part = 0; Part < State.UF; ++Part) { 9571 Value *A = State.get(getOperand(0), Part); 9572 Value *B = State.get(getOperand(1), Part); 9573 Value *C = nullptr; 9574 if (FCmp) { 9575 // Propagate fast math flags. 9576 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9577 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9578 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9579 } else { 9580 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9581 } 9582 State.set(this, C, Part); 9583 State.ILV->addMetadata(C, &I); 9584 } 9585 9586 break; 9587 } 9588 9589 case Instruction::ZExt: 9590 case Instruction::SExt: 9591 case Instruction::FPToUI: 9592 case Instruction::FPToSI: 9593 case Instruction::FPExt: 9594 case Instruction::PtrToInt: 9595 case Instruction::IntToPtr: 9596 case Instruction::SIToFP: 9597 case Instruction::UIToFP: 9598 case Instruction::Trunc: 9599 case Instruction::FPTrunc: 9600 case Instruction::BitCast: { 9601 auto *CI = cast<CastInst>(&I); 9602 State.ILV->setDebugLocFromInst(CI); 9603 9604 /// Vectorize casts. 9605 Type *DestTy = (State.VF.isScalar()) 9606 ? CI->getType() 9607 : VectorType::get(CI->getType(), State.VF); 9608 9609 for (unsigned Part = 0; Part < State.UF; ++Part) { 9610 Value *A = State.get(getOperand(0), Part); 9611 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9612 State.set(this, Cast, Part); 9613 State.ILV->addMetadata(Cast, &I); 9614 } 9615 break; 9616 } 9617 default: 9618 // This instruction is not vectorized by simple widening. 9619 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9620 llvm_unreachable("Unhandled instruction!"); 9621 } // end of switch. 9622 } 9623 9624 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9625 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9626 // Construct a vector GEP by widening the operands of the scalar GEP as 9627 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9628 // results in a vector of pointers when at least one operand of the GEP 9629 // is vector-typed. Thus, to keep the representation compact, we only use 9630 // vector-typed operands for loop-varying values. 9631 9632 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9633 // If we are vectorizing, but the GEP has only loop-invariant operands, 9634 // the GEP we build (by only using vector-typed operands for 9635 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9636 // produce a vector of pointers, we need to either arbitrarily pick an 9637 // operand to broadcast, or broadcast a clone of the original GEP. 9638 // Here, we broadcast a clone of the original. 9639 // 9640 // TODO: If at some point we decide to scalarize instructions having 9641 // loop-invariant operands, this special case will no longer be 9642 // required. We would add the scalarization decision to 9643 // collectLoopScalars() and teach getVectorValue() to broadcast 9644 // the lane-zero scalar value. 9645 auto *Clone = State.Builder.Insert(GEP->clone()); 9646 for (unsigned Part = 0; Part < State.UF; ++Part) { 9647 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9648 State.set(this, EntryPart, Part); 9649 State.ILV->addMetadata(EntryPart, GEP); 9650 } 9651 } else { 9652 // If the GEP has at least one loop-varying operand, we are sure to 9653 // produce a vector of pointers. But if we are only unrolling, we want 9654 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9655 // produce with the code below will be scalar (if VF == 1) or vector 9656 // (otherwise). Note that for the unroll-only case, we still maintain 9657 // values in the vector mapping with initVector, as we do for other 9658 // instructions. 9659 for (unsigned Part = 0; Part < State.UF; ++Part) { 9660 // The pointer operand of the new GEP. If it's loop-invariant, we 9661 // won't broadcast it. 9662 auto *Ptr = IsPtrLoopInvariant 9663 ? State.get(getOperand(0), VPIteration(0, 0)) 9664 : State.get(getOperand(0), Part); 9665 9666 // Collect all the indices for the new GEP. If any index is 9667 // loop-invariant, we won't broadcast it. 9668 SmallVector<Value *, 4> Indices; 9669 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9670 VPValue *Operand = getOperand(I); 9671 if (IsIndexLoopInvariant[I - 1]) 9672 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9673 else 9674 Indices.push_back(State.get(Operand, Part)); 9675 } 9676 9677 // If the GEP instruction is vectorized and was in a basic block that 9678 // needed predication, we can't propagate the poison-generating 'inbounds' 9679 // flag. The control flow has been linearized and the GEP is no longer 9680 // guarded by the predicate, which could make the 'inbounds' properties to 9681 // no longer hold. 9682 bool IsInBounds = 9683 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9684 9685 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9686 // but it should be a vector, otherwise. 9687 auto *NewGEP = IsInBounds 9688 ? State.Builder.CreateInBoundsGEP( 9689 GEP->getSourceElementType(), Ptr, Indices) 9690 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9691 Ptr, Indices); 9692 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9693 "NewGEP is not a pointer vector"); 9694 State.set(this, NewGEP, Part); 9695 State.ILV->addMetadata(NewGEP, GEP); 9696 } 9697 } 9698 } 9699 9700 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9701 assert(!State.Instance && "Int or FP induction being replicated."); 9702 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9703 State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); 9704 } 9705 9706 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9707 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9708 State); 9709 } 9710 9711 void VPBlendRecipe::execute(VPTransformState &State) { 9712 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9713 // We know that all PHIs in non-header blocks are converted into 9714 // selects, so we don't have to worry about the insertion order and we 9715 // can just use the builder. 9716 // At this point we generate the predication tree. There may be 9717 // duplications since this is a simple recursive scan, but future 9718 // optimizations will clean it up. 9719 9720 unsigned NumIncoming = getNumIncomingValues(); 9721 9722 // Generate a sequence of selects of the form: 9723 // SELECT(Mask3, In3, 9724 // SELECT(Mask2, In2, 9725 // SELECT(Mask1, In1, 9726 // In0))) 9727 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9728 // are essentially undef are taken from In0. 9729 InnerLoopVectorizer::VectorParts Entry(State.UF); 9730 for (unsigned In = 0; In < NumIncoming; ++In) { 9731 for (unsigned Part = 0; Part < State.UF; ++Part) { 9732 // We might have single edge PHIs (blocks) - use an identity 9733 // 'select' for the first PHI operand. 9734 Value *In0 = State.get(getIncomingValue(In), Part); 9735 if (In == 0) 9736 Entry[Part] = In0; // Initialize with the first incoming value. 9737 else { 9738 // Select between the current value and the previous incoming edge 9739 // based on the incoming mask. 9740 Value *Cond = State.get(getMask(In), Part); 9741 Entry[Part] = 9742 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9743 } 9744 } 9745 } 9746 for (unsigned Part = 0; Part < State.UF; ++Part) 9747 State.set(this, Entry[Part], Part); 9748 } 9749 9750 void VPInterleaveRecipe::execute(VPTransformState &State) { 9751 assert(!State.Instance && "Interleave group being replicated."); 9752 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9753 getStoredValues(), getMask()); 9754 } 9755 9756 void VPReductionRecipe::execute(VPTransformState &State) { 9757 assert(!State.Instance && "Reduction being replicated."); 9758 Value *PrevInChain = State.get(getChainOp(), 0); 9759 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9760 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9761 // Propagate the fast-math flags carried by the underlying instruction. 9762 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9763 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9764 for (unsigned Part = 0; Part < State.UF; ++Part) { 9765 Value *NewVecOp = State.get(getVecOp(), Part); 9766 if (VPValue *Cond = getCondOp()) { 9767 Value *NewCond = State.get(Cond, Part); 9768 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9769 Value *Iden = RdxDesc->getRecurrenceIdentity( 9770 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9771 Value *IdenVec = 9772 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9773 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9774 NewVecOp = Select; 9775 } 9776 Value *NewRed; 9777 Value *NextInChain; 9778 if (IsOrdered) { 9779 if (State.VF.isVector()) 9780 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9781 PrevInChain); 9782 else 9783 NewRed = State.Builder.CreateBinOp( 9784 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9785 NewVecOp); 9786 PrevInChain = NewRed; 9787 } else { 9788 PrevInChain = State.get(getChainOp(), Part); 9789 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9790 } 9791 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9792 NextInChain = 9793 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9794 NewRed, PrevInChain); 9795 } else if (IsOrdered) 9796 NextInChain = NewRed; 9797 else 9798 NextInChain = State.Builder.CreateBinOp( 9799 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9800 PrevInChain); 9801 State.set(this, NextInChain, Part); 9802 } 9803 } 9804 9805 void VPReplicateRecipe::execute(VPTransformState &State) { 9806 if (State.Instance) { // Generate a single instance. 9807 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9808 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9809 IsPredicated, State); 9810 // Insert scalar instance packing it into a vector. 9811 if (AlsoPack && State.VF.isVector()) { 9812 // If we're constructing lane 0, initialize to start from poison. 9813 if (State.Instance->Lane.isFirstLane()) { 9814 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9815 Value *Poison = PoisonValue::get( 9816 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9817 State.set(this, Poison, State.Instance->Part); 9818 } 9819 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9820 } 9821 return; 9822 } 9823 9824 // Generate scalar instances for all VF lanes of all UF parts, unless the 9825 // instruction is uniform inwhich case generate only the first lane for each 9826 // of the UF parts. 9827 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9828 assert((!State.VF.isScalable() || IsUniform) && 9829 "Can't scalarize a scalable vector"); 9830 for (unsigned Part = 0; Part < State.UF; ++Part) 9831 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9832 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9833 VPIteration(Part, Lane), IsPredicated, 9834 State); 9835 } 9836 9837 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9838 assert(State.Instance && "Branch on Mask works only on single instance."); 9839 9840 unsigned Part = State.Instance->Part; 9841 unsigned Lane = State.Instance->Lane.getKnownLane(); 9842 9843 Value *ConditionBit = nullptr; 9844 VPValue *BlockInMask = getMask(); 9845 if (BlockInMask) { 9846 ConditionBit = State.get(BlockInMask, Part); 9847 if (ConditionBit->getType()->isVectorTy()) 9848 ConditionBit = State.Builder.CreateExtractElement( 9849 ConditionBit, State.Builder.getInt32(Lane)); 9850 } else // Block in mask is all-one. 9851 ConditionBit = State.Builder.getTrue(); 9852 9853 // Replace the temporary unreachable terminator with a new conditional branch, 9854 // whose two destinations will be set later when they are created. 9855 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9856 assert(isa<UnreachableInst>(CurrentTerminator) && 9857 "Expected to replace unreachable terminator with conditional branch."); 9858 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9859 CondBr->setSuccessor(0, nullptr); 9860 ReplaceInstWithInst(CurrentTerminator, CondBr); 9861 } 9862 9863 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9864 assert(State.Instance && "Predicated instruction PHI works per instance."); 9865 Instruction *ScalarPredInst = 9866 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9867 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9868 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9869 assert(PredicatingBB && "Predicated block has no single predecessor."); 9870 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9871 "operand must be VPReplicateRecipe"); 9872 9873 // By current pack/unpack logic we need to generate only a single phi node: if 9874 // a vector value for the predicated instruction exists at this point it means 9875 // the instruction has vector users only, and a phi for the vector value is 9876 // needed. In this case the recipe of the predicated instruction is marked to 9877 // also do that packing, thereby "hoisting" the insert-element sequence. 9878 // Otherwise, a phi node for the scalar value is needed. 9879 unsigned Part = State.Instance->Part; 9880 if (State.hasVectorValue(getOperand(0), Part)) { 9881 Value *VectorValue = State.get(getOperand(0), Part); 9882 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9883 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9884 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9885 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9886 if (State.hasVectorValue(this, Part)) 9887 State.reset(this, VPhi, Part); 9888 else 9889 State.set(this, VPhi, Part); 9890 // NOTE: Currently we need to update the value of the operand, so the next 9891 // predicated iteration inserts its generated value in the correct vector. 9892 State.reset(getOperand(0), VPhi, Part); 9893 } else { 9894 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9895 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9896 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9897 PredicatingBB); 9898 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9899 if (State.hasScalarValue(this, *State.Instance)) 9900 State.reset(this, Phi, *State.Instance); 9901 else 9902 State.set(this, Phi, *State.Instance); 9903 // NOTE: Currently we need to update the value of the operand, so the next 9904 // predicated iteration inserts its generated value in the correct vector. 9905 State.reset(getOperand(0), Phi, *State.Instance); 9906 } 9907 } 9908 9909 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9910 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9911 9912 // Attempt to issue a wide load. 9913 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9914 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9915 9916 assert((LI || SI) && "Invalid Load/Store instruction"); 9917 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9918 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9919 9920 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9921 9922 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9923 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9924 bool CreateGatherScatter = !Consecutive; 9925 9926 auto &Builder = State.Builder; 9927 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9928 bool isMaskRequired = getMask(); 9929 if (isMaskRequired) 9930 for (unsigned Part = 0; Part < State.UF; ++Part) 9931 BlockInMaskParts[Part] = State.get(getMask(), Part); 9932 9933 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9934 // Calculate the pointer for the specific unroll-part. 9935 GetElementPtrInst *PartPtr = nullptr; 9936 9937 bool InBounds = false; 9938 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9939 InBounds = gep->isInBounds(); 9940 if (Reverse) { 9941 // If the address is consecutive but reversed, then the 9942 // wide store needs to start at the last vector element. 9943 // RunTimeVF = VScale * VF.getKnownMinValue() 9944 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9945 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9946 // NumElt = -Part * RunTimeVF 9947 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9948 // LastLane = 1 - RunTimeVF 9949 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9950 PartPtr = 9951 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9952 PartPtr->setIsInBounds(InBounds); 9953 PartPtr = cast<GetElementPtrInst>( 9954 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9955 PartPtr->setIsInBounds(InBounds); 9956 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9957 BlockInMaskParts[Part] = 9958 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9959 } else { 9960 Value *Increment = 9961 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9962 PartPtr = cast<GetElementPtrInst>( 9963 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9964 PartPtr->setIsInBounds(InBounds); 9965 } 9966 9967 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9968 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9969 }; 9970 9971 // Handle Stores: 9972 if (SI) { 9973 State.ILV->setDebugLocFromInst(SI); 9974 9975 for (unsigned Part = 0; Part < State.UF; ++Part) { 9976 Instruction *NewSI = nullptr; 9977 Value *StoredVal = State.get(StoredValue, Part); 9978 if (CreateGatherScatter) { 9979 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9980 Value *VectorGep = State.get(getAddr(), Part); 9981 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9982 MaskPart); 9983 } else { 9984 if (Reverse) { 9985 // If we store to reverse consecutive memory locations, then we need 9986 // to reverse the order of elements in the stored value. 9987 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9988 // We don't want to update the value in the map as it might be used in 9989 // another expression. So don't call resetVectorValue(StoredVal). 9990 } 9991 auto *VecPtr = 9992 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9993 if (isMaskRequired) 9994 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9995 BlockInMaskParts[Part]); 9996 else 9997 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9998 } 9999 State.ILV->addMetadata(NewSI, SI); 10000 } 10001 return; 10002 } 10003 10004 // Handle loads. 10005 assert(LI && "Must have a load instruction"); 10006 State.ILV->setDebugLocFromInst(LI); 10007 for (unsigned Part = 0; Part < State.UF; ++Part) { 10008 Value *NewLI; 10009 if (CreateGatherScatter) { 10010 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10011 Value *VectorGep = State.get(getAddr(), Part); 10012 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10013 nullptr, "wide.masked.gather"); 10014 State.ILV->addMetadata(NewLI, LI); 10015 } else { 10016 auto *VecPtr = 10017 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10018 if (isMaskRequired) 10019 NewLI = Builder.CreateMaskedLoad( 10020 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10021 PoisonValue::get(DataTy), "wide.masked.load"); 10022 else 10023 NewLI = 10024 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10025 10026 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10027 State.ILV->addMetadata(NewLI, LI); 10028 if (Reverse) 10029 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10030 } 10031 10032 State.set(this, NewLI, Part); 10033 } 10034 } 10035 10036 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10037 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10038 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10039 // for predication. 10040 static ScalarEpilogueLowering getScalarEpilogueLowering( 10041 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10042 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10043 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10044 LoopVectorizationLegality &LVL) { 10045 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10046 // don't look at hints or options, and don't request a scalar epilogue. 10047 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10048 // LoopAccessInfo (due to code dependency and not being able to reliably get 10049 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10050 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10051 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10052 // back to the old way and vectorize with versioning when forced. See D81345.) 10053 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10054 PGSOQueryType::IRPass) && 10055 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10056 return CM_ScalarEpilogueNotAllowedOptSize; 10057 10058 // 2) If set, obey the directives 10059 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10060 switch (PreferPredicateOverEpilogue) { 10061 case PreferPredicateTy::ScalarEpilogue: 10062 return CM_ScalarEpilogueAllowed; 10063 case PreferPredicateTy::PredicateElseScalarEpilogue: 10064 return CM_ScalarEpilogueNotNeededUsePredicate; 10065 case PreferPredicateTy::PredicateOrDontVectorize: 10066 return CM_ScalarEpilogueNotAllowedUsePredicate; 10067 }; 10068 } 10069 10070 // 3) If set, obey the hints 10071 switch (Hints.getPredicate()) { 10072 case LoopVectorizeHints::FK_Enabled: 10073 return CM_ScalarEpilogueNotNeededUsePredicate; 10074 case LoopVectorizeHints::FK_Disabled: 10075 return CM_ScalarEpilogueAllowed; 10076 }; 10077 10078 // 4) if the TTI hook indicates this is profitable, request predication. 10079 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10080 LVL.getLAI())) 10081 return CM_ScalarEpilogueNotNeededUsePredicate; 10082 10083 return CM_ScalarEpilogueAllowed; 10084 } 10085 10086 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10087 // If Values have been set for this Def return the one relevant for \p Part. 10088 if (hasVectorValue(Def, Part)) 10089 return Data.PerPartOutput[Def][Part]; 10090 10091 if (!hasScalarValue(Def, {Part, 0})) { 10092 Value *IRV = Def->getLiveInIRValue(); 10093 Value *B = ILV->getBroadcastInstrs(IRV); 10094 set(Def, B, Part); 10095 return B; 10096 } 10097 10098 Value *ScalarValue = get(Def, {Part, 0}); 10099 // If we aren't vectorizing, we can just copy the scalar map values over 10100 // to the vector map. 10101 if (VF.isScalar()) { 10102 set(Def, ScalarValue, Part); 10103 return ScalarValue; 10104 } 10105 10106 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10107 bool IsUniform = RepR && RepR->isUniform(); 10108 10109 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10110 // Check if there is a scalar value for the selected lane. 10111 if (!hasScalarValue(Def, {Part, LastLane})) { 10112 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10113 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10114 "unexpected recipe found to be invariant"); 10115 IsUniform = true; 10116 LastLane = 0; 10117 } 10118 10119 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10120 // Set the insert point after the last scalarized instruction or after the 10121 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10122 // will directly follow the scalar definitions. 10123 auto OldIP = Builder.saveIP(); 10124 auto NewIP = 10125 isa<PHINode>(LastInst) 10126 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10127 : std::next(BasicBlock::iterator(LastInst)); 10128 Builder.SetInsertPoint(&*NewIP); 10129 10130 // However, if we are vectorizing, we need to construct the vector values. 10131 // If the value is known to be uniform after vectorization, we can just 10132 // broadcast the scalar value corresponding to lane zero for each unroll 10133 // iteration. Otherwise, we construct the vector values using 10134 // insertelement instructions. Since the resulting vectors are stored in 10135 // State, we will only generate the insertelements once. 10136 Value *VectorValue = nullptr; 10137 if (IsUniform) { 10138 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10139 set(Def, VectorValue, Part); 10140 } else { 10141 // Initialize packing with insertelements to start from undef. 10142 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10143 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10144 set(Def, Undef, Part); 10145 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10146 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10147 VectorValue = get(Def, Part); 10148 } 10149 Builder.restoreIP(OldIP); 10150 return VectorValue; 10151 } 10152 10153 // Process the loop in the VPlan-native vectorization path. This path builds 10154 // VPlan upfront in the vectorization pipeline, which allows to apply 10155 // VPlan-to-VPlan transformations from the very beginning without modifying the 10156 // input LLVM IR. 10157 static bool processLoopInVPlanNativePath( 10158 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10159 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10160 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10161 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10162 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10163 LoopVectorizationRequirements &Requirements) { 10164 10165 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10166 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10167 return false; 10168 } 10169 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10170 Function *F = L->getHeader()->getParent(); 10171 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10172 10173 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10174 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10175 10176 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10177 &Hints, IAI); 10178 // Use the planner for outer loop vectorization. 10179 // TODO: CM is not used at this point inside the planner. Turn CM into an 10180 // optional argument if we don't need it in the future. 10181 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10182 Requirements, ORE); 10183 10184 // Get user vectorization factor. 10185 ElementCount UserVF = Hints.getWidth(); 10186 10187 CM.collectElementTypesForWidening(); 10188 10189 // Plan how to best vectorize, return the best VF and its cost. 10190 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10191 10192 // If we are stress testing VPlan builds, do not attempt to generate vector 10193 // code. Masked vector code generation support will follow soon. 10194 // Also, do not attempt to vectorize if no vector code will be produced. 10195 if (VPlanBuildStressTest || EnableVPlanPredication || 10196 VectorizationFactor::Disabled() == VF) 10197 return false; 10198 10199 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10200 10201 { 10202 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10203 F->getParent()->getDataLayout()); 10204 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10205 &CM, BFI, PSI, Checks); 10206 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10207 << L->getHeader()->getParent()->getName() << "\"\n"); 10208 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10209 } 10210 10211 // Mark the loop as already vectorized to avoid vectorizing again. 10212 Hints.setAlreadyVectorized(); 10213 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10214 return true; 10215 } 10216 10217 // Emit a remark if there are stores to floats that required a floating point 10218 // extension. If the vectorized loop was generated with floating point there 10219 // will be a performance penalty from the conversion overhead and the change in 10220 // the vector width. 10221 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10222 SmallVector<Instruction *, 4> Worklist; 10223 for (BasicBlock *BB : L->getBlocks()) { 10224 for (Instruction &Inst : *BB) { 10225 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10226 if (S->getValueOperand()->getType()->isFloatTy()) 10227 Worklist.push_back(S); 10228 } 10229 } 10230 } 10231 10232 // Traverse the floating point stores upwards searching, for floating point 10233 // conversions. 10234 SmallPtrSet<const Instruction *, 4> Visited; 10235 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10236 while (!Worklist.empty()) { 10237 auto *I = Worklist.pop_back_val(); 10238 if (!L->contains(I)) 10239 continue; 10240 if (!Visited.insert(I).second) 10241 continue; 10242 10243 // Emit a remark if the floating point store required a floating 10244 // point conversion. 10245 // TODO: More work could be done to identify the root cause such as a 10246 // constant or a function return type and point the user to it. 10247 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10248 ORE->emit([&]() { 10249 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10250 I->getDebugLoc(), L->getHeader()) 10251 << "floating point conversion changes vector width. " 10252 << "Mixed floating point precision requires an up/down " 10253 << "cast that will negatively impact performance."; 10254 }); 10255 10256 for (Use &Op : I->operands()) 10257 if (auto *OpI = dyn_cast<Instruction>(Op)) 10258 Worklist.push_back(OpI); 10259 } 10260 } 10261 10262 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10263 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10264 !EnableLoopInterleaving), 10265 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10266 !EnableLoopVectorization) {} 10267 10268 bool LoopVectorizePass::processLoop(Loop *L) { 10269 assert((EnableVPlanNativePath || L->isInnermost()) && 10270 "VPlan-native path is not enabled. Only process inner loops."); 10271 10272 #ifndef NDEBUG 10273 const std::string DebugLocStr = getDebugLocString(L); 10274 #endif /* NDEBUG */ 10275 10276 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10277 << L->getHeader()->getParent()->getName() << "\" from " 10278 << DebugLocStr << "\n"); 10279 10280 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10281 10282 LLVM_DEBUG( 10283 dbgs() << "LV: Loop hints:" 10284 << " force=" 10285 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10286 ? "disabled" 10287 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10288 ? "enabled" 10289 : "?")) 10290 << " width=" << Hints.getWidth() 10291 << " interleave=" << Hints.getInterleave() << "\n"); 10292 10293 // Function containing loop 10294 Function *F = L->getHeader()->getParent(); 10295 10296 // Looking at the diagnostic output is the only way to determine if a loop 10297 // was vectorized (other than looking at the IR or machine code), so it 10298 // is important to generate an optimization remark for each loop. Most of 10299 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10300 // generated as OptimizationRemark and OptimizationRemarkMissed are 10301 // less verbose reporting vectorized loops and unvectorized loops that may 10302 // benefit from vectorization, respectively. 10303 10304 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10305 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10306 return false; 10307 } 10308 10309 PredicatedScalarEvolution PSE(*SE, *L); 10310 10311 // Check if it is legal to vectorize the loop. 10312 LoopVectorizationRequirements Requirements; 10313 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10314 &Requirements, &Hints, DB, AC, BFI, PSI); 10315 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10316 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10317 Hints.emitRemarkWithHints(); 10318 return false; 10319 } 10320 10321 // Check the function attributes and profiles to find out if this function 10322 // should be optimized for size. 10323 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10324 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10325 10326 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10327 // here. They may require CFG and instruction level transformations before 10328 // even evaluating whether vectorization is profitable. Since we cannot modify 10329 // the incoming IR, we need to build VPlan upfront in the vectorization 10330 // pipeline. 10331 if (!L->isInnermost()) 10332 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10333 ORE, BFI, PSI, Hints, Requirements); 10334 10335 assert(L->isInnermost() && "Inner loop expected."); 10336 10337 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10338 // count by optimizing for size, to minimize overheads. 10339 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10340 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10341 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10342 << "This loop is worth vectorizing only if no scalar " 10343 << "iteration overheads are incurred."); 10344 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10345 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10346 else { 10347 LLVM_DEBUG(dbgs() << "\n"); 10348 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10349 } 10350 } 10351 10352 // Check the function attributes to see if implicit floats are allowed. 10353 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10354 // an integer loop and the vector instructions selected are purely integer 10355 // vector instructions? 10356 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10357 reportVectorizationFailure( 10358 "Can't vectorize when the NoImplicitFloat attribute is used", 10359 "loop not vectorized due to NoImplicitFloat attribute", 10360 "NoImplicitFloat", ORE, L); 10361 Hints.emitRemarkWithHints(); 10362 return false; 10363 } 10364 10365 // Check if the target supports potentially unsafe FP vectorization. 10366 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10367 // for the target we're vectorizing for, to make sure none of the 10368 // additional fp-math flags can help. 10369 if (Hints.isPotentiallyUnsafe() && 10370 TTI->isFPVectorizationPotentiallyUnsafe()) { 10371 reportVectorizationFailure( 10372 "Potentially unsafe FP op prevents vectorization", 10373 "loop not vectorized due to unsafe FP support.", 10374 "UnsafeFP", ORE, L); 10375 Hints.emitRemarkWithHints(); 10376 return false; 10377 } 10378 10379 bool AllowOrderedReductions; 10380 // If the flag is set, use that instead and override the TTI behaviour. 10381 if (ForceOrderedReductions.getNumOccurrences() > 0) 10382 AllowOrderedReductions = ForceOrderedReductions; 10383 else 10384 AllowOrderedReductions = TTI->enableOrderedReductions(); 10385 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10386 ORE->emit([&]() { 10387 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10388 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10389 ExactFPMathInst->getDebugLoc(), 10390 ExactFPMathInst->getParent()) 10391 << "loop not vectorized: cannot prove it is safe to reorder " 10392 "floating-point operations"; 10393 }); 10394 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10395 "reorder floating-point operations\n"); 10396 Hints.emitRemarkWithHints(); 10397 return false; 10398 } 10399 10400 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10401 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10402 10403 // If an override option has been passed in for interleaved accesses, use it. 10404 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10405 UseInterleaved = EnableInterleavedMemAccesses; 10406 10407 // Analyze interleaved memory accesses. 10408 if (UseInterleaved) { 10409 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10410 } 10411 10412 // Use the cost model. 10413 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10414 F, &Hints, IAI); 10415 CM.collectValuesToIgnore(); 10416 CM.collectElementTypesForWidening(); 10417 10418 // Use the planner for vectorization. 10419 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10420 Requirements, ORE); 10421 10422 // Get user vectorization factor and interleave count. 10423 ElementCount UserVF = Hints.getWidth(); 10424 unsigned UserIC = Hints.getInterleave(); 10425 10426 // Plan how to best vectorize, return the best VF and its cost. 10427 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10428 10429 VectorizationFactor VF = VectorizationFactor::Disabled(); 10430 unsigned IC = 1; 10431 10432 if (MaybeVF) { 10433 VF = *MaybeVF; 10434 // Select the interleave count. 10435 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10436 } 10437 10438 // Identify the diagnostic messages that should be produced. 10439 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10440 bool VectorizeLoop = true, InterleaveLoop = true; 10441 if (VF.Width.isScalar()) { 10442 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10443 VecDiagMsg = std::make_pair( 10444 "VectorizationNotBeneficial", 10445 "the cost-model indicates that vectorization is not beneficial"); 10446 VectorizeLoop = false; 10447 } 10448 10449 if (!MaybeVF && UserIC > 1) { 10450 // Tell the user interleaving was avoided up-front, despite being explicitly 10451 // requested. 10452 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10453 "interleaving should be avoided up front\n"); 10454 IntDiagMsg = std::make_pair( 10455 "InterleavingAvoided", 10456 "Ignoring UserIC, because interleaving was avoided up front"); 10457 InterleaveLoop = false; 10458 } else if (IC == 1 && UserIC <= 1) { 10459 // Tell the user interleaving is not beneficial. 10460 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10461 IntDiagMsg = std::make_pair( 10462 "InterleavingNotBeneficial", 10463 "the cost-model indicates that interleaving is not beneficial"); 10464 InterleaveLoop = false; 10465 if (UserIC == 1) { 10466 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10467 IntDiagMsg.second += 10468 " and is explicitly disabled or interleave count is set to 1"; 10469 } 10470 } else if (IC > 1 && UserIC == 1) { 10471 // Tell the user interleaving is beneficial, but it explicitly disabled. 10472 LLVM_DEBUG( 10473 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10474 IntDiagMsg = std::make_pair( 10475 "InterleavingBeneficialButDisabled", 10476 "the cost-model indicates that interleaving is beneficial " 10477 "but is explicitly disabled or interleave count is set to 1"); 10478 InterleaveLoop = false; 10479 } 10480 10481 // Override IC if user provided an interleave count. 10482 IC = UserIC > 0 ? UserIC : IC; 10483 10484 // Emit diagnostic messages, if any. 10485 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10486 if (!VectorizeLoop && !InterleaveLoop) { 10487 // Do not vectorize or interleaving the loop. 10488 ORE->emit([&]() { 10489 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10490 L->getStartLoc(), L->getHeader()) 10491 << VecDiagMsg.second; 10492 }); 10493 ORE->emit([&]() { 10494 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10495 L->getStartLoc(), L->getHeader()) 10496 << IntDiagMsg.second; 10497 }); 10498 return false; 10499 } else if (!VectorizeLoop && InterleaveLoop) { 10500 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10501 ORE->emit([&]() { 10502 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10503 L->getStartLoc(), L->getHeader()) 10504 << VecDiagMsg.second; 10505 }); 10506 } else if (VectorizeLoop && !InterleaveLoop) { 10507 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10508 << ") in " << DebugLocStr << '\n'); 10509 ORE->emit([&]() { 10510 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10511 L->getStartLoc(), L->getHeader()) 10512 << IntDiagMsg.second; 10513 }); 10514 } else if (VectorizeLoop && InterleaveLoop) { 10515 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10516 << ") in " << DebugLocStr << '\n'); 10517 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10518 } 10519 10520 bool DisableRuntimeUnroll = false; 10521 MDNode *OrigLoopID = L->getLoopID(); 10522 { 10523 // Optimistically generate runtime checks. Drop them if they turn out to not 10524 // be profitable. Limit the scope of Checks, so the cleanup happens 10525 // immediately after vector codegeneration is done. 10526 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10527 F->getParent()->getDataLayout()); 10528 if (!VF.Width.isScalar() || IC > 1) 10529 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10530 10531 using namespace ore; 10532 if (!VectorizeLoop) { 10533 assert(IC > 1 && "interleave count should not be 1 or 0"); 10534 // If we decided that it is not legal to vectorize the loop, then 10535 // interleave it. 10536 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10537 &CM, BFI, PSI, Checks); 10538 10539 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10540 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10541 10542 ORE->emit([&]() { 10543 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10544 L->getHeader()) 10545 << "interleaved loop (interleaved count: " 10546 << NV("InterleaveCount", IC) << ")"; 10547 }); 10548 } else { 10549 // If we decided that it is *legal* to vectorize the loop, then do it. 10550 10551 // Consider vectorizing the epilogue too if it's profitable. 10552 VectorizationFactor EpilogueVF = 10553 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10554 if (EpilogueVF.Width.isVector()) { 10555 10556 // The first pass vectorizes the main loop and creates a scalar epilogue 10557 // to be vectorized by executing the plan (potentially with a different 10558 // factor) again shortly afterwards. 10559 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10560 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10561 EPI, &LVL, &CM, BFI, PSI, Checks); 10562 10563 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10564 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10565 DT); 10566 ++LoopsVectorized; 10567 10568 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10569 formLCSSARecursively(*L, *DT, LI, SE); 10570 10571 // Second pass vectorizes the epilogue and adjusts the control flow 10572 // edges from the first pass. 10573 EPI.MainLoopVF = EPI.EpilogueVF; 10574 EPI.MainLoopUF = EPI.EpilogueUF; 10575 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10576 ORE, EPI, &LVL, &CM, BFI, PSI, 10577 Checks); 10578 10579 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10580 10581 // Ensure that the start values for any VPReductionPHIRecipes are 10582 // updated before vectorising the epilogue loop. 10583 VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); 10584 for (VPRecipeBase &R : Header->phis()) { 10585 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10586 if (auto *Resume = MainILV.getReductionResumeValue( 10587 ReductionPhi->getRecurrenceDescriptor())) { 10588 VPValue *StartVal = new VPValue(Resume); 10589 BestEpiPlan.addExternalDef(StartVal); 10590 ReductionPhi->setOperand(0, StartVal); 10591 } 10592 } 10593 } 10594 10595 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10596 DT); 10597 ++LoopsEpilogueVectorized; 10598 10599 if (!MainILV.areSafetyChecksAdded()) 10600 DisableRuntimeUnroll = true; 10601 } else { 10602 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10603 &LVL, &CM, BFI, PSI, Checks); 10604 10605 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10606 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10607 ++LoopsVectorized; 10608 10609 // Add metadata to disable runtime unrolling a scalar loop when there 10610 // are no runtime checks about strides and memory. A scalar loop that is 10611 // rarely used is not worth unrolling. 10612 if (!LB.areSafetyChecksAdded()) 10613 DisableRuntimeUnroll = true; 10614 } 10615 // Report the vectorization decision. 10616 ORE->emit([&]() { 10617 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10618 L->getHeader()) 10619 << "vectorized loop (vectorization width: " 10620 << NV("VectorizationFactor", VF.Width) 10621 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10622 }); 10623 } 10624 10625 if (ORE->allowExtraAnalysis(LV_NAME)) 10626 checkMixedPrecision(L, ORE); 10627 } 10628 10629 Optional<MDNode *> RemainderLoopID = 10630 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10631 LLVMLoopVectorizeFollowupEpilogue}); 10632 if (RemainderLoopID.hasValue()) { 10633 L->setLoopID(RemainderLoopID.getValue()); 10634 } else { 10635 if (DisableRuntimeUnroll) 10636 AddRuntimeUnrollDisableMetaData(L); 10637 10638 // Mark the loop as already vectorized to avoid vectorizing again. 10639 Hints.setAlreadyVectorized(); 10640 } 10641 10642 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10643 return true; 10644 } 10645 10646 LoopVectorizeResult LoopVectorizePass::runImpl( 10647 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10648 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10649 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10650 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10651 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10652 SE = &SE_; 10653 LI = &LI_; 10654 TTI = &TTI_; 10655 DT = &DT_; 10656 BFI = &BFI_; 10657 TLI = TLI_; 10658 AA = &AA_; 10659 AC = &AC_; 10660 GetLAA = &GetLAA_; 10661 DB = &DB_; 10662 ORE = &ORE_; 10663 PSI = PSI_; 10664 10665 // Don't attempt if 10666 // 1. the target claims to have no vector registers, and 10667 // 2. interleaving won't help ILP. 10668 // 10669 // The second condition is necessary because, even if the target has no 10670 // vector registers, loop vectorization may still enable scalar 10671 // interleaving. 10672 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10673 TTI->getMaxInterleaveFactor(1) < 2) 10674 return LoopVectorizeResult(false, false); 10675 10676 bool Changed = false, CFGChanged = false; 10677 10678 // The vectorizer requires loops to be in simplified form. 10679 // Since simplification may add new inner loops, it has to run before the 10680 // legality and profitability checks. This means running the loop vectorizer 10681 // will simplify all loops, regardless of whether anything end up being 10682 // vectorized. 10683 for (auto &L : *LI) 10684 Changed |= CFGChanged |= 10685 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10686 10687 // Build up a worklist of inner-loops to vectorize. This is necessary as 10688 // the act of vectorizing or partially unrolling a loop creates new loops 10689 // and can invalidate iterators across the loops. 10690 SmallVector<Loop *, 8> Worklist; 10691 10692 for (Loop *L : *LI) 10693 collectSupportedLoops(*L, LI, ORE, Worklist); 10694 10695 LoopsAnalyzed += Worklist.size(); 10696 10697 // Now walk the identified inner loops. 10698 while (!Worklist.empty()) { 10699 Loop *L = Worklist.pop_back_val(); 10700 10701 // For the inner loops we actually process, form LCSSA to simplify the 10702 // transform. 10703 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10704 10705 Changed |= CFGChanged |= processLoop(L); 10706 } 10707 10708 // Process each loop nest in the function. 10709 return LoopVectorizeResult(Changed, CFGChanged); 10710 } 10711 10712 PreservedAnalyses LoopVectorizePass::run(Function &F, 10713 FunctionAnalysisManager &AM) { 10714 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10715 auto &LI = AM.getResult<LoopAnalysis>(F); 10716 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10717 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10718 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10719 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10720 auto &AA = AM.getResult<AAManager>(F); 10721 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10722 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10723 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10724 10725 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10726 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10727 [&](Loop &L) -> const LoopAccessInfo & { 10728 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10729 TLI, TTI, nullptr, nullptr, nullptr}; 10730 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10731 }; 10732 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10733 ProfileSummaryInfo *PSI = 10734 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10735 LoopVectorizeResult Result = 10736 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10737 if (!Result.MadeAnyChange) 10738 return PreservedAnalyses::all(); 10739 PreservedAnalyses PA; 10740 10741 // We currently do not preserve loopinfo/dominator analyses with outer loop 10742 // vectorization. Until this is addressed, mark these analyses as preserved 10743 // only for non-VPlan-native path. 10744 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10745 if (!EnableVPlanNativePath) { 10746 PA.preserve<LoopAnalysis>(); 10747 PA.preserve<DominatorTreeAnalysis>(); 10748 } 10749 10750 if (Result.MadeCFGChange) { 10751 // Making CFG changes likely means a loop got vectorized. Indicate that 10752 // extra simplification passes should be run. 10753 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10754 // be run if runtime checks have been added. 10755 AM.getResult<ShouldRunExtraVectorPasses>(F); 10756 PA.preserve<ShouldRunExtraVectorPasses>(); 10757 } else { 10758 PA.preserveSet<CFGAnalyses>(); 10759 } 10760 return PA; 10761 } 10762 10763 void LoopVectorizePass::printPipeline( 10764 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10765 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10766 OS, MapClassName2PassName); 10767 10768 OS << "<"; 10769 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10770 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10771 OS << ">"; 10772 } 10773