1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop and the start value for the canonical induction, if it is != 0. The 474 /// latter is the case when vectorizing the epilogue loop. In the case of 475 /// epilogue vectorization, this function is overriden to handle the more 476 /// complex control flow around the loops. 477 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 478 479 /// Widen a single call instruction within the innermost loop. 480 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 481 VPTransformState &State); 482 483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 484 void fixVectorizedLoop(VPTransformState &State); 485 486 // Return true if any runtime check is added. 487 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 488 489 /// A type for vectorized values in the new loop. Each value from the 490 /// original loop, when vectorized, is represented by UF vector values in the 491 /// new unrolled loop, where UF is the unroll factor. 492 using VectorParts = SmallVector<Value *, 2>; 493 494 /// Vectorize a single first-order recurrence or pointer induction PHINode in 495 /// a block. This method handles the induction variable canonicalization. It 496 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 497 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 498 VPTransformState &State); 499 500 /// A helper function to scalarize a single Instruction in the innermost loop. 501 /// Generates a sequence of scalar instances for each lane between \p MinLane 502 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 503 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 504 /// Instr's operands. 505 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 506 const VPIteration &Instance, bool IfPredicateInstr, 507 VPTransformState &State); 508 509 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 510 /// is provided, the integer induction variable will first be truncated to 511 /// the corresponding type. \p CanonicalIV is the scalar value generated for 512 /// the canonical induction variable. 513 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, 514 VPTransformState &State, Value *CanonicalIV); 515 516 /// Construct the vector value of a scalarized value \p V one lane at a time. 517 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 518 VPTransformState &State); 519 520 /// Try to vectorize interleaved access group \p Group with the base address 521 /// given in \p Addr, optionally masking the vector operations if \p 522 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 523 /// values in the vectorized loop. 524 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 525 ArrayRef<VPValue *> VPDefs, 526 VPTransformState &State, VPValue *Addr, 527 ArrayRef<VPValue *> StoredValues, 528 VPValue *BlockInMask = nullptr); 529 530 /// Set the debug location in the builder \p Ptr using the debug location in 531 /// \p V. If \p Ptr is None then it uses the class member's Builder. 532 void setDebugLocFromInst(const Value *V, 533 Optional<IRBuilderBase *> CustomBuilder = None); 534 535 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 536 void fixNonInductionPHIs(VPTransformState &State); 537 538 /// Returns true if the reordering of FP operations is not allowed, but we are 539 /// able to vectorize with strict in-order reductions for the given RdxDesc. 540 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 541 542 /// Create a broadcast instruction. This method generates a broadcast 543 /// instruction (shuffle) for loop invariant values and for the induction 544 /// value. If this is the induction variable then we extend it to N, N+1, ... 545 /// this is needed because each iteration in the loop corresponds to a SIMD 546 /// element. 547 virtual Value *getBroadcastInstrs(Value *V); 548 549 /// Add metadata from one instruction to another. 550 /// 551 /// This includes both the original MDs from \p From and additional ones (\see 552 /// addNewMetadata). Use this for *newly created* instructions in the vector 553 /// loop. 554 void addMetadata(Instruction *To, Instruction *From); 555 556 /// Similar to the previous function but it adds the metadata to a 557 /// vector of instructions. 558 void addMetadata(ArrayRef<Value *> To, Instruction *From); 559 560 // Returns the resume value (bc.merge.rdx) for a reduction as 561 // generated by fixReduction. 562 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 563 564 protected: 565 friend class LoopVectorizationPlanner; 566 567 /// A small list of PHINodes. 568 using PhiVector = SmallVector<PHINode *, 4>; 569 570 /// A type for scalarized values in the new loop. Each value from the 571 /// original loop, when scalarized, is represented by UF x VF scalar values 572 /// in the new unrolled loop, where UF is the unroll factor and VF is the 573 /// vectorization factor. 574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 575 576 /// Set up the values of the IVs correctly when exiting the vector loop. 577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 578 Value *CountRoundDown, Value *EndValue, 579 BasicBlock *MiddleBlock); 580 581 /// Introduce a conditional branch (on true, condition to be set later) at the 582 /// end of the header=latch connecting it to itself (across the backedge) and 583 /// to the exit block of \p L. 584 void createHeaderBranch(Loop *L); 585 586 /// Handle all cross-iteration phis in the header. 587 void fixCrossIterationPHIs(VPTransformState &State); 588 589 /// Create the exit value of first order recurrences in the middle block and 590 /// update their users. 591 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 592 VPTransformState &State); 593 594 /// Create code for the loop exit value of the reduction. 595 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 596 597 /// Clear NSW/NUW flags from reduction instructions if necessary. 598 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 599 VPTransformState &State); 600 601 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 602 /// means we need to add the appropriate incoming value from the middle 603 /// block as exiting edges from the scalar epilogue loop (if present) are 604 /// already in place, and we exit the vector loop exclusively to the middle 605 /// block. 606 void fixLCSSAPHIs(VPTransformState &State); 607 608 /// Iteratively sink the scalarized operands of a predicated instruction into 609 /// the block that was created for it. 610 void sinkScalarOperands(Instruction *PredInst); 611 612 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 613 /// represented as. 614 void truncateToMinimalBitwidths(VPTransformState &State); 615 616 /// Create a vector induction phi node based on an existing scalar one. \p 617 /// EntryVal is the value from the original loop that maps to the vector phi 618 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 619 /// truncate instruction, instead of widening the original IV, we widen a 620 /// version of the IV truncated to \p EntryVal's type. 621 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 622 Value *Step, Value *Start, 623 Instruction *EntryVal, VPValue *Def, 624 VPTransformState &State); 625 626 /// Returns (and creates if needed) the original loop trip count. 627 Value *getOrCreateTripCount(Loop *NewLoop); 628 629 /// Returns (and creates if needed) the trip count of the widened loop. 630 Value *getOrCreateVectorTripCount(Loop *NewLoop); 631 632 /// Returns a bitcasted value to the requested vector type. 633 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 634 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 635 const DataLayout &DL); 636 637 /// Emit a bypass check to see if the vector trip count is zero, including if 638 /// it overflows. 639 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 640 641 /// Emit a bypass check to see if all of the SCEV assumptions we've 642 /// had to make are correct. Returns the block containing the checks or 643 /// nullptr if no checks have been added. 644 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 645 646 /// Emit bypass checks to check any memory assumptions we may have made. 647 /// Returns the block containing the checks or nullptr if no checks have been 648 /// added. 649 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 650 651 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 652 /// vector loop preheader, middle block and scalar preheader. Also 653 /// allocate a loop object for the new vector loop and return it. 654 Loop *createVectorLoopSkeleton(StringRef Prefix); 655 656 /// Create new phi nodes for the induction variables to resume iteration count 657 /// in the scalar epilogue, from where the vectorized loop left off. 658 /// In cases where the loop skeleton is more complicated (eg. epilogue 659 /// vectorization) and the resume values can come from an additional bypass 660 /// block, the \p AdditionalBypass pair provides information about the bypass 661 /// block and the end value on the edge from bypass to this loop. 662 void createInductionResumeValues( 663 Loop *L, 664 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 665 666 /// Complete the loop skeleton by adding debug MDs, creating appropriate 667 /// conditional branches in the middle block, preparing the builder and 668 /// running the verifier. Take in the vector loop \p L as argument, and return 669 /// the preheader of the completed vector loop. 670 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 671 672 /// Add additional metadata to \p To that was not present on \p Orig. 673 /// 674 /// Currently this is used to add the noalias annotations based on the 675 /// inserted memchecks. Use this for instructions that are *cloned* into the 676 /// vector loop. 677 void addNewMetadata(Instruction *To, const Instruction *Orig); 678 679 /// Collect poison-generating recipes that may generate a poison value that is 680 /// used after vectorization, even when their operands are not poison. Those 681 /// recipes meet the following conditions: 682 /// * Contribute to the address computation of a recipe generating a widen 683 /// memory load/store (VPWidenMemoryInstructionRecipe or 684 /// VPInterleaveRecipe). 685 /// * Such a widen memory load/store has at least one underlying Instruction 686 /// that is in a basic block that needs predication and after vectorization 687 /// the generated instruction won't be predicated. 688 void collectPoisonGeneratingRecipes(VPTransformState &State); 689 690 /// Allow subclasses to override and print debug traces before/after vplan 691 /// execution, when trace information is requested. 692 virtual void printDebugTracesAtStart(){}; 693 virtual void printDebugTracesAtEnd(){}; 694 695 /// The original loop. 696 Loop *OrigLoop; 697 698 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 699 /// dynamic knowledge to simplify SCEV expressions and converts them to a 700 /// more usable form. 701 PredicatedScalarEvolution &PSE; 702 703 /// Loop Info. 704 LoopInfo *LI; 705 706 /// Dominator Tree. 707 DominatorTree *DT; 708 709 /// Alias Analysis. 710 AAResults *AA; 711 712 /// Target Library Info. 713 const TargetLibraryInfo *TLI; 714 715 /// Target Transform Info. 716 const TargetTransformInfo *TTI; 717 718 /// Assumption Cache. 719 AssumptionCache *AC; 720 721 /// Interface to emit optimization remarks. 722 OptimizationRemarkEmitter *ORE; 723 724 /// LoopVersioning. It's only set up (non-null) if memchecks were 725 /// used. 726 /// 727 /// This is currently only used to add no-alias metadata based on the 728 /// memchecks. The actually versioning is performed manually. 729 std::unique_ptr<LoopVersioning> LVer; 730 731 /// The vectorization SIMD factor to use. Each vector will have this many 732 /// vector elements. 733 ElementCount VF; 734 735 /// The vectorization unroll factor to use. Each scalar is vectorized to this 736 /// many different vector instructions. 737 unsigned UF; 738 739 /// The builder that we use 740 IRBuilder<> Builder; 741 742 // --- Vectorization state --- 743 744 /// The vector-loop preheader. 745 BasicBlock *LoopVectorPreHeader; 746 747 /// The scalar-loop preheader. 748 BasicBlock *LoopScalarPreHeader; 749 750 /// Middle Block between the vector and the scalar. 751 BasicBlock *LoopMiddleBlock; 752 753 /// The unique ExitBlock of the scalar loop if one exists. Note that 754 /// there can be multiple exiting edges reaching this block. 755 BasicBlock *LoopExitBlock; 756 757 /// The vector loop body. 758 BasicBlock *LoopVectorBody; 759 760 /// The scalar loop body. 761 BasicBlock *LoopScalarBody; 762 763 /// A list of all bypass blocks. The first block is the entry of the loop. 764 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 765 766 /// Store instructions that were predicated. 767 SmallVector<Instruction *, 4> PredicatedInstructions; 768 769 /// Trip count of the original loop. 770 Value *TripCount = nullptr; 771 772 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 773 Value *VectorTripCount = nullptr; 774 775 /// The legality analysis. 776 LoopVectorizationLegality *Legal; 777 778 /// The profitablity analysis. 779 LoopVectorizationCostModel *Cost; 780 781 // Record whether runtime checks are added. 782 bool AddedSafetyChecks = false; 783 784 // Holds the end values for each induction variable. We save the end values 785 // so we can later fix-up the external users of the induction variables. 786 DenseMap<PHINode *, Value *> IVEndValues; 787 788 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 789 // fixed up at the end of vector code generation. 790 SmallVector<PHINode *, 8> OrigPHIsToFix; 791 792 /// BFI and PSI are used to check for profile guided size optimizations. 793 BlockFrequencyInfo *BFI; 794 ProfileSummaryInfo *PSI; 795 796 // Whether this loop should be optimized for size based on profile guided size 797 // optimizatios. 798 bool OptForSizeBasedOnProfile; 799 800 /// Structure to hold information about generated runtime checks, responsible 801 /// for cleaning the checks, if vectorization turns out unprofitable. 802 GeneratedRTChecks &RTChecks; 803 804 // Holds the resume values for reductions in the loops, used to set the 805 // correct start value of reduction PHIs when vectorizing the epilogue. 806 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 807 ReductionResumeValues; 808 }; 809 810 class InnerLoopUnroller : public InnerLoopVectorizer { 811 public: 812 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 813 LoopInfo *LI, DominatorTree *DT, 814 const TargetLibraryInfo *TLI, 815 const TargetTransformInfo *TTI, AssumptionCache *AC, 816 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 817 LoopVectorizationLegality *LVL, 818 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 819 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 820 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 821 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 822 BFI, PSI, Check) {} 823 824 private: 825 Value *getBroadcastInstrs(Value *V) override; 826 }; 827 828 /// Encapsulate information regarding vectorization of a loop and its epilogue. 829 /// This information is meant to be updated and used across two stages of 830 /// epilogue vectorization. 831 struct EpilogueLoopVectorizationInfo { 832 ElementCount MainLoopVF = ElementCount::getFixed(0); 833 unsigned MainLoopUF = 0; 834 ElementCount EpilogueVF = ElementCount::getFixed(0); 835 unsigned EpilogueUF = 0; 836 BasicBlock *MainLoopIterationCountCheck = nullptr; 837 BasicBlock *EpilogueIterationCountCheck = nullptr; 838 BasicBlock *SCEVSafetyCheck = nullptr; 839 BasicBlock *MemSafetyCheck = nullptr; 840 Value *TripCount = nullptr; 841 Value *VectorTripCount = nullptr; 842 843 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 844 ElementCount EVF, unsigned EUF) 845 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 846 assert(EUF == 1 && 847 "A high UF for the epilogue loop is likely not beneficial."); 848 } 849 }; 850 851 /// An extension of the inner loop vectorizer that creates a skeleton for a 852 /// vectorized loop that has its epilogue (residual) also vectorized. 853 /// The idea is to run the vplan on a given loop twice, firstly to setup the 854 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 855 /// from the first step and vectorize the epilogue. This is achieved by 856 /// deriving two concrete strategy classes from this base class and invoking 857 /// them in succession from the loop vectorizer planner. 858 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 859 public: 860 InnerLoopAndEpilogueVectorizer( 861 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 862 DominatorTree *DT, const TargetLibraryInfo *TLI, 863 const TargetTransformInfo *TTI, AssumptionCache *AC, 864 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 865 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 866 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 867 GeneratedRTChecks &Checks) 868 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 869 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 870 Checks), 871 EPI(EPI) {} 872 873 // Override this function to handle the more complex control flow around the 874 // three loops. 875 std::pair<BasicBlock *, Value *> 876 createVectorizedLoopSkeleton() final override { 877 return createEpilogueVectorizedLoopSkeleton(); 878 } 879 880 /// The interface for creating a vectorized skeleton using one of two 881 /// different strategies, each corresponding to one execution of the vplan 882 /// as described above. 883 virtual std::pair<BasicBlock *, Value *> 884 createEpilogueVectorizedLoopSkeleton() = 0; 885 886 /// Holds and updates state information required to vectorize the main loop 887 /// and its epilogue in two separate passes. This setup helps us avoid 888 /// regenerating and recomputing runtime safety checks. It also helps us to 889 /// shorten the iteration-count-check path length for the cases where the 890 /// iteration count of the loop is so small that the main vector loop is 891 /// completely skipped. 892 EpilogueLoopVectorizationInfo &EPI; 893 }; 894 895 /// A specialized derived class of inner loop vectorizer that performs 896 /// vectorization of *main* loops in the process of vectorizing loops and their 897 /// epilogues. 898 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 899 public: 900 EpilogueVectorizerMainLoop( 901 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 902 DominatorTree *DT, const TargetLibraryInfo *TLI, 903 const TargetTransformInfo *TTI, AssumptionCache *AC, 904 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 905 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 906 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 907 GeneratedRTChecks &Check) 908 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 909 EPI, LVL, CM, BFI, PSI, Check) {} 910 /// Implements the interface for creating a vectorized skeleton using the 911 /// *main loop* strategy (ie the first pass of vplan execution). 912 std::pair<BasicBlock *, Value *> 913 createEpilogueVectorizedLoopSkeleton() final override; 914 915 protected: 916 /// Emits an iteration count bypass check once for the main loop (when \p 917 /// ForEpilogue is false) and once for the epilogue loop (when \p 918 /// ForEpilogue is true). 919 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 920 bool ForEpilogue); 921 void printDebugTracesAtStart() override; 922 void printDebugTracesAtEnd() override; 923 }; 924 925 // A specialized derived class of inner loop vectorizer that performs 926 // vectorization of *epilogue* loops in the process of vectorizing loops and 927 // their epilogues. 928 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 929 public: 930 EpilogueVectorizerEpilogueLoop( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI, LVL, CM, BFI, PSI, Checks) {} 940 /// Implements the interface for creating a vectorized skeleton using the 941 /// *epilogue loop* strategy (ie the second pass of vplan execution). 942 std::pair<BasicBlock *, Value *> 943 createEpilogueVectorizedLoopSkeleton() final override; 944 945 protected: 946 /// Emits an iteration count bypass check after the main vector loop has 947 /// finished to see if there are any iterations left to execute by either 948 /// the vector epilogue or the scalar epilogue. 949 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 950 BasicBlock *Bypass, 951 BasicBlock *Insert); 952 void printDebugTracesAtStart() override; 953 void printDebugTracesAtEnd() override; 954 }; 955 } // end namespace llvm 956 957 /// Look for a meaningful debug location on the instruction or it's 958 /// operands. 959 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 960 if (!I) 961 return I; 962 963 DebugLoc Empty; 964 if (I->getDebugLoc() != Empty) 965 return I; 966 967 for (Use &Op : I->operands()) { 968 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 969 if (OpInst->getDebugLoc() != Empty) 970 return OpInst; 971 } 972 973 return I; 974 } 975 976 void InnerLoopVectorizer::setDebugLocFromInst( 977 const Value *V, Optional<IRBuilderBase *> CustomBuilder) { 978 IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 979 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 980 const DILocation *DIL = Inst->getDebugLoc(); 981 982 // When a FSDiscriminator is enabled, we don't need to add the multiply 983 // factors to the discriminators. 984 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 985 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 986 // FIXME: For scalable vectors, assume vscale=1. 987 auto NewDIL = 988 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 989 if (NewDIL) 990 B->SetCurrentDebugLocation(NewDIL.getValue()); 991 else 992 LLVM_DEBUG(dbgs() 993 << "Failed to create new discriminator: " 994 << DIL->getFilename() << " Line: " << DIL->getLine()); 995 } else 996 B->SetCurrentDebugLocation(DIL); 997 } else 998 B->SetCurrentDebugLocation(DebugLoc()); 999 } 1000 1001 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1002 /// is passed, the message relates to that particular instruction. 1003 #ifndef NDEBUG 1004 static void debugVectorizationMessage(const StringRef Prefix, 1005 const StringRef DebugMsg, 1006 Instruction *I) { 1007 dbgs() << "LV: " << Prefix << DebugMsg; 1008 if (I != nullptr) 1009 dbgs() << " " << *I; 1010 else 1011 dbgs() << '.'; 1012 dbgs() << '\n'; 1013 } 1014 #endif 1015 1016 /// Create an analysis remark that explains why vectorization failed 1017 /// 1018 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1019 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1020 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1021 /// the location of the remark. \return the remark object that can be 1022 /// streamed to. 1023 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1024 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1025 Value *CodeRegion = TheLoop->getHeader(); 1026 DebugLoc DL = TheLoop->getStartLoc(); 1027 1028 if (I) { 1029 CodeRegion = I->getParent(); 1030 // If there is no debug location attached to the instruction, revert back to 1031 // using the loop's. 1032 if (I->getDebugLoc()) 1033 DL = I->getDebugLoc(); 1034 } 1035 1036 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1037 } 1038 1039 namespace llvm { 1040 1041 /// Return a value for Step multiplied by VF. 1042 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1043 int64_t Step) { 1044 assert(Ty->isIntegerTy() && "Expected an integer step"); 1045 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1046 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1047 } 1048 1049 /// Return the runtime value for VF. 1050 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1051 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1052 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1053 } 1054 1055 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1056 ElementCount VF) { 1057 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1058 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1059 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1060 return B.CreateUIToFP(RuntimeVF, FTy); 1061 } 1062 1063 void reportVectorizationFailure(const StringRef DebugMsg, 1064 const StringRef OREMsg, const StringRef ORETag, 1065 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1066 Instruction *I) { 1067 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1068 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1069 ORE->emit( 1070 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1071 << "loop not vectorized: " << OREMsg); 1072 } 1073 1074 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1075 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1076 Instruction *I) { 1077 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1078 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1079 ORE->emit( 1080 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1081 << Msg); 1082 } 1083 1084 } // end namespace llvm 1085 1086 #ifndef NDEBUG 1087 /// \return string containing a file name and a line # for the given loop. 1088 static std::string getDebugLocString(const Loop *L) { 1089 std::string Result; 1090 if (L) { 1091 raw_string_ostream OS(Result); 1092 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1093 LoopDbgLoc.print(OS); 1094 else 1095 // Just print the module name. 1096 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1097 OS.flush(); 1098 } 1099 return Result; 1100 } 1101 #endif 1102 1103 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1104 const Instruction *Orig) { 1105 // If the loop was versioned with memchecks, add the corresponding no-alias 1106 // metadata. 1107 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1108 LVer->annotateInstWithNoAlias(To, Orig); 1109 } 1110 1111 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1112 VPTransformState &State) { 1113 1114 // Collect recipes in the backward slice of `Root` that may generate a poison 1115 // value that is used after vectorization. 1116 SmallPtrSet<VPRecipeBase *, 16> Visited; 1117 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1118 SmallVector<VPRecipeBase *, 16> Worklist; 1119 Worklist.push_back(Root); 1120 1121 // Traverse the backward slice of Root through its use-def chain. 1122 while (!Worklist.empty()) { 1123 VPRecipeBase *CurRec = Worklist.back(); 1124 Worklist.pop_back(); 1125 1126 if (!Visited.insert(CurRec).second) 1127 continue; 1128 1129 // Prune search if we find another recipe generating a widen memory 1130 // instruction. Widen memory instructions involved in address computation 1131 // will lead to gather/scatter instructions, which don't need to be 1132 // handled. 1133 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1134 isa<VPInterleaveRecipe>(CurRec) || 1135 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1136 continue; 1137 1138 // This recipe contributes to the address computation of a widen 1139 // load/store. Collect recipe if its underlying instruction has 1140 // poison-generating flags. 1141 Instruction *Instr = CurRec->getUnderlyingInstr(); 1142 if (Instr && Instr->hasPoisonGeneratingFlags()) 1143 State.MayGeneratePoisonRecipes.insert(CurRec); 1144 1145 // Add new definitions to the worklist. 1146 for (VPValue *operand : CurRec->operands()) 1147 if (VPDef *OpDef = operand->getDef()) 1148 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1149 } 1150 }); 1151 1152 // Traverse all the recipes in the VPlan and collect the poison-generating 1153 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1154 // VPInterleaveRecipe. 1155 auto Iter = depth_first( 1156 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1157 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1158 for (VPRecipeBase &Recipe : *VPBB) { 1159 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1160 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1161 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1162 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1163 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1164 collectPoisonGeneratingInstrsInBackwardSlice( 1165 cast<VPRecipeBase>(AddrDef)); 1166 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1167 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1168 if (AddrDef) { 1169 // Check if any member of the interleave group needs predication. 1170 const InterleaveGroup<Instruction> *InterGroup = 1171 InterleaveRec->getInterleaveGroup(); 1172 bool NeedPredication = false; 1173 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1174 I < NumMembers; ++I) { 1175 Instruction *Member = InterGroup->getMember(I); 1176 if (Member) 1177 NeedPredication |= 1178 Legal->blockNeedsPredication(Member->getParent()); 1179 } 1180 1181 if (NeedPredication) 1182 collectPoisonGeneratingInstrsInBackwardSlice( 1183 cast<VPRecipeBase>(AddrDef)); 1184 } 1185 } 1186 } 1187 } 1188 } 1189 1190 void InnerLoopVectorizer::addMetadata(Instruction *To, 1191 Instruction *From) { 1192 propagateMetadata(To, From); 1193 addNewMetadata(To, From); 1194 } 1195 1196 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1197 Instruction *From) { 1198 for (Value *V : To) { 1199 if (Instruction *I = dyn_cast<Instruction>(V)) 1200 addMetadata(I, From); 1201 } 1202 } 1203 1204 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1205 const RecurrenceDescriptor &RdxDesc) { 1206 auto It = ReductionResumeValues.find(&RdxDesc); 1207 assert(It != ReductionResumeValues.end() && 1208 "Expected to find a resume value for the reduction."); 1209 return It->second; 1210 } 1211 1212 namespace llvm { 1213 1214 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1215 // lowered. 1216 enum ScalarEpilogueLowering { 1217 1218 // The default: allowing scalar epilogues. 1219 CM_ScalarEpilogueAllowed, 1220 1221 // Vectorization with OptForSize: don't allow epilogues. 1222 CM_ScalarEpilogueNotAllowedOptSize, 1223 1224 // A special case of vectorisation with OptForSize: loops with a very small 1225 // trip count are considered for vectorization under OptForSize, thereby 1226 // making sure the cost of their loop body is dominant, free of runtime 1227 // guards and scalar iteration overheads. 1228 CM_ScalarEpilogueNotAllowedLowTripLoop, 1229 1230 // Loop hint predicate indicating an epilogue is undesired. 1231 CM_ScalarEpilogueNotNeededUsePredicate, 1232 1233 // Directive indicating we must either tail fold or not vectorize 1234 CM_ScalarEpilogueNotAllowedUsePredicate 1235 }; 1236 1237 /// ElementCountComparator creates a total ordering for ElementCount 1238 /// for the purposes of using it in a set structure. 1239 struct ElementCountComparator { 1240 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1241 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1242 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1243 } 1244 }; 1245 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1246 1247 /// LoopVectorizationCostModel - estimates the expected speedups due to 1248 /// vectorization. 1249 /// In many cases vectorization is not profitable. This can happen because of 1250 /// a number of reasons. In this class we mainly attempt to predict the 1251 /// expected speedup/slowdowns due to the supported instruction set. We use the 1252 /// TargetTransformInfo to query the different backends for the cost of 1253 /// different operations. 1254 class LoopVectorizationCostModel { 1255 public: 1256 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1257 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1258 LoopVectorizationLegality *Legal, 1259 const TargetTransformInfo &TTI, 1260 const TargetLibraryInfo *TLI, DemandedBits *DB, 1261 AssumptionCache *AC, 1262 OptimizationRemarkEmitter *ORE, const Function *F, 1263 const LoopVectorizeHints *Hints, 1264 InterleavedAccessInfo &IAI) 1265 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1266 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1267 Hints(Hints), InterleaveInfo(IAI) {} 1268 1269 /// \return An upper bound for the vectorization factors (both fixed and 1270 /// scalable). If the factors are 0, vectorization and interleaving should be 1271 /// avoided up front. 1272 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1273 1274 /// \return True if runtime checks are required for vectorization, and false 1275 /// otherwise. 1276 bool runtimeChecksRequired(); 1277 1278 /// \return The most profitable vectorization factor and the cost of that VF. 1279 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1280 /// then this vectorization factor will be selected if vectorization is 1281 /// possible. 1282 VectorizationFactor 1283 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1284 1285 VectorizationFactor 1286 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1287 const LoopVectorizationPlanner &LVP); 1288 1289 /// Setup cost-based decisions for user vectorization factor. 1290 /// \return true if the UserVF is a feasible VF to be chosen. 1291 bool selectUserVectorizationFactor(ElementCount UserVF) { 1292 collectUniformsAndScalars(UserVF); 1293 collectInstsToScalarize(UserVF); 1294 return expectedCost(UserVF).first.isValid(); 1295 } 1296 1297 /// \return The size (in bits) of the smallest and widest types in the code 1298 /// that needs to be vectorized. We ignore values that remain scalar such as 1299 /// 64 bit loop indices. 1300 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1301 1302 /// \return The desired interleave count. 1303 /// If interleave count has been specified by metadata it will be returned. 1304 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1305 /// are the selected vectorization factor and the cost of the selected VF. 1306 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1307 1308 /// Memory access instruction may be vectorized in more than one way. 1309 /// Form of instruction after vectorization depends on cost. 1310 /// This function takes cost-based decisions for Load/Store instructions 1311 /// and collects them in a map. This decisions map is used for building 1312 /// the lists of loop-uniform and loop-scalar instructions. 1313 /// The calculated cost is saved with widening decision in order to 1314 /// avoid redundant calculations. 1315 void setCostBasedWideningDecision(ElementCount VF); 1316 1317 /// A struct that represents some properties of the register usage 1318 /// of a loop. 1319 struct RegisterUsage { 1320 /// Holds the number of loop invariant values that are used in the loop. 1321 /// The key is ClassID of target-provided register class. 1322 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1323 /// Holds the maximum number of concurrent live intervals in the loop. 1324 /// The key is ClassID of target-provided register class. 1325 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1326 }; 1327 1328 /// \return Returns information about the register usages of the loop for the 1329 /// given vectorization factors. 1330 SmallVector<RegisterUsage, 8> 1331 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1332 1333 /// Collect values we want to ignore in the cost model. 1334 void collectValuesToIgnore(); 1335 1336 /// Collect all element types in the loop for which widening is needed. 1337 void collectElementTypesForWidening(); 1338 1339 /// Split reductions into those that happen in the loop, and those that happen 1340 /// outside. In loop reductions are collected into InLoopReductionChains. 1341 void collectInLoopReductions(); 1342 1343 /// Returns true if we should use strict in-order reductions for the given 1344 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1345 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1346 /// of FP operations. 1347 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1348 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1349 } 1350 1351 /// \returns The smallest bitwidth each instruction can be represented with. 1352 /// The vector equivalents of these instructions should be truncated to this 1353 /// type. 1354 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1355 return MinBWs; 1356 } 1357 1358 /// \returns True if it is more profitable to scalarize instruction \p I for 1359 /// vectorization factor \p VF. 1360 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1361 assert(VF.isVector() && 1362 "Profitable to scalarize relevant only for VF > 1."); 1363 1364 // Cost model is not run in the VPlan-native path - return conservative 1365 // result until this changes. 1366 if (EnableVPlanNativePath) 1367 return false; 1368 1369 auto Scalars = InstsToScalarize.find(VF); 1370 assert(Scalars != InstsToScalarize.end() && 1371 "VF not yet analyzed for scalarization profitability"); 1372 return Scalars->second.find(I) != Scalars->second.end(); 1373 } 1374 1375 /// Returns true if \p I is known to be uniform after vectorization. 1376 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1377 if (VF.isScalar()) 1378 return true; 1379 1380 // Cost model is not run in the VPlan-native path - return conservative 1381 // result until this changes. 1382 if (EnableVPlanNativePath) 1383 return false; 1384 1385 auto UniformsPerVF = Uniforms.find(VF); 1386 assert(UniformsPerVF != Uniforms.end() && 1387 "VF not yet analyzed for uniformity"); 1388 return UniformsPerVF->second.count(I); 1389 } 1390 1391 /// Returns true if \p I is known to be scalar after vectorization. 1392 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1393 if (VF.isScalar()) 1394 return true; 1395 1396 // Cost model is not run in the VPlan-native path - return conservative 1397 // result until this changes. 1398 if (EnableVPlanNativePath) 1399 return false; 1400 1401 auto ScalarsPerVF = Scalars.find(VF); 1402 assert(ScalarsPerVF != Scalars.end() && 1403 "Scalar values are not calculated for VF"); 1404 return ScalarsPerVF->second.count(I); 1405 } 1406 1407 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1408 /// for vectorization factor \p VF. 1409 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1410 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1411 !isProfitableToScalarize(I, VF) && 1412 !isScalarAfterVectorization(I, VF); 1413 } 1414 1415 /// Decision that was taken during cost calculation for memory instruction. 1416 enum InstWidening { 1417 CM_Unknown, 1418 CM_Widen, // For consecutive accesses with stride +1. 1419 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1420 CM_Interleave, 1421 CM_GatherScatter, 1422 CM_Scalarize 1423 }; 1424 1425 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1426 /// instruction \p I and vector width \p VF. 1427 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1428 InstructionCost Cost) { 1429 assert(VF.isVector() && "Expected VF >=2"); 1430 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1431 } 1432 1433 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1434 /// interleaving group \p Grp and vector width \p VF. 1435 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1436 ElementCount VF, InstWidening W, 1437 InstructionCost Cost) { 1438 assert(VF.isVector() && "Expected VF >=2"); 1439 /// Broadcast this decicion to all instructions inside the group. 1440 /// But the cost will be assigned to one instruction only. 1441 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1442 if (auto *I = Grp->getMember(i)) { 1443 if (Grp->getInsertPos() == I) 1444 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1445 else 1446 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1447 } 1448 } 1449 } 1450 1451 /// Return the cost model decision for the given instruction \p I and vector 1452 /// width \p VF. Return CM_Unknown if this instruction did not pass 1453 /// through the cost modeling. 1454 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1455 assert(VF.isVector() && "Expected VF to be a vector VF"); 1456 // Cost model is not run in the VPlan-native path - return conservative 1457 // result until this changes. 1458 if (EnableVPlanNativePath) 1459 return CM_GatherScatter; 1460 1461 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1462 auto Itr = WideningDecisions.find(InstOnVF); 1463 if (Itr == WideningDecisions.end()) 1464 return CM_Unknown; 1465 return Itr->second.first; 1466 } 1467 1468 /// Return the vectorization cost for the given instruction \p I and vector 1469 /// width \p VF. 1470 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1471 assert(VF.isVector() && "Expected VF >=2"); 1472 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1473 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1474 "The cost is not calculated"); 1475 return WideningDecisions[InstOnVF].second; 1476 } 1477 1478 /// Return True if instruction \p I is an optimizable truncate whose operand 1479 /// is an induction variable. Such a truncate will be removed by adding a new 1480 /// induction variable with the destination type. 1481 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1482 // If the instruction is not a truncate, return false. 1483 auto *Trunc = dyn_cast<TruncInst>(I); 1484 if (!Trunc) 1485 return false; 1486 1487 // Get the source and destination types of the truncate. 1488 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1489 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1490 1491 // If the truncate is free for the given types, return false. Replacing a 1492 // free truncate with an induction variable would add an induction variable 1493 // update instruction to each iteration of the loop. We exclude from this 1494 // check the primary induction variable since it will need an update 1495 // instruction regardless. 1496 Value *Op = Trunc->getOperand(0); 1497 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1498 return false; 1499 1500 // If the truncated value is not an induction variable, return false. 1501 return Legal->isInductionPhi(Op); 1502 } 1503 1504 /// Collects the instructions to scalarize for each predicated instruction in 1505 /// the loop. 1506 void collectInstsToScalarize(ElementCount VF); 1507 1508 /// Collect Uniform and Scalar values for the given \p VF. 1509 /// The sets depend on CM decision for Load/Store instructions 1510 /// that may be vectorized as interleave, gather-scatter or scalarized. 1511 void collectUniformsAndScalars(ElementCount VF) { 1512 // Do the analysis once. 1513 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1514 return; 1515 setCostBasedWideningDecision(VF); 1516 collectLoopUniforms(VF); 1517 collectLoopScalars(VF); 1518 } 1519 1520 /// Returns true if the target machine supports masked store operation 1521 /// for the given \p DataType and kind of access to \p Ptr. 1522 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1523 return Legal->isConsecutivePtr(DataType, Ptr) && 1524 TTI.isLegalMaskedStore(DataType, Alignment); 1525 } 1526 1527 /// Returns true if the target machine supports masked load operation 1528 /// for the given \p DataType and kind of access to \p Ptr. 1529 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1530 return Legal->isConsecutivePtr(DataType, Ptr) && 1531 TTI.isLegalMaskedLoad(DataType, Alignment); 1532 } 1533 1534 /// Returns true if the target machine can represent \p V as a masked gather 1535 /// or scatter operation. 1536 bool isLegalGatherOrScatter(Value *V, 1537 ElementCount VF = ElementCount::getFixed(1)) { 1538 bool LI = isa<LoadInst>(V); 1539 bool SI = isa<StoreInst>(V); 1540 if (!LI && !SI) 1541 return false; 1542 auto *Ty = getLoadStoreType(V); 1543 Align Align = getLoadStoreAlignment(V); 1544 if (VF.isVector()) 1545 Ty = VectorType::get(Ty, VF); 1546 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1547 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1548 } 1549 1550 /// Returns true if the target machine supports all of the reduction 1551 /// variables found for the given VF. 1552 bool canVectorizeReductions(ElementCount VF) const { 1553 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1554 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1555 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1556 })); 1557 } 1558 1559 /// Returns true if \p I is an instruction that will be scalarized with 1560 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1561 /// instructions include conditional stores and instructions that may divide 1562 /// by zero. 1563 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1564 1565 // Returns true if \p I is an instruction that will be predicated either 1566 // through scalar predication or masked load/store or masked gather/scatter. 1567 // \p VF is the vectorization factor that will be used to vectorize \p I. 1568 // Superset of instructions that return true for isScalarWithPredication. 1569 bool isPredicatedInst(Instruction *I, ElementCount VF, 1570 bool IsKnownUniform = false) { 1571 // When we know the load is uniform and the original scalar loop was not 1572 // predicated we don't need to mark it as a predicated instruction. Any 1573 // vectorised blocks created when tail-folding are something artificial we 1574 // have introduced and we know there is always at least one active lane. 1575 // That's why we call Legal->blockNeedsPredication here because it doesn't 1576 // query tail-folding. 1577 if (IsKnownUniform && isa<LoadInst>(I) && 1578 !Legal->blockNeedsPredication(I->getParent())) 1579 return false; 1580 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1581 return false; 1582 // Loads and stores that need some form of masked operation are predicated 1583 // instructions. 1584 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1585 return Legal->isMaskRequired(I); 1586 return isScalarWithPredication(I, VF); 1587 } 1588 1589 /// Returns true if \p I is a memory instruction with consecutive memory 1590 /// access that can be widened. 1591 bool 1592 memoryInstructionCanBeWidened(Instruction *I, 1593 ElementCount VF = ElementCount::getFixed(1)); 1594 1595 /// Returns true if \p I is a memory instruction in an interleaved-group 1596 /// of memory accesses that can be vectorized with wide vector loads/stores 1597 /// and shuffles. 1598 bool 1599 interleavedAccessCanBeWidened(Instruction *I, 1600 ElementCount VF = ElementCount::getFixed(1)); 1601 1602 /// Check if \p Instr belongs to any interleaved access group. 1603 bool isAccessInterleaved(Instruction *Instr) { 1604 return InterleaveInfo.isInterleaved(Instr); 1605 } 1606 1607 /// Get the interleaved access group that \p Instr belongs to. 1608 const InterleaveGroup<Instruction> * 1609 getInterleavedAccessGroup(Instruction *Instr) { 1610 return InterleaveInfo.getInterleaveGroup(Instr); 1611 } 1612 1613 /// Returns true if we're required to use a scalar epilogue for at least 1614 /// the final iteration of the original loop. 1615 bool requiresScalarEpilogue(ElementCount VF) const { 1616 if (!isScalarEpilogueAllowed()) 1617 return false; 1618 // If we might exit from anywhere but the latch, must run the exiting 1619 // iteration in scalar form. 1620 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1621 return true; 1622 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1623 } 1624 1625 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1626 /// loop hint annotation. 1627 bool isScalarEpilogueAllowed() const { 1628 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1629 } 1630 1631 /// Returns true if all loop blocks should be masked to fold tail loop. 1632 bool foldTailByMasking() const { return FoldTailByMasking; } 1633 1634 /// Returns true if the instructions in this block requires predication 1635 /// for any reason, e.g. because tail folding now requires a predicate 1636 /// or because the block in the original loop was predicated. 1637 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1638 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1639 } 1640 1641 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1642 /// nodes to the chain of instructions representing the reductions. Uses a 1643 /// MapVector to ensure deterministic iteration order. 1644 using ReductionChainMap = 1645 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1646 1647 /// Return the chain of instructions representing an inloop reduction. 1648 const ReductionChainMap &getInLoopReductionChains() const { 1649 return InLoopReductionChains; 1650 } 1651 1652 /// Returns true if the Phi is part of an inloop reduction. 1653 bool isInLoopReduction(PHINode *Phi) const { 1654 return InLoopReductionChains.count(Phi); 1655 } 1656 1657 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1658 /// with factor VF. Return the cost of the instruction, including 1659 /// scalarization overhead if it's needed. 1660 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1661 1662 /// Estimate cost of a call instruction CI if it were vectorized with factor 1663 /// VF. Return the cost of the instruction, including scalarization overhead 1664 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1665 /// scalarized - 1666 /// i.e. either vector version isn't available, or is too expensive. 1667 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1668 bool &NeedToScalarize) const; 1669 1670 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1671 /// that of B. 1672 bool isMoreProfitable(const VectorizationFactor &A, 1673 const VectorizationFactor &B) const; 1674 1675 /// Invalidates decisions already taken by the cost model. 1676 void invalidateCostModelingDecisions() { 1677 WideningDecisions.clear(); 1678 Uniforms.clear(); 1679 Scalars.clear(); 1680 } 1681 1682 private: 1683 unsigned NumPredStores = 0; 1684 1685 /// Convenience function that returns the value of vscale_range iff 1686 /// vscale_range.min == vscale_range.max or otherwise returns the value 1687 /// returned by the corresponding TLI method. 1688 Optional<unsigned> getVScaleForTuning() const; 1689 1690 /// \return An upper bound for the vectorization factors for both 1691 /// fixed and scalable vectorization, where the minimum-known number of 1692 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1693 /// disabled or unsupported, then the scalable part will be equal to 1694 /// ElementCount::getScalable(0). 1695 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1696 ElementCount UserVF, 1697 bool FoldTailByMasking); 1698 1699 /// \return the maximized element count based on the targets vector 1700 /// registers and the loop trip-count, but limited to a maximum safe VF. 1701 /// This is a helper function of computeFeasibleMaxVF. 1702 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1703 /// issue that occurred on one of the buildbots which cannot be reproduced 1704 /// without having access to the properietary compiler (see comments on 1705 /// D98509). The issue is currently under investigation and this workaround 1706 /// will be removed as soon as possible. 1707 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1708 unsigned SmallestType, 1709 unsigned WidestType, 1710 const ElementCount &MaxSafeVF, 1711 bool FoldTailByMasking); 1712 1713 /// \return the maximum legal scalable VF, based on the safe max number 1714 /// of elements. 1715 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1716 1717 /// The vectorization cost is a combination of the cost itself and a boolean 1718 /// indicating whether any of the contributing operations will actually 1719 /// operate on vector values after type legalization in the backend. If this 1720 /// latter value is false, then all operations will be scalarized (i.e. no 1721 /// vectorization has actually taken place). 1722 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1723 1724 /// Returns the expected execution cost. The unit of the cost does 1725 /// not matter because we use the 'cost' units to compare different 1726 /// vector widths. The cost that is returned is *not* normalized by 1727 /// the factor width. If \p Invalid is not nullptr, this function 1728 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1729 /// each instruction that has an Invalid cost for the given VF. 1730 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1731 VectorizationCostTy 1732 expectedCost(ElementCount VF, 1733 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1734 1735 /// Returns the execution time cost of an instruction for a given vector 1736 /// width. Vector width of one means scalar. 1737 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1738 1739 /// The cost-computation logic from getInstructionCost which provides 1740 /// the vector type as an output parameter. 1741 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1742 Type *&VectorTy); 1743 1744 /// Return the cost of instructions in an inloop reduction pattern, if I is 1745 /// part of that pattern. 1746 Optional<InstructionCost> 1747 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1748 TTI::TargetCostKind CostKind); 1749 1750 /// Calculate vectorization cost of memory instruction \p I. 1751 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1752 1753 /// The cost computation for scalarized memory instruction. 1754 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1755 1756 /// The cost computation for interleaving group of memory instructions. 1757 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1758 1759 /// The cost computation for Gather/Scatter instruction. 1760 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1761 1762 /// The cost computation for widening instruction \p I with consecutive 1763 /// memory access. 1764 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1765 1766 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1767 /// Load: scalar load + broadcast. 1768 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1769 /// element) 1770 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1771 1772 /// Estimate the overhead of scalarizing an instruction. This is a 1773 /// convenience wrapper for the type-based getScalarizationOverhead API. 1774 InstructionCost getScalarizationOverhead(Instruction *I, 1775 ElementCount VF) const; 1776 1777 /// Returns whether the instruction is a load or store and will be a emitted 1778 /// as a vector operation. 1779 bool isConsecutiveLoadOrStore(Instruction *I); 1780 1781 /// Returns true if an artificially high cost for emulated masked memrefs 1782 /// should be used. 1783 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1784 1785 /// Map of scalar integer values to the smallest bitwidth they can be legally 1786 /// represented as. The vector equivalents of these values should be truncated 1787 /// to this type. 1788 MapVector<Instruction *, uint64_t> MinBWs; 1789 1790 /// A type representing the costs for instructions if they were to be 1791 /// scalarized rather than vectorized. The entries are Instruction-Cost 1792 /// pairs. 1793 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1794 1795 /// A set containing all BasicBlocks that are known to present after 1796 /// vectorization as a predicated block. 1797 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1798 1799 /// Records whether it is allowed to have the original scalar loop execute at 1800 /// least once. This may be needed as a fallback loop in case runtime 1801 /// aliasing/dependence checks fail, or to handle the tail/remainder 1802 /// iterations when the trip count is unknown or doesn't divide by the VF, 1803 /// or as a peel-loop to handle gaps in interleave-groups. 1804 /// Under optsize and when the trip count is very small we don't allow any 1805 /// iterations to execute in the scalar loop. 1806 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1807 1808 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1809 bool FoldTailByMasking = false; 1810 1811 /// A map holding scalar costs for different vectorization factors. The 1812 /// presence of a cost for an instruction in the mapping indicates that the 1813 /// instruction will be scalarized when vectorizing with the associated 1814 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1815 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1816 1817 /// Holds the instructions known to be uniform after vectorization. 1818 /// The data is collected per VF. 1819 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1820 1821 /// Holds the instructions known to be scalar after vectorization. 1822 /// The data is collected per VF. 1823 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1824 1825 /// Holds the instructions (address computations) that are forced to be 1826 /// scalarized. 1827 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1828 1829 /// PHINodes of the reductions that should be expanded in-loop along with 1830 /// their associated chains of reduction operations, in program order from top 1831 /// (PHI) to bottom 1832 ReductionChainMap InLoopReductionChains; 1833 1834 /// A Map of inloop reduction operations and their immediate chain operand. 1835 /// FIXME: This can be removed once reductions can be costed correctly in 1836 /// vplan. This was added to allow quick lookup to the inloop operations, 1837 /// without having to loop through InLoopReductionChains. 1838 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1839 1840 /// Returns the expected difference in cost from scalarizing the expression 1841 /// feeding a predicated instruction \p PredInst. The instructions to 1842 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1843 /// non-negative return value implies the expression will be scalarized. 1844 /// Currently, only single-use chains are considered for scalarization. 1845 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1846 ElementCount VF); 1847 1848 /// Collect the instructions that are uniform after vectorization. An 1849 /// instruction is uniform if we represent it with a single scalar value in 1850 /// the vectorized loop corresponding to each vector iteration. Examples of 1851 /// uniform instructions include pointer operands of consecutive or 1852 /// interleaved memory accesses. Note that although uniformity implies an 1853 /// instruction will be scalar, the reverse is not true. In general, a 1854 /// scalarized instruction will be represented by VF scalar values in the 1855 /// vectorized loop, each corresponding to an iteration of the original 1856 /// scalar loop. 1857 void collectLoopUniforms(ElementCount VF); 1858 1859 /// Collect the instructions that are scalar after vectorization. An 1860 /// instruction is scalar if it is known to be uniform or will be scalarized 1861 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1862 /// to the list if they are used by a load/store instruction that is marked as 1863 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1864 /// VF values in the vectorized loop, each corresponding to an iteration of 1865 /// the original scalar loop. 1866 void collectLoopScalars(ElementCount VF); 1867 1868 /// Keeps cost model vectorization decision and cost for instructions. 1869 /// Right now it is used for memory instructions only. 1870 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1871 std::pair<InstWidening, InstructionCost>>; 1872 1873 DecisionList WideningDecisions; 1874 1875 /// Returns true if \p V is expected to be vectorized and it needs to be 1876 /// extracted. 1877 bool needsExtract(Value *V, ElementCount VF) const { 1878 Instruction *I = dyn_cast<Instruction>(V); 1879 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1880 TheLoop->isLoopInvariant(I)) 1881 return false; 1882 1883 // Assume we can vectorize V (and hence we need extraction) if the 1884 // scalars are not computed yet. This can happen, because it is called 1885 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1886 // the scalars are collected. That should be a safe assumption in most 1887 // cases, because we check if the operands have vectorizable types 1888 // beforehand in LoopVectorizationLegality. 1889 return Scalars.find(VF) == Scalars.end() || 1890 !isScalarAfterVectorization(I, VF); 1891 }; 1892 1893 /// Returns a range containing only operands needing to be extracted. 1894 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1895 ElementCount VF) const { 1896 return SmallVector<Value *, 4>(make_filter_range( 1897 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1898 } 1899 1900 /// Determines if we have the infrastructure to vectorize loop \p L and its 1901 /// epilogue, assuming the main loop is vectorized by \p VF. 1902 bool isCandidateForEpilogueVectorization(const Loop &L, 1903 const ElementCount VF) const; 1904 1905 /// Returns true if epilogue vectorization is considered profitable, and 1906 /// false otherwise. 1907 /// \p VF is the vectorization factor chosen for the original loop. 1908 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1909 1910 public: 1911 /// The loop that we evaluate. 1912 Loop *TheLoop; 1913 1914 /// Predicated scalar evolution analysis. 1915 PredicatedScalarEvolution &PSE; 1916 1917 /// Loop Info analysis. 1918 LoopInfo *LI; 1919 1920 /// Vectorization legality. 1921 LoopVectorizationLegality *Legal; 1922 1923 /// Vector target information. 1924 const TargetTransformInfo &TTI; 1925 1926 /// Target Library Info. 1927 const TargetLibraryInfo *TLI; 1928 1929 /// Demanded bits analysis. 1930 DemandedBits *DB; 1931 1932 /// Assumption cache. 1933 AssumptionCache *AC; 1934 1935 /// Interface to emit optimization remarks. 1936 OptimizationRemarkEmitter *ORE; 1937 1938 const Function *TheFunction; 1939 1940 /// Loop Vectorize Hint. 1941 const LoopVectorizeHints *Hints; 1942 1943 /// The interleave access information contains groups of interleaved accesses 1944 /// with the same stride and close to each other. 1945 InterleavedAccessInfo &InterleaveInfo; 1946 1947 /// Values to ignore in the cost model. 1948 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1949 1950 /// Values to ignore in the cost model when VF > 1. 1951 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1952 1953 /// All element types found in the loop. 1954 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1955 1956 /// Profitable vector factors. 1957 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1958 }; 1959 } // end namespace llvm 1960 1961 /// Helper struct to manage generating runtime checks for vectorization. 1962 /// 1963 /// The runtime checks are created up-front in temporary blocks to allow better 1964 /// estimating the cost and un-linked from the existing IR. After deciding to 1965 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1966 /// temporary blocks are completely removed. 1967 class GeneratedRTChecks { 1968 /// Basic block which contains the generated SCEV checks, if any. 1969 BasicBlock *SCEVCheckBlock = nullptr; 1970 1971 /// The value representing the result of the generated SCEV checks. If it is 1972 /// nullptr, either no SCEV checks have been generated or they have been used. 1973 Value *SCEVCheckCond = nullptr; 1974 1975 /// Basic block which contains the generated memory runtime checks, if any. 1976 BasicBlock *MemCheckBlock = nullptr; 1977 1978 /// The value representing the result of the generated memory runtime checks. 1979 /// If it is nullptr, either no memory runtime checks have been generated or 1980 /// they have been used. 1981 Value *MemRuntimeCheckCond = nullptr; 1982 1983 DominatorTree *DT; 1984 LoopInfo *LI; 1985 1986 SCEVExpander SCEVExp; 1987 SCEVExpander MemCheckExp; 1988 1989 public: 1990 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1991 const DataLayout &DL) 1992 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1993 MemCheckExp(SE, DL, "scev.check") {} 1994 1995 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1996 /// accurately estimate the cost of the runtime checks. The blocks are 1997 /// un-linked from the IR and is added back during vector code generation. If 1998 /// there is no vector code generation, the check blocks are removed 1999 /// completely. 2000 void Create(Loop *L, const LoopAccessInfo &LAI, 2001 const SCEVPredicate &Pred) { 2002 2003 BasicBlock *LoopHeader = L->getHeader(); 2004 BasicBlock *Preheader = L->getLoopPreheader(); 2005 2006 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2007 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2008 // may be used by SCEVExpander. The blocks will be un-linked from their 2009 // predecessors and removed from LI & DT at the end of the function. 2010 if (!Pred.isAlwaysTrue()) { 2011 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2012 nullptr, "vector.scevcheck"); 2013 2014 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2015 &Pred, SCEVCheckBlock->getTerminator()); 2016 } 2017 2018 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2019 if (RtPtrChecking.Need) { 2020 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2021 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2022 "vector.memcheck"); 2023 2024 MemRuntimeCheckCond = 2025 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2026 RtPtrChecking.getChecks(), MemCheckExp); 2027 assert(MemRuntimeCheckCond && 2028 "no RT checks generated although RtPtrChecking " 2029 "claimed checks are required"); 2030 } 2031 2032 if (!MemCheckBlock && !SCEVCheckBlock) 2033 return; 2034 2035 // Unhook the temporary block with the checks, update various places 2036 // accordingly. 2037 if (SCEVCheckBlock) 2038 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2039 if (MemCheckBlock) 2040 MemCheckBlock->replaceAllUsesWith(Preheader); 2041 2042 if (SCEVCheckBlock) { 2043 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2044 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2045 Preheader->getTerminator()->eraseFromParent(); 2046 } 2047 if (MemCheckBlock) { 2048 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2049 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2050 Preheader->getTerminator()->eraseFromParent(); 2051 } 2052 2053 DT->changeImmediateDominator(LoopHeader, Preheader); 2054 if (MemCheckBlock) { 2055 DT->eraseNode(MemCheckBlock); 2056 LI->removeBlock(MemCheckBlock); 2057 } 2058 if (SCEVCheckBlock) { 2059 DT->eraseNode(SCEVCheckBlock); 2060 LI->removeBlock(SCEVCheckBlock); 2061 } 2062 } 2063 2064 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2065 /// unused. 2066 ~GeneratedRTChecks() { 2067 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2068 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2069 if (!SCEVCheckCond) 2070 SCEVCleaner.markResultUsed(); 2071 2072 if (!MemRuntimeCheckCond) 2073 MemCheckCleaner.markResultUsed(); 2074 2075 if (MemRuntimeCheckCond) { 2076 auto &SE = *MemCheckExp.getSE(); 2077 // Memory runtime check generation creates compares that use expanded 2078 // values. Remove them before running the SCEVExpanderCleaners. 2079 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2080 if (MemCheckExp.isInsertedInstruction(&I)) 2081 continue; 2082 SE.forgetValue(&I); 2083 I.eraseFromParent(); 2084 } 2085 } 2086 MemCheckCleaner.cleanup(); 2087 SCEVCleaner.cleanup(); 2088 2089 if (SCEVCheckCond) 2090 SCEVCheckBlock->eraseFromParent(); 2091 if (MemRuntimeCheckCond) 2092 MemCheckBlock->eraseFromParent(); 2093 } 2094 2095 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2096 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2097 /// depending on the generated condition. 2098 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2099 BasicBlock *LoopVectorPreHeader, 2100 BasicBlock *LoopExitBlock) { 2101 if (!SCEVCheckCond) 2102 return nullptr; 2103 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2104 if (C->isZero()) 2105 return nullptr; 2106 2107 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2108 2109 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2110 // Create new preheader for vector loop. 2111 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2112 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2113 2114 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2115 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2116 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2117 SCEVCheckBlock); 2118 2119 DT->addNewBlock(SCEVCheckBlock, Pred); 2120 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2121 2122 ReplaceInstWithInst( 2123 SCEVCheckBlock->getTerminator(), 2124 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2125 // Mark the check as used, to prevent it from being removed during cleanup. 2126 SCEVCheckCond = nullptr; 2127 return SCEVCheckBlock; 2128 } 2129 2130 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2131 /// the branches to branch to the vector preheader or \p Bypass, depending on 2132 /// the generated condition. 2133 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2134 BasicBlock *LoopVectorPreHeader) { 2135 // Check if we generated code that checks in runtime if arrays overlap. 2136 if (!MemRuntimeCheckCond) 2137 return nullptr; 2138 2139 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2140 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2141 MemCheckBlock); 2142 2143 DT->addNewBlock(MemCheckBlock, Pred); 2144 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2145 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2146 2147 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2148 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2149 2150 ReplaceInstWithInst( 2151 MemCheckBlock->getTerminator(), 2152 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2153 MemCheckBlock->getTerminator()->setDebugLoc( 2154 Pred->getTerminator()->getDebugLoc()); 2155 2156 // Mark the check as used, to prevent it from being removed during cleanup. 2157 MemRuntimeCheckCond = nullptr; 2158 return MemCheckBlock; 2159 } 2160 }; 2161 2162 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2163 // vectorization. The loop needs to be annotated with #pragma omp simd 2164 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2165 // vector length information is not provided, vectorization is not considered 2166 // explicit. Interleave hints are not allowed either. These limitations will be 2167 // relaxed in the future. 2168 // Please, note that we are currently forced to abuse the pragma 'clang 2169 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2170 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2171 // provides *explicit vectorization hints* (LV can bypass legal checks and 2172 // assume that vectorization is legal). However, both hints are implemented 2173 // using the same metadata (llvm.loop.vectorize, processed by 2174 // LoopVectorizeHints). This will be fixed in the future when the native IR 2175 // representation for pragma 'omp simd' is introduced. 2176 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2177 OptimizationRemarkEmitter *ORE) { 2178 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2179 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2180 2181 // Only outer loops with an explicit vectorization hint are supported. 2182 // Unannotated outer loops are ignored. 2183 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2184 return false; 2185 2186 Function *Fn = OuterLp->getHeader()->getParent(); 2187 if (!Hints.allowVectorization(Fn, OuterLp, 2188 true /*VectorizeOnlyWhenForced*/)) { 2189 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2190 return false; 2191 } 2192 2193 if (Hints.getInterleave() > 1) { 2194 // TODO: Interleave support is future work. 2195 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2196 "outer loops.\n"); 2197 Hints.emitRemarkWithHints(); 2198 return false; 2199 } 2200 2201 return true; 2202 } 2203 2204 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2205 OptimizationRemarkEmitter *ORE, 2206 SmallVectorImpl<Loop *> &V) { 2207 // Collect inner loops and outer loops without irreducible control flow. For 2208 // now, only collect outer loops that have explicit vectorization hints. If we 2209 // are stress testing the VPlan H-CFG construction, we collect the outermost 2210 // loop of every loop nest. 2211 if (L.isInnermost() || VPlanBuildStressTest || 2212 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2213 LoopBlocksRPO RPOT(&L); 2214 RPOT.perform(LI); 2215 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2216 V.push_back(&L); 2217 // TODO: Collect inner loops inside marked outer loops in case 2218 // vectorization fails for the outer loop. Do not invoke 2219 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2220 // already known to be reducible. We can use an inherited attribute for 2221 // that. 2222 return; 2223 } 2224 } 2225 for (Loop *InnerL : L) 2226 collectSupportedLoops(*InnerL, LI, ORE, V); 2227 } 2228 2229 namespace { 2230 2231 /// The LoopVectorize Pass. 2232 struct LoopVectorize : public FunctionPass { 2233 /// Pass identification, replacement for typeid 2234 static char ID; 2235 2236 LoopVectorizePass Impl; 2237 2238 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2239 bool VectorizeOnlyWhenForced = false) 2240 : FunctionPass(ID), 2241 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2242 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2243 } 2244 2245 bool runOnFunction(Function &F) override { 2246 if (skipFunction(F)) 2247 return false; 2248 2249 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2250 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2251 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2252 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2253 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2254 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2255 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2256 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2257 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2258 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2259 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2260 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2261 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2262 2263 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2264 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2265 2266 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2267 GetLAA, *ORE, PSI).MadeAnyChange; 2268 } 2269 2270 void getAnalysisUsage(AnalysisUsage &AU) const override { 2271 AU.addRequired<AssumptionCacheTracker>(); 2272 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2273 AU.addRequired<DominatorTreeWrapperPass>(); 2274 AU.addRequired<LoopInfoWrapperPass>(); 2275 AU.addRequired<ScalarEvolutionWrapperPass>(); 2276 AU.addRequired<TargetTransformInfoWrapperPass>(); 2277 AU.addRequired<AAResultsWrapperPass>(); 2278 AU.addRequired<LoopAccessLegacyAnalysis>(); 2279 AU.addRequired<DemandedBitsWrapperPass>(); 2280 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2281 AU.addRequired<InjectTLIMappingsLegacy>(); 2282 2283 // We currently do not preserve loopinfo/dominator analyses with outer loop 2284 // vectorization. Until this is addressed, mark these analyses as preserved 2285 // only for non-VPlan-native path. 2286 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2287 if (!EnableVPlanNativePath) { 2288 AU.addPreserved<LoopInfoWrapperPass>(); 2289 AU.addPreserved<DominatorTreeWrapperPass>(); 2290 } 2291 2292 AU.addPreserved<BasicAAWrapperPass>(); 2293 AU.addPreserved<GlobalsAAWrapperPass>(); 2294 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2295 } 2296 }; 2297 2298 } // end anonymous namespace 2299 2300 //===----------------------------------------------------------------------===// 2301 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2302 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2303 //===----------------------------------------------------------------------===// 2304 2305 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2306 // We need to place the broadcast of invariant variables outside the loop, 2307 // but only if it's proven safe to do so. Else, broadcast will be inside 2308 // vector loop body. 2309 Instruction *Instr = dyn_cast<Instruction>(V); 2310 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2311 (!Instr || 2312 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2313 // Place the code for broadcasting invariant variables in the new preheader. 2314 IRBuilder<>::InsertPointGuard Guard(Builder); 2315 if (SafeToHoist) 2316 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2317 2318 // Broadcast the scalar into all locations in the vector. 2319 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2320 2321 return Shuf; 2322 } 2323 2324 /// This function adds 2325 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2326 /// to each vector element of Val. The sequence starts at StartIndex. 2327 /// \p Opcode is relevant for FP induction variable. 2328 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2329 Instruction::BinaryOps BinOp, ElementCount VF, 2330 IRBuilderBase &Builder) { 2331 assert(VF.isVector() && "only vector VFs are supported"); 2332 2333 // Create and check the types. 2334 auto *ValVTy = cast<VectorType>(Val->getType()); 2335 ElementCount VLen = ValVTy->getElementCount(); 2336 2337 Type *STy = Val->getType()->getScalarType(); 2338 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2339 "Induction Step must be an integer or FP"); 2340 assert(Step->getType() == STy && "Step has wrong type"); 2341 2342 SmallVector<Constant *, 8> Indices; 2343 2344 // Create a vector of consecutive numbers from zero to VF. 2345 VectorType *InitVecValVTy = ValVTy; 2346 if (STy->isFloatingPointTy()) { 2347 Type *InitVecValSTy = 2348 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2349 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2350 } 2351 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2352 2353 // Splat the StartIdx 2354 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2355 2356 if (STy->isIntegerTy()) { 2357 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2358 Step = Builder.CreateVectorSplat(VLen, Step); 2359 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2360 // FIXME: The newly created binary instructions should contain nsw/nuw 2361 // flags, which can be found from the original scalar operations. 2362 Step = Builder.CreateMul(InitVec, Step); 2363 return Builder.CreateAdd(Val, Step, "induction"); 2364 } 2365 2366 // Floating point induction. 2367 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2368 "Binary Opcode should be specified for FP induction"); 2369 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2370 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2371 2372 Step = Builder.CreateVectorSplat(VLen, Step); 2373 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2374 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2375 } 2376 2377 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2378 const InductionDescriptor &II, Value *Step, Value *Start, 2379 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2380 IRBuilderBase &Builder = State.Builder; 2381 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2382 "Expected either an induction phi-node or a truncate of it!"); 2383 2384 // Construct the initial value of the vector IV in the vector loop preheader 2385 auto CurrIP = Builder.saveIP(); 2386 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2387 if (isa<TruncInst>(EntryVal)) { 2388 assert(Start->getType()->isIntegerTy() && 2389 "Truncation requires an integer type"); 2390 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2391 Step = Builder.CreateTrunc(Step, TruncType); 2392 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2393 } 2394 2395 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2396 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2397 Value *SteppedStart = getStepVector( 2398 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2399 2400 // We create vector phi nodes for both integer and floating-point induction 2401 // variables. Here, we determine the kind of arithmetic we will perform. 2402 Instruction::BinaryOps AddOp; 2403 Instruction::BinaryOps MulOp; 2404 if (Step->getType()->isIntegerTy()) { 2405 AddOp = Instruction::Add; 2406 MulOp = Instruction::Mul; 2407 } else { 2408 AddOp = II.getInductionOpcode(); 2409 MulOp = Instruction::FMul; 2410 } 2411 2412 // Multiply the vectorization factor by the step using integer or 2413 // floating-point arithmetic as appropriate. 2414 Type *StepType = Step->getType(); 2415 Value *RuntimeVF; 2416 if (Step->getType()->isFloatingPointTy()) 2417 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2418 else 2419 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2420 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2421 2422 // Create a vector splat to use in the induction update. 2423 // 2424 // FIXME: If the step is non-constant, we create the vector splat with 2425 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2426 // handle a constant vector splat. 2427 Value *SplatVF = isa<Constant>(Mul) 2428 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2429 : Builder.CreateVectorSplat(State.VF, Mul); 2430 Builder.restoreIP(CurrIP); 2431 2432 // We may need to add the step a number of times, depending on the unroll 2433 // factor. The last of those goes into the PHI. 2434 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2435 &*LoopVectorBody->getFirstInsertionPt()); 2436 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2437 Instruction *LastInduction = VecInd; 2438 for (unsigned Part = 0; Part < UF; ++Part) { 2439 State.set(Def, LastInduction, Part); 2440 2441 if (isa<TruncInst>(EntryVal)) 2442 addMetadata(LastInduction, EntryVal); 2443 2444 LastInduction = cast<Instruction>( 2445 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2446 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2447 } 2448 2449 // Move the last step to the end of the latch block. This ensures consistent 2450 // placement of all induction updates. 2451 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2452 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2453 LastInduction->moveBefore(Br); 2454 LastInduction->setName("vec.ind.next"); 2455 2456 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2457 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2458 } 2459 2460 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2461 /// variable on which to base the steps, \p Step is the size of the step, and 2462 /// \p EntryVal is the value from the original loop that maps to the steps. 2463 /// Note that \p EntryVal doesn't have to be an induction variable - it 2464 /// can also be a truncate instruction. 2465 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2466 Instruction *EntryVal, 2467 const InductionDescriptor &ID, VPValue *Def, 2468 VPTransformState &State) { 2469 IRBuilderBase &Builder = State.Builder; 2470 // We shouldn't have to build scalar steps if we aren't vectorizing. 2471 assert(State.VF.isVector() && "VF should be greater than one"); 2472 // Get the value type and ensure it and the step have the same integer type. 2473 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2474 assert(ScalarIVTy == Step->getType() && 2475 "Val and Step should have the same type"); 2476 2477 // We build scalar steps for both integer and floating-point induction 2478 // variables. Here, we determine the kind of arithmetic we will perform. 2479 Instruction::BinaryOps AddOp; 2480 Instruction::BinaryOps MulOp; 2481 if (ScalarIVTy->isIntegerTy()) { 2482 AddOp = Instruction::Add; 2483 MulOp = Instruction::Mul; 2484 } else { 2485 AddOp = ID.getInductionOpcode(); 2486 MulOp = Instruction::FMul; 2487 } 2488 2489 // Determine the number of scalars we need to generate for each unroll 2490 // iteration. 2491 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2492 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2493 // Compute the scalar steps and save the results in State. 2494 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2495 ScalarIVTy->getScalarSizeInBits()); 2496 Type *VecIVTy = nullptr; 2497 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2498 if (!FirstLaneOnly && State.VF.isScalable()) { 2499 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2500 UnitStepVec = 2501 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2502 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2503 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2504 } 2505 2506 for (unsigned Part = 0; Part < State.UF; ++Part) { 2507 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2508 2509 if (!FirstLaneOnly && State.VF.isScalable()) { 2510 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2511 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2512 if (ScalarIVTy->isFloatingPointTy()) 2513 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2514 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2515 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2516 State.set(Def, Add, Part); 2517 // It's useful to record the lane values too for the known minimum number 2518 // of elements so we do those below. This improves the code quality when 2519 // trying to extract the first element, for example. 2520 } 2521 2522 if (ScalarIVTy->isFloatingPointTy()) 2523 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2524 2525 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2526 Value *StartIdx = Builder.CreateBinOp( 2527 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2528 // The step returned by `createStepForVF` is a runtime-evaluated value 2529 // when VF is scalable. Otherwise, it should be folded into a Constant. 2530 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2531 "Expected StartIdx to be folded to a constant when VF is not " 2532 "scalable"); 2533 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2534 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2535 State.set(Def, Add, VPIteration(Part, Lane)); 2536 } 2537 } 2538 } 2539 2540 // Generate code for the induction step. Note that induction steps are 2541 // required to be loop-invariant 2542 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2543 Instruction *InsertBefore, 2544 Loop *OrigLoop = nullptr) { 2545 const DataLayout &DL = SE.getDataLayout(); 2546 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2547 "Induction step should be loop invariant"); 2548 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2549 return E->getValue(); 2550 2551 SCEVExpander Exp(SE, DL, "induction"); 2552 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2553 } 2554 2555 /// Compute the transformed value of Index at offset StartValue using step 2556 /// StepValue. 2557 /// For integer induction, returns StartValue + Index * StepValue. 2558 /// For pointer induction, returns StartValue[Index * StepValue]. 2559 /// FIXME: The newly created binary instructions should contain nsw/nuw 2560 /// flags, which can be found from the original scalar operations. 2561 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2562 Value *StartValue, Value *Step, 2563 const InductionDescriptor &ID) { 2564 assert(Index->getType()->getScalarType() == Step->getType() && 2565 "Index scalar type does not match StepValue type"); 2566 2567 // Note: the IR at this point is broken. We cannot use SE to create any new 2568 // SCEV and then expand it, hoping that SCEV's simplification will give us 2569 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2570 // lead to various SCEV crashes. So all we can do is to use builder and rely 2571 // on InstCombine for future simplifications. Here we handle some trivial 2572 // cases only. 2573 auto CreateAdd = [&B](Value *X, Value *Y) { 2574 assert(X->getType() == Y->getType() && "Types don't match!"); 2575 if (auto *CX = dyn_cast<ConstantInt>(X)) 2576 if (CX->isZero()) 2577 return Y; 2578 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2579 if (CY->isZero()) 2580 return X; 2581 return B.CreateAdd(X, Y); 2582 }; 2583 2584 // We allow X to be a vector type, in which case Y will potentially be 2585 // splatted into a vector with the same element count. 2586 auto CreateMul = [&B](Value *X, Value *Y) { 2587 assert(X->getType()->getScalarType() == Y->getType() && 2588 "Types don't match!"); 2589 if (auto *CX = dyn_cast<ConstantInt>(X)) 2590 if (CX->isOne()) 2591 return Y; 2592 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2593 if (CY->isOne()) 2594 return X; 2595 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2596 if (XVTy && !isa<VectorType>(Y->getType())) 2597 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2598 return B.CreateMul(X, Y); 2599 }; 2600 2601 switch (ID.getKind()) { 2602 case InductionDescriptor::IK_IntInduction: { 2603 assert(!isa<VectorType>(Index->getType()) && 2604 "Vector indices not supported for integer inductions yet"); 2605 assert(Index->getType() == StartValue->getType() && 2606 "Index type does not match StartValue type"); 2607 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2608 return B.CreateSub(StartValue, Index); 2609 auto *Offset = CreateMul(Index, Step); 2610 return CreateAdd(StartValue, Offset); 2611 } 2612 case InductionDescriptor::IK_PtrInduction: { 2613 assert(isa<Constant>(Step) && 2614 "Expected constant step for pointer induction"); 2615 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2616 } 2617 case InductionDescriptor::IK_FpInduction: { 2618 assert(!isa<VectorType>(Index->getType()) && 2619 "Vector indices not supported for FP inductions yet"); 2620 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2621 auto InductionBinOp = ID.getInductionBinOp(); 2622 assert(InductionBinOp && 2623 (InductionBinOp->getOpcode() == Instruction::FAdd || 2624 InductionBinOp->getOpcode() == Instruction::FSub) && 2625 "Original bin op should be defined for FP induction"); 2626 2627 Value *MulExp = B.CreateFMul(Step, Index); 2628 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2629 "induction"); 2630 } 2631 case InductionDescriptor::IK_NoInduction: 2632 return nullptr; 2633 } 2634 llvm_unreachable("invalid enum"); 2635 } 2636 2637 void InnerLoopVectorizer::widenIntOrFpInduction( 2638 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, 2639 Value *CanonicalIV) { 2640 Value *Start = Def->getStartValue()->getLiveInIRValue(); 2641 const InductionDescriptor &ID = Def->getInductionDescriptor(); 2642 TruncInst *Trunc = Def->getTruncInst(); 2643 IRBuilderBase &Builder = State.Builder; 2644 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2645 assert(!State.VF.isZero() && "VF must be non-zero"); 2646 2647 // The value from the original loop to which we are mapping the new induction 2648 // variable. 2649 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2650 2651 auto &DL = EntryVal->getModule()->getDataLayout(); 2652 2653 // Generate code for the induction step. Note that induction steps are 2654 // required to be loop-invariant 2655 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2656 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2657 "Induction step should be loop invariant"); 2658 if (PSE.getSE()->isSCEVable(IV->getType())) { 2659 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2660 return Exp.expandCodeFor(Step, Step->getType(), 2661 State.CFG.VectorPreHeader->getTerminator()); 2662 } 2663 return cast<SCEVUnknown>(Step)->getValue(); 2664 }; 2665 2666 // The scalar value to broadcast. This is derived from the canonical 2667 // induction variable. If a truncation type is given, truncate the canonical 2668 // induction variable and step. Otherwise, derive these values from the 2669 // induction descriptor. 2670 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2671 Value *ScalarIV = CanonicalIV; 2672 Type *NeededType = IV->getType(); 2673 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { 2674 ScalarIV = 2675 NeededType->isIntegerTy() 2676 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) 2677 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); 2678 ScalarIV = emitTransformedIndex(Builder, ScalarIV, Start, Step, ID); 2679 ScalarIV->setName("offset.idx"); 2680 } 2681 if (Trunc) { 2682 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2683 assert(Step->getType()->isIntegerTy() && 2684 "Truncation requires an integer step"); 2685 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2686 Step = Builder.CreateTrunc(Step, TruncType); 2687 } 2688 return ScalarIV; 2689 }; 2690 2691 // Fast-math-flags propagate from the original induction instruction. 2692 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2693 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2694 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2695 2696 // Now do the actual transformations, and start with creating the step value. 2697 Value *Step = CreateStepValue(ID.getStep()); 2698 if (State.VF.isScalar()) { 2699 Value *ScalarIV = CreateScalarIV(Step); 2700 Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), 2701 Step->getType()->getScalarSizeInBits()); 2702 2703 Instruction::BinaryOps IncOp = ID.getInductionOpcode(); 2704 if (IncOp == Instruction::BinaryOpsEnd) 2705 IncOp = Instruction::Add; 2706 for (unsigned Part = 0; Part < UF; ++Part) { 2707 Value *StartIdx = ConstantInt::get(ScalarTy, Part); 2708 Instruction::BinaryOps MulOp = Instruction::Mul; 2709 if (Step->getType()->isFloatingPointTy()) { 2710 StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); 2711 MulOp = Instruction::FMul; 2712 } 2713 2714 Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2715 Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); 2716 State.set(Def, EntryPart, Part); 2717 if (Trunc) { 2718 assert(!Step->getType()->isFloatingPointTy() && 2719 "fp inductions shouldn't be truncated"); 2720 addMetadata(EntryPart, Trunc); 2721 } 2722 } 2723 return; 2724 } 2725 2726 // Create a new independent vector induction variable, if one is needed. 2727 if (Def->needsVectorIV()) 2728 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2729 2730 if (Def->needsScalarIV()) { 2731 // Create scalar steps that can be used by instructions we will later 2732 // scalarize. Note that the addition of the scalar steps will not increase 2733 // the number of instructions in the loop in the common case prior to 2734 // InstCombine. We will be trading one vector extract for each scalar step. 2735 Value *ScalarIV = CreateScalarIV(Step); 2736 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2737 } 2738 } 2739 2740 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2741 const VPIteration &Instance, 2742 VPTransformState &State) { 2743 Value *ScalarInst = State.get(Def, Instance); 2744 Value *VectorValue = State.get(Def, Instance.Part); 2745 VectorValue = Builder.CreateInsertElement( 2746 VectorValue, ScalarInst, 2747 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2748 State.set(Def, VectorValue, Instance.Part); 2749 } 2750 2751 // Return whether we allow using masked interleave-groups (for dealing with 2752 // strided loads/stores that reside in predicated blocks, or for dealing 2753 // with gaps). 2754 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2755 // If an override option has been passed in for interleaved accesses, use it. 2756 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2757 return EnableMaskedInterleavedMemAccesses; 2758 2759 return TTI.enableMaskedInterleavedAccessVectorization(); 2760 } 2761 2762 // Try to vectorize the interleave group that \p Instr belongs to. 2763 // 2764 // E.g. Translate following interleaved load group (factor = 3): 2765 // for (i = 0; i < N; i+=3) { 2766 // R = Pic[i]; // Member of index 0 2767 // G = Pic[i+1]; // Member of index 1 2768 // B = Pic[i+2]; // Member of index 2 2769 // ... // do something to R, G, B 2770 // } 2771 // To: 2772 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2773 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2774 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2775 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2776 // 2777 // Or translate following interleaved store group (factor = 3): 2778 // for (i = 0; i < N; i+=3) { 2779 // ... do something to R, G, B 2780 // Pic[i] = R; // Member of index 0 2781 // Pic[i+1] = G; // Member of index 1 2782 // Pic[i+2] = B; // Member of index 2 2783 // } 2784 // To: 2785 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2786 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2787 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2788 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2789 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2790 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2791 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2792 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2793 VPValue *BlockInMask) { 2794 Instruction *Instr = Group->getInsertPos(); 2795 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2796 2797 // Prepare for the vector type of the interleaved load/store. 2798 Type *ScalarTy = getLoadStoreType(Instr); 2799 unsigned InterleaveFactor = Group->getFactor(); 2800 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2801 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2802 2803 // Prepare for the new pointers. 2804 SmallVector<Value *, 2> AddrParts; 2805 unsigned Index = Group->getIndex(Instr); 2806 2807 // TODO: extend the masked interleaved-group support to reversed access. 2808 assert((!BlockInMask || !Group->isReverse()) && 2809 "Reversed masked interleave-group not supported."); 2810 2811 // If the group is reverse, adjust the index to refer to the last vector lane 2812 // instead of the first. We adjust the index from the first vector lane, 2813 // rather than directly getting the pointer for lane VF - 1, because the 2814 // pointer operand of the interleaved access is supposed to be uniform. For 2815 // uniform instructions, we're only required to generate a value for the 2816 // first vector lane in each unroll iteration. 2817 if (Group->isReverse()) 2818 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2819 2820 for (unsigned Part = 0; Part < UF; Part++) { 2821 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2822 setDebugLocFromInst(AddrPart); 2823 2824 // Notice current instruction could be any index. Need to adjust the address 2825 // to the member of index 0. 2826 // 2827 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2828 // b = A[i]; // Member of index 0 2829 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2830 // 2831 // E.g. A[i+1] = a; // Member of index 1 2832 // A[i] = b; // Member of index 0 2833 // A[i+2] = c; // Member of index 2 (Current instruction) 2834 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2835 2836 bool InBounds = false; 2837 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2838 InBounds = gep->isInBounds(); 2839 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2840 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2841 2842 // Cast to the vector pointer type. 2843 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2844 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2845 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2846 } 2847 2848 setDebugLocFromInst(Instr); 2849 Value *PoisonVec = PoisonValue::get(VecTy); 2850 2851 Value *MaskForGaps = nullptr; 2852 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2853 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2854 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2855 } 2856 2857 // Vectorize the interleaved load group. 2858 if (isa<LoadInst>(Instr)) { 2859 // For each unroll part, create a wide load for the group. 2860 SmallVector<Value *, 2> NewLoads; 2861 for (unsigned Part = 0; Part < UF; Part++) { 2862 Instruction *NewLoad; 2863 if (BlockInMask || MaskForGaps) { 2864 assert(useMaskedInterleavedAccesses(*TTI) && 2865 "masked interleaved groups are not allowed."); 2866 Value *GroupMask = MaskForGaps; 2867 if (BlockInMask) { 2868 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2869 Value *ShuffledMask = Builder.CreateShuffleVector( 2870 BlockInMaskPart, 2871 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2872 "interleaved.mask"); 2873 GroupMask = MaskForGaps 2874 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2875 MaskForGaps) 2876 : ShuffledMask; 2877 } 2878 NewLoad = 2879 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2880 GroupMask, PoisonVec, "wide.masked.vec"); 2881 } 2882 else 2883 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2884 Group->getAlign(), "wide.vec"); 2885 Group->addMetadata(NewLoad); 2886 NewLoads.push_back(NewLoad); 2887 } 2888 2889 // For each member in the group, shuffle out the appropriate data from the 2890 // wide loads. 2891 unsigned J = 0; 2892 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2893 Instruction *Member = Group->getMember(I); 2894 2895 // Skip the gaps in the group. 2896 if (!Member) 2897 continue; 2898 2899 auto StrideMask = 2900 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2901 for (unsigned Part = 0; Part < UF; Part++) { 2902 Value *StridedVec = Builder.CreateShuffleVector( 2903 NewLoads[Part], StrideMask, "strided.vec"); 2904 2905 // If this member has different type, cast the result type. 2906 if (Member->getType() != ScalarTy) { 2907 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2908 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2909 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2910 } 2911 2912 if (Group->isReverse()) 2913 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2914 2915 State.set(VPDefs[J], StridedVec, Part); 2916 } 2917 ++J; 2918 } 2919 return; 2920 } 2921 2922 // The sub vector type for current instruction. 2923 auto *SubVT = VectorType::get(ScalarTy, VF); 2924 2925 // Vectorize the interleaved store group. 2926 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2927 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2928 "masked interleaved groups are not allowed."); 2929 assert((!MaskForGaps || !VF.isScalable()) && 2930 "masking gaps for scalable vectors is not yet supported."); 2931 for (unsigned Part = 0; Part < UF; Part++) { 2932 // Collect the stored vector from each member. 2933 SmallVector<Value *, 4> StoredVecs; 2934 for (unsigned i = 0; i < InterleaveFactor; i++) { 2935 assert((Group->getMember(i) || MaskForGaps) && 2936 "Fail to get a member from an interleaved store group"); 2937 Instruction *Member = Group->getMember(i); 2938 2939 // Skip the gaps in the group. 2940 if (!Member) { 2941 Value *Undef = PoisonValue::get(SubVT); 2942 StoredVecs.push_back(Undef); 2943 continue; 2944 } 2945 2946 Value *StoredVec = State.get(StoredValues[i], Part); 2947 2948 if (Group->isReverse()) 2949 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2950 2951 // If this member has different type, cast it to a unified type. 2952 2953 if (StoredVec->getType() != SubVT) 2954 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2955 2956 StoredVecs.push_back(StoredVec); 2957 } 2958 2959 // Concatenate all vectors into a wide vector. 2960 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2961 2962 // Interleave the elements in the wide vector. 2963 Value *IVec = Builder.CreateShuffleVector( 2964 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2965 "interleaved.vec"); 2966 2967 Instruction *NewStoreInstr; 2968 if (BlockInMask || MaskForGaps) { 2969 Value *GroupMask = MaskForGaps; 2970 if (BlockInMask) { 2971 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2972 Value *ShuffledMask = Builder.CreateShuffleVector( 2973 BlockInMaskPart, 2974 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2975 "interleaved.mask"); 2976 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2977 ShuffledMask, MaskForGaps) 2978 : ShuffledMask; 2979 } 2980 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2981 Group->getAlign(), GroupMask); 2982 } else 2983 NewStoreInstr = 2984 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2985 2986 Group->addMetadata(NewStoreInstr); 2987 } 2988 } 2989 2990 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2991 VPReplicateRecipe *RepRecipe, 2992 const VPIteration &Instance, 2993 bool IfPredicateInstr, 2994 VPTransformState &State) { 2995 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2996 2997 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2998 // the first lane and part. 2999 if (isa<NoAliasScopeDeclInst>(Instr)) 3000 if (!Instance.isFirstIteration()) 3001 return; 3002 3003 setDebugLocFromInst(Instr); 3004 3005 // Does this instruction return a value ? 3006 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3007 3008 Instruction *Cloned = Instr->clone(); 3009 if (!IsVoidRetTy) 3010 Cloned->setName(Instr->getName() + ".cloned"); 3011 3012 // If the scalarized instruction contributes to the address computation of a 3013 // widen masked load/store which was in a basic block that needed predication 3014 // and is not predicated after vectorization, we can't propagate 3015 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 3016 // instruction could feed a poison value to the base address of the widen 3017 // load/store. 3018 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 3019 Cloned->dropPoisonGeneratingFlags(); 3020 3021 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3022 Builder.GetInsertPoint()); 3023 // Replace the operands of the cloned instructions with their scalar 3024 // equivalents in the new loop. 3025 for (auto &I : enumerate(RepRecipe->operands())) { 3026 auto InputInstance = Instance; 3027 VPValue *Operand = I.value(); 3028 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 3029 if (OperandR && OperandR->isUniform()) 3030 InputInstance.Lane = VPLane::getFirstLane(); 3031 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 3032 } 3033 addNewMetadata(Cloned, Instr); 3034 3035 // Place the cloned scalar in the new loop. 3036 Builder.Insert(Cloned); 3037 3038 State.set(RepRecipe, Cloned, Instance); 3039 3040 // If we just cloned a new assumption, add it the assumption cache. 3041 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3042 AC->registerAssumption(II); 3043 3044 // End if-block. 3045 if (IfPredicateInstr) 3046 PredicatedInstructions.push_back(Cloned); 3047 } 3048 3049 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 3050 BasicBlock *Header = L->getHeader(); 3051 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 3052 3053 IRBuilder<> B(Header->getTerminator()); 3054 Instruction *OldInst = 3055 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 3056 setDebugLocFromInst(OldInst, &B); 3057 3058 // Connect the header to the exit and header blocks and replace the old 3059 // terminator. 3060 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 3061 3062 // Now we have two terminators. Remove the old one from the block. 3063 Header->getTerminator()->eraseFromParent(); 3064 } 3065 3066 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3067 if (TripCount) 3068 return TripCount; 3069 3070 assert(L && "Create Trip Count for null loop."); 3071 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3072 // Find the loop boundaries. 3073 ScalarEvolution *SE = PSE.getSE(); 3074 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3075 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3076 "Invalid loop count"); 3077 3078 Type *IdxTy = Legal->getWidestInductionType(); 3079 assert(IdxTy && "No type for induction"); 3080 3081 // The exit count might have the type of i64 while the phi is i32. This can 3082 // happen if we have an induction variable that is sign extended before the 3083 // compare. The only way that we get a backedge taken count is that the 3084 // induction variable was signed and as such will not overflow. In such a case 3085 // truncation is legal. 3086 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3087 IdxTy->getPrimitiveSizeInBits()) 3088 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3089 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3090 3091 // Get the total trip count from the count by adding 1. 3092 const SCEV *ExitCount = SE->getAddExpr( 3093 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3094 3095 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3096 3097 // Expand the trip count and place the new instructions in the preheader. 3098 // Notice that the pre-header does not change, only the loop body. 3099 SCEVExpander Exp(*SE, DL, "induction"); 3100 3101 // Count holds the overall loop count (N). 3102 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3103 L->getLoopPreheader()->getTerminator()); 3104 3105 if (TripCount->getType()->isPointerTy()) 3106 TripCount = 3107 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3108 L->getLoopPreheader()->getTerminator()); 3109 3110 return TripCount; 3111 } 3112 3113 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3114 if (VectorTripCount) 3115 return VectorTripCount; 3116 3117 Value *TC = getOrCreateTripCount(L); 3118 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3119 3120 Type *Ty = TC->getType(); 3121 // This is where we can make the step a runtime constant. 3122 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3123 3124 // If the tail is to be folded by masking, round the number of iterations N 3125 // up to a multiple of Step instead of rounding down. This is done by first 3126 // adding Step-1 and then rounding down. Note that it's ok if this addition 3127 // overflows: the vector induction variable will eventually wrap to zero given 3128 // that it starts at zero and its Step is a power of two; the loop will then 3129 // exit, with the last early-exit vector comparison also producing all-true. 3130 if (Cost->foldTailByMasking()) { 3131 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3132 "VF*UF must be a power of 2 when folding tail by masking"); 3133 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 3134 TC = Builder.CreateAdd( 3135 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 3136 } 3137 3138 // Now we need to generate the expression for the part of the loop that the 3139 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3140 // iterations are not required for correctness, or N - Step, otherwise. Step 3141 // is equal to the vectorization factor (number of SIMD elements) times the 3142 // unroll factor (number of SIMD instructions). 3143 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3144 3145 // There are cases where we *must* run at least one iteration in the remainder 3146 // loop. See the cost model for when this can happen. If the step evenly 3147 // divides the trip count, we set the remainder to be equal to the step. If 3148 // the step does not evenly divide the trip count, no adjustment is necessary 3149 // since there will already be scalar iterations. Note that the minimum 3150 // iterations check ensures that N >= Step. 3151 if (Cost->requiresScalarEpilogue(VF)) { 3152 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3153 R = Builder.CreateSelect(IsZero, Step, R); 3154 } 3155 3156 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3157 3158 return VectorTripCount; 3159 } 3160 3161 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3162 const DataLayout &DL) { 3163 // Verify that V is a vector type with same number of elements as DstVTy. 3164 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3165 unsigned VF = DstFVTy->getNumElements(); 3166 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3167 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3168 Type *SrcElemTy = SrcVecTy->getElementType(); 3169 Type *DstElemTy = DstFVTy->getElementType(); 3170 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3171 "Vector elements must have same size"); 3172 3173 // Do a direct cast if element types are castable. 3174 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3175 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3176 } 3177 // V cannot be directly casted to desired vector type. 3178 // May happen when V is a floating point vector but DstVTy is a vector of 3179 // pointers or vice-versa. Handle this using a two-step bitcast using an 3180 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3181 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3182 "Only one type should be a pointer type"); 3183 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3184 "Only one type should be a floating point type"); 3185 Type *IntTy = 3186 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3187 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3188 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3189 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3190 } 3191 3192 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3193 BasicBlock *Bypass) { 3194 Value *Count = getOrCreateTripCount(L); 3195 // Reuse existing vector loop preheader for TC checks. 3196 // Note that new preheader block is generated for vector loop. 3197 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3198 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3199 3200 // Generate code to check if the loop's trip count is less than VF * UF, or 3201 // equal to it in case a scalar epilogue is required; this implies that the 3202 // vector trip count is zero. This check also covers the case where adding one 3203 // to the backedge-taken count overflowed leading to an incorrect trip count 3204 // of zero. In this case we will also jump to the scalar loop. 3205 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3206 : ICmpInst::ICMP_ULT; 3207 3208 // If tail is to be folded, vector loop takes care of all iterations. 3209 Value *CheckMinIters = Builder.getFalse(); 3210 if (!Cost->foldTailByMasking()) { 3211 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3212 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3213 } 3214 // Create new preheader for vector loop. 3215 LoopVectorPreHeader = 3216 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3217 "vector.ph"); 3218 3219 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3220 DT->getNode(Bypass)->getIDom()) && 3221 "TC check is expected to dominate Bypass"); 3222 3223 // Update dominator for Bypass & LoopExit (if needed). 3224 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3225 if (!Cost->requiresScalarEpilogue(VF)) 3226 // If there is an epilogue which must run, there's no edge from the 3227 // middle block to exit blocks and thus no need to update the immediate 3228 // dominator of the exit blocks. 3229 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3230 3231 ReplaceInstWithInst( 3232 TCCheckBlock->getTerminator(), 3233 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3234 LoopBypassBlocks.push_back(TCCheckBlock); 3235 } 3236 3237 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3238 3239 BasicBlock *const SCEVCheckBlock = 3240 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3241 if (!SCEVCheckBlock) 3242 return nullptr; 3243 3244 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3245 (OptForSizeBasedOnProfile && 3246 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3247 "Cannot SCEV check stride or overflow when optimizing for size"); 3248 3249 3250 // Update dominator only if this is first RT check. 3251 if (LoopBypassBlocks.empty()) { 3252 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3253 if (!Cost->requiresScalarEpilogue(VF)) 3254 // If there is an epilogue which must run, there's no edge from the 3255 // middle block to exit blocks and thus no need to update the immediate 3256 // dominator of the exit blocks. 3257 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3258 } 3259 3260 LoopBypassBlocks.push_back(SCEVCheckBlock); 3261 AddedSafetyChecks = true; 3262 return SCEVCheckBlock; 3263 } 3264 3265 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3266 BasicBlock *Bypass) { 3267 // VPlan-native path does not do any analysis for runtime checks currently. 3268 if (EnableVPlanNativePath) 3269 return nullptr; 3270 3271 BasicBlock *const MemCheckBlock = 3272 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3273 3274 // Check if we generated code that checks in runtime if arrays overlap. We put 3275 // the checks into a separate block to make the more common case of few 3276 // elements faster. 3277 if (!MemCheckBlock) 3278 return nullptr; 3279 3280 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3281 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3282 "Cannot emit memory checks when optimizing for size, unless forced " 3283 "to vectorize."); 3284 ORE->emit([&]() { 3285 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3286 L->getStartLoc(), L->getHeader()) 3287 << "Code-size may be reduced by not forcing " 3288 "vectorization, or by source-code modifications " 3289 "eliminating the need for runtime checks " 3290 "(e.g., adding 'restrict')."; 3291 }); 3292 } 3293 3294 LoopBypassBlocks.push_back(MemCheckBlock); 3295 3296 AddedSafetyChecks = true; 3297 3298 // We currently don't use LoopVersioning for the actual loop cloning but we 3299 // still use it to add the noalias metadata. 3300 LVer = std::make_unique<LoopVersioning>( 3301 *Legal->getLAI(), 3302 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3303 DT, PSE.getSE()); 3304 LVer->prepareNoAliasMetadata(); 3305 return MemCheckBlock; 3306 } 3307 3308 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3309 LoopScalarBody = OrigLoop->getHeader(); 3310 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3311 assert(LoopVectorPreHeader && "Invalid loop structure"); 3312 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3313 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3314 "multiple exit loop without required epilogue?"); 3315 3316 LoopMiddleBlock = 3317 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3318 LI, nullptr, Twine(Prefix) + "middle.block"); 3319 LoopScalarPreHeader = 3320 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3321 nullptr, Twine(Prefix) + "scalar.ph"); 3322 3323 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3324 3325 // Set up the middle block terminator. Two cases: 3326 // 1) If we know that we must execute the scalar epilogue, emit an 3327 // unconditional branch. 3328 // 2) Otherwise, we must have a single unique exit block (due to how we 3329 // implement the multiple exit case). In this case, set up a conditonal 3330 // branch from the middle block to the loop scalar preheader, and the 3331 // exit block. completeLoopSkeleton will update the condition to use an 3332 // iteration check, if required to decide whether to execute the remainder. 3333 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3334 BranchInst::Create(LoopScalarPreHeader) : 3335 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3336 Builder.getTrue()); 3337 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3338 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3339 3340 // We intentionally don't let SplitBlock to update LoopInfo since 3341 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3342 // LoopVectorBody is explicitly added to the correct place few lines later. 3343 LoopVectorBody = 3344 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3345 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3346 3347 // Update dominator for loop exit. 3348 if (!Cost->requiresScalarEpilogue(VF)) 3349 // If there is an epilogue which must run, there's no edge from the 3350 // middle block to exit blocks and thus no need to update the immediate 3351 // dominator of the exit blocks. 3352 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3353 3354 // Create and register the new vector loop. 3355 Loop *Lp = LI->AllocateLoop(); 3356 Loop *ParentLoop = OrigLoop->getParentLoop(); 3357 3358 // Insert the new loop into the loop nest and register the new basic blocks 3359 // before calling any utilities such as SCEV that require valid LoopInfo. 3360 if (ParentLoop) { 3361 ParentLoop->addChildLoop(Lp); 3362 } else { 3363 LI->addTopLevelLoop(Lp); 3364 } 3365 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3366 return Lp; 3367 } 3368 3369 void InnerLoopVectorizer::createInductionResumeValues( 3370 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3371 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3372 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3373 "Inconsistent information about additional bypass."); 3374 3375 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3376 assert(VectorTripCount && L && "Expected valid arguments"); 3377 // We are going to resume the execution of the scalar loop. 3378 // Go over all of the induction variables that we found and fix the 3379 // PHIs that are left in the scalar version of the loop. 3380 // The starting values of PHI nodes depend on the counter of the last 3381 // iteration in the vectorized loop. 3382 // If we come from a bypass edge then we need to start from the original 3383 // start value. 3384 Instruction *OldInduction = Legal->getPrimaryInduction(); 3385 for (auto &InductionEntry : Legal->getInductionVars()) { 3386 PHINode *OrigPhi = InductionEntry.first; 3387 InductionDescriptor II = InductionEntry.second; 3388 3389 // Create phi nodes to merge from the backedge-taken check block. 3390 PHINode *BCResumeVal = 3391 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3392 LoopScalarPreHeader->getTerminator()); 3393 // Copy original phi DL over to the new one. 3394 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3395 Value *&EndValue = IVEndValues[OrigPhi]; 3396 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3397 if (OrigPhi == OldInduction) { 3398 // We know what the end value is. 3399 EndValue = VectorTripCount; 3400 } else { 3401 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3402 3403 // Fast-math-flags propagate from the original induction instruction. 3404 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3405 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3406 3407 Type *StepType = II.getStep()->getType(); 3408 Instruction::CastOps CastOp = 3409 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3410 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3411 Value *Step = 3412 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3413 EndValue = emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3414 EndValue->setName("ind.end"); 3415 3416 // Compute the end value for the additional bypass (if applicable). 3417 if (AdditionalBypass.first) { 3418 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3419 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3420 StepType, true); 3421 Value *Step = 3422 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3423 CRD = 3424 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3425 EndValueFromAdditionalBypass = 3426 emitTransformedIndex(B, CRD, II.getStartValue(), Step, II); 3427 EndValueFromAdditionalBypass->setName("ind.end"); 3428 } 3429 } 3430 // The new PHI merges the original incoming value, in case of a bypass, 3431 // or the value at the end of the vectorized loop. 3432 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3433 3434 // Fix the scalar body counter (PHI node). 3435 // The old induction's phi node in the scalar body needs the truncated 3436 // value. 3437 for (BasicBlock *BB : LoopBypassBlocks) 3438 BCResumeVal->addIncoming(II.getStartValue(), BB); 3439 3440 if (AdditionalBypass.first) 3441 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3442 EndValueFromAdditionalBypass); 3443 3444 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3445 } 3446 } 3447 3448 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3449 MDNode *OrigLoopID) { 3450 assert(L && "Expected valid loop."); 3451 3452 // The trip counts should be cached by now. 3453 Value *Count = getOrCreateTripCount(L); 3454 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3455 3456 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3457 3458 // Add a check in the middle block to see if we have completed 3459 // all of the iterations in the first vector loop. Three cases: 3460 // 1) If we require a scalar epilogue, there is no conditional branch as 3461 // we unconditionally branch to the scalar preheader. Do nothing. 3462 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3463 // Thus if tail is to be folded, we know we don't need to run the 3464 // remainder and we can use the previous value for the condition (true). 3465 // 3) Otherwise, construct a runtime check. 3466 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3467 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3468 Count, VectorTripCount, "cmp.n", 3469 LoopMiddleBlock->getTerminator()); 3470 3471 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3472 // of the corresponding compare because they may have ended up with 3473 // different line numbers and we want to avoid awkward line stepping while 3474 // debugging. Eg. if the compare has got a line number inside the loop. 3475 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3476 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3477 } 3478 3479 // Get ready to start creating new instructions into the vectorized body. 3480 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3481 "Inconsistent vector loop preheader"); 3482 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3483 3484 #ifdef EXPENSIVE_CHECKS 3485 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3486 LI->verify(*DT); 3487 #endif 3488 3489 return LoopVectorPreHeader; 3490 } 3491 3492 std::pair<BasicBlock *, Value *> 3493 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3494 /* 3495 In this function we generate a new loop. The new loop will contain 3496 the vectorized instructions while the old loop will continue to run the 3497 scalar remainder. 3498 3499 [ ] <-- loop iteration number check. 3500 / | 3501 / v 3502 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3503 | / | 3504 | / v 3505 || [ ] <-- vector pre header. 3506 |/ | 3507 | v 3508 | [ ] \ 3509 | [ ]_| <-- vector loop. 3510 | | 3511 | v 3512 \ -[ ] <--- middle-block. 3513 \/ | 3514 /\ v 3515 | ->[ ] <--- new preheader. 3516 | | 3517 (opt) v <-- edge from middle to exit iff epilogue is not required. 3518 | [ ] \ 3519 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3520 \ | 3521 \ v 3522 >[ ] <-- exit block(s). 3523 ... 3524 */ 3525 3526 // Get the metadata of the original loop before it gets modified. 3527 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3528 3529 // Workaround! Compute the trip count of the original loop and cache it 3530 // before we start modifying the CFG. This code has a systemic problem 3531 // wherein it tries to run analysis over partially constructed IR; this is 3532 // wrong, and not simply for SCEV. The trip count of the original loop 3533 // simply happens to be prone to hitting this in practice. In theory, we 3534 // can hit the same issue for any SCEV, or ValueTracking query done during 3535 // mutation. See PR49900. 3536 getOrCreateTripCount(OrigLoop); 3537 3538 // Create an empty vector loop, and prepare basic blocks for the runtime 3539 // checks. 3540 Loop *Lp = createVectorLoopSkeleton(""); 3541 3542 // Now, compare the new count to zero. If it is zero skip the vector loop and 3543 // jump to the scalar loop. This check also covers the case where the 3544 // backedge-taken count is uint##_max: adding one to it will overflow leading 3545 // to an incorrect trip count of zero. In this (rare) case we will also jump 3546 // to the scalar loop. 3547 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3548 3549 // Generate the code to check any assumptions that we've made for SCEV 3550 // expressions. 3551 emitSCEVChecks(Lp, LoopScalarPreHeader); 3552 3553 // Generate the code that checks in runtime if arrays overlap. We put the 3554 // checks into a separate block to make the more common case of few elements 3555 // faster. 3556 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3557 3558 createHeaderBranch(Lp); 3559 3560 // Emit phis for the new starting index of the scalar loop. 3561 createInductionResumeValues(Lp); 3562 3563 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3564 } 3565 3566 // Fix up external users of the induction variable. At this point, we are 3567 // in LCSSA form, with all external PHIs that use the IV having one input value, 3568 // coming from the remainder loop. We need those PHIs to also have a correct 3569 // value for the IV when arriving directly from the middle block. 3570 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3571 const InductionDescriptor &II, 3572 Value *CountRoundDown, Value *EndValue, 3573 BasicBlock *MiddleBlock) { 3574 // There are two kinds of external IV usages - those that use the value 3575 // computed in the last iteration (the PHI) and those that use the penultimate 3576 // value (the value that feeds into the phi from the loop latch). 3577 // We allow both, but they, obviously, have different values. 3578 3579 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3580 3581 DenseMap<Value *, Value *> MissingVals; 3582 3583 // An external user of the last iteration's value should see the value that 3584 // the remainder loop uses to initialize its own IV. 3585 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3586 for (User *U : PostInc->users()) { 3587 Instruction *UI = cast<Instruction>(U); 3588 if (!OrigLoop->contains(UI)) { 3589 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3590 MissingVals[UI] = EndValue; 3591 } 3592 } 3593 3594 // An external user of the penultimate value need to see EndValue - Step. 3595 // The simplest way to get this is to recompute it from the constituent SCEVs, 3596 // that is Start + (Step * (CRD - 1)). 3597 for (User *U : OrigPhi->users()) { 3598 auto *UI = cast<Instruction>(U); 3599 if (!OrigLoop->contains(UI)) { 3600 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3601 3602 IRBuilder<> B(MiddleBlock->getTerminator()); 3603 3604 // Fast-math-flags propagate from the original induction instruction. 3605 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3606 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3607 3608 Value *CountMinusOne = B.CreateSub( 3609 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3610 Value *CMO = 3611 !II.getStep()->getType()->isIntegerTy() 3612 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3613 II.getStep()->getType()) 3614 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3615 CMO->setName("cast.cmo"); 3616 3617 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3618 LoopVectorBody->getTerminator()); 3619 Value *Escape = 3620 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3621 Escape->setName("ind.escape"); 3622 MissingVals[UI] = Escape; 3623 } 3624 } 3625 3626 for (auto &I : MissingVals) { 3627 PHINode *PHI = cast<PHINode>(I.first); 3628 // One corner case we have to handle is two IVs "chasing" each-other, 3629 // that is %IV2 = phi [...], [ %IV1, %latch ] 3630 // In this case, if IV1 has an external use, we need to avoid adding both 3631 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3632 // don't already have an incoming value for the middle block. 3633 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3634 PHI->addIncoming(I.second, MiddleBlock); 3635 } 3636 } 3637 3638 namespace { 3639 3640 struct CSEDenseMapInfo { 3641 static bool canHandle(const Instruction *I) { 3642 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3643 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3644 } 3645 3646 static inline Instruction *getEmptyKey() { 3647 return DenseMapInfo<Instruction *>::getEmptyKey(); 3648 } 3649 3650 static inline Instruction *getTombstoneKey() { 3651 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3652 } 3653 3654 static unsigned getHashValue(const Instruction *I) { 3655 assert(canHandle(I) && "Unknown instruction!"); 3656 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3657 I->value_op_end())); 3658 } 3659 3660 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3661 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3662 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3663 return LHS == RHS; 3664 return LHS->isIdenticalTo(RHS); 3665 } 3666 }; 3667 3668 } // end anonymous namespace 3669 3670 ///Perform cse of induction variable instructions. 3671 static void cse(BasicBlock *BB) { 3672 // Perform simple cse. 3673 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3674 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3675 if (!CSEDenseMapInfo::canHandle(&In)) 3676 continue; 3677 3678 // Check if we can replace this instruction with any of the 3679 // visited instructions. 3680 if (Instruction *V = CSEMap.lookup(&In)) { 3681 In.replaceAllUsesWith(V); 3682 In.eraseFromParent(); 3683 continue; 3684 } 3685 3686 CSEMap[&In] = &In; 3687 } 3688 } 3689 3690 InstructionCost 3691 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3692 bool &NeedToScalarize) const { 3693 Function *F = CI->getCalledFunction(); 3694 Type *ScalarRetTy = CI->getType(); 3695 SmallVector<Type *, 4> Tys, ScalarTys; 3696 for (auto &ArgOp : CI->args()) 3697 ScalarTys.push_back(ArgOp->getType()); 3698 3699 // Estimate cost of scalarized vector call. The source operands are assumed 3700 // to be vectors, so we need to extract individual elements from there, 3701 // execute VF scalar calls, and then gather the result into the vector return 3702 // value. 3703 InstructionCost ScalarCallCost = 3704 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3705 if (VF.isScalar()) 3706 return ScalarCallCost; 3707 3708 // Compute corresponding vector type for return value and arguments. 3709 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3710 for (Type *ScalarTy : ScalarTys) 3711 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3712 3713 // Compute costs of unpacking argument values for the scalar calls and 3714 // packing the return values to a vector. 3715 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3716 3717 InstructionCost Cost = 3718 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3719 3720 // If we can't emit a vector call for this function, then the currently found 3721 // cost is the cost we need to return. 3722 NeedToScalarize = true; 3723 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3724 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3725 3726 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3727 return Cost; 3728 3729 // If the corresponding vector cost is cheaper, return its cost. 3730 InstructionCost VectorCallCost = 3731 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3732 if (VectorCallCost < Cost) { 3733 NeedToScalarize = false; 3734 Cost = VectorCallCost; 3735 } 3736 return Cost; 3737 } 3738 3739 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3740 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3741 return Elt; 3742 return VectorType::get(Elt, VF); 3743 } 3744 3745 InstructionCost 3746 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3747 ElementCount VF) const { 3748 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3749 assert(ID && "Expected intrinsic call!"); 3750 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3751 FastMathFlags FMF; 3752 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3753 FMF = FPMO->getFastMathFlags(); 3754 3755 SmallVector<const Value *> Arguments(CI->args()); 3756 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3757 SmallVector<Type *> ParamTys; 3758 std::transform(FTy->param_begin(), FTy->param_end(), 3759 std::back_inserter(ParamTys), 3760 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3761 3762 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3763 dyn_cast<IntrinsicInst>(CI)); 3764 return TTI.getIntrinsicInstrCost(CostAttrs, 3765 TargetTransformInfo::TCK_RecipThroughput); 3766 } 3767 3768 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3769 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3770 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3771 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3772 } 3773 3774 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3775 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3776 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3777 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3778 } 3779 3780 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3781 // For every instruction `I` in MinBWs, truncate the operands, create a 3782 // truncated version of `I` and reextend its result. InstCombine runs 3783 // later and will remove any ext/trunc pairs. 3784 SmallPtrSet<Value *, 4> Erased; 3785 for (const auto &KV : Cost->getMinimalBitwidths()) { 3786 // If the value wasn't vectorized, we must maintain the original scalar 3787 // type. The absence of the value from State indicates that it 3788 // wasn't vectorized. 3789 // FIXME: Should not rely on getVPValue at this point. 3790 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3791 if (!State.hasAnyVectorValue(Def)) 3792 continue; 3793 for (unsigned Part = 0; Part < UF; ++Part) { 3794 Value *I = State.get(Def, Part); 3795 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3796 continue; 3797 Type *OriginalTy = I->getType(); 3798 Type *ScalarTruncatedTy = 3799 IntegerType::get(OriginalTy->getContext(), KV.second); 3800 auto *TruncatedTy = VectorType::get( 3801 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3802 if (TruncatedTy == OriginalTy) 3803 continue; 3804 3805 IRBuilder<> B(cast<Instruction>(I)); 3806 auto ShrinkOperand = [&](Value *V) -> Value * { 3807 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3808 if (ZI->getSrcTy() == TruncatedTy) 3809 return ZI->getOperand(0); 3810 return B.CreateZExtOrTrunc(V, TruncatedTy); 3811 }; 3812 3813 // The actual instruction modification depends on the instruction type, 3814 // unfortunately. 3815 Value *NewI = nullptr; 3816 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3817 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3818 ShrinkOperand(BO->getOperand(1))); 3819 3820 // Any wrapping introduced by shrinking this operation shouldn't be 3821 // considered undefined behavior. So, we can't unconditionally copy 3822 // arithmetic wrapping flags to NewI. 3823 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3824 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3825 NewI = 3826 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3827 ShrinkOperand(CI->getOperand(1))); 3828 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3829 NewI = B.CreateSelect(SI->getCondition(), 3830 ShrinkOperand(SI->getTrueValue()), 3831 ShrinkOperand(SI->getFalseValue())); 3832 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3833 switch (CI->getOpcode()) { 3834 default: 3835 llvm_unreachable("Unhandled cast!"); 3836 case Instruction::Trunc: 3837 NewI = ShrinkOperand(CI->getOperand(0)); 3838 break; 3839 case Instruction::SExt: 3840 NewI = B.CreateSExtOrTrunc( 3841 CI->getOperand(0), 3842 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3843 break; 3844 case Instruction::ZExt: 3845 NewI = B.CreateZExtOrTrunc( 3846 CI->getOperand(0), 3847 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3848 break; 3849 } 3850 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3851 auto Elements0 = 3852 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3853 auto *O0 = B.CreateZExtOrTrunc( 3854 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3855 auto Elements1 = 3856 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3857 auto *O1 = B.CreateZExtOrTrunc( 3858 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3859 3860 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3861 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3862 // Don't do anything with the operands, just extend the result. 3863 continue; 3864 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3865 auto Elements = 3866 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3867 auto *O0 = B.CreateZExtOrTrunc( 3868 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3869 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3870 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3871 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3872 auto Elements = 3873 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3874 auto *O0 = B.CreateZExtOrTrunc( 3875 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3876 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3877 } else { 3878 // If we don't know what to do, be conservative and don't do anything. 3879 continue; 3880 } 3881 3882 // Lastly, extend the result. 3883 NewI->takeName(cast<Instruction>(I)); 3884 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3885 I->replaceAllUsesWith(Res); 3886 cast<Instruction>(I)->eraseFromParent(); 3887 Erased.insert(I); 3888 State.reset(Def, Res, Part); 3889 } 3890 } 3891 3892 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3893 for (const auto &KV : Cost->getMinimalBitwidths()) { 3894 // If the value wasn't vectorized, we must maintain the original scalar 3895 // type. The absence of the value from State indicates that it 3896 // wasn't vectorized. 3897 // FIXME: Should not rely on getVPValue at this point. 3898 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3899 if (!State.hasAnyVectorValue(Def)) 3900 continue; 3901 for (unsigned Part = 0; Part < UF; ++Part) { 3902 Value *I = State.get(Def, Part); 3903 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3904 if (Inst && Inst->use_empty()) { 3905 Value *NewI = Inst->getOperand(0); 3906 Inst->eraseFromParent(); 3907 State.reset(Def, NewI, Part); 3908 } 3909 } 3910 } 3911 } 3912 3913 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3914 // Insert truncates and extends for any truncated instructions as hints to 3915 // InstCombine. 3916 if (VF.isVector()) 3917 truncateToMinimalBitwidths(State); 3918 3919 // Fix widened non-induction PHIs by setting up the PHI operands. 3920 if (OrigPHIsToFix.size()) { 3921 assert(EnableVPlanNativePath && 3922 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3923 fixNonInductionPHIs(State); 3924 } 3925 3926 // At this point every instruction in the original loop is widened to a 3927 // vector form. Now we need to fix the recurrences in the loop. These PHI 3928 // nodes are currently empty because we did not want to introduce cycles. 3929 // This is the second stage of vectorizing recurrences. 3930 fixCrossIterationPHIs(State); 3931 3932 // Forget the original basic block. 3933 PSE.getSE()->forgetLoop(OrigLoop); 3934 3935 // If we inserted an edge from the middle block to the unique exit block, 3936 // update uses outside the loop (phis) to account for the newly inserted 3937 // edge. 3938 if (!Cost->requiresScalarEpilogue(VF)) { 3939 // Fix-up external users of the induction variables. 3940 for (auto &Entry : Legal->getInductionVars()) 3941 fixupIVUsers(Entry.first, Entry.second, 3942 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3943 IVEndValues[Entry.first], LoopMiddleBlock); 3944 3945 fixLCSSAPHIs(State); 3946 } 3947 3948 for (Instruction *PI : PredicatedInstructions) 3949 sinkScalarOperands(&*PI); 3950 3951 // Remove redundant induction instructions. 3952 cse(LoopVectorBody); 3953 3954 // Set/update profile weights for the vector and remainder loops as original 3955 // loop iterations are now distributed among them. Note that original loop 3956 // represented by LoopScalarBody becomes remainder loop after vectorization. 3957 // 3958 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3959 // end up getting slightly roughened result but that should be OK since 3960 // profile is not inherently precise anyway. Note also possible bypass of 3961 // vector code caused by legality checks is ignored, assigning all the weight 3962 // to the vector loop, optimistically. 3963 // 3964 // For scalable vectorization we can't know at compile time how many iterations 3965 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3966 // vscale of '1'. 3967 setProfileInfoAfterUnrolling( 3968 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3969 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3970 } 3971 3972 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3973 // In order to support recurrences we need to be able to vectorize Phi nodes. 3974 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3975 // stage #2: We now need to fix the recurrences by adding incoming edges to 3976 // the currently empty PHI nodes. At this point every instruction in the 3977 // original loop is widened to a vector form so we can use them to construct 3978 // the incoming edges. 3979 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 3980 for (VPRecipeBase &R : Header->phis()) { 3981 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3982 fixReduction(ReductionPhi, State); 3983 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3984 fixFirstOrderRecurrence(FOR, State); 3985 } 3986 } 3987 3988 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3989 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3990 // This is the second phase of vectorizing first-order recurrences. An 3991 // overview of the transformation is described below. Suppose we have the 3992 // following loop. 3993 // 3994 // for (int i = 0; i < n; ++i) 3995 // b[i] = a[i] - a[i - 1]; 3996 // 3997 // There is a first-order recurrence on "a". For this loop, the shorthand 3998 // scalar IR looks like: 3999 // 4000 // scalar.ph: 4001 // s_init = a[-1] 4002 // br scalar.body 4003 // 4004 // scalar.body: 4005 // i = phi [0, scalar.ph], [i+1, scalar.body] 4006 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4007 // s2 = a[i] 4008 // b[i] = s2 - s1 4009 // br cond, scalar.body, ... 4010 // 4011 // In this example, s1 is a recurrence because it's value depends on the 4012 // previous iteration. In the first phase of vectorization, we created a 4013 // vector phi v1 for s1. We now complete the vectorization and produce the 4014 // shorthand vector IR shown below (for VF = 4, UF = 1). 4015 // 4016 // vector.ph: 4017 // v_init = vector(..., ..., ..., a[-1]) 4018 // br vector.body 4019 // 4020 // vector.body 4021 // i = phi [0, vector.ph], [i+4, vector.body] 4022 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4023 // v2 = a[i, i+1, i+2, i+3]; 4024 // v3 = vector(v1(3), v2(0, 1, 2)) 4025 // b[i, i+1, i+2, i+3] = v2 - v3 4026 // br cond, vector.body, middle.block 4027 // 4028 // middle.block: 4029 // x = v2(3) 4030 // br scalar.ph 4031 // 4032 // scalar.ph: 4033 // s_init = phi [x, middle.block], [a[-1], otherwise] 4034 // br scalar.body 4035 // 4036 // After execution completes the vector loop, we extract the next value of 4037 // the recurrence (x) to use as the initial value in the scalar loop. 4038 4039 // Extract the last vector element in the middle block. This will be the 4040 // initial value for the recurrence when jumping to the scalar loop. 4041 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4042 Value *Incoming = State.get(PreviousDef, UF - 1); 4043 auto *ExtractForScalar = Incoming; 4044 auto *IdxTy = Builder.getInt32Ty(); 4045 if (VF.isVector()) { 4046 auto *One = ConstantInt::get(IdxTy, 1); 4047 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4048 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4049 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4050 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4051 "vector.recur.extract"); 4052 } 4053 // Extract the second last element in the middle block if the 4054 // Phi is used outside the loop. We need to extract the phi itself 4055 // and not the last element (the phi update in the current iteration). This 4056 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4057 // when the scalar loop is not run at all. 4058 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4059 if (VF.isVector()) { 4060 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4061 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4062 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4063 Incoming, Idx, "vector.recur.extract.for.phi"); 4064 } else if (UF > 1) 4065 // When loop is unrolled without vectorizing, initialize 4066 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4067 // of `Incoming`. This is analogous to the vectorized case above: extracting 4068 // the second last element when VF > 1. 4069 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4070 4071 // Fix the initial value of the original recurrence in the scalar loop. 4072 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4073 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4074 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4075 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4076 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4077 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4078 Start->addIncoming(Incoming, BB); 4079 } 4080 4081 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4082 Phi->setName("scalar.recur"); 4083 4084 // Finally, fix users of the recurrence outside the loop. The users will need 4085 // either the last value of the scalar recurrence or the last value of the 4086 // vector recurrence we extracted in the middle block. Since the loop is in 4087 // LCSSA form, we just need to find all the phi nodes for the original scalar 4088 // recurrence in the exit block, and then add an edge for the middle block. 4089 // Note that LCSSA does not imply single entry when the original scalar loop 4090 // had multiple exiting edges (as we always run the last iteration in the 4091 // scalar epilogue); in that case, there is no edge from middle to exit and 4092 // and thus no phis which needed updated. 4093 if (!Cost->requiresScalarEpilogue(VF)) 4094 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4095 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4096 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4097 } 4098 4099 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4100 VPTransformState &State) { 4101 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4102 // Get it's reduction variable descriptor. 4103 assert(Legal->isReductionVariable(OrigPhi) && 4104 "Unable to find the reduction variable"); 4105 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4106 4107 RecurKind RK = RdxDesc.getRecurrenceKind(); 4108 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4109 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4110 setDebugLocFromInst(ReductionStartValue); 4111 4112 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4113 // This is the vector-clone of the value that leaves the loop. 4114 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4115 4116 // Wrap flags are in general invalid after vectorization, clear them. 4117 clearReductionWrapFlags(RdxDesc, State); 4118 4119 // Before each round, move the insertion point right between 4120 // the PHIs and the values we are going to write. 4121 // This allows us to write both PHINodes and the extractelement 4122 // instructions. 4123 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4124 4125 setDebugLocFromInst(LoopExitInst); 4126 4127 Type *PhiTy = OrigPhi->getType(); 4128 // If tail is folded by masking, the vector value to leave the loop should be 4129 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4130 // instead of the former. For an inloop reduction the reduction will already 4131 // be predicated, and does not need to be handled here. 4132 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4133 for (unsigned Part = 0; Part < UF; ++Part) { 4134 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4135 Value *Sel = nullptr; 4136 for (User *U : VecLoopExitInst->users()) { 4137 if (isa<SelectInst>(U)) { 4138 assert(!Sel && "Reduction exit feeding two selects"); 4139 Sel = U; 4140 } else 4141 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4142 } 4143 assert(Sel && "Reduction exit feeds no select"); 4144 State.reset(LoopExitInstDef, Sel, Part); 4145 4146 // If the target can create a predicated operator for the reduction at no 4147 // extra cost in the loop (for example a predicated vadd), it can be 4148 // cheaper for the select to remain in the loop than be sunk out of it, 4149 // and so use the select value for the phi instead of the old 4150 // LoopExitValue. 4151 if (PreferPredicatedReductionSelect || 4152 TTI->preferPredicatedReductionSelect( 4153 RdxDesc.getOpcode(), PhiTy, 4154 TargetTransformInfo::ReductionFlags())) { 4155 auto *VecRdxPhi = 4156 cast<PHINode>(State.get(PhiR, Part)); 4157 VecRdxPhi->setIncomingValueForBlock( 4158 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4159 } 4160 } 4161 } 4162 4163 // If the vector reduction can be performed in a smaller type, we truncate 4164 // then extend the loop exit value to enable InstCombine to evaluate the 4165 // entire expression in the smaller type. 4166 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4167 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4168 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4169 Builder.SetInsertPoint( 4170 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4171 VectorParts RdxParts(UF); 4172 for (unsigned Part = 0; Part < UF; ++Part) { 4173 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4174 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4175 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4176 : Builder.CreateZExt(Trunc, VecTy); 4177 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4178 if (U != Trunc) { 4179 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4180 RdxParts[Part] = Extnd; 4181 } 4182 } 4183 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4184 for (unsigned Part = 0; Part < UF; ++Part) { 4185 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4186 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4187 } 4188 } 4189 4190 // Reduce all of the unrolled parts into a single vector. 4191 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4192 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4193 4194 // The middle block terminator has already been assigned a DebugLoc here (the 4195 // OrigLoop's single latch terminator). We want the whole middle block to 4196 // appear to execute on this line because: (a) it is all compiler generated, 4197 // (b) these instructions are always executed after evaluating the latch 4198 // conditional branch, and (c) other passes may add new predecessors which 4199 // terminate on this line. This is the easiest way to ensure we don't 4200 // accidentally cause an extra step back into the loop while debugging. 4201 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4202 if (PhiR->isOrdered()) 4203 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4204 else { 4205 // Floating-point operations should have some FMF to enable the reduction. 4206 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4207 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4208 for (unsigned Part = 1; Part < UF; ++Part) { 4209 Value *RdxPart = State.get(LoopExitInstDef, Part); 4210 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4211 ReducedPartRdx = Builder.CreateBinOp( 4212 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4213 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4214 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4215 ReducedPartRdx, RdxPart); 4216 else 4217 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4218 } 4219 } 4220 4221 // Create the reduction after the loop. Note that inloop reductions create the 4222 // target reduction in the loop using a Reduction recipe. 4223 if (VF.isVector() && !PhiR->isInLoop()) { 4224 ReducedPartRdx = 4225 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4226 // If the reduction can be performed in a smaller type, we need to extend 4227 // the reduction to the wider type before we branch to the original loop. 4228 if (PhiTy != RdxDesc.getRecurrenceType()) 4229 ReducedPartRdx = RdxDesc.isSigned() 4230 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4231 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4232 } 4233 4234 PHINode *ResumePhi = 4235 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4236 4237 // Create a phi node that merges control-flow from the backedge-taken check 4238 // block and the middle block. 4239 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4240 LoopScalarPreHeader->getTerminator()); 4241 4242 // If we are fixing reductions in the epilogue loop then we should already 4243 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4244 // we carry over the incoming values correctly. 4245 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4246 if (Incoming == LoopMiddleBlock) 4247 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4248 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4249 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4250 Incoming); 4251 else 4252 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4253 } 4254 4255 // Set the resume value for this reduction 4256 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4257 4258 // Now, we need to fix the users of the reduction variable 4259 // inside and outside of the scalar remainder loop. 4260 4261 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4262 // in the exit blocks. See comment on analogous loop in 4263 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4264 if (!Cost->requiresScalarEpilogue(VF)) 4265 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4266 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4267 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4268 4269 // Fix the scalar loop reduction variable with the incoming reduction sum 4270 // from the vector body and from the backedge value. 4271 int IncomingEdgeBlockIdx = 4272 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4273 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4274 // Pick the other block. 4275 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4276 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4277 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4278 } 4279 4280 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4281 VPTransformState &State) { 4282 RecurKind RK = RdxDesc.getRecurrenceKind(); 4283 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4284 return; 4285 4286 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4287 assert(LoopExitInstr && "null loop exit instruction"); 4288 SmallVector<Instruction *, 8> Worklist; 4289 SmallPtrSet<Instruction *, 8> Visited; 4290 Worklist.push_back(LoopExitInstr); 4291 Visited.insert(LoopExitInstr); 4292 4293 while (!Worklist.empty()) { 4294 Instruction *Cur = Worklist.pop_back_val(); 4295 if (isa<OverflowingBinaryOperator>(Cur)) 4296 for (unsigned Part = 0; Part < UF; ++Part) { 4297 // FIXME: Should not rely on getVPValue at this point. 4298 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4299 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4300 } 4301 4302 for (User *U : Cur->users()) { 4303 Instruction *UI = cast<Instruction>(U); 4304 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4305 Visited.insert(UI).second) 4306 Worklist.push_back(UI); 4307 } 4308 } 4309 } 4310 4311 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4312 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4313 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4314 // Some phis were already hand updated by the reduction and recurrence 4315 // code above, leave them alone. 4316 continue; 4317 4318 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4319 // Non-instruction incoming values will have only one value. 4320 4321 VPLane Lane = VPLane::getFirstLane(); 4322 if (isa<Instruction>(IncomingValue) && 4323 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4324 VF)) 4325 Lane = VPLane::getLastLaneForVF(VF); 4326 4327 // Can be a loop invariant incoming value or the last scalar value to be 4328 // extracted from the vectorized loop. 4329 // FIXME: Should not rely on getVPValue at this point. 4330 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4331 Value *lastIncomingValue = 4332 OrigLoop->isLoopInvariant(IncomingValue) 4333 ? IncomingValue 4334 : State.get(State.Plan->getVPValue(IncomingValue, true), 4335 VPIteration(UF - 1, Lane)); 4336 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4337 } 4338 } 4339 4340 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4341 // The basic block and loop containing the predicated instruction. 4342 auto *PredBB = PredInst->getParent(); 4343 auto *VectorLoop = LI->getLoopFor(PredBB); 4344 4345 // Initialize a worklist with the operands of the predicated instruction. 4346 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4347 4348 // Holds instructions that we need to analyze again. An instruction may be 4349 // reanalyzed if we don't yet know if we can sink it or not. 4350 SmallVector<Instruction *, 8> InstsToReanalyze; 4351 4352 // Returns true if a given use occurs in the predicated block. Phi nodes use 4353 // their operands in their corresponding predecessor blocks. 4354 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4355 auto *I = cast<Instruction>(U.getUser()); 4356 BasicBlock *BB = I->getParent(); 4357 if (auto *Phi = dyn_cast<PHINode>(I)) 4358 BB = Phi->getIncomingBlock( 4359 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4360 return BB == PredBB; 4361 }; 4362 4363 // Iteratively sink the scalarized operands of the predicated instruction 4364 // into the block we created for it. When an instruction is sunk, it's 4365 // operands are then added to the worklist. The algorithm ends after one pass 4366 // through the worklist doesn't sink a single instruction. 4367 bool Changed; 4368 do { 4369 // Add the instructions that need to be reanalyzed to the worklist, and 4370 // reset the changed indicator. 4371 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4372 InstsToReanalyze.clear(); 4373 Changed = false; 4374 4375 while (!Worklist.empty()) { 4376 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4377 4378 // We can't sink an instruction if it is a phi node, is not in the loop, 4379 // or may have side effects. 4380 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4381 I->mayHaveSideEffects()) 4382 continue; 4383 4384 // If the instruction is already in PredBB, check if we can sink its 4385 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4386 // sinking the scalar instruction I, hence it appears in PredBB; but it 4387 // may have failed to sink I's operands (recursively), which we try 4388 // (again) here. 4389 if (I->getParent() == PredBB) { 4390 Worklist.insert(I->op_begin(), I->op_end()); 4391 continue; 4392 } 4393 4394 // It's legal to sink the instruction if all its uses occur in the 4395 // predicated block. Otherwise, there's nothing to do yet, and we may 4396 // need to reanalyze the instruction. 4397 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4398 InstsToReanalyze.push_back(I); 4399 continue; 4400 } 4401 4402 // Move the instruction to the beginning of the predicated block, and add 4403 // it's operands to the worklist. 4404 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4405 Worklist.insert(I->op_begin(), I->op_end()); 4406 4407 // The sinking may have enabled other instructions to be sunk, so we will 4408 // need to iterate. 4409 Changed = true; 4410 } 4411 } while (Changed); 4412 } 4413 4414 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4415 for (PHINode *OrigPhi : OrigPHIsToFix) { 4416 VPWidenPHIRecipe *VPPhi = 4417 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4418 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4419 // Make sure the builder has a valid insert point. 4420 Builder.SetInsertPoint(NewPhi); 4421 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4422 VPValue *Inc = VPPhi->getIncomingValue(i); 4423 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4424 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4425 } 4426 } 4427 } 4428 4429 bool InnerLoopVectorizer::useOrderedReductions( 4430 const RecurrenceDescriptor &RdxDesc) { 4431 return Cost->useOrderedReductions(RdxDesc); 4432 } 4433 4434 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4435 VPWidenPHIRecipe *PhiR, 4436 VPTransformState &State) { 4437 PHINode *P = cast<PHINode>(PN); 4438 if (EnableVPlanNativePath) { 4439 // Currently we enter here in the VPlan-native path for non-induction 4440 // PHIs where all control flow is uniform. We simply widen these PHIs. 4441 // Create a vector phi with no operands - the vector phi operands will be 4442 // set at the end of vector code generation. 4443 Type *VecTy = (State.VF.isScalar()) 4444 ? PN->getType() 4445 : VectorType::get(PN->getType(), State.VF); 4446 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4447 State.set(PhiR, VecPhi, 0); 4448 OrigPHIsToFix.push_back(P); 4449 4450 return; 4451 } 4452 4453 assert(PN->getParent() == OrigLoop->getHeader() && 4454 "Non-header phis should have been handled elsewhere"); 4455 4456 // In order to support recurrences we need to be able to vectorize Phi nodes. 4457 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4458 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4459 // this value when we vectorize all of the instructions that use the PHI. 4460 4461 assert(!Legal->isReductionVariable(P) && 4462 "reductions should be handled elsewhere"); 4463 4464 setDebugLocFromInst(P); 4465 4466 // This PHINode must be an induction variable. 4467 // Make sure that we know about it. 4468 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4469 4470 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4471 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4472 4473 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4474 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4475 4476 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4477 // which can be found from the original scalar operations. 4478 switch (II.getKind()) { 4479 case InductionDescriptor::IK_NoInduction: 4480 llvm_unreachable("Unknown induction"); 4481 case InductionDescriptor::IK_IntInduction: 4482 case InductionDescriptor::IK_FpInduction: 4483 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4484 case InductionDescriptor::IK_PtrInduction: { 4485 // Handle the pointer induction variable case. 4486 assert(P->getType()->isPointerTy() && "Unexpected type."); 4487 4488 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4489 // This is the normalized GEP that starts counting at zero. 4490 Value *PtrInd = 4491 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4492 // Determine the number of scalars we need to generate for each unroll 4493 // iteration. If the instruction is uniform, we only need to generate the 4494 // first lane. Otherwise, we generate all VF values. 4495 bool IsUniform = vputils::onlyFirstLaneUsed(PhiR); 4496 assert((IsUniform || !State.VF.isScalable()) && 4497 "Cannot scalarize a scalable VF"); 4498 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4499 4500 for (unsigned Part = 0; Part < UF; ++Part) { 4501 Value *PartStart = 4502 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4503 4504 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4505 Value *Idx = Builder.CreateAdd( 4506 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4507 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4508 4509 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 4510 State.CFG.PrevBB->getTerminator()); 4511 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, 4512 II.getStartValue(), Step, II); 4513 SclrGep->setName("next.gep"); 4514 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4515 } 4516 } 4517 return; 4518 } 4519 assert(isa<SCEVConstant>(II.getStep()) && 4520 "Induction step not a SCEV constant!"); 4521 Type *PhiType = II.getStep()->getType(); 4522 4523 // Build a pointer phi 4524 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4525 Type *ScStValueType = ScalarStartValue->getType(); 4526 PHINode *NewPointerPhi = 4527 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4528 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4529 4530 // A pointer induction, performed by using a gep 4531 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4532 Instruction *InductionLoc = LoopLatch->getTerminator(); 4533 const SCEV *ScalarStep = II.getStep(); 4534 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4535 Value *ScalarStepValue = 4536 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4537 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4538 Value *NumUnrolledElems = 4539 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4540 Value *InductionGEP = GetElementPtrInst::Create( 4541 II.getElementType(), NewPointerPhi, 4542 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4543 InductionLoc); 4544 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4545 4546 // Create UF many actual address geps that use the pointer 4547 // phi as base and a vectorized version of the step value 4548 // (<step*0, ..., step*N>) as offset. 4549 for (unsigned Part = 0; Part < State.UF; ++Part) { 4550 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4551 Value *StartOffsetScalar = 4552 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4553 Value *StartOffset = 4554 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4555 // Create a vector of consecutive numbers from zero to VF. 4556 StartOffset = 4557 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4558 4559 Value *GEP = Builder.CreateGEP( 4560 II.getElementType(), NewPointerPhi, 4561 Builder.CreateMul( 4562 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4563 "vector.gep")); 4564 State.set(PhiR, GEP, Part); 4565 } 4566 } 4567 } 4568 } 4569 4570 /// A helper function for checking whether an integer division-related 4571 /// instruction may divide by zero (in which case it must be predicated if 4572 /// executed conditionally in the scalar code). 4573 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4574 /// Non-zero divisors that are non compile-time constants will not be 4575 /// converted into multiplication, so we will still end up scalarizing 4576 /// the division, but can do so w/o predication. 4577 static bool mayDivideByZero(Instruction &I) { 4578 assert((I.getOpcode() == Instruction::UDiv || 4579 I.getOpcode() == Instruction::SDiv || 4580 I.getOpcode() == Instruction::URem || 4581 I.getOpcode() == Instruction::SRem) && 4582 "Unexpected instruction"); 4583 Value *Divisor = I.getOperand(1); 4584 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4585 return !CInt || CInt->isZero(); 4586 } 4587 4588 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4589 VPUser &ArgOperands, 4590 VPTransformState &State) { 4591 assert(!isa<DbgInfoIntrinsic>(I) && 4592 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4593 setDebugLocFromInst(&I); 4594 4595 Module *M = I.getParent()->getParent()->getParent(); 4596 auto *CI = cast<CallInst>(&I); 4597 4598 SmallVector<Type *, 4> Tys; 4599 for (Value *ArgOperand : CI->args()) 4600 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4601 4602 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4603 4604 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4605 // version of the instruction. 4606 // Is it beneficial to perform intrinsic call compared to lib call? 4607 bool NeedToScalarize = false; 4608 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4609 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4610 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4611 assert((UseVectorIntrinsic || !NeedToScalarize) && 4612 "Instruction should be scalarized elsewhere."); 4613 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4614 "Either the intrinsic cost or vector call cost must be valid"); 4615 4616 for (unsigned Part = 0; Part < UF; ++Part) { 4617 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4618 SmallVector<Value *, 4> Args; 4619 for (auto &I : enumerate(ArgOperands.operands())) { 4620 // Some intrinsics have a scalar argument - don't replace it with a 4621 // vector. 4622 Value *Arg; 4623 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4624 Arg = State.get(I.value(), Part); 4625 else { 4626 Arg = State.get(I.value(), VPIteration(0, 0)); 4627 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4628 TysForDecl.push_back(Arg->getType()); 4629 } 4630 Args.push_back(Arg); 4631 } 4632 4633 Function *VectorF; 4634 if (UseVectorIntrinsic) { 4635 // Use vector version of the intrinsic. 4636 if (VF.isVector()) 4637 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4638 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4639 assert(VectorF && "Can't retrieve vector intrinsic."); 4640 } else { 4641 // Use vector version of the function call. 4642 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4643 #ifndef NDEBUG 4644 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4645 "Can't create vector function."); 4646 #endif 4647 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4648 } 4649 SmallVector<OperandBundleDef, 1> OpBundles; 4650 CI->getOperandBundlesAsDefs(OpBundles); 4651 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4652 4653 if (isa<FPMathOperator>(V)) 4654 V->copyFastMathFlags(CI); 4655 4656 State.set(Def, V, Part); 4657 addMetadata(V, &I); 4658 } 4659 } 4660 4661 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4662 // We should not collect Scalars more than once per VF. Right now, this 4663 // function is called from collectUniformsAndScalars(), which already does 4664 // this check. Collecting Scalars for VF=1 does not make any sense. 4665 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4666 "This function should not be visited twice for the same VF"); 4667 4668 SmallSetVector<Instruction *, 8> Worklist; 4669 4670 // These sets are used to seed the analysis with pointers used by memory 4671 // accesses that will remain scalar. 4672 SmallSetVector<Instruction *, 8> ScalarPtrs; 4673 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4674 auto *Latch = TheLoop->getLoopLatch(); 4675 4676 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4677 // The pointer operands of loads and stores will be scalar as long as the 4678 // memory access is not a gather or scatter operation. The value operand of a 4679 // store will remain scalar if the store is scalarized. 4680 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4681 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4682 assert(WideningDecision != CM_Unknown && 4683 "Widening decision should be ready at this moment"); 4684 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4685 if (Ptr == Store->getValueOperand()) 4686 return WideningDecision == CM_Scalarize; 4687 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4688 "Ptr is neither a value or pointer operand"); 4689 return WideningDecision != CM_GatherScatter; 4690 }; 4691 4692 // A helper that returns true if the given value is a bitcast or 4693 // getelementptr instruction contained in the loop. 4694 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4695 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4696 isa<GetElementPtrInst>(V)) && 4697 !TheLoop->isLoopInvariant(V); 4698 }; 4699 4700 // A helper that evaluates a memory access's use of a pointer. If the use will 4701 // be a scalar use and the pointer is only used by memory accesses, we place 4702 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4703 // PossibleNonScalarPtrs. 4704 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4705 // We only care about bitcast and getelementptr instructions contained in 4706 // the loop. 4707 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4708 return; 4709 4710 // If the pointer has already been identified as scalar (e.g., if it was 4711 // also identified as uniform), there's nothing to do. 4712 auto *I = cast<Instruction>(Ptr); 4713 if (Worklist.count(I)) 4714 return; 4715 4716 // If the use of the pointer will be a scalar use, and all users of the 4717 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4718 // place the pointer in PossibleNonScalarPtrs. 4719 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4720 return isa<LoadInst>(U) || isa<StoreInst>(U); 4721 })) 4722 ScalarPtrs.insert(I); 4723 else 4724 PossibleNonScalarPtrs.insert(I); 4725 }; 4726 4727 // We seed the scalars analysis with three classes of instructions: (1) 4728 // instructions marked uniform-after-vectorization and (2) bitcast, 4729 // getelementptr and (pointer) phi instructions used by memory accesses 4730 // requiring a scalar use. 4731 // 4732 // (1) Add to the worklist all instructions that have been identified as 4733 // uniform-after-vectorization. 4734 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4735 4736 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4737 // memory accesses requiring a scalar use. The pointer operands of loads and 4738 // stores will be scalar as long as the memory accesses is not a gather or 4739 // scatter operation. The value operand of a store will remain scalar if the 4740 // store is scalarized. 4741 for (auto *BB : TheLoop->blocks()) 4742 for (auto &I : *BB) { 4743 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4744 evaluatePtrUse(Load, Load->getPointerOperand()); 4745 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4746 evaluatePtrUse(Store, Store->getPointerOperand()); 4747 evaluatePtrUse(Store, Store->getValueOperand()); 4748 } 4749 } 4750 for (auto *I : ScalarPtrs) 4751 if (!PossibleNonScalarPtrs.count(I)) { 4752 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4753 Worklist.insert(I); 4754 } 4755 4756 // Insert the forced scalars. 4757 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4758 // induction variable when the PHI user is scalarized. 4759 auto ForcedScalar = ForcedScalars.find(VF); 4760 if (ForcedScalar != ForcedScalars.end()) 4761 for (auto *I : ForcedScalar->second) 4762 Worklist.insert(I); 4763 4764 // Expand the worklist by looking through any bitcasts and getelementptr 4765 // instructions we've already identified as scalar. This is similar to the 4766 // expansion step in collectLoopUniforms(); however, here we're only 4767 // expanding to include additional bitcasts and getelementptr instructions. 4768 unsigned Idx = 0; 4769 while (Idx != Worklist.size()) { 4770 Instruction *Dst = Worklist[Idx++]; 4771 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4772 continue; 4773 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4774 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4775 auto *J = cast<Instruction>(U); 4776 return !TheLoop->contains(J) || Worklist.count(J) || 4777 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4778 isScalarUse(J, Src)); 4779 })) { 4780 Worklist.insert(Src); 4781 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4782 } 4783 } 4784 4785 // An induction variable will remain scalar if all users of the induction 4786 // variable and induction variable update remain scalar. 4787 for (auto &Induction : Legal->getInductionVars()) { 4788 auto *Ind = Induction.first; 4789 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4790 4791 // If tail-folding is applied, the primary induction variable will be used 4792 // to feed a vector compare. 4793 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4794 continue; 4795 4796 // Returns true if \p Indvar is a pointer induction that is used directly by 4797 // load/store instruction \p I. 4798 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4799 Instruction *I) { 4800 return Induction.second.getKind() == 4801 InductionDescriptor::IK_PtrInduction && 4802 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4803 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4804 }; 4805 4806 // Determine if all users of the induction variable are scalar after 4807 // vectorization. 4808 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4809 auto *I = cast<Instruction>(U); 4810 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4811 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4812 }); 4813 if (!ScalarInd) 4814 continue; 4815 4816 // Determine if all users of the induction variable update instruction are 4817 // scalar after vectorization. 4818 auto ScalarIndUpdate = 4819 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4820 auto *I = cast<Instruction>(U); 4821 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4822 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4823 }); 4824 if (!ScalarIndUpdate) 4825 continue; 4826 4827 // The induction variable and its update instruction will remain scalar. 4828 Worklist.insert(Ind); 4829 Worklist.insert(IndUpdate); 4830 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4831 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4832 << "\n"); 4833 } 4834 4835 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4836 } 4837 4838 bool LoopVectorizationCostModel::isScalarWithPredication( 4839 Instruction *I, ElementCount VF) const { 4840 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4841 return false; 4842 switch(I->getOpcode()) { 4843 default: 4844 break; 4845 case Instruction::Load: 4846 case Instruction::Store: { 4847 if (!Legal->isMaskRequired(I)) 4848 return false; 4849 auto *Ptr = getLoadStorePointerOperand(I); 4850 auto *Ty = getLoadStoreType(I); 4851 Type *VTy = Ty; 4852 if (VF.isVector()) 4853 VTy = VectorType::get(Ty, VF); 4854 const Align Alignment = getLoadStoreAlignment(I); 4855 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4856 TTI.isLegalMaskedGather(VTy, Alignment)) 4857 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4858 TTI.isLegalMaskedScatter(VTy, Alignment)); 4859 } 4860 case Instruction::UDiv: 4861 case Instruction::SDiv: 4862 case Instruction::SRem: 4863 case Instruction::URem: 4864 return mayDivideByZero(*I); 4865 } 4866 return false; 4867 } 4868 4869 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4870 Instruction *I, ElementCount VF) { 4871 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4872 assert(getWideningDecision(I, VF) == CM_Unknown && 4873 "Decision should not be set yet."); 4874 auto *Group = getInterleavedAccessGroup(I); 4875 assert(Group && "Must have a group."); 4876 4877 // If the instruction's allocated size doesn't equal it's type size, it 4878 // requires padding and will be scalarized. 4879 auto &DL = I->getModule()->getDataLayout(); 4880 auto *ScalarTy = getLoadStoreType(I); 4881 if (hasIrregularType(ScalarTy, DL)) 4882 return false; 4883 4884 // Check if masking is required. 4885 // A Group may need masking for one of two reasons: it resides in a block that 4886 // needs predication, or it was decided to use masking to deal with gaps 4887 // (either a gap at the end of a load-access that may result in a speculative 4888 // load, or any gaps in a store-access). 4889 bool PredicatedAccessRequiresMasking = 4890 blockNeedsPredicationForAnyReason(I->getParent()) && 4891 Legal->isMaskRequired(I); 4892 bool LoadAccessWithGapsRequiresEpilogMasking = 4893 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4894 !isScalarEpilogueAllowed(); 4895 bool StoreAccessWithGapsRequiresMasking = 4896 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4897 if (!PredicatedAccessRequiresMasking && 4898 !LoadAccessWithGapsRequiresEpilogMasking && 4899 !StoreAccessWithGapsRequiresMasking) 4900 return true; 4901 4902 // If masked interleaving is required, we expect that the user/target had 4903 // enabled it, because otherwise it either wouldn't have been created or 4904 // it should have been invalidated by the CostModel. 4905 assert(useMaskedInterleavedAccesses(TTI) && 4906 "Masked interleave-groups for predicated accesses are not enabled."); 4907 4908 if (Group->isReverse()) 4909 return false; 4910 4911 auto *Ty = getLoadStoreType(I); 4912 const Align Alignment = getLoadStoreAlignment(I); 4913 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4914 : TTI.isLegalMaskedStore(Ty, Alignment); 4915 } 4916 4917 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4918 Instruction *I, ElementCount VF) { 4919 // Get and ensure we have a valid memory instruction. 4920 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4921 4922 auto *Ptr = getLoadStorePointerOperand(I); 4923 auto *ScalarTy = getLoadStoreType(I); 4924 4925 // In order to be widened, the pointer should be consecutive, first of all. 4926 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4927 return false; 4928 4929 // If the instruction is a store located in a predicated block, it will be 4930 // scalarized. 4931 if (isScalarWithPredication(I, VF)) 4932 return false; 4933 4934 // If the instruction's allocated size doesn't equal it's type size, it 4935 // requires padding and will be scalarized. 4936 auto &DL = I->getModule()->getDataLayout(); 4937 if (hasIrregularType(ScalarTy, DL)) 4938 return false; 4939 4940 return true; 4941 } 4942 4943 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4944 // We should not collect Uniforms more than once per VF. Right now, 4945 // this function is called from collectUniformsAndScalars(), which 4946 // already does this check. Collecting Uniforms for VF=1 does not make any 4947 // sense. 4948 4949 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4950 "This function should not be visited twice for the same VF"); 4951 4952 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4953 // not analyze again. Uniforms.count(VF) will return 1. 4954 Uniforms[VF].clear(); 4955 4956 // We now know that the loop is vectorizable! 4957 // Collect instructions inside the loop that will remain uniform after 4958 // vectorization. 4959 4960 // Global values, params and instructions outside of current loop are out of 4961 // scope. 4962 auto isOutOfScope = [&](Value *V) -> bool { 4963 Instruction *I = dyn_cast<Instruction>(V); 4964 return (!I || !TheLoop->contains(I)); 4965 }; 4966 4967 // Worklist containing uniform instructions demanding lane 0. 4968 SetVector<Instruction *> Worklist; 4969 BasicBlock *Latch = TheLoop->getLoopLatch(); 4970 4971 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4972 // that are scalar with predication must not be considered uniform after 4973 // vectorization, because that would create an erroneous replicating region 4974 // where only a single instance out of VF should be formed. 4975 // TODO: optimize such seldom cases if found important, see PR40816. 4976 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4977 if (isOutOfScope(I)) { 4978 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4979 << *I << "\n"); 4980 return; 4981 } 4982 if (isScalarWithPredication(I, VF)) { 4983 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4984 << *I << "\n"); 4985 return; 4986 } 4987 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4988 Worklist.insert(I); 4989 }; 4990 4991 // Start with the conditional branch. If the branch condition is an 4992 // instruction contained in the loop that is only used by the branch, it is 4993 // uniform. 4994 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4995 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4996 addToWorklistIfAllowed(Cmp); 4997 4998 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4999 InstWidening WideningDecision = getWideningDecision(I, VF); 5000 assert(WideningDecision != CM_Unknown && 5001 "Widening decision should be ready at this moment"); 5002 5003 // A uniform memory op is itself uniform. We exclude uniform stores 5004 // here as they demand the last lane, not the first one. 5005 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5006 assert(WideningDecision == CM_Scalarize); 5007 return true; 5008 } 5009 5010 return (WideningDecision == CM_Widen || 5011 WideningDecision == CM_Widen_Reverse || 5012 WideningDecision == CM_Interleave); 5013 }; 5014 5015 5016 // Returns true if Ptr is the pointer operand of a memory access instruction 5017 // I, and I is known to not require scalarization. 5018 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5019 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5020 }; 5021 5022 // Holds a list of values which are known to have at least one uniform use. 5023 // Note that there may be other uses which aren't uniform. A "uniform use" 5024 // here is something which only demands lane 0 of the unrolled iterations; 5025 // it does not imply that all lanes produce the same value (e.g. this is not 5026 // the usual meaning of uniform) 5027 SetVector<Value *> HasUniformUse; 5028 5029 // Scan the loop for instructions which are either a) known to have only 5030 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5031 for (auto *BB : TheLoop->blocks()) 5032 for (auto &I : *BB) { 5033 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5034 switch (II->getIntrinsicID()) { 5035 case Intrinsic::sideeffect: 5036 case Intrinsic::experimental_noalias_scope_decl: 5037 case Intrinsic::assume: 5038 case Intrinsic::lifetime_start: 5039 case Intrinsic::lifetime_end: 5040 if (TheLoop->hasLoopInvariantOperands(&I)) 5041 addToWorklistIfAllowed(&I); 5042 break; 5043 default: 5044 break; 5045 } 5046 } 5047 5048 // ExtractValue instructions must be uniform, because the operands are 5049 // known to be loop-invariant. 5050 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5051 assert(isOutOfScope(EVI->getAggregateOperand()) && 5052 "Expected aggregate value to be loop invariant"); 5053 addToWorklistIfAllowed(EVI); 5054 continue; 5055 } 5056 5057 // If there's no pointer operand, there's nothing to do. 5058 auto *Ptr = getLoadStorePointerOperand(&I); 5059 if (!Ptr) 5060 continue; 5061 5062 // A uniform memory op is itself uniform. We exclude uniform stores 5063 // here as they demand the last lane, not the first one. 5064 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5065 addToWorklistIfAllowed(&I); 5066 5067 if (isUniformDecision(&I, VF)) { 5068 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5069 HasUniformUse.insert(Ptr); 5070 } 5071 } 5072 5073 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5074 // demanding) users. Since loops are assumed to be in LCSSA form, this 5075 // disallows uses outside the loop as well. 5076 for (auto *V : HasUniformUse) { 5077 if (isOutOfScope(V)) 5078 continue; 5079 auto *I = cast<Instruction>(V); 5080 auto UsersAreMemAccesses = 5081 llvm::all_of(I->users(), [&](User *U) -> bool { 5082 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5083 }); 5084 if (UsersAreMemAccesses) 5085 addToWorklistIfAllowed(I); 5086 } 5087 5088 // Expand Worklist in topological order: whenever a new instruction 5089 // is added , its users should be already inside Worklist. It ensures 5090 // a uniform instruction will only be used by uniform instructions. 5091 unsigned idx = 0; 5092 while (idx != Worklist.size()) { 5093 Instruction *I = Worklist[idx++]; 5094 5095 for (auto OV : I->operand_values()) { 5096 // isOutOfScope operands cannot be uniform instructions. 5097 if (isOutOfScope(OV)) 5098 continue; 5099 // First order recurrence Phi's should typically be considered 5100 // non-uniform. 5101 auto *OP = dyn_cast<PHINode>(OV); 5102 if (OP && Legal->isFirstOrderRecurrence(OP)) 5103 continue; 5104 // If all the users of the operand are uniform, then add the 5105 // operand into the uniform worklist. 5106 auto *OI = cast<Instruction>(OV); 5107 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5108 auto *J = cast<Instruction>(U); 5109 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5110 })) 5111 addToWorklistIfAllowed(OI); 5112 } 5113 } 5114 5115 // For an instruction to be added into Worklist above, all its users inside 5116 // the loop should also be in Worklist. However, this condition cannot be 5117 // true for phi nodes that form a cyclic dependence. We must process phi 5118 // nodes separately. An induction variable will remain uniform if all users 5119 // of the induction variable and induction variable update remain uniform. 5120 // The code below handles both pointer and non-pointer induction variables. 5121 for (auto &Induction : Legal->getInductionVars()) { 5122 auto *Ind = Induction.first; 5123 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5124 5125 // Determine if all users of the induction variable are uniform after 5126 // vectorization. 5127 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5128 auto *I = cast<Instruction>(U); 5129 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5130 isVectorizedMemAccessUse(I, Ind); 5131 }); 5132 if (!UniformInd) 5133 continue; 5134 5135 // Determine if all users of the induction variable update instruction are 5136 // uniform after vectorization. 5137 auto UniformIndUpdate = 5138 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5139 auto *I = cast<Instruction>(U); 5140 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5141 isVectorizedMemAccessUse(I, IndUpdate); 5142 }); 5143 if (!UniformIndUpdate) 5144 continue; 5145 5146 // The induction variable and its update instruction will remain uniform. 5147 addToWorklistIfAllowed(Ind); 5148 addToWorklistIfAllowed(IndUpdate); 5149 } 5150 5151 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5152 } 5153 5154 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5155 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5156 5157 if (Legal->getRuntimePointerChecking()->Need) { 5158 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5159 "runtime pointer checks needed. Enable vectorization of this " 5160 "loop with '#pragma clang loop vectorize(enable)' when " 5161 "compiling with -Os/-Oz", 5162 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5163 return true; 5164 } 5165 5166 if (!PSE.getPredicate().isAlwaysTrue()) { 5167 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5168 "runtime SCEV checks needed. Enable vectorization of this " 5169 "loop with '#pragma clang loop vectorize(enable)' when " 5170 "compiling with -Os/-Oz", 5171 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5172 return true; 5173 } 5174 5175 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5176 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5177 reportVectorizationFailure("Runtime stride check for small trip count", 5178 "runtime stride == 1 checks needed. Enable vectorization of " 5179 "this loop without such check by compiling with -Os/-Oz", 5180 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5181 return true; 5182 } 5183 5184 return false; 5185 } 5186 5187 ElementCount 5188 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5189 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5190 return ElementCount::getScalable(0); 5191 5192 if (Hints->isScalableVectorizationDisabled()) { 5193 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5194 "ScalableVectorizationDisabled", ORE, TheLoop); 5195 return ElementCount::getScalable(0); 5196 } 5197 5198 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5199 5200 auto MaxScalableVF = ElementCount::getScalable( 5201 std::numeric_limits<ElementCount::ScalarTy>::max()); 5202 5203 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5204 // FIXME: While for scalable vectors this is currently sufficient, this should 5205 // be replaced by a more detailed mechanism that filters out specific VFs, 5206 // instead of invalidating vectorization for a whole set of VFs based on the 5207 // MaxVF. 5208 5209 // Disable scalable vectorization if the loop contains unsupported reductions. 5210 if (!canVectorizeReductions(MaxScalableVF)) { 5211 reportVectorizationInfo( 5212 "Scalable vectorization not supported for the reduction " 5213 "operations found in this loop.", 5214 "ScalableVFUnfeasible", ORE, TheLoop); 5215 return ElementCount::getScalable(0); 5216 } 5217 5218 // Disable scalable vectorization if the loop contains any instructions 5219 // with element types not supported for scalable vectors. 5220 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5221 return !Ty->isVoidTy() && 5222 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5223 })) { 5224 reportVectorizationInfo("Scalable vectorization is not supported " 5225 "for all element types found in this loop.", 5226 "ScalableVFUnfeasible", ORE, TheLoop); 5227 return ElementCount::getScalable(0); 5228 } 5229 5230 if (Legal->isSafeForAnyVectorWidth()) 5231 return MaxScalableVF; 5232 5233 // Limit MaxScalableVF by the maximum safe dependence distance. 5234 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5235 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5236 MaxVScale = 5237 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5238 MaxScalableVF = ElementCount::getScalable( 5239 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5240 if (!MaxScalableVF) 5241 reportVectorizationInfo( 5242 "Max legal vector width too small, scalable vectorization " 5243 "unfeasible.", 5244 "ScalableVFUnfeasible", ORE, TheLoop); 5245 5246 return MaxScalableVF; 5247 } 5248 5249 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5250 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5251 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5252 unsigned SmallestType, WidestType; 5253 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5254 5255 // Get the maximum safe dependence distance in bits computed by LAA. 5256 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5257 // the memory accesses that is most restrictive (involved in the smallest 5258 // dependence distance). 5259 unsigned MaxSafeElements = 5260 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5261 5262 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5263 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5264 5265 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5266 << ".\n"); 5267 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5268 << ".\n"); 5269 5270 // First analyze the UserVF, fall back if the UserVF should be ignored. 5271 if (UserVF) { 5272 auto MaxSafeUserVF = 5273 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5274 5275 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5276 // If `VF=vscale x N` is safe, then so is `VF=N` 5277 if (UserVF.isScalable()) 5278 return FixedScalableVFPair( 5279 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5280 else 5281 return UserVF; 5282 } 5283 5284 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5285 5286 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5287 // is better to ignore the hint and let the compiler choose a suitable VF. 5288 if (!UserVF.isScalable()) { 5289 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5290 << " is unsafe, clamping to max safe VF=" 5291 << MaxSafeFixedVF << ".\n"); 5292 ORE->emit([&]() { 5293 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5294 TheLoop->getStartLoc(), 5295 TheLoop->getHeader()) 5296 << "User-specified vectorization factor " 5297 << ore::NV("UserVectorizationFactor", UserVF) 5298 << " is unsafe, clamping to maximum safe vectorization factor " 5299 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5300 }); 5301 return MaxSafeFixedVF; 5302 } 5303 5304 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5305 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5306 << " is ignored because scalable vectors are not " 5307 "available.\n"); 5308 ORE->emit([&]() { 5309 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5310 TheLoop->getStartLoc(), 5311 TheLoop->getHeader()) 5312 << "User-specified vectorization factor " 5313 << ore::NV("UserVectorizationFactor", UserVF) 5314 << " is ignored because the target does not support scalable " 5315 "vectors. The compiler will pick a more suitable value."; 5316 }); 5317 } else { 5318 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5319 << " is unsafe. Ignoring scalable UserVF.\n"); 5320 ORE->emit([&]() { 5321 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5322 TheLoop->getStartLoc(), 5323 TheLoop->getHeader()) 5324 << "User-specified vectorization factor " 5325 << ore::NV("UserVectorizationFactor", UserVF) 5326 << " is unsafe. Ignoring the hint to let the compiler pick a " 5327 "more suitable value."; 5328 }); 5329 } 5330 } 5331 5332 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5333 << " / " << WidestType << " bits.\n"); 5334 5335 FixedScalableVFPair Result(ElementCount::getFixed(1), 5336 ElementCount::getScalable(0)); 5337 if (auto MaxVF = 5338 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5339 MaxSafeFixedVF, FoldTailByMasking)) 5340 Result.FixedVF = MaxVF; 5341 5342 if (auto MaxVF = 5343 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5344 MaxSafeScalableVF, FoldTailByMasking)) 5345 if (MaxVF.isScalable()) { 5346 Result.ScalableVF = MaxVF; 5347 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5348 << "\n"); 5349 } 5350 5351 return Result; 5352 } 5353 5354 FixedScalableVFPair 5355 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5356 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5357 // TODO: It may by useful to do since it's still likely to be dynamically 5358 // uniform if the target can skip. 5359 reportVectorizationFailure( 5360 "Not inserting runtime ptr check for divergent target", 5361 "runtime pointer checks needed. Not enabled for divergent target", 5362 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5363 return FixedScalableVFPair::getNone(); 5364 } 5365 5366 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5367 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5368 if (TC == 1) { 5369 reportVectorizationFailure("Single iteration (non) loop", 5370 "loop trip count is one, irrelevant for vectorization", 5371 "SingleIterationLoop", ORE, TheLoop); 5372 return FixedScalableVFPair::getNone(); 5373 } 5374 5375 switch (ScalarEpilogueStatus) { 5376 case CM_ScalarEpilogueAllowed: 5377 return computeFeasibleMaxVF(TC, UserVF, false); 5378 case CM_ScalarEpilogueNotAllowedUsePredicate: 5379 LLVM_FALLTHROUGH; 5380 case CM_ScalarEpilogueNotNeededUsePredicate: 5381 LLVM_DEBUG( 5382 dbgs() << "LV: vector predicate hint/switch found.\n" 5383 << "LV: Not allowing scalar epilogue, creating predicated " 5384 << "vector loop.\n"); 5385 break; 5386 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5387 // fallthrough as a special case of OptForSize 5388 case CM_ScalarEpilogueNotAllowedOptSize: 5389 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5390 LLVM_DEBUG( 5391 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5392 else 5393 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5394 << "count.\n"); 5395 5396 // Bail if runtime checks are required, which are not good when optimising 5397 // for size. 5398 if (runtimeChecksRequired()) 5399 return FixedScalableVFPair::getNone(); 5400 5401 break; 5402 } 5403 5404 // The only loops we can vectorize without a scalar epilogue, are loops with 5405 // a bottom-test and a single exiting block. We'd have to handle the fact 5406 // that not every instruction executes on the last iteration. This will 5407 // require a lane mask which varies through the vector loop body. (TODO) 5408 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5409 // If there was a tail-folding hint/switch, but we can't fold the tail by 5410 // masking, fallback to a vectorization with a scalar epilogue. 5411 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5412 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5413 "scalar epilogue instead.\n"); 5414 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5415 return computeFeasibleMaxVF(TC, UserVF, false); 5416 } 5417 return FixedScalableVFPair::getNone(); 5418 } 5419 5420 // Now try the tail folding 5421 5422 // Invalidate interleave groups that require an epilogue if we can't mask 5423 // the interleave-group. 5424 if (!useMaskedInterleavedAccesses(TTI)) { 5425 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5426 "No decisions should have been taken at this point"); 5427 // Note: There is no need to invalidate any cost modeling decisions here, as 5428 // non where taken so far. 5429 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5430 } 5431 5432 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5433 // Avoid tail folding if the trip count is known to be a multiple of any VF 5434 // we chose. 5435 // FIXME: The condition below pessimises the case for fixed-width vectors, 5436 // when scalable VFs are also candidates for vectorization. 5437 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5438 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5439 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5440 "MaxFixedVF must be a power of 2"); 5441 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5442 : MaxFixedVF.getFixedValue(); 5443 ScalarEvolution *SE = PSE.getSE(); 5444 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5445 const SCEV *ExitCount = SE->getAddExpr( 5446 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5447 const SCEV *Rem = SE->getURemExpr( 5448 SE->applyLoopGuards(ExitCount, TheLoop), 5449 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5450 if (Rem->isZero()) { 5451 // Accept MaxFixedVF if we do not have a tail. 5452 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5453 return MaxFactors; 5454 } 5455 } 5456 5457 // For scalable vectors don't use tail folding for low trip counts or 5458 // optimizing for code size. We only permit this if the user has explicitly 5459 // requested it. 5460 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5461 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5462 MaxFactors.ScalableVF.isVector()) 5463 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5464 5465 // If we don't know the precise trip count, or if the trip count that we 5466 // found modulo the vectorization factor is not zero, try to fold the tail 5467 // by masking. 5468 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5469 if (Legal->prepareToFoldTailByMasking()) { 5470 FoldTailByMasking = true; 5471 return MaxFactors; 5472 } 5473 5474 // If there was a tail-folding hint/switch, but we can't fold the tail by 5475 // masking, fallback to a vectorization with a scalar epilogue. 5476 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5477 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5478 "scalar epilogue instead.\n"); 5479 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5480 return MaxFactors; 5481 } 5482 5483 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5484 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5485 return FixedScalableVFPair::getNone(); 5486 } 5487 5488 if (TC == 0) { 5489 reportVectorizationFailure( 5490 "Unable to calculate the loop count due to complex control flow", 5491 "unable to calculate the loop count due to complex control flow", 5492 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5493 return FixedScalableVFPair::getNone(); 5494 } 5495 5496 reportVectorizationFailure( 5497 "Cannot optimize for size and vectorize at the same time.", 5498 "cannot optimize for size and vectorize at the same time. " 5499 "Enable vectorization of this loop with '#pragma clang loop " 5500 "vectorize(enable)' when compiling with -Os/-Oz", 5501 "NoTailLoopWithOptForSize", ORE, TheLoop); 5502 return FixedScalableVFPair::getNone(); 5503 } 5504 5505 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5506 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5507 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5508 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5509 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5510 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5511 : TargetTransformInfo::RGK_FixedWidthVector); 5512 5513 // Convenience function to return the minimum of two ElementCounts. 5514 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5515 assert((LHS.isScalable() == RHS.isScalable()) && 5516 "Scalable flags must match"); 5517 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5518 }; 5519 5520 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5521 // Note that both WidestRegister and WidestType may not be a powers of 2. 5522 auto MaxVectorElementCount = ElementCount::get( 5523 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5524 ComputeScalableMaxVF); 5525 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5526 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5527 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5528 5529 if (!MaxVectorElementCount) { 5530 LLVM_DEBUG(dbgs() << "LV: The target has no " 5531 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5532 << " vector registers.\n"); 5533 return ElementCount::getFixed(1); 5534 } 5535 5536 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5537 if (ConstTripCount && 5538 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5539 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5540 // If loop trip count (TC) is known at compile time there is no point in 5541 // choosing VF greater than TC (as done in the loop below). Select maximum 5542 // power of two which doesn't exceed TC. 5543 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5544 // when the TC is less than or equal to the known number of lanes. 5545 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5546 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5547 "exceeding the constant trip count: " 5548 << ClampedConstTripCount << "\n"); 5549 return ElementCount::getFixed(ClampedConstTripCount); 5550 } 5551 5552 ElementCount MaxVF = MaxVectorElementCount; 5553 if (TTI.shouldMaximizeVectorBandwidth() || 5554 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5555 auto MaxVectorElementCountMaxBW = ElementCount::get( 5556 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5557 ComputeScalableMaxVF); 5558 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5559 5560 // Collect all viable vectorization factors larger than the default MaxVF 5561 // (i.e. MaxVectorElementCount). 5562 SmallVector<ElementCount, 8> VFs; 5563 for (ElementCount VS = MaxVectorElementCount * 2; 5564 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5565 VFs.push_back(VS); 5566 5567 // For each VF calculate its register usage. 5568 auto RUs = calculateRegisterUsage(VFs); 5569 5570 // Select the largest VF which doesn't require more registers than existing 5571 // ones. 5572 for (int i = RUs.size() - 1; i >= 0; --i) { 5573 bool Selected = true; 5574 for (auto &pair : RUs[i].MaxLocalUsers) { 5575 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5576 if (pair.second > TargetNumRegisters) 5577 Selected = false; 5578 } 5579 if (Selected) { 5580 MaxVF = VFs[i]; 5581 break; 5582 } 5583 } 5584 if (ElementCount MinVF = 5585 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5586 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5587 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5588 << ") with target's minimum: " << MinVF << '\n'); 5589 MaxVF = MinVF; 5590 } 5591 } 5592 } 5593 return MaxVF; 5594 } 5595 5596 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5597 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5598 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5599 auto Min = Attr.getVScaleRangeMin(); 5600 auto Max = Attr.getVScaleRangeMax(); 5601 if (Max && Min == Max) 5602 return Max; 5603 } 5604 5605 return TTI.getVScaleForTuning(); 5606 } 5607 5608 bool LoopVectorizationCostModel::isMoreProfitable( 5609 const VectorizationFactor &A, const VectorizationFactor &B) const { 5610 InstructionCost CostA = A.Cost; 5611 InstructionCost CostB = B.Cost; 5612 5613 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5614 5615 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5616 MaxTripCount) { 5617 // If we are folding the tail and the trip count is a known (possibly small) 5618 // constant, the trip count will be rounded up to an integer number of 5619 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5620 // which we compare directly. When not folding the tail, the total cost will 5621 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5622 // approximated with the per-lane cost below instead of using the tripcount 5623 // as here. 5624 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5625 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5626 return RTCostA < RTCostB; 5627 } 5628 5629 // Improve estimate for the vector width if it is scalable. 5630 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5631 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5632 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5633 if (A.Width.isScalable()) 5634 EstimatedWidthA *= VScale.getValue(); 5635 if (B.Width.isScalable()) 5636 EstimatedWidthB *= VScale.getValue(); 5637 } 5638 5639 // Assume vscale may be larger than 1 (or the value being tuned for), 5640 // so that scalable vectorization is slightly favorable over fixed-width 5641 // vectorization. 5642 if (A.Width.isScalable() && !B.Width.isScalable()) 5643 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5644 5645 // To avoid the need for FP division: 5646 // (CostA / A.Width) < (CostB / B.Width) 5647 // <=> (CostA * B.Width) < (CostB * A.Width) 5648 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5649 } 5650 5651 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5652 const ElementCountSet &VFCandidates) { 5653 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5654 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5655 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5656 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5657 "Expected Scalar VF to be a candidate"); 5658 5659 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5660 VectorizationFactor ChosenFactor = ScalarCost; 5661 5662 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5663 if (ForceVectorization && VFCandidates.size() > 1) { 5664 // Ignore scalar width, because the user explicitly wants vectorization. 5665 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5666 // evaluation. 5667 ChosenFactor.Cost = InstructionCost::getMax(); 5668 } 5669 5670 SmallVector<InstructionVFPair> InvalidCosts; 5671 for (const auto &i : VFCandidates) { 5672 // The cost for scalar VF=1 is already calculated, so ignore it. 5673 if (i.isScalar()) 5674 continue; 5675 5676 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5677 VectorizationFactor Candidate(i, C.first); 5678 5679 #ifndef NDEBUG 5680 unsigned AssumedMinimumVscale = 1; 5681 if (Optional<unsigned> VScale = getVScaleForTuning()) 5682 AssumedMinimumVscale = VScale.getValue(); 5683 unsigned Width = 5684 Candidate.Width.isScalable() 5685 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5686 : Candidate.Width.getFixedValue(); 5687 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5688 << " costs: " << (Candidate.Cost / Width)); 5689 if (i.isScalable()) 5690 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5691 << AssumedMinimumVscale << ")"); 5692 LLVM_DEBUG(dbgs() << ".\n"); 5693 #endif 5694 5695 if (!C.second && !ForceVectorization) { 5696 LLVM_DEBUG( 5697 dbgs() << "LV: Not considering vector loop of width " << i 5698 << " because it will not generate any vector instructions.\n"); 5699 continue; 5700 } 5701 5702 // If profitable add it to ProfitableVF list. 5703 if (isMoreProfitable(Candidate, ScalarCost)) 5704 ProfitableVFs.push_back(Candidate); 5705 5706 if (isMoreProfitable(Candidate, ChosenFactor)) 5707 ChosenFactor = Candidate; 5708 } 5709 5710 // Emit a report of VFs with invalid costs in the loop. 5711 if (!InvalidCosts.empty()) { 5712 // Group the remarks per instruction, keeping the instruction order from 5713 // InvalidCosts. 5714 std::map<Instruction *, unsigned> Numbering; 5715 unsigned I = 0; 5716 for (auto &Pair : InvalidCosts) 5717 if (!Numbering.count(Pair.first)) 5718 Numbering[Pair.first] = I++; 5719 5720 // Sort the list, first on instruction(number) then on VF. 5721 llvm::sort(InvalidCosts, 5722 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5723 if (Numbering[A.first] != Numbering[B.first]) 5724 return Numbering[A.first] < Numbering[B.first]; 5725 ElementCountComparator ECC; 5726 return ECC(A.second, B.second); 5727 }); 5728 5729 // For a list of ordered instruction-vf pairs: 5730 // [(load, vf1), (load, vf2), (store, vf1)] 5731 // Group the instructions together to emit separate remarks for: 5732 // load (vf1, vf2) 5733 // store (vf1) 5734 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5735 auto Subset = ArrayRef<InstructionVFPair>(); 5736 do { 5737 if (Subset.empty()) 5738 Subset = Tail.take_front(1); 5739 5740 Instruction *I = Subset.front().first; 5741 5742 // If the next instruction is different, or if there are no other pairs, 5743 // emit a remark for the collated subset. e.g. 5744 // [(load, vf1), (load, vf2))] 5745 // to emit: 5746 // remark: invalid costs for 'load' at VF=(vf, vf2) 5747 if (Subset == Tail || Tail[Subset.size()].first != I) { 5748 std::string OutString; 5749 raw_string_ostream OS(OutString); 5750 assert(!Subset.empty() && "Unexpected empty range"); 5751 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5752 for (auto &Pair : Subset) 5753 OS << (Pair.second == Subset.front().second ? "" : ", ") 5754 << Pair.second; 5755 OS << "):"; 5756 if (auto *CI = dyn_cast<CallInst>(I)) 5757 OS << " call to " << CI->getCalledFunction()->getName(); 5758 else 5759 OS << " " << I->getOpcodeName(); 5760 OS.flush(); 5761 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5762 Tail = Tail.drop_front(Subset.size()); 5763 Subset = {}; 5764 } else 5765 // Grow the subset by one element 5766 Subset = Tail.take_front(Subset.size() + 1); 5767 } while (!Tail.empty()); 5768 } 5769 5770 if (!EnableCondStoresVectorization && NumPredStores) { 5771 reportVectorizationFailure("There are conditional stores.", 5772 "store that is conditionally executed prevents vectorization", 5773 "ConditionalStore", ORE, TheLoop); 5774 ChosenFactor = ScalarCost; 5775 } 5776 5777 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5778 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5779 << "LV: Vectorization seems to be not beneficial, " 5780 << "but was forced by a user.\n"); 5781 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5782 return ChosenFactor; 5783 } 5784 5785 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5786 const Loop &L, ElementCount VF) const { 5787 // Cross iteration phis such as reductions need special handling and are 5788 // currently unsupported. 5789 if (any_of(L.getHeader()->phis(), 5790 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5791 return false; 5792 5793 // Phis with uses outside of the loop require special handling and are 5794 // currently unsupported. 5795 for (auto &Entry : Legal->getInductionVars()) { 5796 // Look for uses of the value of the induction at the last iteration. 5797 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5798 for (User *U : PostInc->users()) 5799 if (!L.contains(cast<Instruction>(U))) 5800 return false; 5801 // Look for uses of penultimate value of the induction. 5802 for (User *U : Entry.first->users()) 5803 if (!L.contains(cast<Instruction>(U))) 5804 return false; 5805 } 5806 5807 // Induction variables that are widened require special handling that is 5808 // currently not supported. 5809 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5810 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5811 this->isProfitableToScalarize(Entry.first, VF)); 5812 })) 5813 return false; 5814 5815 // Epilogue vectorization code has not been auditted to ensure it handles 5816 // non-latch exits properly. It may be fine, but it needs auditted and 5817 // tested. 5818 if (L.getExitingBlock() != L.getLoopLatch()) 5819 return false; 5820 5821 return true; 5822 } 5823 5824 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5825 const ElementCount VF) const { 5826 // FIXME: We need a much better cost-model to take different parameters such 5827 // as register pressure, code size increase and cost of extra branches into 5828 // account. For now we apply a very crude heuristic and only consider loops 5829 // with vectorization factors larger than a certain value. 5830 // We also consider epilogue vectorization unprofitable for targets that don't 5831 // consider interleaving beneficial (eg. MVE). 5832 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5833 return false; 5834 // FIXME: We should consider changing the threshold for scalable 5835 // vectors to take VScaleForTuning into account. 5836 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5837 return true; 5838 return false; 5839 } 5840 5841 VectorizationFactor 5842 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5843 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5844 VectorizationFactor Result = VectorizationFactor::Disabled(); 5845 if (!EnableEpilogueVectorization) { 5846 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5847 return Result; 5848 } 5849 5850 if (!isScalarEpilogueAllowed()) { 5851 LLVM_DEBUG( 5852 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5853 "allowed.\n";); 5854 return Result; 5855 } 5856 5857 // Not really a cost consideration, but check for unsupported cases here to 5858 // simplify the logic. 5859 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5860 LLVM_DEBUG( 5861 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5862 "not a supported candidate.\n";); 5863 return Result; 5864 } 5865 5866 if (EpilogueVectorizationForceVF > 1) { 5867 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5868 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5869 if (LVP.hasPlanWithVF(ForcedEC)) 5870 return {ForcedEC, 0}; 5871 else { 5872 LLVM_DEBUG( 5873 dbgs() 5874 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5875 return Result; 5876 } 5877 } 5878 5879 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5880 TheLoop->getHeader()->getParent()->hasMinSize()) { 5881 LLVM_DEBUG( 5882 dbgs() 5883 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5884 return Result; 5885 } 5886 5887 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5888 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5889 "this loop\n"); 5890 return Result; 5891 } 5892 5893 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5894 // the main loop handles 8 lanes per iteration. We could still benefit from 5895 // vectorizing the epilogue loop with VF=4. 5896 ElementCount EstimatedRuntimeVF = MainLoopVF; 5897 if (MainLoopVF.isScalable()) { 5898 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5899 if (Optional<unsigned> VScale = getVScaleForTuning()) 5900 EstimatedRuntimeVF *= VScale.getValue(); 5901 } 5902 5903 for (auto &NextVF : ProfitableVFs) 5904 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5905 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5906 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5907 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5908 LVP.hasPlanWithVF(NextVF.Width)) 5909 Result = NextVF; 5910 5911 if (Result != VectorizationFactor::Disabled()) 5912 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5913 << Result.Width << "\n";); 5914 return Result; 5915 } 5916 5917 std::pair<unsigned, unsigned> 5918 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5919 unsigned MinWidth = -1U; 5920 unsigned MaxWidth = 8; 5921 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5922 // For in-loop reductions, no element types are added to ElementTypesInLoop 5923 // if there are no loads/stores in the loop. In this case, check through the 5924 // reduction variables to determine the maximum width. 5925 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5926 // Reset MaxWidth so that we can find the smallest type used by recurrences 5927 // in the loop. 5928 MaxWidth = -1U; 5929 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5930 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5931 // When finding the min width used by the recurrence we need to account 5932 // for casts on the input operands of the recurrence. 5933 MaxWidth = std::min<unsigned>( 5934 MaxWidth, std::min<unsigned>( 5935 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5936 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5937 } 5938 } else { 5939 for (Type *T : ElementTypesInLoop) { 5940 MinWidth = std::min<unsigned>( 5941 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5942 MaxWidth = std::max<unsigned>( 5943 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5944 } 5945 } 5946 return {MinWidth, MaxWidth}; 5947 } 5948 5949 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5950 ElementTypesInLoop.clear(); 5951 // For each block. 5952 for (BasicBlock *BB : TheLoop->blocks()) { 5953 // For each instruction in the loop. 5954 for (Instruction &I : BB->instructionsWithoutDebug()) { 5955 Type *T = I.getType(); 5956 5957 // Skip ignored values. 5958 if (ValuesToIgnore.count(&I)) 5959 continue; 5960 5961 // Only examine Loads, Stores and PHINodes. 5962 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5963 continue; 5964 5965 // Examine PHI nodes that are reduction variables. Update the type to 5966 // account for the recurrence type. 5967 if (auto *PN = dyn_cast<PHINode>(&I)) { 5968 if (!Legal->isReductionVariable(PN)) 5969 continue; 5970 const RecurrenceDescriptor &RdxDesc = 5971 Legal->getReductionVars().find(PN)->second; 5972 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5973 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5974 RdxDesc.getRecurrenceType(), 5975 TargetTransformInfo::ReductionFlags())) 5976 continue; 5977 T = RdxDesc.getRecurrenceType(); 5978 } 5979 5980 // Examine the stored values. 5981 if (auto *ST = dyn_cast<StoreInst>(&I)) 5982 T = ST->getValueOperand()->getType(); 5983 5984 assert(T->isSized() && 5985 "Expected the load/store/recurrence type to be sized"); 5986 5987 ElementTypesInLoop.insert(T); 5988 } 5989 } 5990 } 5991 5992 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5993 unsigned LoopCost) { 5994 // -- The interleave heuristics -- 5995 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5996 // There are many micro-architectural considerations that we can't predict 5997 // at this level. For example, frontend pressure (on decode or fetch) due to 5998 // code size, or the number and capabilities of the execution ports. 5999 // 6000 // We use the following heuristics to select the interleave count: 6001 // 1. If the code has reductions, then we interleave to break the cross 6002 // iteration dependency. 6003 // 2. If the loop is really small, then we interleave to reduce the loop 6004 // overhead. 6005 // 3. We don't interleave if we think that we will spill registers to memory 6006 // due to the increased register pressure. 6007 6008 if (!isScalarEpilogueAllowed()) 6009 return 1; 6010 6011 // We used the distance for the interleave count. 6012 if (Legal->getMaxSafeDepDistBytes() != -1U) 6013 return 1; 6014 6015 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6016 const bool HasReductions = !Legal->getReductionVars().empty(); 6017 // Do not interleave loops with a relatively small known or estimated trip 6018 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6019 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6020 // because with the above conditions interleaving can expose ILP and break 6021 // cross iteration dependences for reductions. 6022 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6023 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6024 return 1; 6025 6026 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6027 // We divide by these constants so assume that we have at least one 6028 // instruction that uses at least one register. 6029 for (auto& pair : R.MaxLocalUsers) { 6030 pair.second = std::max(pair.second, 1U); 6031 } 6032 6033 // We calculate the interleave count using the following formula. 6034 // Subtract the number of loop invariants from the number of available 6035 // registers. These registers are used by all of the interleaved instances. 6036 // Next, divide the remaining registers by the number of registers that is 6037 // required by the loop, in order to estimate how many parallel instances 6038 // fit without causing spills. All of this is rounded down if necessary to be 6039 // a power of two. We want power of two interleave count to simplify any 6040 // addressing operations or alignment considerations. 6041 // We also want power of two interleave counts to ensure that the induction 6042 // variable of the vector loop wraps to zero, when tail is folded by masking; 6043 // this currently happens when OptForSize, in which case IC is set to 1 above. 6044 unsigned IC = UINT_MAX; 6045 6046 for (auto& pair : R.MaxLocalUsers) { 6047 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6048 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6049 << " registers of " 6050 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6051 if (VF.isScalar()) { 6052 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6053 TargetNumRegisters = ForceTargetNumScalarRegs; 6054 } else { 6055 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6056 TargetNumRegisters = ForceTargetNumVectorRegs; 6057 } 6058 unsigned MaxLocalUsers = pair.second; 6059 unsigned LoopInvariantRegs = 0; 6060 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6061 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6062 6063 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6064 // Don't count the induction variable as interleaved. 6065 if (EnableIndVarRegisterHeur) { 6066 TmpIC = 6067 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6068 std::max(1U, (MaxLocalUsers - 1))); 6069 } 6070 6071 IC = std::min(IC, TmpIC); 6072 } 6073 6074 // Clamp the interleave ranges to reasonable counts. 6075 unsigned MaxInterleaveCount = 6076 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6077 6078 // Check if the user has overridden the max. 6079 if (VF.isScalar()) { 6080 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6081 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6082 } else { 6083 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6084 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6085 } 6086 6087 // If trip count is known or estimated compile time constant, limit the 6088 // interleave count to be less than the trip count divided by VF, provided it 6089 // is at least 1. 6090 // 6091 // For scalable vectors we can't know if interleaving is beneficial. It may 6092 // not be beneficial for small loops if none of the lanes in the second vector 6093 // iterations is enabled. However, for larger loops, there is likely to be a 6094 // similar benefit as for fixed-width vectors. For now, we choose to leave 6095 // the InterleaveCount as if vscale is '1', although if some information about 6096 // the vector is known (e.g. min vector size), we can make a better decision. 6097 if (BestKnownTC) { 6098 MaxInterleaveCount = 6099 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6100 // Make sure MaxInterleaveCount is greater than 0. 6101 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6102 } 6103 6104 assert(MaxInterleaveCount > 0 && 6105 "Maximum interleave count must be greater than 0"); 6106 6107 // Clamp the calculated IC to be between the 1 and the max interleave count 6108 // that the target and trip count allows. 6109 if (IC > MaxInterleaveCount) 6110 IC = MaxInterleaveCount; 6111 else 6112 // Make sure IC is greater than 0. 6113 IC = std::max(1u, IC); 6114 6115 assert(IC > 0 && "Interleave count must be greater than 0."); 6116 6117 // If we did not calculate the cost for VF (because the user selected the VF) 6118 // then we calculate the cost of VF here. 6119 if (LoopCost == 0) { 6120 InstructionCost C = expectedCost(VF).first; 6121 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6122 LoopCost = *C.getValue(); 6123 } 6124 6125 assert(LoopCost && "Non-zero loop cost expected"); 6126 6127 // Interleave if we vectorized this loop and there is a reduction that could 6128 // benefit from interleaving. 6129 if (VF.isVector() && HasReductions) { 6130 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6131 return IC; 6132 } 6133 6134 // For any scalar loop that either requires runtime checks or predication we 6135 // are better off leaving this to the unroller. Note that if we've already 6136 // vectorized the loop we will have done the runtime check and so interleaving 6137 // won't require further checks. 6138 bool ScalarInterleavingRequiresPredication = 6139 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 6140 return Legal->blockNeedsPredication(BB); 6141 })); 6142 bool ScalarInterleavingRequiresRuntimePointerCheck = 6143 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6144 6145 // We want to interleave small loops in order to reduce the loop overhead and 6146 // potentially expose ILP opportunities. 6147 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6148 << "LV: IC is " << IC << '\n' 6149 << "LV: VF is " << VF << '\n'); 6150 const bool AggressivelyInterleaveReductions = 6151 TTI.enableAggressiveInterleaving(HasReductions); 6152 if (!ScalarInterleavingRequiresRuntimePointerCheck && 6153 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 6154 // We assume that the cost overhead is 1 and we use the cost model 6155 // to estimate the cost of the loop and interleave until the cost of the 6156 // loop overhead is about 5% of the cost of the loop. 6157 unsigned SmallIC = 6158 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6159 6160 // Interleave until store/load ports (estimated by max interleave count) are 6161 // saturated. 6162 unsigned NumStores = Legal->getNumStores(); 6163 unsigned NumLoads = Legal->getNumLoads(); 6164 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6165 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6166 6167 // There is little point in interleaving for reductions containing selects 6168 // and compares when VF=1 since it may just create more overhead than it's 6169 // worth for loops with small trip counts. This is because we still have to 6170 // do the final reduction after the loop. 6171 bool HasSelectCmpReductions = 6172 HasReductions && 6173 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6174 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6175 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6176 RdxDesc.getRecurrenceKind()); 6177 }); 6178 if (HasSelectCmpReductions) { 6179 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6180 return 1; 6181 } 6182 6183 // If we have a scalar reduction (vector reductions are already dealt with 6184 // by this point), we can increase the critical path length if the loop 6185 // we're interleaving is inside another loop. For tree-wise reductions 6186 // set the limit to 2, and for ordered reductions it's best to disable 6187 // interleaving entirely. 6188 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6189 bool HasOrderedReductions = 6190 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6191 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6192 return RdxDesc.isOrdered(); 6193 }); 6194 if (HasOrderedReductions) { 6195 LLVM_DEBUG( 6196 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6197 return 1; 6198 } 6199 6200 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6201 SmallIC = std::min(SmallIC, F); 6202 StoresIC = std::min(StoresIC, F); 6203 LoadsIC = std::min(LoadsIC, F); 6204 } 6205 6206 if (EnableLoadStoreRuntimeInterleave && 6207 std::max(StoresIC, LoadsIC) > SmallIC) { 6208 LLVM_DEBUG( 6209 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6210 return std::max(StoresIC, LoadsIC); 6211 } 6212 6213 // If there are scalar reductions and TTI has enabled aggressive 6214 // interleaving for reductions, we will interleave to expose ILP. 6215 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6216 AggressivelyInterleaveReductions) { 6217 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6218 // Interleave no less than SmallIC but not as aggressive as the normal IC 6219 // to satisfy the rare situation when resources are too limited. 6220 return std::max(IC / 2, SmallIC); 6221 } else { 6222 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6223 return SmallIC; 6224 } 6225 } 6226 6227 // Interleave if this is a large loop (small loops are already dealt with by 6228 // this point) that could benefit from interleaving. 6229 if (AggressivelyInterleaveReductions) { 6230 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6231 return IC; 6232 } 6233 6234 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6235 return 1; 6236 } 6237 6238 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6239 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6240 // This function calculates the register usage by measuring the highest number 6241 // of values that are alive at a single location. Obviously, this is a very 6242 // rough estimation. We scan the loop in a topological order in order and 6243 // assign a number to each instruction. We use RPO to ensure that defs are 6244 // met before their users. We assume that each instruction that has in-loop 6245 // users starts an interval. We record every time that an in-loop value is 6246 // used, so we have a list of the first and last occurrences of each 6247 // instruction. Next, we transpose this data structure into a multi map that 6248 // holds the list of intervals that *end* at a specific location. This multi 6249 // map allows us to perform a linear search. We scan the instructions linearly 6250 // and record each time that a new interval starts, by placing it in a set. 6251 // If we find this value in the multi-map then we remove it from the set. 6252 // The max register usage is the maximum size of the set. 6253 // We also search for instructions that are defined outside the loop, but are 6254 // used inside the loop. We need this number separately from the max-interval 6255 // usage number because when we unroll, loop-invariant values do not take 6256 // more register. 6257 LoopBlocksDFS DFS(TheLoop); 6258 DFS.perform(LI); 6259 6260 RegisterUsage RU; 6261 6262 // Each 'key' in the map opens a new interval. The values 6263 // of the map are the index of the 'last seen' usage of the 6264 // instruction that is the key. 6265 using IntervalMap = DenseMap<Instruction *, unsigned>; 6266 6267 // Maps instruction to its index. 6268 SmallVector<Instruction *, 64> IdxToInstr; 6269 // Marks the end of each interval. 6270 IntervalMap EndPoint; 6271 // Saves the list of instruction indices that are used in the loop. 6272 SmallPtrSet<Instruction *, 8> Ends; 6273 // Saves the list of values that are used in the loop but are 6274 // defined outside the loop, such as arguments and constants. 6275 SmallPtrSet<Value *, 8> LoopInvariants; 6276 6277 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6278 for (Instruction &I : BB->instructionsWithoutDebug()) { 6279 IdxToInstr.push_back(&I); 6280 6281 // Save the end location of each USE. 6282 for (Value *U : I.operands()) { 6283 auto *Instr = dyn_cast<Instruction>(U); 6284 6285 // Ignore non-instruction values such as arguments, constants, etc. 6286 if (!Instr) 6287 continue; 6288 6289 // If this instruction is outside the loop then record it and continue. 6290 if (!TheLoop->contains(Instr)) { 6291 LoopInvariants.insert(Instr); 6292 continue; 6293 } 6294 6295 // Overwrite previous end points. 6296 EndPoint[Instr] = IdxToInstr.size(); 6297 Ends.insert(Instr); 6298 } 6299 } 6300 } 6301 6302 // Saves the list of intervals that end with the index in 'key'. 6303 using InstrList = SmallVector<Instruction *, 2>; 6304 DenseMap<unsigned, InstrList> TransposeEnds; 6305 6306 // Transpose the EndPoints to a list of values that end at each index. 6307 for (auto &Interval : EndPoint) 6308 TransposeEnds[Interval.second].push_back(Interval.first); 6309 6310 SmallPtrSet<Instruction *, 8> OpenIntervals; 6311 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6312 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6313 6314 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6315 6316 // A lambda that gets the register usage for the given type and VF. 6317 const auto &TTICapture = TTI; 6318 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6319 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6320 return 0; 6321 InstructionCost::CostType RegUsage = 6322 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6323 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6324 "Nonsensical values for register usage."); 6325 return RegUsage; 6326 }; 6327 6328 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6329 Instruction *I = IdxToInstr[i]; 6330 6331 // Remove all of the instructions that end at this location. 6332 InstrList &List = TransposeEnds[i]; 6333 for (Instruction *ToRemove : List) 6334 OpenIntervals.erase(ToRemove); 6335 6336 // Ignore instructions that are never used within the loop. 6337 if (!Ends.count(I)) 6338 continue; 6339 6340 // Skip ignored values. 6341 if (ValuesToIgnore.count(I)) 6342 continue; 6343 6344 // For each VF find the maximum usage of registers. 6345 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6346 // Count the number of live intervals. 6347 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6348 6349 if (VFs[j].isScalar()) { 6350 for (auto Inst : OpenIntervals) { 6351 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6352 if (RegUsage.find(ClassID) == RegUsage.end()) 6353 RegUsage[ClassID] = 1; 6354 else 6355 RegUsage[ClassID] += 1; 6356 } 6357 } else { 6358 collectUniformsAndScalars(VFs[j]); 6359 for (auto Inst : OpenIntervals) { 6360 // Skip ignored values for VF > 1. 6361 if (VecValuesToIgnore.count(Inst)) 6362 continue; 6363 if (isScalarAfterVectorization(Inst, VFs[j])) { 6364 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6365 if (RegUsage.find(ClassID) == RegUsage.end()) 6366 RegUsage[ClassID] = 1; 6367 else 6368 RegUsage[ClassID] += 1; 6369 } else { 6370 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6371 if (RegUsage.find(ClassID) == RegUsage.end()) 6372 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6373 else 6374 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6375 } 6376 } 6377 } 6378 6379 for (auto& pair : RegUsage) { 6380 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6381 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6382 else 6383 MaxUsages[j][pair.first] = pair.second; 6384 } 6385 } 6386 6387 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6388 << OpenIntervals.size() << '\n'); 6389 6390 // Add the current instruction to the list of open intervals. 6391 OpenIntervals.insert(I); 6392 } 6393 6394 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6395 SmallMapVector<unsigned, unsigned, 4> Invariant; 6396 6397 for (auto Inst : LoopInvariants) { 6398 unsigned Usage = 6399 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6400 unsigned ClassID = 6401 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6402 if (Invariant.find(ClassID) == Invariant.end()) 6403 Invariant[ClassID] = Usage; 6404 else 6405 Invariant[ClassID] += Usage; 6406 } 6407 6408 LLVM_DEBUG({ 6409 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6410 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6411 << " item\n"; 6412 for (const auto &pair : MaxUsages[i]) { 6413 dbgs() << "LV(REG): RegisterClass: " 6414 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6415 << " registers\n"; 6416 } 6417 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6418 << " item\n"; 6419 for (const auto &pair : Invariant) { 6420 dbgs() << "LV(REG): RegisterClass: " 6421 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6422 << " registers\n"; 6423 } 6424 }); 6425 6426 RU.LoopInvariantRegs = Invariant; 6427 RU.MaxLocalUsers = MaxUsages[i]; 6428 RUs[i] = RU; 6429 } 6430 6431 return RUs; 6432 } 6433 6434 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6435 ElementCount VF) { 6436 // TODO: Cost model for emulated masked load/store is completely 6437 // broken. This hack guides the cost model to use an artificially 6438 // high enough value to practically disable vectorization with such 6439 // operations, except where previously deployed legality hack allowed 6440 // using very low cost values. This is to avoid regressions coming simply 6441 // from moving "masked load/store" check from legality to cost model. 6442 // Masked Load/Gather emulation was previously never allowed. 6443 // Limited number of Masked Store/Scatter emulation was allowed. 6444 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6445 return isa<LoadInst>(I) || 6446 (isa<StoreInst>(I) && 6447 NumPredStores > NumberOfStoresToPredicate); 6448 } 6449 6450 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6451 // If we aren't vectorizing the loop, or if we've already collected the 6452 // instructions to scalarize, there's nothing to do. Collection may already 6453 // have occurred if we have a user-selected VF and are now computing the 6454 // expected cost for interleaving. 6455 if (VF.isScalar() || VF.isZero() || 6456 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6457 return; 6458 6459 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6460 // not profitable to scalarize any instructions, the presence of VF in the 6461 // map will indicate that we've analyzed it already. 6462 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6463 6464 // Find all the instructions that are scalar with predication in the loop and 6465 // determine if it would be better to not if-convert the blocks they are in. 6466 // If so, we also record the instructions to scalarize. 6467 for (BasicBlock *BB : TheLoop->blocks()) { 6468 if (!blockNeedsPredicationForAnyReason(BB)) 6469 continue; 6470 for (Instruction &I : *BB) 6471 if (isScalarWithPredication(&I, VF)) { 6472 ScalarCostsTy ScalarCosts; 6473 // Do not apply discount if scalable, because that would lead to 6474 // invalid scalarization costs. 6475 // Do not apply discount logic if hacked cost is needed 6476 // for emulated masked memrefs. 6477 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6478 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6479 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6480 // Remember that BB will remain after vectorization. 6481 PredicatedBBsAfterVectorization.insert(BB); 6482 } 6483 } 6484 } 6485 6486 int LoopVectorizationCostModel::computePredInstDiscount( 6487 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6488 assert(!isUniformAfterVectorization(PredInst, VF) && 6489 "Instruction marked uniform-after-vectorization will be predicated"); 6490 6491 // Initialize the discount to zero, meaning that the scalar version and the 6492 // vector version cost the same. 6493 InstructionCost Discount = 0; 6494 6495 // Holds instructions to analyze. The instructions we visit are mapped in 6496 // ScalarCosts. Those instructions are the ones that would be scalarized if 6497 // we find that the scalar version costs less. 6498 SmallVector<Instruction *, 8> Worklist; 6499 6500 // Returns true if the given instruction can be scalarized. 6501 auto canBeScalarized = [&](Instruction *I) -> bool { 6502 // We only attempt to scalarize instructions forming a single-use chain 6503 // from the original predicated block that would otherwise be vectorized. 6504 // Although not strictly necessary, we give up on instructions we know will 6505 // already be scalar to avoid traversing chains that are unlikely to be 6506 // beneficial. 6507 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6508 isScalarAfterVectorization(I, VF)) 6509 return false; 6510 6511 // If the instruction is scalar with predication, it will be analyzed 6512 // separately. We ignore it within the context of PredInst. 6513 if (isScalarWithPredication(I, VF)) 6514 return false; 6515 6516 // If any of the instruction's operands are uniform after vectorization, 6517 // the instruction cannot be scalarized. This prevents, for example, a 6518 // masked load from being scalarized. 6519 // 6520 // We assume we will only emit a value for lane zero of an instruction 6521 // marked uniform after vectorization, rather than VF identical values. 6522 // Thus, if we scalarize an instruction that uses a uniform, we would 6523 // create uses of values corresponding to the lanes we aren't emitting code 6524 // for. This behavior can be changed by allowing getScalarValue to clone 6525 // the lane zero values for uniforms rather than asserting. 6526 for (Use &U : I->operands()) 6527 if (auto *J = dyn_cast<Instruction>(U.get())) 6528 if (isUniformAfterVectorization(J, VF)) 6529 return false; 6530 6531 // Otherwise, we can scalarize the instruction. 6532 return true; 6533 }; 6534 6535 // Compute the expected cost discount from scalarizing the entire expression 6536 // feeding the predicated instruction. We currently only consider expressions 6537 // that are single-use instruction chains. 6538 Worklist.push_back(PredInst); 6539 while (!Worklist.empty()) { 6540 Instruction *I = Worklist.pop_back_val(); 6541 6542 // If we've already analyzed the instruction, there's nothing to do. 6543 if (ScalarCosts.find(I) != ScalarCosts.end()) 6544 continue; 6545 6546 // Compute the cost of the vector instruction. Note that this cost already 6547 // includes the scalarization overhead of the predicated instruction. 6548 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6549 6550 // Compute the cost of the scalarized instruction. This cost is the cost of 6551 // the instruction as if it wasn't if-converted and instead remained in the 6552 // predicated block. We will scale this cost by block probability after 6553 // computing the scalarization overhead. 6554 InstructionCost ScalarCost = 6555 VF.getFixedValue() * 6556 getInstructionCost(I, ElementCount::getFixed(1)).first; 6557 6558 // Compute the scalarization overhead of needed insertelement instructions 6559 // and phi nodes. 6560 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6561 ScalarCost += TTI.getScalarizationOverhead( 6562 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6563 APInt::getAllOnes(VF.getFixedValue()), true, false); 6564 ScalarCost += 6565 VF.getFixedValue() * 6566 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6567 } 6568 6569 // Compute the scalarization overhead of needed extractelement 6570 // instructions. For each of the instruction's operands, if the operand can 6571 // be scalarized, add it to the worklist; otherwise, account for the 6572 // overhead. 6573 for (Use &U : I->operands()) 6574 if (auto *J = dyn_cast<Instruction>(U.get())) { 6575 assert(VectorType::isValidElementType(J->getType()) && 6576 "Instruction has non-scalar type"); 6577 if (canBeScalarized(J)) 6578 Worklist.push_back(J); 6579 else if (needsExtract(J, VF)) { 6580 ScalarCost += TTI.getScalarizationOverhead( 6581 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6582 APInt::getAllOnes(VF.getFixedValue()), false, true); 6583 } 6584 } 6585 6586 // Scale the total scalar cost by block probability. 6587 ScalarCost /= getReciprocalPredBlockProb(); 6588 6589 // Compute the discount. A non-negative discount means the vector version 6590 // of the instruction costs more, and scalarizing would be beneficial. 6591 Discount += VectorCost - ScalarCost; 6592 ScalarCosts[I] = ScalarCost; 6593 } 6594 6595 return *Discount.getValue(); 6596 } 6597 6598 LoopVectorizationCostModel::VectorizationCostTy 6599 LoopVectorizationCostModel::expectedCost( 6600 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6601 VectorizationCostTy Cost; 6602 6603 // For each block. 6604 for (BasicBlock *BB : TheLoop->blocks()) { 6605 VectorizationCostTy BlockCost; 6606 6607 // For each instruction in the old loop. 6608 for (Instruction &I : BB->instructionsWithoutDebug()) { 6609 // Skip ignored values. 6610 if (ValuesToIgnore.count(&I) || 6611 (VF.isVector() && VecValuesToIgnore.count(&I))) 6612 continue; 6613 6614 VectorizationCostTy C = getInstructionCost(&I, VF); 6615 6616 // Check if we should override the cost. 6617 if (C.first.isValid() && 6618 ForceTargetInstructionCost.getNumOccurrences() > 0) 6619 C.first = InstructionCost(ForceTargetInstructionCost); 6620 6621 // Keep a list of instructions with invalid costs. 6622 if (Invalid && !C.first.isValid()) 6623 Invalid->emplace_back(&I, VF); 6624 6625 BlockCost.first += C.first; 6626 BlockCost.second |= C.second; 6627 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6628 << " for VF " << VF << " For instruction: " << I 6629 << '\n'); 6630 } 6631 6632 // If we are vectorizing a predicated block, it will have been 6633 // if-converted. This means that the block's instructions (aside from 6634 // stores and instructions that may divide by zero) will now be 6635 // unconditionally executed. For the scalar case, we may not always execute 6636 // the predicated block, if it is an if-else block. Thus, scale the block's 6637 // cost by the probability of executing it. blockNeedsPredication from 6638 // Legal is used so as to not include all blocks in tail folded loops. 6639 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6640 BlockCost.first /= getReciprocalPredBlockProb(); 6641 6642 Cost.first += BlockCost.first; 6643 Cost.second |= BlockCost.second; 6644 } 6645 6646 return Cost; 6647 } 6648 6649 /// Gets Address Access SCEV after verifying that the access pattern 6650 /// is loop invariant except the induction variable dependence. 6651 /// 6652 /// This SCEV can be sent to the Target in order to estimate the address 6653 /// calculation cost. 6654 static const SCEV *getAddressAccessSCEV( 6655 Value *Ptr, 6656 LoopVectorizationLegality *Legal, 6657 PredicatedScalarEvolution &PSE, 6658 const Loop *TheLoop) { 6659 6660 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6661 if (!Gep) 6662 return nullptr; 6663 6664 // We are looking for a gep with all loop invariant indices except for one 6665 // which should be an induction variable. 6666 auto SE = PSE.getSE(); 6667 unsigned NumOperands = Gep->getNumOperands(); 6668 for (unsigned i = 1; i < NumOperands; ++i) { 6669 Value *Opd = Gep->getOperand(i); 6670 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6671 !Legal->isInductionVariable(Opd)) 6672 return nullptr; 6673 } 6674 6675 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6676 return PSE.getSCEV(Ptr); 6677 } 6678 6679 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6680 return Legal->hasStride(I->getOperand(0)) || 6681 Legal->hasStride(I->getOperand(1)); 6682 } 6683 6684 InstructionCost 6685 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6686 ElementCount VF) { 6687 assert(VF.isVector() && 6688 "Scalarization cost of instruction implies vectorization."); 6689 if (VF.isScalable()) 6690 return InstructionCost::getInvalid(); 6691 6692 Type *ValTy = getLoadStoreType(I); 6693 auto SE = PSE.getSE(); 6694 6695 unsigned AS = getLoadStoreAddressSpace(I); 6696 Value *Ptr = getLoadStorePointerOperand(I); 6697 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6698 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6699 // that it is being called from this specific place. 6700 6701 // Figure out whether the access is strided and get the stride value 6702 // if it's known in compile time 6703 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6704 6705 // Get the cost of the scalar memory instruction and address computation. 6706 InstructionCost Cost = 6707 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6708 6709 // Don't pass *I here, since it is scalar but will actually be part of a 6710 // vectorized loop where the user of it is a vectorized instruction. 6711 const Align Alignment = getLoadStoreAlignment(I); 6712 Cost += VF.getKnownMinValue() * 6713 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6714 AS, TTI::TCK_RecipThroughput); 6715 6716 // Get the overhead of the extractelement and insertelement instructions 6717 // we might create due to scalarization. 6718 Cost += getScalarizationOverhead(I, VF); 6719 6720 // If we have a predicated load/store, it will need extra i1 extracts and 6721 // conditional branches, but may not be executed for each vector lane. Scale 6722 // the cost by the probability of executing the predicated block. 6723 if (isPredicatedInst(I, VF)) { 6724 Cost /= getReciprocalPredBlockProb(); 6725 6726 // Add the cost of an i1 extract and a branch 6727 auto *Vec_i1Ty = 6728 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6729 Cost += TTI.getScalarizationOverhead( 6730 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6731 /*Insert=*/false, /*Extract=*/true); 6732 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6733 6734 if (useEmulatedMaskMemRefHack(I, VF)) 6735 // Artificially setting to a high enough value to practically disable 6736 // vectorization with such operations. 6737 Cost = 3000000; 6738 } 6739 6740 return Cost; 6741 } 6742 6743 InstructionCost 6744 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6745 ElementCount VF) { 6746 Type *ValTy = getLoadStoreType(I); 6747 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6748 Value *Ptr = getLoadStorePointerOperand(I); 6749 unsigned AS = getLoadStoreAddressSpace(I); 6750 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6751 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6752 6753 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6754 "Stride should be 1 or -1 for consecutive memory access"); 6755 const Align Alignment = getLoadStoreAlignment(I); 6756 InstructionCost Cost = 0; 6757 if (Legal->isMaskRequired(I)) 6758 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6759 CostKind); 6760 else 6761 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6762 CostKind, I); 6763 6764 bool Reverse = ConsecutiveStride < 0; 6765 if (Reverse) 6766 Cost += 6767 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6768 return Cost; 6769 } 6770 6771 InstructionCost 6772 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6773 ElementCount VF) { 6774 assert(Legal->isUniformMemOp(*I)); 6775 6776 Type *ValTy = getLoadStoreType(I); 6777 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6778 const Align Alignment = getLoadStoreAlignment(I); 6779 unsigned AS = getLoadStoreAddressSpace(I); 6780 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6781 if (isa<LoadInst>(I)) { 6782 return TTI.getAddressComputationCost(ValTy) + 6783 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6784 CostKind) + 6785 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6786 } 6787 StoreInst *SI = cast<StoreInst>(I); 6788 6789 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6790 return TTI.getAddressComputationCost(ValTy) + 6791 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6792 CostKind) + 6793 (isLoopInvariantStoreValue 6794 ? 0 6795 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6796 VF.getKnownMinValue() - 1)); 6797 } 6798 6799 InstructionCost 6800 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6801 ElementCount VF) { 6802 Type *ValTy = getLoadStoreType(I); 6803 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6804 const Align Alignment = getLoadStoreAlignment(I); 6805 const Value *Ptr = getLoadStorePointerOperand(I); 6806 6807 return TTI.getAddressComputationCost(VectorTy) + 6808 TTI.getGatherScatterOpCost( 6809 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6810 TargetTransformInfo::TCK_RecipThroughput, I); 6811 } 6812 6813 InstructionCost 6814 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6815 ElementCount VF) { 6816 // TODO: Once we have support for interleaving with scalable vectors 6817 // we can calculate the cost properly here. 6818 if (VF.isScalable()) 6819 return InstructionCost::getInvalid(); 6820 6821 Type *ValTy = getLoadStoreType(I); 6822 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6823 unsigned AS = getLoadStoreAddressSpace(I); 6824 6825 auto Group = getInterleavedAccessGroup(I); 6826 assert(Group && "Fail to get an interleaved access group."); 6827 6828 unsigned InterleaveFactor = Group->getFactor(); 6829 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6830 6831 // Holds the indices of existing members in the interleaved group. 6832 SmallVector<unsigned, 4> Indices; 6833 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6834 if (Group->getMember(IF)) 6835 Indices.push_back(IF); 6836 6837 // Calculate the cost of the whole interleaved group. 6838 bool UseMaskForGaps = 6839 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6840 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6841 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6842 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6843 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6844 6845 if (Group->isReverse()) { 6846 // TODO: Add support for reversed masked interleaved access. 6847 assert(!Legal->isMaskRequired(I) && 6848 "Reverse masked interleaved access not supported."); 6849 Cost += 6850 Group->getNumMembers() * 6851 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6852 } 6853 return Cost; 6854 } 6855 6856 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6857 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6858 using namespace llvm::PatternMatch; 6859 // Early exit for no inloop reductions 6860 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6861 return None; 6862 auto *VectorTy = cast<VectorType>(Ty); 6863 6864 // We are looking for a pattern of, and finding the minimal acceptable cost: 6865 // reduce(mul(ext(A), ext(B))) or 6866 // reduce(mul(A, B)) or 6867 // reduce(ext(A)) or 6868 // reduce(A). 6869 // The basic idea is that we walk down the tree to do that, finding the root 6870 // reduction instruction in InLoopReductionImmediateChains. From there we find 6871 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6872 // of the components. If the reduction cost is lower then we return it for the 6873 // reduction instruction and 0 for the other instructions in the pattern. If 6874 // it is not we return an invalid cost specifying the orignal cost method 6875 // should be used. 6876 Instruction *RetI = I; 6877 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6878 if (!RetI->hasOneUser()) 6879 return None; 6880 RetI = RetI->user_back(); 6881 } 6882 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6883 RetI->user_back()->getOpcode() == Instruction::Add) { 6884 if (!RetI->hasOneUser()) 6885 return None; 6886 RetI = RetI->user_back(); 6887 } 6888 6889 // Test if the found instruction is a reduction, and if not return an invalid 6890 // cost specifying the parent to use the original cost modelling. 6891 if (!InLoopReductionImmediateChains.count(RetI)) 6892 return None; 6893 6894 // Find the reduction this chain is a part of and calculate the basic cost of 6895 // the reduction on its own. 6896 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6897 Instruction *ReductionPhi = LastChain; 6898 while (!isa<PHINode>(ReductionPhi)) 6899 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6900 6901 const RecurrenceDescriptor &RdxDesc = 6902 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6903 6904 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6905 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6906 6907 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6908 // normal fmul instruction to the cost of the fadd reduction. 6909 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6910 BaseCost += 6911 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6912 6913 // If we're using ordered reductions then we can just return the base cost 6914 // here, since getArithmeticReductionCost calculates the full ordered 6915 // reduction cost when FP reassociation is not allowed. 6916 if (useOrderedReductions(RdxDesc)) 6917 return BaseCost; 6918 6919 // Get the operand that was not the reduction chain and match it to one of the 6920 // patterns, returning the better cost if it is found. 6921 Instruction *RedOp = RetI->getOperand(1) == LastChain 6922 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6923 : dyn_cast<Instruction>(RetI->getOperand(1)); 6924 6925 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6926 6927 Instruction *Op0, *Op1; 6928 if (RedOp && 6929 match(RedOp, 6930 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6931 match(Op0, m_ZExtOrSExt(m_Value())) && 6932 Op0->getOpcode() == Op1->getOpcode() && 6933 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6934 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6935 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6936 6937 // Matched reduce(ext(mul(ext(A), ext(B))) 6938 // Note that the extend opcodes need to all match, or if A==B they will have 6939 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6940 // which is equally fine. 6941 bool IsUnsigned = isa<ZExtInst>(Op0); 6942 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6943 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6944 6945 InstructionCost ExtCost = 6946 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6947 TTI::CastContextHint::None, CostKind, Op0); 6948 InstructionCost MulCost = 6949 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6950 InstructionCost Ext2Cost = 6951 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6952 TTI::CastContextHint::None, CostKind, RedOp); 6953 6954 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6955 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6956 CostKind); 6957 6958 if (RedCost.isValid() && 6959 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6960 return I == RetI ? RedCost : 0; 6961 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6962 !TheLoop->isLoopInvariant(RedOp)) { 6963 // Matched reduce(ext(A)) 6964 bool IsUnsigned = isa<ZExtInst>(RedOp); 6965 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6966 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6967 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6968 CostKind); 6969 6970 InstructionCost ExtCost = 6971 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6972 TTI::CastContextHint::None, CostKind, RedOp); 6973 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6974 return I == RetI ? RedCost : 0; 6975 } else if (RedOp && 6976 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6977 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6978 Op0->getOpcode() == Op1->getOpcode() && 6979 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6980 bool IsUnsigned = isa<ZExtInst>(Op0); 6981 Type *Op0Ty = Op0->getOperand(0)->getType(); 6982 Type *Op1Ty = Op1->getOperand(0)->getType(); 6983 Type *LargestOpTy = 6984 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6985 : Op0Ty; 6986 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6987 6988 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6989 // different sizes. We take the largest type as the ext to reduce, and add 6990 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6991 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6992 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6993 TTI::CastContextHint::None, CostKind, Op0); 6994 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6995 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6996 TTI::CastContextHint::None, CostKind, Op1); 6997 InstructionCost MulCost = 6998 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6999 7000 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7001 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7002 CostKind); 7003 InstructionCost ExtraExtCost = 0; 7004 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7005 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7006 ExtraExtCost = TTI.getCastInstrCost( 7007 ExtraExtOp->getOpcode(), ExtType, 7008 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7009 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7010 } 7011 7012 if (RedCost.isValid() && 7013 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7014 return I == RetI ? RedCost : 0; 7015 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7016 // Matched reduce(mul()) 7017 InstructionCost MulCost = 7018 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7019 7020 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7021 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7022 CostKind); 7023 7024 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7025 return I == RetI ? RedCost : 0; 7026 } 7027 } 7028 7029 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7030 } 7031 7032 InstructionCost 7033 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7034 ElementCount VF) { 7035 // Calculate scalar cost only. Vectorization cost should be ready at this 7036 // moment. 7037 if (VF.isScalar()) { 7038 Type *ValTy = getLoadStoreType(I); 7039 const Align Alignment = getLoadStoreAlignment(I); 7040 unsigned AS = getLoadStoreAddressSpace(I); 7041 7042 return TTI.getAddressComputationCost(ValTy) + 7043 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7044 TTI::TCK_RecipThroughput, I); 7045 } 7046 return getWideningCost(I, VF); 7047 } 7048 7049 LoopVectorizationCostModel::VectorizationCostTy 7050 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7051 ElementCount VF) { 7052 // If we know that this instruction will remain uniform, check the cost of 7053 // the scalar version. 7054 if (isUniformAfterVectorization(I, VF)) 7055 VF = ElementCount::getFixed(1); 7056 7057 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7058 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7059 7060 // Forced scalars do not have any scalarization overhead. 7061 auto ForcedScalar = ForcedScalars.find(VF); 7062 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7063 auto InstSet = ForcedScalar->second; 7064 if (InstSet.count(I)) 7065 return VectorizationCostTy( 7066 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7067 VF.getKnownMinValue()), 7068 false); 7069 } 7070 7071 Type *VectorTy; 7072 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7073 7074 bool TypeNotScalarized = false; 7075 if (VF.isVector() && VectorTy->isVectorTy()) { 7076 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7077 if (NumParts) 7078 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7079 else 7080 C = InstructionCost::getInvalid(); 7081 } 7082 return VectorizationCostTy(C, TypeNotScalarized); 7083 } 7084 7085 InstructionCost 7086 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7087 ElementCount VF) const { 7088 7089 // There is no mechanism yet to create a scalable scalarization loop, 7090 // so this is currently Invalid. 7091 if (VF.isScalable()) 7092 return InstructionCost::getInvalid(); 7093 7094 if (VF.isScalar()) 7095 return 0; 7096 7097 InstructionCost Cost = 0; 7098 Type *RetTy = ToVectorTy(I->getType(), VF); 7099 if (!RetTy->isVoidTy() && 7100 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7101 Cost += TTI.getScalarizationOverhead( 7102 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7103 false); 7104 7105 // Some targets keep addresses scalar. 7106 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7107 return Cost; 7108 7109 // Some targets support efficient element stores. 7110 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7111 return Cost; 7112 7113 // Collect operands to consider. 7114 CallInst *CI = dyn_cast<CallInst>(I); 7115 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7116 7117 // Skip operands that do not require extraction/scalarization and do not incur 7118 // any overhead. 7119 SmallVector<Type *> Tys; 7120 for (auto *V : filterExtractingOperands(Ops, VF)) 7121 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7122 return Cost + TTI.getOperandsScalarizationOverhead( 7123 filterExtractingOperands(Ops, VF), Tys); 7124 } 7125 7126 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7127 if (VF.isScalar()) 7128 return; 7129 NumPredStores = 0; 7130 for (BasicBlock *BB : TheLoop->blocks()) { 7131 // For each instruction in the old loop. 7132 for (Instruction &I : *BB) { 7133 Value *Ptr = getLoadStorePointerOperand(&I); 7134 if (!Ptr) 7135 continue; 7136 7137 // TODO: We should generate better code and update the cost model for 7138 // predicated uniform stores. Today they are treated as any other 7139 // predicated store (see added test cases in 7140 // invariant-store-vectorization.ll). 7141 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 7142 NumPredStores++; 7143 7144 if (Legal->isUniformMemOp(I)) { 7145 // TODO: Avoid replicating loads and stores instead of 7146 // relying on instcombine to remove them. 7147 // Load: Scalar load + broadcast 7148 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7149 InstructionCost Cost; 7150 if (isa<StoreInst>(&I) && VF.isScalable() && 7151 isLegalGatherOrScatter(&I, VF)) { 7152 Cost = getGatherScatterCost(&I, VF); 7153 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7154 } else { 7155 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7156 "Cannot yet scalarize uniform stores"); 7157 Cost = getUniformMemOpCost(&I, VF); 7158 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7159 } 7160 continue; 7161 } 7162 7163 // We assume that widening is the best solution when possible. 7164 if (memoryInstructionCanBeWidened(&I, VF)) { 7165 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7166 int ConsecutiveStride = Legal->isConsecutivePtr( 7167 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7168 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7169 "Expected consecutive stride."); 7170 InstWidening Decision = 7171 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7172 setWideningDecision(&I, VF, Decision, Cost); 7173 continue; 7174 } 7175 7176 // Choose between Interleaving, Gather/Scatter or Scalarization. 7177 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7178 unsigned NumAccesses = 1; 7179 if (isAccessInterleaved(&I)) { 7180 auto Group = getInterleavedAccessGroup(&I); 7181 assert(Group && "Fail to get an interleaved access group."); 7182 7183 // Make one decision for the whole group. 7184 if (getWideningDecision(&I, VF) != CM_Unknown) 7185 continue; 7186 7187 NumAccesses = Group->getNumMembers(); 7188 if (interleavedAccessCanBeWidened(&I, VF)) 7189 InterleaveCost = getInterleaveGroupCost(&I, VF); 7190 } 7191 7192 InstructionCost GatherScatterCost = 7193 isLegalGatherOrScatter(&I, VF) 7194 ? getGatherScatterCost(&I, VF) * NumAccesses 7195 : InstructionCost::getInvalid(); 7196 7197 InstructionCost ScalarizationCost = 7198 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7199 7200 // Choose better solution for the current VF, 7201 // write down this decision and use it during vectorization. 7202 InstructionCost Cost; 7203 InstWidening Decision; 7204 if (InterleaveCost <= GatherScatterCost && 7205 InterleaveCost < ScalarizationCost) { 7206 Decision = CM_Interleave; 7207 Cost = InterleaveCost; 7208 } else if (GatherScatterCost < ScalarizationCost) { 7209 Decision = CM_GatherScatter; 7210 Cost = GatherScatterCost; 7211 } else { 7212 Decision = CM_Scalarize; 7213 Cost = ScalarizationCost; 7214 } 7215 // If the instructions belongs to an interleave group, the whole group 7216 // receives the same decision. The whole group receives the cost, but 7217 // the cost will actually be assigned to one instruction. 7218 if (auto Group = getInterleavedAccessGroup(&I)) 7219 setWideningDecision(Group, VF, Decision, Cost); 7220 else 7221 setWideningDecision(&I, VF, Decision, Cost); 7222 } 7223 } 7224 7225 // Make sure that any load of address and any other address computation 7226 // remains scalar unless there is gather/scatter support. This avoids 7227 // inevitable extracts into address registers, and also has the benefit of 7228 // activating LSR more, since that pass can't optimize vectorized 7229 // addresses. 7230 if (TTI.prefersVectorizedAddressing()) 7231 return; 7232 7233 // Start with all scalar pointer uses. 7234 SmallPtrSet<Instruction *, 8> AddrDefs; 7235 for (BasicBlock *BB : TheLoop->blocks()) 7236 for (Instruction &I : *BB) { 7237 Instruction *PtrDef = 7238 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7239 if (PtrDef && TheLoop->contains(PtrDef) && 7240 getWideningDecision(&I, VF) != CM_GatherScatter) 7241 AddrDefs.insert(PtrDef); 7242 } 7243 7244 // Add all instructions used to generate the addresses. 7245 SmallVector<Instruction *, 4> Worklist; 7246 append_range(Worklist, AddrDefs); 7247 while (!Worklist.empty()) { 7248 Instruction *I = Worklist.pop_back_val(); 7249 for (auto &Op : I->operands()) 7250 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7251 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7252 AddrDefs.insert(InstOp).second) 7253 Worklist.push_back(InstOp); 7254 } 7255 7256 for (auto *I : AddrDefs) { 7257 if (isa<LoadInst>(I)) { 7258 // Setting the desired widening decision should ideally be handled in 7259 // by cost functions, but since this involves the task of finding out 7260 // if the loaded register is involved in an address computation, it is 7261 // instead changed here when we know this is the case. 7262 InstWidening Decision = getWideningDecision(I, VF); 7263 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7264 // Scalarize a widened load of address. 7265 setWideningDecision( 7266 I, VF, CM_Scalarize, 7267 (VF.getKnownMinValue() * 7268 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7269 else if (auto Group = getInterleavedAccessGroup(I)) { 7270 // Scalarize an interleave group of address loads. 7271 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7272 if (Instruction *Member = Group->getMember(I)) 7273 setWideningDecision( 7274 Member, VF, CM_Scalarize, 7275 (VF.getKnownMinValue() * 7276 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7277 } 7278 } 7279 } else 7280 // Make sure I gets scalarized and a cost estimate without 7281 // scalarization overhead. 7282 ForcedScalars[VF].insert(I); 7283 } 7284 } 7285 7286 InstructionCost 7287 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7288 Type *&VectorTy) { 7289 Type *RetTy = I->getType(); 7290 if (canTruncateToMinimalBitwidth(I, VF)) 7291 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7292 auto SE = PSE.getSE(); 7293 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7294 7295 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7296 ElementCount VF) -> bool { 7297 if (VF.isScalar()) 7298 return true; 7299 7300 auto Scalarized = InstsToScalarize.find(VF); 7301 assert(Scalarized != InstsToScalarize.end() && 7302 "VF not yet analyzed for scalarization profitability"); 7303 return !Scalarized->second.count(I) && 7304 llvm::all_of(I->users(), [&](User *U) { 7305 auto *UI = cast<Instruction>(U); 7306 return !Scalarized->second.count(UI); 7307 }); 7308 }; 7309 (void) hasSingleCopyAfterVectorization; 7310 7311 if (isScalarAfterVectorization(I, VF)) { 7312 // With the exception of GEPs and PHIs, after scalarization there should 7313 // only be one copy of the instruction generated in the loop. This is 7314 // because the VF is either 1, or any instructions that need scalarizing 7315 // have already been dealt with by the the time we get here. As a result, 7316 // it means we don't have to multiply the instruction cost by VF. 7317 assert(I->getOpcode() == Instruction::GetElementPtr || 7318 I->getOpcode() == Instruction::PHI || 7319 (I->getOpcode() == Instruction::BitCast && 7320 I->getType()->isPointerTy()) || 7321 hasSingleCopyAfterVectorization(I, VF)); 7322 VectorTy = RetTy; 7323 } else 7324 VectorTy = ToVectorTy(RetTy, VF); 7325 7326 // TODO: We need to estimate the cost of intrinsic calls. 7327 switch (I->getOpcode()) { 7328 case Instruction::GetElementPtr: 7329 // We mark this instruction as zero-cost because the cost of GEPs in 7330 // vectorized code depends on whether the corresponding memory instruction 7331 // is scalarized or not. Therefore, we handle GEPs with the memory 7332 // instruction cost. 7333 return 0; 7334 case Instruction::Br: { 7335 // In cases of scalarized and predicated instructions, there will be VF 7336 // predicated blocks in the vectorized loop. Each branch around these 7337 // blocks requires also an extract of its vector compare i1 element. 7338 bool ScalarPredicatedBB = false; 7339 BranchInst *BI = cast<BranchInst>(I); 7340 if (VF.isVector() && BI->isConditional() && 7341 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7342 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7343 ScalarPredicatedBB = true; 7344 7345 if (ScalarPredicatedBB) { 7346 // Not possible to scalarize scalable vector with predicated instructions. 7347 if (VF.isScalable()) 7348 return InstructionCost::getInvalid(); 7349 // Return cost for branches around scalarized and predicated blocks. 7350 auto *Vec_i1Ty = 7351 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7352 return ( 7353 TTI.getScalarizationOverhead( 7354 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7355 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7356 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7357 // The back-edge branch will remain, as will all scalar branches. 7358 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7359 else 7360 // This branch will be eliminated by if-conversion. 7361 return 0; 7362 // Note: We currently assume zero cost for an unconditional branch inside 7363 // a predicated block since it will become a fall-through, although we 7364 // may decide in the future to call TTI for all branches. 7365 } 7366 case Instruction::PHI: { 7367 auto *Phi = cast<PHINode>(I); 7368 7369 // First-order recurrences are replaced by vector shuffles inside the loop. 7370 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7371 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7372 return TTI.getShuffleCost( 7373 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7374 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7375 7376 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7377 // converted into select instructions. We require N - 1 selects per phi 7378 // node, where N is the number of incoming values. 7379 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7380 return (Phi->getNumIncomingValues() - 1) * 7381 TTI.getCmpSelInstrCost( 7382 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7383 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7384 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7385 7386 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7387 } 7388 case Instruction::UDiv: 7389 case Instruction::SDiv: 7390 case Instruction::URem: 7391 case Instruction::SRem: 7392 // If we have a predicated instruction, it may not be executed for each 7393 // vector lane. Get the scalarization cost and scale this amount by the 7394 // probability of executing the predicated block. If the instruction is not 7395 // predicated, we fall through to the next case. 7396 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7397 InstructionCost Cost = 0; 7398 7399 // These instructions have a non-void type, so account for the phi nodes 7400 // that we will create. This cost is likely to be zero. The phi node 7401 // cost, if any, should be scaled by the block probability because it 7402 // models a copy at the end of each predicated block. 7403 Cost += VF.getKnownMinValue() * 7404 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7405 7406 // The cost of the non-predicated instruction. 7407 Cost += VF.getKnownMinValue() * 7408 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7409 7410 // The cost of insertelement and extractelement instructions needed for 7411 // scalarization. 7412 Cost += getScalarizationOverhead(I, VF); 7413 7414 // Scale the cost by the probability of executing the predicated blocks. 7415 // This assumes the predicated block for each vector lane is equally 7416 // likely. 7417 return Cost / getReciprocalPredBlockProb(); 7418 } 7419 LLVM_FALLTHROUGH; 7420 case Instruction::Add: 7421 case Instruction::FAdd: 7422 case Instruction::Sub: 7423 case Instruction::FSub: 7424 case Instruction::Mul: 7425 case Instruction::FMul: 7426 case Instruction::FDiv: 7427 case Instruction::FRem: 7428 case Instruction::Shl: 7429 case Instruction::LShr: 7430 case Instruction::AShr: 7431 case Instruction::And: 7432 case Instruction::Or: 7433 case Instruction::Xor: { 7434 // Since we will replace the stride by 1 the multiplication should go away. 7435 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7436 return 0; 7437 7438 // Detect reduction patterns 7439 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7440 return *RedCost; 7441 7442 // Certain instructions can be cheaper to vectorize if they have a constant 7443 // second vector operand. One example of this are shifts on x86. 7444 Value *Op2 = I->getOperand(1); 7445 TargetTransformInfo::OperandValueProperties Op2VP; 7446 TargetTransformInfo::OperandValueKind Op2VK = 7447 TTI.getOperandInfo(Op2, Op2VP); 7448 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7449 Op2VK = TargetTransformInfo::OK_UniformValue; 7450 7451 SmallVector<const Value *, 4> Operands(I->operand_values()); 7452 return TTI.getArithmeticInstrCost( 7453 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7454 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7455 } 7456 case Instruction::FNeg: { 7457 return TTI.getArithmeticInstrCost( 7458 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7459 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7460 TargetTransformInfo::OP_None, I->getOperand(0), I); 7461 } 7462 case Instruction::Select: { 7463 SelectInst *SI = cast<SelectInst>(I); 7464 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7465 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7466 7467 const Value *Op0, *Op1; 7468 using namespace llvm::PatternMatch; 7469 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7470 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7471 // select x, y, false --> x & y 7472 // select x, true, y --> x | y 7473 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7474 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7475 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7476 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7477 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7478 Op1->getType()->getScalarSizeInBits() == 1); 7479 7480 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7481 return TTI.getArithmeticInstrCost( 7482 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7483 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7484 } 7485 7486 Type *CondTy = SI->getCondition()->getType(); 7487 if (!ScalarCond) 7488 CondTy = VectorType::get(CondTy, VF); 7489 7490 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7491 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7492 Pred = Cmp->getPredicate(); 7493 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7494 CostKind, I); 7495 } 7496 case Instruction::ICmp: 7497 case Instruction::FCmp: { 7498 Type *ValTy = I->getOperand(0)->getType(); 7499 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7500 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7501 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7502 VectorTy = ToVectorTy(ValTy, VF); 7503 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7504 cast<CmpInst>(I)->getPredicate(), CostKind, 7505 I); 7506 } 7507 case Instruction::Store: 7508 case Instruction::Load: { 7509 ElementCount Width = VF; 7510 if (Width.isVector()) { 7511 InstWidening Decision = getWideningDecision(I, Width); 7512 assert(Decision != CM_Unknown && 7513 "CM decision should be taken at this point"); 7514 if (Decision == CM_Scalarize) 7515 Width = ElementCount::getFixed(1); 7516 } 7517 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7518 return getMemoryInstructionCost(I, VF); 7519 } 7520 case Instruction::BitCast: 7521 if (I->getType()->isPointerTy()) 7522 return 0; 7523 LLVM_FALLTHROUGH; 7524 case Instruction::ZExt: 7525 case Instruction::SExt: 7526 case Instruction::FPToUI: 7527 case Instruction::FPToSI: 7528 case Instruction::FPExt: 7529 case Instruction::PtrToInt: 7530 case Instruction::IntToPtr: 7531 case Instruction::SIToFP: 7532 case Instruction::UIToFP: 7533 case Instruction::Trunc: 7534 case Instruction::FPTrunc: { 7535 // Computes the CastContextHint from a Load/Store instruction. 7536 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7537 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7538 "Expected a load or a store!"); 7539 7540 if (VF.isScalar() || !TheLoop->contains(I)) 7541 return TTI::CastContextHint::Normal; 7542 7543 switch (getWideningDecision(I, VF)) { 7544 case LoopVectorizationCostModel::CM_GatherScatter: 7545 return TTI::CastContextHint::GatherScatter; 7546 case LoopVectorizationCostModel::CM_Interleave: 7547 return TTI::CastContextHint::Interleave; 7548 case LoopVectorizationCostModel::CM_Scalarize: 7549 case LoopVectorizationCostModel::CM_Widen: 7550 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7551 : TTI::CastContextHint::Normal; 7552 case LoopVectorizationCostModel::CM_Widen_Reverse: 7553 return TTI::CastContextHint::Reversed; 7554 case LoopVectorizationCostModel::CM_Unknown: 7555 llvm_unreachable("Instr did not go through cost modelling?"); 7556 } 7557 7558 llvm_unreachable("Unhandled case!"); 7559 }; 7560 7561 unsigned Opcode = I->getOpcode(); 7562 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7563 // For Trunc, the context is the only user, which must be a StoreInst. 7564 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7565 if (I->hasOneUse()) 7566 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7567 CCH = ComputeCCH(Store); 7568 } 7569 // For Z/Sext, the context is the operand, which must be a LoadInst. 7570 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7571 Opcode == Instruction::FPExt) { 7572 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7573 CCH = ComputeCCH(Load); 7574 } 7575 7576 // We optimize the truncation of induction variables having constant 7577 // integer steps. The cost of these truncations is the same as the scalar 7578 // operation. 7579 if (isOptimizableIVTruncate(I, VF)) { 7580 auto *Trunc = cast<TruncInst>(I); 7581 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7582 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7583 } 7584 7585 // Detect reduction patterns 7586 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7587 return *RedCost; 7588 7589 Type *SrcScalarTy = I->getOperand(0)->getType(); 7590 Type *SrcVecTy = 7591 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7592 if (canTruncateToMinimalBitwidth(I, VF)) { 7593 // This cast is going to be shrunk. This may remove the cast or it might 7594 // turn it into slightly different cast. For example, if MinBW == 16, 7595 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7596 // 7597 // Calculate the modified src and dest types. 7598 Type *MinVecTy = VectorTy; 7599 if (Opcode == Instruction::Trunc) { 7600 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7601 VectorTy = 7602 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7603 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7604 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7605 VectorTy = 7606 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7607 } 7608 } 7609 7610 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7611 } 7612 case Instruction::Call: { 7613 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7614 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7615 return *RedCost; 7616 bool NeedToScalarize; 7617 CallInst *CI = cast<CallInst>(I); 7618 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7619 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7620 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7621 return std::min(CallCost, IntrinsicCost); 7622 } 7623 return CallCost; 7624 } 7625 case Instruction::ExtractValue: 7626 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7627 case Instruction::Alloca: 7628 // We cannot easily widen alloca to a scalable alloca, as 7629 // the result would need to be a vector of pointers. 7630 if (VF.isScalable()) 7631 return InstructionCost::getInvalid(); 7632 LLVM_FALLTHROUGH; 7633 default: 7634 // This opcode is unknown. Assume that it is the same as 'mul'. 7635 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7636 } // end of switch. 7637 } 7638 7639 char LoopVectorize::ID = 0; 7640 7641 static const char lv_name[] = "Loop Vectorization"; 7642 7643 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7644 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7645 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7646 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7647 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7648 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7649 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7650 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7651 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7652 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7653 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7654 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7655 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7656 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7657 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7658 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7659 7660 namespace llvm { 7661 7662 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7663 7664 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7665 bool VectorizeOnlyWhenForced) { 7666 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7667 } 7668 7669 } // end namespace llvm 7670 7671 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7672 // Check if the pointer operand of a load or store instruction is 7673 // consecutive. 7674 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7675 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7676 return false; 7677 } 7678 7679 void LoopVectorizationCostModel::collectValuesToIgnore() { 7680 // Ignore ephemeral values. 7681 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7682 7683 // Ignore type-promoting instructions we identified during reduction 7684 // detection. 7685 for (auto &Reduction : Legal->getReductionVars()) { 7686 const RecurrenceDescriptor &RedDes = Reduction.second; 7687 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7688 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7689 } 7690 // Ignore type-casting instructions we identified during induction 7691 // detection. 7692 for (auto &Induction : Legal->getInductionVars()) { 7693 const InductionDescriptor &IndDes = Induction.second; 7694 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7695 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7696 } 7697 } 7698 7699 void LoopVectorizationCostModel::collectInLoopReductions() { 7700 for (auto &Reduction : Legal->getReductionVars()) { 7701 PHINode *Phi = Reduction.first; 7702 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7703 7704 // We don't collect reductions that are type promoted (yet). 7705 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7706 continue; 7707 7708 // If the target would prefer this reduction to happen "in-loop", then we 7709 // want to record it as such. 7710 unsigned Opcode = RdxDesc.getOpcode(); 7711 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7712 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7713 TargetTransformInfo::ReductionFlags())) 7714 continue; 7715 7716 // Check that we can correctly put the reductions into the loop, by 7717 // finding the chain of operations that leads from the phi to the loop 7718 // exit value. 7719 SmallVector<Instruction *, 4> ReductionOperations = 7720 RdxDesc.getReductionOpChain(Phi, TheLoop); 7721 bool InLoop = !ReductionOperations.empty(); 7722 if (InLoop) { 7723 InLoopReductionChains[Phi] = ReductionOperations; 7724 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7725 Instruction *LastChain = Phi; 7726 for (auto *I : ReductionOperations) { 7727 InLoopReductionImmediateChains[I] = LastChain; 7728 LastChain = I; 7729 } 7730 } 7731 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7732 << " reduction for phi: " << *Phi << "\n"); 7733 } 7734 } 7735 7736 // TODO: we could return a pair of values that specify the max VF and 7737 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7738 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7739 // doesn't have a cost model that can choose which plan to execute if 7740 // more than one is generated. 7741 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7742 LoopVectorizationCostModel &CM) { 7743 unsigned WidestType; 7744 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7745 return WidestVectorRegBits / WidestType; 7746 } 7747 7748 VectorizationFactor 7749 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7750 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7751 ElementCount VF = UserVF; 7752 // Outer loop handling: They may require CFG and instruction level 7753 // transformations before even evaluating whether vectorization is profitable. 7754 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7755 // the vectorization pipeline. 7756 if (!OrigLoop->isInnermost()) { 7757 // If the user doesn't provide a vectorization factor, determine a 7758 // reasonable one. 7759 if (UserVF.isZero()) { 7760 VF = ElementCount::getFixed(determineVPlanVF( 7761 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7762 .getFixedSize(), 7763 CM)); 7764 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7765 7766 // Make sure we have a VF > 1 for stress testing. 7767 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7768 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7769 << "overriding computed VF.\n"); 7770 VF = ElementCount::getFixed(4); 7771 } 7772 } 7773 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7774 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7775 "VF needs to be a power of two"); 7776 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7777 << "VF " << VF << " to build VPlans.\n"); 7778 buildVPlans(VF, VF); 7779 7780 // For VPlan build stress testing, we bail out after VPlan construction. 7781 if (VPlanBuildStressTest) 7782 return VectorizationFactor::Disabled(); 7783 7784 return {VF, 0 /*Cost*/}; 7785 } 7786 7787 LLVM_DEBUG( 7788 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7789 "VPlan-native path.\n"); 7790 return VectorizationFactor::Disabled(); 7791 } 7792 7793 Optional<VectorizationFactor> 7794 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7795 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7796 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7797 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7798 return None; 7799 7800 // Invalidate interleave groups if all blocks of loop will be predicated. 7801 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7802 !useMaskedInterleavedAccesses(*TTI)) { 7803 LLVM_DEBUG( 7804 dbgs() 7805 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7806 "which requires masked-interleaved support.\n"); 7807 if (CM.InterleaveInfo.invalidateGroups()) 7808 // Invalidating interleave groups also requires invalidating all decisions 7809 // based on them, which includes widening decisions and uniform and scalar 7810 // values. 7811 CM.invalidateCostModelingDecisions(); 7812 } 7813 7814 ElementCount MaxUserVF = 7815 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7816 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7817 if (!UserVF.isZero() && UserVFIsLegal) { 7818 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7819 "VF needs to be a power of two"); 7820 // Collect the instructions (and their associated costs) that will be more 7821 // profitable to scalarize. 7822 if (CM.selectUserVectorizationFactor(UserVF)) { 7823 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7824 CM.collectInLoopReductions(); 7825 buildVPlansWithVPRecipes(UserVF, UserVF); 7826 LLVM_DEBUG(printPlans(dbgs())); 7827 return {{UserVF, 0}}; 7828 } else 7829 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7830 "InvalidCost", ORE, OrigLoop); 7831 } 7832 7833 // Populate the set of Vectorization Factor Candidates. 7834 ElementCountSet VFCandidates; 7835 for (auto VF = ElementCount::getFixed(1); 7836 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7837 VFCandidates.insert(VF); 7838 for (auto VF = ElementCount::getScalable(1); 7839 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7840 VFCandidates.insert(VF); 7841 7842 for (const auto &VF : VFCandidates) { 7843 // Collect Uniform and Scalar instructions after vectorization with VF. 7844 CM.collectUniformsAndScalars(VF); 7845 7846 // Collect the instructions (and their associated costs) that will be more 7847 // profitable to scalarize. 7848 if (VF.isVector()) 7849 CM.collectInstsToScalarize(VF); 7850 } 7851 7852 CM.collectInLoopReductions(); 7853 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7854 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7855 7856 LLVM_DEBUG(printPlans(dbgs())); 7857 if (!MaxFactors.hasVector()) 7858 return VectorizationFactor::Disabled(); 7859 7860 // Select the optimal vectorization factor. 7861 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7862 7863 // Check if it is profitable to vectorize with runtime checks. 7864 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7865 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7866 bool PragmaThresholdReached = 7867 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7868 bool ThresholdReached = 7869 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7870 if ((ThresholdReached && !Hints.allowReordering()) || 7871 PragmaThresholdReached) { 7872 ORE->emit([&]() { 7873 return OptimizationRemarkAnalysisAliasing( 7874 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7875 OrigLoop->getHeader()) 7876 << "loop not vectorized: cannot prove it is safe to reorder " 7877 "memory operations"; 7878 }); 7879 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7880 Hints.emitRemarkWithHints(); 7881 return VectorizationFactor::Disabled(); 7882 } 7883 } 7884 return SelectedVF; 7885 } 7886 7887 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7888 assert(count_if(VPlans, 7889 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7890 1 && 7891 "Best VF has not a single VPlan."); 7892 7893 for (const VPlanPtr &Plan : VPlans) { 7894 if (Plan->hasVF(VF)) 7895 return *Plan.get(); 7896 } 7897 llvm_unreachable("No plan found!"); 7898 } 7899 7900 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7901 SmallVector<Metadata *, 4> MDs; 7902 // Reserve first location for self reference to the LoopID metadata node. 7903 MDs.push_back(nullptr); 7904 bool IsUnrollMetadata = false; 7905 MDNode *LoopID = L->getLoopID(); 7906 if (LoopID) { 7907 // First find existing loop unrolling disable metadata. 7908 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7909 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7910 if (MD) { 7911 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7912 IsUnrollMetadata = 7913 S && S->getString().startswith("llvm.loop.unroll.disable"); 7914 } 7915 MDs.push_back(LoopID->getOperand(i)); 7916 } 7917 } 7918 7919 if (!IsUnrollMetadata) { 7920 // Add runtime unroll disable metadata. 7921 LLVMContext &Context = L->getHeader()->getContext(); 7922 SmallVector<Metadata *, 1> DisableOperands; 7923 DisableOperands.push_back( 7924 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7925 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7926 MDs.push_back(DisableNode); 7927 MDNode *NewLoopID = MDNode::get(Context, MDs); 7928 // Set operand 0 to refer to the loop id itself. 7929 NewLoopID->replaceOperandWith(0, NewLoopID); 7930 L->setLoopID(NewLoopID); 7931 } 7932 } 7933 7934 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7935 VPlan &BestVPlan, 7936 InnerLoopVectorizer &ILV, 7937 DominatorTree *DT) { 7938 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7939 << '\n'); 7940 7941 // Perform the actual loop transformation. 7942 7943 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7944 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7945 Value *CanonicalIVStartValue; 7946 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7947 ILV.createVectorizedLoopSkeleton(); 7948 ILV.collectPoisonGeneratingRecipes(State); 7949 7950 ILV.printDebugTracesAtStart(); 7951 7952 //===------------------------------------------------===// 7953 // 7954 // Notice: any optimization or new instruction that go 7955 // into the code below should also be implemented in 7956 // the cost-model. 7957 // 7958 //===------------------------------------------------===// 7959 7960 // 2. Copy and widen instructions from the old loop into the new loop. 7961 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7962 ILV.getOrCreateVectorTripCount(nullptr), 7963 CanonicalIVStartValue, State); 7964 BestVPlan.execute(&State); 7965 7966 // Keep all loop hints from the original loop on the vector loop (we'll 7967 // replace the vectorizer-specific hints below). 7968 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7969 7970 Optional<MDNode *> VectorizedLoopID = 7971 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7972 LLVMLoopVectorizeFollowupVectorized}); 7973 7974 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 7975 if (VectorizedLoopID.hasValue()) 7976 L->setLoopID(VectorizedLoopID.getValue()); 7977 else { 7978 // Keep all loop hints from the original loop on the vector loop (we'll 7979 // replace the vectorizer-specific hints below). 7980 if (MDNode *LID = OrigLoop->getLoopID()) 7981 L->setLoopID(LID); 7982 7983 LoopVectorizeHints Hints(L, true, *ORE); 7984 Hints.setAlreadyVectorized(); 7985 } 7986 // Disable runtime unrolling when vectorizing the epilogue loop. 7987 if (CanonicalIVStartValue) 7988 AddRuntimeUnrollDisableMetaData(L); 7989 7990 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7991 // predication, updating analyses. 7992 ILV.fixVectorizedLoop(State); 7993 7994 ILV.printDebugTracesAtEnd(); 7995 } 7996 7997 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7998 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7999 for (const auto &Plan : VPlans) 8000 if (PrintVPlansInDotFormat) 8001 Plan->printDOT(O); 8002 else 8003 Plan->print(O); 8004 } 8005 #endif 8006 8007 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8008 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8009 8010 // We create new control-flow for the vectorized loop, so the original exit 8011 // conditions will be dead after vectorization if it's only used by the 8012 // terminator 8013 SmallVector<BasicBlock*> ExitingBlocks; 8014 OrigLoop->getExitingBlocks(ExitingBlocks); 8015 for (auto *BB : ExitingBlocks) { 8016 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8017 if (!Cmp || !Cmp->hasOneUse()) 8018 continue; 8019 8020 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8021 if (!DeadInstructions.insert(Cmp).second) 8022 continue; 8023 8024 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8025 // TODO: can recurse through operands in general 8026 for (Value *Op : Cmp->operands()) { 8027 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8028 DeadInstructions.insert(cast<Instruction>(Op)); 8029 } 8030 } 8031 8032 // We create new "steps" for induction variable updates to which the original 8033 // induction variables map. An original update instruction will be dead if 8034 // all its users except the induction variable are dead. 8035 auto *Latch = OrigLoop->getLoopLatch(); 8036 for (auto &Induction : Legal->getInductionVars()) { 8037 PHINode *Ind = Induction.first; 8038 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8039 8040 // If the tail is to be folded by masking, the primary induction variable, 8041 // if exists, isn't dead: it will be used for masking. Don't kill it. 8042 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8043 continue; 8044 8045 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8046 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8047 })) 8048 DeadInstructions.insert(IndUpdate); 8049 } 8050 } 8051 8052 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8053 8054 //===--------------------------------------------------------------------===// 8055 // EpilogueVectorizerMainLoop 8056 //===--------------------------------------------------------------------===// 8057 8058 /// This function is partially responsible for generating the control flow 8059 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8060 std::pair<BasicBlock *, Value *> 8061 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8062 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8063 Loop *Lp = createVectorLoopSkeleton(""); 8064 8065 // Generate the code to check the minimum iteration count of the vector 8066 // epilogue (see below). 8067 EPI.EpilogueIterationCountCheck = 8068 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8069 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8070 8071 // Generate the code to check any assumptions that we've made for SCEV 8072 // expressions. 8073 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8074 8075 // Generate the code that checks at runtime if arrays overlap. We put the 8076 // checks into a separate block to make the more common case of few elements 8077 // faster. 8078 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8079 8080 // Generate the iteration count check for the main loop, *after* the check 8081 // for the epilogue loop, so that the path-length is shorter for the case 8082 // that goes directly through the vector epilogue. The longer-path length for 8083 // the main loop is compensated for, by the gain from vectorizing the larger 8084 // trip count. Note: the branch will get updated later on when we vectorize 8085 // the epilogue. 8086 EPI.MainLoopIterationCountCheck = 8087 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8088 8089 // Generate the induction variable. 8090 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8091 EPI.VectorTripCount = CountRoundDown; 8092 createHeaderBranch(Lp); 8093 8094 // Skip induction resume value creation here because they will be created in 8095 // the second pass. If we created them here, they wouldn't be used anyway, 8096 // because the vplan in the second pass still contains the inductions from the 8097 // original loop. 8098 8099 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 8100 } 8101 8102 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8103 LLVM_DEBUG({ 8104 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8105 << "Main Loop VF:" << EPI.MainLoopVF 8106 << ", Main Loop UF:" << EPI.MainLoopUF 8107 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8108 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8109 }); 8110 } 8111 8112 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8113 DEBUG_WITH_TYPE(VerboseDebug, { 8114 dbgs() << "intermediate fn:\n" 8115 << *OrigLoop->getHeader()->getParent() << "\n"; 8116 }); 8117 } 8118 8119 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8120 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8121 assert(L && "Expected valid Loop."); 8122 assert(Bypass && "Expected valid bypass basic block."); 8123 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8124 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8125 Value *Count = getOrCreateTripCount(L); 8126 // Reuse existing vector loop preheader for TC checks. 8127 // Note that new preheader block is generated for vector loop. 8128 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8129 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8130 8131 // Generate code to check if the loop's trip count is less than VF * UF of the 8132 // main vector loop. 8133 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8134 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8135 8136 Value *CheckMinIters = Builder.CreateICmp( 8137 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8138 "min.iters.check"); 8139 8140 if (!ForEpilogue) 8141 TCCheckBlock->setName("vector.main.loop.iter.check"); 8142 8143 // Create new preheader for vector loop. 8144 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8145 DT, LI, nullptr, "vector.ph"); 8146 8147 if (ForEpilogue) { 8148 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8149 DT->getNode(Bypass)->getIDom()) && 8150 "TC check is expected to dominate Bypass"); 8151 8152 // Update dominator for Bypass & LoopExit. 8153 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8154 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8155 // For loops with multiple exits, there's no edge from the middle block 8156 // to exit blocks (as the epilogue must run) and thus no need to update 8157 // the immediate dominator of the exit blocks. 8158 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8159 8160 LoopBypassBlocks.push_back(TCCheckBlock); 8161 8162 // Save the trip count so we don't have to regenerate it in the 8163 // vec.epilog.iter.check. This is safe to do because the trip count 8164 // generated here dominates the vector epilog iter check. 8165 EPI.TripCount = Count; 8166 } 8167 8168 ReplaceInstWithInst( 8169 TCCheckBlock->getTerminator(), 8170 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8171 8172 return TCCheckBlock; 8173 } 8174 8175 //===--------------------------------------------------------------------===// 8176 // EpilogueVectorizerEpilogueLoop 8177 //===--------------------------------------------------------------------===// 8178 8179 /// This function is partially responsible for generating the control flow 8180 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8181 std::pair<BasicBlock *, Value *> 8182 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8183 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8184 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8185 8186 // Now, compare the remaining count and if there aren't enough iterations to 8187 // execute the vectorized epilogue skip to the scalar part. 8188 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8189 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8190 LoopVectorPreHeader = 8191 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8192 LI, nullptr, "vec.epilog.ph"); 8193 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8194 VecEpilogueIterationCountCheck); 8195 8196 // Adjust the control flow taking the state info from the main loop 8197 // vectorization into account. 8198 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8199 "expected this to be saved from the previous pass."); 8200 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8201 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8202 8203 DT->changeImmediateDominator(LoopVectorPreHeader, 8204 EPI.MainLoopIterationCountCheck); 8205 8206 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8207 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8208 8209 if (EPI.SCEVSafetyCheck) 8210 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8211 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8212 if (EPI.MemSafetyCheck) 8213 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8214 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8215 8216 DT->changeImmediateDominator( 8217 VecEpilogueIterationCountCheck, 8218 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8219 8220 DT->changeImmediateDominator(LoopScalarPreHeader, 8221 EPI.EpilogueIterationCountCheck); 8222 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8223 // If there is an epilogue which must run, there's no edge from the 8224 // middle block to exit blocks and thus no need to update the immediate 8225 // dominator of the exit blocks. 8226 DT->changeImmediateDominator(LoopExitBlock, 8227 EPI.EpilogueIterationCountCheck); 8228 8229 // Keep track of bypass blocks, as they feed start values to the induction 8230 // phis in the scalar loop preheader. 8231 if (EPI.SCEVSafetyCheck) 8232 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8233 if (EPI.MemSafetyCheck) 8234 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8235 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8236 8237 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 8238 // merge control-flow from the latch block and the middle block. Update the 8239 // incoming values here and move the Phi into the preheader. 8240 SmallVector<PHINode *, 4> PhisInBlock; 8241 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8242 PhisInBlock.push_back(&Phi); 8243 8244 for (PHINode *Phi : PhisInBlock) { 8245 Phi->replaceIncomingBlockWith( 8246 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8247 VecEpilogueIterationCountCheck); 8248 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8249 if (EPI.SCEVSafetyCheck) 8250 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8251 if (EPI.MemSafetyCheck) 8252 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8253 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8254 } 8255 8256 // Generate a resume induction for the vector epilogue and put it in the 8257 // vector epilogue preheader 8258 Type *IdxTy = Legal->getWidestInductionType(); 8259 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8260 LoopVectorPreHeader->getFirstNonPHI()); 8261 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8262 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8263 EPI.MainLoopIterationCountCheck); 8264 8265 // Generate the induction variable. 8266 createHeaderBranch(Lp); 8267 8268 // Generate induction resume values. These variables save the new starting 8269 // indexes for the scalar loop. They are used to test if there are any tail 8270 // iterations left once the vector loop has completed. 8271 // Note that when the vectorized epilogue is skipped due to iteration count 8272 // check, then the resume value for the induction variable comes from 8273 // the trip count of the main vector loop, hence passing the AdditionalBypass 8274 // argument. 8275 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8276 EPI.VectorTripCount} /* AdditionalBypass */); 8277 8278 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8279 } 8280 8281 BasicBlock * 8282 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8283 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8284 8285 assert(EPI.TripCount && 8286 "Expected trip count to have been safed in the first pass."); 8287 assert( 8288 (!isa<Instruction>(EPI.TripCount) || 8289 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8290 "saved trip count does not dominate insertion point."); 8291 Value *TC = EPI.TripCount; 8292 IRBuilder<> Builder(Insert->getTerminator()); 8293 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8294 8295 // Generate code to check if the loop's trip count is less than VF * UF of the 8296 // vector epilogue loop. 8297 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8298 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8299 8300 Value *CheckMinIters = 8301 Builder.CreateICmp(P, Count, 8302 createStepForVF(Builder, Count->getType(), 8303 EPI.EpilogueVF, EPI.EpilogueUF), 8304 "min.epilog.iters.check"); 8305 8306 ReplaceInstWithInst( 8307 Insert->getTerminator(), 8308 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8309 8310 LoopBypassBlocks.push_back(Insert); 8311 return Insert; 8312 } 8313 8314 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8315 LLVM_DEBUG({ 8316 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8317 << "Epilogue Loop VF:" << EPI.EpilogueVF 8318 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8319 }); 8320 } 8321 8322 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8323 DEBUG_WITH_TYPE(VerboseDebug, { 8324 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8325 }); 8326 } 8327 8328 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8329 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8330 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8331 bool PredicateAtRangeStart = Predicate(Range.Start); 8332 8333 for (ElementCount TmpVF = Range.Start * 2; 8334 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8335 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8336 Range.End = TmpVF; 8337 break; 8338 } 8339 8340 return PredicateAtRangeStart; 8341 } 8342 8343 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8344 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8345 /// of VF's starting at a given VF and extending it as much as possible. Each 8346 /// vectorization decision can potentially shorten this sub-range during 8347 /// buildVPlan(). 8348 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8349 ElementCount MaxVF) { 8350 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8351 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8352 VFRange SubRange = {VF, MaxVFPlusOne}; 8353 VPlans.push_back(buildVPlan(SubRange)); 8354 VF = SubRange.End; 8355 } 8356 } 8357 8358 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8359 VPlanPtr &Plan) { 8360 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8361 8362 // Look for cached value. 8363 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8364 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8365 if (ECEntryIt != EdgeMaskCache.end()) 8366 return ECEntryIt->second; 8367 8368 VPValue *SrcMask = createBlockInMask(Src, Plan); 8369 8370 // The terminator has to be a branch inst! 8371 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8372 assert(BI && "Unexpected terminator found"); 8373 8374 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8375 return EdgeMaskCache[Edge] = SrcMask; 8376 8377 // If source is an exiting block, we know the exit edge is dynamically dead 8378 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8379 // adding uses of an otherwise potentially dead instruction. 8380 if (OrigLoop->isLoopExiting(Src)) 8381 return EdgeMaskCache[Edge] = SrcMask; 8382 8383 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8384 assert(EdgeMask && "No Edge Mask found for condition"); 8385 8386 if (BI->getSuccessor(0) != Dst) 8387 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8388 8389 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8390 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8391 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8392 // The select version does not introduce new UB if SrcMask is false and 8393 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8394 VPValue *False = Plan->getOrAddVPValue( 8395 ConstantInt::getFalse(BI->getCondition()->getType())); 8396 EdgeMask = 8397 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8398 } 8399 8400 return EdgeMaskCache[Edge] = EdgeMask; 8401 } 8402 8403 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8404 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8405 8406 // Look for cached value. 8407 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8408 if (BCEntryIt != BlockMaskCache.end()) 8409 return BCEntryIt->second; 8410 8411 // All-one mask is modelled as no-mask following the convention for masked 8412 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8413 VPValue *BlockMask = nullptr; 8414 8415 if (OrigLoop->getHeader() == BB) { 8416 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8417 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8418 8419 // Introduce the early-exit compare IV <= BTC to form header block mask. 8420 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8421 // constructing the desired canonical IV in the header block as its first 8422 // non-phi instructions. 8423 assert(CM.foldTailByMasking() && "must fold the tail"); 8424 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8425 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8426 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8427 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8428 8429 VPBuilder::InsertPointGuard Guard(Builder); 8430 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8431 if (CM.TTI.emitGetActiveLaneMask()) { 8432 VPValue *TC = Plan->getOrCreateTripCount(); 8433 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8434 } else { 8435 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8436 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8437 } 8438 return BlockMaskCache[BB] = BlockMask; 8439 } 8440 8441 // This is the block mask. We OR all incoming edges. 8442 for (auto *Predecessor : predecessors(BB)) { 8443 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8444 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8445 return BlockMaskCache[BB] = EdgeMask; 8446 8447 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8448 BlockMask = EdgeMask; 8449 continue; 8450 } 8451 8452 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8453 } 8454 8455 return BlockMaskCache[BB] = BlockMask; 8456 } 8457 8458 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8459 ArrayRef<VPValue *> Operands, 8460 VFRange &Range, 8461 VPlanPtr &Plan) { 8462 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8463 "Must be called with either a load or store"); 8464 8465 auto willWiden = [&](ElementCount VF) -> bool { 8466 if (VF.isScalar()) 8467 return false; 8468 LoopVectorizationCostModel::InstWidening Decision = 8469 CM.getWideningDecision(I, VF); 8470 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8471 "CM decision should be taken at this point."); 8472 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8473 return true; 8474 if (CM.isScalarAfterVectorization(I, VF) || 8475 CM.isProfitableToScalarize(I, VF)) 8476 return false; 8477 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8478 }; 8479 8480 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8481 return nullptr; 8482 8483 VPValue *Mask = nullptr; 8484 if (Legal->isMaskRequired(I)) 8485 Mask = createBlockInMask(I->getParent(), Plan); 8486 8487 // Determine if the pointer operand of the access is either consecutive or 8488 // reverse consecutive. 8489 LoopVectorizationCostModel::InstWidening Decision = 8490 CM.getWideningDecision(I, Range.Start); 8491 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8492 bool Consecutive = 8493 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8494 8495 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8496 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8497 Consecutive, Reverse); 8498 8499 StoreInst *Store = cast<StoreInst>(I); 8500 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8501 Mask, Consecutive, Reverse); 8502 } 8503 8504 static VPWidenIntOrFpInductionRecipe * 8505 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc, 8506 VPValue *Start, const InductionDescriptor &IndDesc, 8507 LoopVectorizationCostModel &CM, Loop &OrigLoop, 8508 VFRange &Range) { 8509 // Returns true if an instruction \p I should be scalarized instead of 8510 // vectorized for the chosen vectorization factor. 8511 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8512 return CM.isScalarAfterVectorization(I, VF) || 8513 CM.isProfitableToScalarize(I, VF); 8514 }; 8515 8516 bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( 8517 [&](ElementCount VF) { 8518 // Returns true if we should generate a scalar version of \p IV. 8519 if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) 8520 return true; 8521 auto isScalarInst = [&](User *U) -> bool { 8522 auto *I = cast<Instruction>(U); 8523 return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); 8524 }; 8525 return any_of(PhiOrTrunc->users(), isScalarInst); 8526 }, 8527 Range); 8528 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8529 [&](ElementCount VF) { 8530 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8531 }, 8532 Range); 8533 assert(IndDesc.getStartValue() == 8534 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8535 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8536 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI, 8537 NeedsScalarIV, !NeedsScalarIVOnly); 8538 } 8539 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8540 return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV, 8541 !NeedsScalarIVOnly); 8542 } 8543 8544 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8545 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const { 8546 8547 // Check if this is an integer or fp induction. If so, build the recipe that 8548 // produces its scalar and vector values. 8549 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8550 return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop, 8551 Range); 8552 8553 return nullptr; 8554 } 8555 8556 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8557 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8558 VPlan &Plan) const { 8559 // Optimize the special case where the source is a constant integer 8560 // induction variable. Notice that we can only optimize the 'trunc' case 8561 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8562 // (c) other casts depend on pointer size. 8563 8564 // Determine whether \p K is a truncation based on an induction variable that 8565 // can be optimized. 8566 auto isOptimizableIVTruncate = 8567 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8568 return [=](ElementCount VF) -> bool { 8569 return CM.isOptimizableIVTruncate(K, VF); 8570 }; 8571 }; 8572 8573 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8574 isOptimizableIVTruncate(I), Range)) { 8575 8576 auto *Phi = cast<PHINode>(I->getOperand(0)); 8577 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8578 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8579 return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range); 8580 } 8581 return nullptr; 8582 } 8583 8584 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8585 ArrayRef<VPValue *> Operands, 8586 VPlanPtr &Plan) { 8587 // If all incoming values are equal, the incoming VPValue can be used directly 8588 // instead of creating a new VPBlendRecipe. 8589 VPValue *FirstIncoming = Operands[0]; 8590 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8591 return FirstIncoming == Inc; 8592 })) { 8593 return Operands[0]; 8594 } 8595 8596 unsigned NumIncoming = Phi->getNumIncomingValues(); 8597 // For in-loop reductions, we do not need to create an additional select. 8598 VPValue *InLoopVal = nullptr; 8599 for (unsigned In = 0; In < NumIncoming; In++) { 8600 PHINode *PhiOp = 8601 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8602 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8603 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8604 InLoopVal = Operands[In]; 8605 } 8606 } 8607 8608 assert((!InLoopVal || NumIncoming == 2) && 8609 "Found an in-loop reduction for PHI with unexpected number of " 8610 "incoming values"); 8611 if (InLoopVal) 8612 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8613 8614 // We know that all PHIs in non-header blocks are converted into selects, so 8615 // we don't have to worry about the insertion order and we can just use the 8616 // builder. At this point we generate the predication tree. There may be 8617 // duplications since this is a simple recursive scan, but future 8618 // optimizations will clean it up. 8619 SmallVector<VPValue *, 2> OperandsWithMask; 8620 8621 for (unsigned In = 0; In < NumIncoming; In++) { 8622 VPValue *EdgeMask = 8623 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8624 assert((EdgeMask || NumIncoming == 1) && 8625 "Multiple predecessors with one having a full mask"); 8626 OperandsWithMask.push_back(Operands[In]); 8627 if (EdgeMask) 8628 OperandsWithMask.push_back(EdgeMask); 8629 } 8630 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8631 } 8632 8633 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8634 ArrayRef<VPValue *> Operands, 8635 VFRange &Range) const { 8636 8637 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8638 [this, CI](ElementCount VF) { 8639 return CM.isScalarWithPredication(CI, VF); 8640 }, 8641 Range); 8642 8643 if (IsPredicated) 8644 return nullptr; 8645 8646 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8647 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8648 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8649 ID == Intrinsic::pseudoprobe || 8650 ID == Intrinsic::experimental_noalias_scope_decl)) 8651 return nullptr; 8652 8653 auto willWiden = [&](ElementCount VF) -> bool { 8654 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8655 // The following case may be scalarized depending on the VF. 8656 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8657 // version of the instruction. 8658 // Is it beneficial to perform intrinsic call compared to lib call? 8659 bool NeedToScalarize = false; 8660 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8661 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8662 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8663 return UseVectorIntrinsic || !NeedToScalarize; 8664 }; 8665 8666 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8667 return nullptr; 8668 8669 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8670 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8671 } 8672 8673 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8674 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8675 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8676 // Instruction should be widened, unless it is scalar after vectorization, 8677 // scalarization is profitable or it is predicated. 8678 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8679 return CM.isScalarAfterVectorization(I, VF) || 8680 CM.isProfitableToScalarize(I, VF) || 8681 CM.isScalarWithPredication(I, VF); 8682 }; 8683 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8684 Range); 8685 } 8686 8687 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8688 ArrayRef<VPValue *> Operands) const { 8689 auto IsVectorizableOpcode = [](unsigned Opcode) { 8690 switch (Opcode) { 8691 case Instruction::Add: 8692 case Instruction::And: 8693 case Instruction::AShr: 8694 case Instruction::BitCast: 8695 case Instruction::FAdd: 8696 case Instruction::FCmp: 8697 case Instruction::FDiv: 8698 case Instruction::FMul: 8699 case Instruction::FNeg: 8700 case Instruction::FPExt: 8701 case Instruction::FPToSI: 8702 case Instruction::FPToUI: 8703 case Instruction::FPTrunc: 8704 case Instruction::FRem: 8705 case Instruction::FSub: 8706 case Instruction::ICmp: 8707 case Instruction::IntToPtr: 8708 case Instruction::LShr: 8709 case Instruction::Mul: 8710 case Instruction::Or: 8711 case Instruction::PtrToInt: 8712 case Instruction::SDiv: 8713 case Instruction::Select: 8714 case Instruction::SExt: 8715 case Instruction::Shl: 8716 case Instruction::SIToFP: 8717 case Instruction::SRem: 8718 case Instruction::Sub: 8719 case Instruction::Trunc: 8720 case Instruction::UDiv: 8721 case Instruction::UIToFP: 8722 case Instruction::URem: 8723 case Instruction::Xor: 8724 case Instruction::ZExt: 8725 return true; 8726 } 8727 return false; 8728 }; 8729 8730 if (!IsVectorizableOpcode(I->getOpcode())) 8731 return nullptr; 8732 8733 // Success: widen this instruction. 8734 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8735 } 8736 8737 void VPRecipeBuilder::fixHeaderPhis() { 8738 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8739 for (VPHeaderPHIRecipe *R : PhisToFix) { 8740 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8741 VPRecipeBase *IncR = 8742 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8743 R->addOperand(IncR->getVPSingleValue()); 8744 } 8745 } 8746 8747 VPBasicBlock *VPRecipeBuilder::handleReplication( 8748 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8749 VPlanPtr &Plan) { 8750 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8751 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8752 Range); 8753 8754 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8755 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8756 Range); 8757 8758 // Even if the instruction is not marked as uniform, there are certain 8759 // intrinsic calls that can be effectively treated as such, so we check for 8760 // them here. Conservatively, we only do this for scalable vectors, since 8761 // for fixed-width VFs we can always fall back on full scalarization. 8762 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8763 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8764 case Intrinsic::assume: 8765 case Intrinsic::lifetime_start: 8766 case Intrinsic::lifetime_end: 8767 // For scalable vectors if one of the operands is variant then we still 8768 // want to mark as uniform, which will generate one instruction for just 8769 // the first lane of the vector. We can't scalarize the call in the same 8770 // way as for fixed-width vectors because we don't know how many lanes 8771 // there are. 8772 // 8773 // The reasons for doing it this way for scalable vectors are: 8774 // 1. For the assume intrinsic generating the instruction for the first 8775 // lane is still be better than not generating any at all. For 8776 // example, the input may be a splat across all lanes. 8777 // 2. For the lifetime start/end intrinsics the pointer operand only 8778 // does anything useful when the input comes from a stack object, 8779 // which suggests it should always be uniform. For non-stack objects 8780 // the effect is to poison the object, which still allows us to 8781 // remove the call. 8782 IsUniform = true; 8783 break; 8784 default: 8785 break; 8786 } 8787 } 8788 8789 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8790 IsUniform, IsPredicated); 8791 setRecipe(I, Recipe); 8792 Plan->addVPValue(I, Recipe); 8793 8794 // Find if I uses a predicated instruction. If so, it will use its scalar 8795 // value. Avoid hoisting the insert-element which packs the scalar value into 8796 // a vector value, as that happens iff all users use the vector value. 8797 for (VPValue *Op : Recipe->operands()) { 8798 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8799 if (!PredR) 8800 continue; 8801 auto *RepR = 8802 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8803 assert(RepR->isPredicated() && 8804 "expected Replicate recipe to be predicated"); 8805 RepR->setAlsoPack(false); 8806 } 8807 8808 // Finalize the recipe for Instr, first if it is not predicated. 8809 if (!IsPredicated) { 8810 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8811 VPBB->appendRecipe(Recipe); 8812 return VPBB; 8813 } 8814 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8815 8816 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8817 assert(SingleSucc && "VPBB must have a single successor when handling " 8818 "predicated replication."); 8819 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8820 // Record predicated instructions for above packing optimizations. 8821 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8822 VPBlockUtils::insertBlockAfter(Region, VPBB); 8823 auto *RegSucc = new VPBasicBlock(); 8824 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8825 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8826 return RegSucc; 8827 } 8828 8829 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8830 VPRecipeBase *PredRecipe, 8831 VPlanPtr &Plan) { 8832 // Instructions marked for predication are replicated and placed under an 8833 // if-then construct to prevent side-effects. 8834 8835 // Generate recipes to compute the block mask for this region. 8836 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8837 8838 // Build the triangular if-then region. 8839 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8840 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8841 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8842 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8843 auto *PHIRecipe = Instr->getType()->isVoidTy() 8844 ? nullptr 8845 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8846 if (PHIRecipe) { 8847 Plan->removeVPValueFor(Instr); 8848 Plan->addVPValue(Instr, PHIRecipe); 8849 } 8850 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8851 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8852 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8853 8854 // Note: first set Entry as region entry and then connect successors starting 8855 // from it in order, to propagate the "parent" of each VPBasicBlock. 8856 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8857 VPBlockUtils::connectBlocks(Pred, Exit); 8858 8859 return Region; 8860 } 8861 8862 VPRecipeOrVPValueTy 8863 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8864 ArrayRef<VPValue *> Operands, 8865 VFRange &Range, VPlanPtr &Plan) { 8866 // First, check for specific widening recipes that deal with calls, memory 8867 // operations, inductions and Phi nodes. 8868 if (auto *CI = dyn_cast<CallInst>(Instr)) 8869 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8870 8871 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8872 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8873 8874 VPRecipeBase *Recipe; 8875 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8876 if (Phi->getParent() != OrigLoop->getHeader()) 8877 return tryToBlend(Phi, Operands, Plan); 8878 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8879 return toVPRecipeResult(Recipe); 8880 8881 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8882 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8883 VPValue *StartV = Operands[0]; 8884 if (Legal->isReductionVariable(Phi)) { 8885 const RecurrenceDescriptor &RdxDesc = 8886 Legal->getReductionVars().find(Phi)->second; 8887 assert(RdxDesc.getRecurrenceStartValue() == 8888 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8889 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8890 CM.isInLoopReduction(Phi), 8891 CM.useOrderedReductions(RdxDesc)); 8892 } else { 8893 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8894 } 8895 8896 // Record the incoming value from the backedge, so we can add the incoming 8897 // value from the backedge after all recipes have been created. 8898 recordRecipeOf(cast<Instruction>( 8899 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8900 PhisToFix.push_back(PhiRecipe); 8901 } else { 8902 // TODO: record backedge value for remaining pointer induction phis. 8903 assert(Phi->getType()->isPointerTy() && 8904 "only pointer phis should be handled here"); 8905 assert(Legal->getInductionVars().count(Phi) && 8906 "Not an induction variable"); 8907 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8908 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8909 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8910 } 8911 8912 return toVPRecipeResult(PhiRecipe); 8913 } 8914 8915 if (isa<TruncInst>(Instr) && 8916 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8917 Range, *Plan))) 8918 return toVPRecipeResult(Recipe); 8919 8920 if (!shouldWiden(Instr, Range)) 8921 return nullptr; 8922 8923 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8924 return toVPRecipeResult(new VPWidenGEPRecipe( 8925 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8926 8927 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8928 bool InvariantCond = 8929 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8930 return toVPRecipeResult(new VPWidenSelectRecipe( 8931 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8932 } 8933 8934 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8935 } 8936 8937 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8938 ElementCount MaxVF) { 8939 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8940 8941 // Collect instructions from the original loop that will become trivially dead 8942 // in the vectorized loop. We don't need to vectorize these instructions. For 8943 // example, original induction update instructions can become dead because we 8944 // separately emit induction "steps" when generating code for the new loop. 8945 // Similarly, we create a new latch condition when setting up the structure 8946 // of the new loop, so the old one can become dead. 8947 SmallPtrSet<Instruction *, 4> DeadInstructions; 8948 collectTriviallyDeadInstructions(DeadInstructions); 8949 8950 // Add assume instructions we need to drop to DeadInstructions, to prevent 8951 // them from being added to the VPlan. 8952 // TODO: We only need to drop assumes in blocks that get flattend. If the 8953 // control flow is preserved, we should keep them. 8954 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8955 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8956 8957 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8958 // Dead instructions do not need sinking. Remove them from SinkAfter. 8959 for (Instruction *I : DeadInstructions) 8960 SinkAfter.erase(I); 8961 8962 // Cannot sink instructions after dead instructions (there won't be any 8963 // recipes for them). Instead, find the first non-dead previous instruction. 8964 for (auto &P : Legal->getSinkAfter()) { 8965 Instruction *SinkTarget = P.second; 8966 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8967 (void)FirstInst; 8968 while (DeadInstructions.contains(SinkTarget)) { 8969 assert( 8970 SinkTarget != FirstInst && 8971 "Must find a live instruction (at least the one feeding the " 8972 "first-order recurrence PHI) before reaching beginning of the block"); 8973 SinkTarget = SinkTarget->getPrevNode(); 8974 assert(SinkTarget != P.first && 8975 "sink source equals target, no sinking required"); 8976 } 8977 P.second = SinkTarget; 8978 } 8979 8980 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8981 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8982 VFRange SubRange = {VF, MaxVFPlusOne}; 8983 VPlans.push_back( 8984 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8985 VF = SubRange.End; 8986 } 8987 } 8988 8989 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8990 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8991 // BranchOnCount VPInstruction to the latch. 8992 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8993 bool HasNUW, bool IsVPlanNative) { 8994 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8995 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8996 8997 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8998 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8999 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 9000 if (IsVPlanNative) 9001 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 9002 Header->insert(CanonicalIVPHI, Header->begin()); 9003 9004 auto *CanonicalIVIncrement = 9005 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 9006 : VPInstruction::CanonicalIVIncrement, 9007 {CanonicalIVPHI}, DL); 9008 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 9009 9010 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 9011 if (IsVPlanNative) { 9012 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 9013 EB->setCondBit(nullptr); 9014 } 9015 EB->appendRecipe(CanonicalIVIncrement); 9016 9017 auto *BranchOnCount = 9018 new VPInstruction(VPInstruction::BranchOnCount, 9019 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 9020 EB->appendRecipe(BranchOnCount); 9021 } 9022 9023 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9024 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9025 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9026 9027 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9028 9029 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9030 9031 // --------------------------------------------------------------------------- 9032 // Pre-construction: record ingredients whose recipes we'll need to further 9033 // process after constructing the initial VPlan. 9034 // --------------------------------------------------------------------------- 9035 9036 // Mark instructions we'll need to sink later and their targets as 9037 // ingredients whose recipe we'll need to record. 9038 for (auto &Entry : SinkAfter) { 9039 RecipeBuilder.recordRecipeOf(Entry.first); 9040 RecipeBuilder.recordRecipeOf(Entry.second); 9041 } 9042 for (auto &Reduction : CM.getInLoopReductionChains()) { 9043 PHINode *Phi = Reduction.first; 9044 RecurKind Kind = 9045 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 9046 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9047 9048 RecipeBuilder.recordRecipeOf(Phi); 9049 for (auto &R : ReductionOperations) { 9050 RecipeBuilder.recordRecipeOf(R); 9051 // For min/max reducitons, where we have a pair of icmp/select, we also 9052 // need to record the ICmp recipe, so it can be removed later. 9053 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9054 "Only min/max recurrences allowed for inloop reductions"); 9055 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9056 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9057 } 9058 } 9059 9060 // For each interleave group which is relevant for this (possibly trimmed) 9061 // Range, add it to the set of groups to be later applied to the VPlan and add 9062 // placeholders for its members' Recipes which we'll be replacing with a 9063 // single VPInterleaveRecipe. 9064 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9065 auto applyIG = [IG, this](ElementCount VF) -> bool { 9066 return (VF.isVector() && // Query is illegal for VF == 1 9067 CM.getWideningDecision(IG->getInsertPos(), VF) == 9068 LoopVectorizationCostModel::CM_Interleave); 9069 }; 9070 if (!getDecisionAndClampRange(applyIG, Range)) 9071 continue; 9072 InterleaveGroups.insert(IG); 9073 for (unsigned i = 0; i < IG->getFactor(); i++) 9074 if (Instruction *Member = IG->getMember(i)) 9075 RecipeBuilder.recordRecipeOf(Member); 9076 }; 9077 9078 // --------------------------------------------------------------------------- 9079 // Build initial VPlan: Scan the body of the loop in a topological order to 9080 // visit each basic block after having visited its predecessor basic blocks. 9081 // --------------------------------------------------------------------------- 9082 9083 // Create initial VPlan skeleton, with separate header and latch blocks. 9084 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9085 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9086 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9087 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9088 auto Plan = std::make_unique<VPlan>(TopRegion); 9089 9090 Instruction *DLInst = 9091 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9092 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 9093 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 9094 !CM.foldTailByMasking(), false); 9095 9096 // Scan the body of the loop in a topological order to visit each basic block 9097 // after having visited its predecessor basic blocks. 9098 LoopBlocksDFS DFS(OrigLoop); 9099 DFS.perform(LI); 9100 9101 VPBasicBlock *VPBB = HeaderVPBB; 9102 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9103 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9104 // Relevant instructions from basic block BB will be grouped into VPRecipe 9105 // ingredients and fill a new VPBasicBlock. 9106 unsigned VPBBsForBB = 0; 9107 VPBB->setName(BB->getName()); 9108 Builder.setInsertPoint(VPBB); 9109 9110 // Introduce each ingredient into VPlan. 9111 // TODO: Model and preserve debug instrinsics in VPlan. 9112 for (Instruction &I : BB->instructionsWithoutDebug()) { 9113 Instruction *Instr = &I; 9114 9115 // First filter out irrelevant instructions, to ensure no recipes are 9116 // built for them. 9117 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9118 continue; 9119 9120 SmallVector<VPValue *, 4> Operands; 9121 auto *Phi = dyn_cast<PHINode>(Instr); 9122 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9123 Operands.push_back(Plan->getOrAddVPValue( 9124 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9125 } else { 9126 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9127 Operands = {OpRange.begin(), OpRange.end()}; 9128 } 9129 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9130 Instr, Operands, Range, Plan)) { 9131 // If Instr can be simplified to an existing VPValue, use it. 9132 if (RecipeOrValue.is<VPValue *>()) { 9133 auto *VPV = RecipeOrValue.get<VPValue *>(); 9134 Plan->addVPValue(Instr, VPV); 9135 // If the re-used value is a recipe, register the recipe for the 9136 // instruction, in case the recipe for Instr needs to be recorded. 9137 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9138 RecipeBuilder.setRecipe(Instr, R); 9139 continue; 9140 } 9141 // Otherwise, add the new recipe. 9142 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9143 for (auto *Def : Recipe->definedValues()) { 9144 auto *UV = Def->getUnderlyingValue(); 9145 Plan->addVPValue(UV, Def); 9146 } 9147 9148 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9149 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9150 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9151 // of the header block. That can happen for truncates of induction 9152 // variables. Those recipes are moved to the phi section of the header 9153 // block after applying SinkAfter, which relies on the original 9154 // position of the trunc. 9155 assert(isa<TruncInst>(Instr)); 9156 InductionsToMove.push_back( 9157 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9158 } 9159 RecipeBuilder.setRecipe(Instr, Recipe); 9160 VPBB->appendRecipe(Recipe); 9161 continue; 9162 } 9163 9164 // Otherwise, if all widening options failed, Instruction is to be 9165 // replicated. This may create a successor for VPBB. 9166 VPBasicBlock *NextVPBB = 9167 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9168 if (NextVPBB != VPBB) { 9169 VPBB = NextVPBB; 9170 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9171 : ""); 9172 } 9173 } 9174 9175 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9176 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9177 } 9178 9179 // Fold the last, empty block into its predecessor. 9180 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9181 assert(VPBB && "expected to fold last (empty) block"); 9182 // After here, VPBB should not be used. 9183 VPBB = nullptr; 9184 9185 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9186 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9187 "entry block must be set to a VPRegionBlock having a non-empty entry " 9188 "VPBasicBlock"); 9189 RecipeBuilder.fixHeaderPhis(); 9190 9191 // --------------------------------------------------------------------------- 9192 // Transform initial VPlan: Apply previously taken decisions, in order, to 9193 // bring the VPlan to its final state. 9194 // --------------------------------------------------------------------------- 9195 9196 // Apply Sink-After legal constraints. 9197 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9198 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9199 if (Region && Region->isReplicator()) { 9200 assert(Region->getNumSuccessors() == 1 && 9201 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9202 assert(R->getParent()->size() == 1 && 9203 "A recipe in an original replicator region must be the only " 9204 "recipe in its block"); 9205 return Region; 9206 } 9207 return nullptr; 9208 }; 9209 for (auto &Entry : SinkAfter) { 9210 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9211 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9212 9213 auto *TargetRegion = GetReplicateRegion(Target); 9214 auto *SinkRegion = GetReplicateRegion(Sink); 9215 if (!SinkRegion) { 9216 // If the sink source is not a replicate region, sink the recipe directly. 9217 if (TargetRegion) { 9218 // The target is in a replication region, make sure to move Sink to 9219 // the block after it, not into the replication region itself. 9220 VPBasicBlock *NextBlock = 9221 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9222 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9223 } else 9224 Sink->moveAfter(Target); 9225 continue; 9226 } 9227 9228 // The sink source is in a replicate region. Unhook the region from the CFG. 9229 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9230 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9231 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9232 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9233 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9234 9235 if (TargetRegion) { 9236 // The target recipe is also in a replicate region, move the sink region 9237 // after the target region. 9238 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9239 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9240 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9241 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9242 } else { 9243 // The sink source is in a replicate region, we need to move the whole 9244 // replicate region, which should only contain a single recipe in the 9245 // main block. 9246 auto *SplitBlock = 9247 Target->getParent()->splitAt(std::next(Target->getIterator())); 9248 9249 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9250 9251 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9252 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9253 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9254 } 9255 } 9256 9257 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9258 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9259 9260 // Now that sink-after is done, move induction recipes for optimized truncates 9261 // to the phi section of the header block. 9262 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9263 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9264 9265 // Adjust the recipes for any inloop reductions. 9266 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9267 RecipeBuilder, Range.Start); 9268 9269 // Introduce a recipe to combine the incoming and previous values of a 9270 // first-order recurrence. 9271 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9272 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9273 if (!RecurPhi) 9274 continue; 9275 9276 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9277 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9278 auto *Region = GetReplicateRegion(PrevRecipe); 9279 if (Region) 9280 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9281 if (Region || PrevRecipe->isPhi()) 9282 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9283 else 9284 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9285 9286 auto *RecurSplice = cast<VPInstruction>( 9287 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9288 {RecurPhi, RecurPhi->getBackedgeValue()})); 9289 9290 RecurPhi->replaceAllUsesWith(RecurSplice); 9291 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9292 // all users. 9293 RecurSplice->setOperand(0, RecurPhi); 9294 } 9295 9296 // Interleave memory: for each Interleave Group we marked earlier as relevant 9297 // for this VPlan, replace the Recipes widening its memory instructions with a 9298 // single VPInterleaveRecipe at its insertion point. 9299 for (auto IG : InterleaveGroups) { 9300 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9301 RecipeBuilder.getRecipe(IG->getInsertPos())); 9302 SmallVector<VPValue *, 4> StoredValues; 9303 for (unsigned i = 0; i < IG->getFactor(); ++i) 9304 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9305 auto *StoreR = 9306 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9307 StoredValues.push_back(StoreR->getStoredValue()); 9308 } 9309 9310 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9311 Recipe->getMask()); 9312 VPIG->insertBefore(Recipe); 9313 unsigned J = 0; 9314 for (unsigned i = 0; i < IG->getFactor(); ++i) 9315 if (Instruction *Member = IG->getMember(i)) { 9316 if (!Member->getType()->isVoidTy()) { 9317 VPValue *OriginalV = Plan->getVPValue(Member); 9318 Plan->removeVPValueFor(Member); 9319 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9320 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9321 J++; 9322 } 9323 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9324 } 9325 } 9326 9327 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9328 // in ways that accessing values using original IR values is incorrect. 9329 Plan->disableValue2VPValue(); 9330 9331 VPlanTransforms::sinkScalarOperands(*Plan); 9332 VPlanTransforms::mergeReplicateRegions(*Plan); 9333 9334 std::string PlanName; 9335 raw_string_ostream RSO(PlanName); 9336 ElementCount VF = Range.Start; 9337 Plan->addVF(VF); 9338 RSO << "Initial VPlan for VF={" << VF; 9339 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9340 Plan->addVF(VF); 9341 RSO << "," << VF; 9342 } 9343 RSO << "},UF>=1"; 9344 RSO.flush(); 9345 Plan->setName(PlanName); 9346 9347 // Fold Exit block into its predecessor if possible. 9348 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9349 // VPBasicBlock as exit. 9350 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9351 9352 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9353 return Plan; 9354 } 9355 9356 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9357 // Outer loop handling: They may require CFG and instruction level 9358 // transformations before even evaluating whether vectorization is profitable. 9359 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9360 // the vectorization pipeline. 9361 assert(!OrigLoop->isInnermost()); 9362 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9363 9364 // Create new empty VPlan 9365 auto Plan = std::make_unique<VPlan>(); 9366 9367 // Build hierarchical CFG 9368 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9369 HCFGBuilder.buildHierarchicalCFG(); 9370 9371 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9372 VF *= 2) 9373 Plan->addVF(VF); 9374 9375 if (EnableVPlanPredication) { 9376 VPlanPredicator VPP(*Plan); 9377 VPP.predicate(); 9378 9379 // Avoid running transformation to recipes until masked code generation in 9380 // VPlan-native path is in place. 9381 return Plan; 9382 } 9383 9384 SmallPtrSet<Instruction *, 1> DeadInstructions; 9385 VPlanTransforms::VPInstructionsToVPRecipes( 9386 OrigLoop, Plan, 9387 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9388 DeadInstructions, *PSE.getSE()); 9389 9390 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9391 true, true); 9392 return Plan; 9393 } 9394 9395 // Adjust the recipes for reductions. For in-loop reductions the chain of 9396 // instructions leading from the loop exit instr to the phi need to be converted 9397 // to reductions, with one operand being vector and the other being the scalar 9398 // reduction chain. For other reductions, a select is introduced between the phi 9399 // and live-out recipes when folding the tail. 9400 void LoopVectorizationPlanner::adjustRecipesForReductions( 9401 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9402 ElementCount MinVF) { 9403 for (auto &Reduction : CM.getInLoopReductionChains()) { 9404 PHINode *Phi = Reduction.first; 9405 const RecurrenceDescriptor &RdxDesc = 9406 Legal->getReductionVars().find(Phi)->second; 9407 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9408 9409 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9410 continue; 9411 9412 // ReductionOperations are orders top-down from the phi's use to the 9413 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9414 // which of the two operands will remain scalar and which will be reduced. 9415 // For minmax the chain will be the select instructions. 9416 Instruction *Chain = Phi; 9417 for (Instruction *R : ReductionOperations) { 9418 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9419 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9420 9421 VPValue *ChainOp = Plan->getVPValue(Chain); 9422 unsigned FirstOpId; 9423 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9424 "Only min/max recurrences allowed for inloop reductions"); 9425 // Recognize a call to the llvm.fmuladd intrinsic. 9426 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9427 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9428 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9429 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9430 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9431 "Expected to replace a VPWidenSelectSC"); 9432 FirstOpId = 1; 9433 } else { 9434 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9435 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9436 "Expected to replace a VPWidenSC"); 9437 FirstOpId = 0; 9438 } 9439 unsigned VecOpId = 9440 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9441 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9442 9443 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9444 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9445 : nullptr; 9446 9447 if (IsFMulAdd) { 9448 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9449 // need to create an fmul recipe to use as the vector operand for the 9450 // fadd reduction. 9451 VPInstruction *FMulRecipe = new VPInstruction( 9452 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9453 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9454 WidenRecipe->getParent()->insert(FMulRecipe, 9455 WidenRecipe->getIterator()); 9456 VecOp = FMulRecipe; 9457 } 9458 VPReductionRecipe *RedRecipe = 9459 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9460 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9461 Plan->removeVPValueFor(R); 9462 Plan->addVPValue(R, RedRecipe); 9463 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9464 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9465 WidenRecipe->eraseFromParent(); 9466 9467 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9468 VPRecipeBase *CompareRecipe = 9469 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9470 assert(isa<VPWidenRecipe>(CompareRecipe) && 9471 "Expected to replace a VPWidenSC"); 9472 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9473 "Expected no remaining users"); 9474 CompareRecipe->eraseFromParent(); 9475 } 9476 Chain = R; 9477 } 9478 } 9479 9480 // If tail is folded by masking, introduce selects between the phi 9481 // and the live-out instruction of each reduction, at the beginning of the 9482 // dedicated latch block. 9483 if (CM.foldTailByMasking()) { 9484 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9485 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9486 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9487 if (!PhiR || PhiR->isInLoop()) 9488 continue; 9489 VPValue *Cond = 9490 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9491 VPValue *Red = PhiR->getBackedgeValue(); 9492 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9493 "reduction recipe must be defined before latch"); 9494 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9495 } 9496 } 9497 } 9498 9499 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9500 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9501 VPSlotTracker &SlotTracker) const { 9502 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9503 IG->getInsertPos()->printAsOperand(O, false); 9504 O << ", "; 9505 getAddr()->printAsOperand(O, SlotTracker); 9506 VPValue *Mask = getMask(); 9507 if (Mask) { 9508 O << ", "; 9509 Mask->printAsOperand(O, SlotTracker); 9510 } 9511 9512 unsigned OpIdx = 0; 9513 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9514 if (!IG->getMember(i)) 9515 continue; 9516 if (getNumStoreOperands() > 0) { 9517 O << "\n" << Indent << " store "; 9518 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9519 O << " to index " << i; 9520 } else { 9521 O << "\n" << Indent << " "; 9522 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9523 O << " = load from index " << i; 9524 } 9525 ++OpIdx; 9526 } 9527 } 9528 #endif 9529 9530 void VPWidenCallRecipe::execute(VPTransformState &State) { 9531 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9532 *this, State); 9533 } 9534 9535 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9536 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9537 State.ILV->setDebugLocFromInst(&I); 9538 9539 // The condition can be loop invariant but still defined inside the 9540 // loop. This means that we can't just use the original 'cond' value. 9541 // We have to take the 'vectorized' value and pick the first lane. 9542 // Instcombine will make this a no-op. 9543 auto *InvarCond = 9544 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9545 9546 for (unsigned Part = 0; Part < State.UF; ++Part) { 9547 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9548 Value *Op0 = State.get(getOperand(1), Part); 9549 Value *Op1 = State.get(getOperand(2), Part); 9550 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9551 State.set(this, Sel, Part); 9552 State.ILV->addMetadata(Sel, &I); 9553 } 9554 } 9555 9556 void VPWidenRecipe::execute(VPTransformState &State) { 9557 auto &I = *cast<Instruction>(getUnderlyingValue()); 9558 auto &Builder = State.Builder; 9559 switch (I.getOpcode()) { 9560 case Instruction::Call: 9561 case Instruction::Br: 9562 case Instruction::PHI: 9563 case Instruction::GetElementPtr: 9564 case Instruction::Select: 9565 llvm_unreachable("This instruction is handled by a different recipe."); 9566 case Instruction::UDiv: 9567 case Instruction::SDiv: 9568 case Instruction::SRem: 9569 case Instruction::URem: 9570 case Instruction::Add: 9571 case Instruction::FAdd: 9572 case Instruction::Sub: 9573 case Instruction::FSub: 9574 case Instruction::FNeg: 9575 case Instruction::Mul: 9576 case Instruction::FMul: 9577 case Instruction::FDiv: 9578 case Instruction::FRem: 9579 case Instruction::Shl: 9580 case Instruction::LShr: 9581 case Instruction::AShr: 9582 case Instruction::And: 9583 case Instruction::Or: 9584 case Instruction::Xor: { 9585 // Just widen unops and binops. 9586 State.ILV->setDebugLocFromInst(&I); 9587 9588 for (unsigned Part = 0; Part < State.UF; ++Part) { 9589 SmallVector<Value *, 2> Ops; 9590 for (VPValue *VPOp : operands()) 9591 Ops.push_back(State.get(VPOp, Part)); 9592 9593 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9594 9595 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9596 VecOp->copyIRFlags(&I); 9597 9598 // If the instruction is vectorized and was in a basic block that needed 9599 // predication, we can't propagate poison-generating flags (nuw/nsw, 9600 // exact, etc.). The control flow has been linearized and the 9601 // instruction is no longer guarded by the predicate, which could make 9602 // the flag properties to no longer hold. 9603 if (State.MayGeneratePoisonRecipes.contains(this)) 9604 VecOp->dropPoisonGeneratingFlags(); 9605 } 9606 9607 // Use this vector value for all users of the original instruction. 9608 State.set(this, V, Part); 9609 State.ILV->addMetadata(V, &I); 9610 } 9611 9612 break; 9613 } 9614 case Instruction::ICmp: 9615 case Instruction::FCmp: { 9616 // Widen compares. Generate vector compares. 9617 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9618 auto *Cmp = cast<CmpInst>(&I); 9619 State.ILV->setDebugLocFromInst(Cmp); 9620 for (unsigned Part = 0; Part < State.UF; ++Part) { 9621 Value *A = State.get(getOperand(0), Part); 9622 Value *B = State.get(getOperand(1), Part); 9623 Value *C = nullptr; 9624 if (FCmp) { 9625 // Propagate fast math flags. 9626 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9627 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9628 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9629 } else { 9630 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9631 } 9632 State.set(this, C, Part); 9633 State.ILV->addMetadata(C, &I); 9634 } 9635 9636 break; 9637 } 9638 9639 case Instruction::ZExt: 9640 case Instruction::SExt: 9641 case Instruction::FPToUI: 9642 case Instruction::FPToSI: 9643 case Instruction::FPExt: 9644 case Instruction::PtrToInt: 9645 case Instruction::IntToPtr: 9646 case Instruction::SIToFP: 9647 case Instruction::UIToFP: 9648 case Instruction::Trunc: 9649 case Instruction::FPTrunc: 9650 case Instruction::BitCast: { 9651 auto *CI = cast<CastInst>(&I); 9652 State.ILV->setDebugLocFromInst(CI); 9653 9654 /// Vectorize casts. 9655 Type *DestTy = (State.VF.isScalar()) 9656 ? CI->getType() 9657 : VectorType::get(CI->getType(), State.VF); 9658 9659 for (unsigned Part = 0; Part < State.UF; ++Part) { 9660 Value *A = State.get(getOperand(0), Part); 9661 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9662 State.set(this, Cast, Part); 9663 State.ILV->addMetadata(Cast, &I); 9664 } 9665 break; 9666 } 9667 default: 9668 // This instruction is not vectorized by simple widening. 9669 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9670 llvm_unreachable("Unhandled instruction!"); 9671 } // end of switch. 9672 } 9673 9674 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9675 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9676 // Construct a vector GEP by widening the operands of the scalar GEP as 9677 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9678 // results in a vector of pointers when at least one operand of the GEP 9679 // is vector-typed. Thus, to keep the representation compact, we only use 9680 // vector-typed operands for loop-varying values. 9681 9682 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9683 // If we are vectorizing, but the GEP has only loop-invariant operands, 9684 // the GEP we build (by only using vector-typed operands for 9685 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9686 // produce a vector of pointers, we need to either arbitrarily pick an 9687 // operand to broadcast, or broadcast a clone of the original GEP. 9688 // Here, we broadcast a clone of the original. 9689 // 9690 // TODO: If at some point we decide to scalarize instructions having 9691 // loop-invariant operands, this special case will no longer be 9692 // required. We would add the scalarization decision to 9693 // collectLoopScalars() and teach getVectorValue() to broadcast 9694 // the lane-zero scalar value. 9695 auto *Clone = State.Builder.Insert(GEP->clone()); 9696 for (unsigned Part = 0; Part < State.UF; ++Part) { 9697 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9698 State.set(this, EntryPart, Part); 9699 State.ILV->addMetadata(EntryPart, GEP); 9700 } 9701 } else { 9702 // If the GEP has at least one loop-varying operand, we are sure to 9703 // produce a vector of pointers. But if we are only unrolling, we want 9704 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9705 // produce with the code below will be scalar (if VF == 1) or vector 9706 // (otherwise). Note that for the unroll-only case, we still maintain 9707 // values in the vector mapping with initVector, as we do for other 9708 // instructions. 9709 for (unsigned Part = 0; Part < State.UF; ++Part) { 9710 // The pointer operand of the new GEP. If it's loop-invariant, we 9711 // won't broadcast it. 9712 auto *Ptr = IsPtrLoopInvariant 9713 ? State.get(getOperand(0), VPIteration(0, 0)) 9714 : State.get(getOperand(0), Part); 9715 9716 // Collect all the indices for the new GEP. If any index is 9717 // loop-invariant, we won't broadcast it. 9718 SmallVector<Value *, 4> Indices; 9719 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9720 VPValue *Operand = getOperand(I); 9721 if (IsIndexLoopInvariant[I - 1]) 9722 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9723 else 9724 Indices.push_back(State.get(Operand, Part)); 9725 } 9726 9727 // If the GEP instruction is vectorized and was in a basic block that 9728 // needed predication, we can't propagate the poison-generating 'inbounds' 9729 // flag. The control flow has been linearized and the GEP is no longer 9730 // guarded by the predicate, which could make the 'inbounds' properties to 9731 // no longer hold. 9732 bool IsInBounds = 9733 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9734 9735 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9736 // but it should be a vector, otherwise. 9737 auto *NewGEP = IsInBounds 9738 ? State.Builder.CreateInBoundsGEP( 9739 GEP->getSourceElementType(), Ptr, Indices) 9740 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9741 Ptr, Indices); 9742 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9743 "NewGEP is not a pointer vector"); 9744 State.set(this, NewGEP, Part); 9745 State.ILV->addMetadata(NewGEP, GEP); 9746 } 9747 } 9748 } 9749 9750 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9751 assert(!State.Instance && "Int or FP induction being replicated."); 9752 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9753 State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); 9754 } 9755 9756 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9757 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9758 State); 9759 } 9760 9761 void VPBlendRecipe::execute(VPTransformState &State) { 9762 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9763 // We know that all PHIs in non-header blocks are converted into 9764 // selects, so we don't have to worry about the insertion order and we 9765 // can just use the builder. 9766 // At this point we generate the predication tree. There may be 9767 // duplications since this is a simple recursive scan, but future 9768 // optimizations will clean it up. 9769 9770 unsigned NumIncoming = getNumIncomingValues(); 9771 9772 // Generate a sequence of selects of the form: 9773 // SELECT(Mask3, In3, 9774 // SELECT(Mask2, In2, 9775 // SELECT(Mask1, In1, 9776 // In0))) 9777 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9778 // are essentially undef are taken from In0. 9779 InnerLoopVectorizer::VectorParts Entry(State.UF); 9780 for (unsigned In = 0; In < NumIncoming; ++In) { 9781 for (unsigned Part = 0; Part < State.UF; ++Part) { 9782 // We might have single edge PHIs (blocks) - use an identity 9783 // 'select' for the first PHI operand. 9784 Value *In0 = State.get(getIncomingValue(In), Part); 9785 if (In == 0) 9786 Entry[Part] = In0; // Initialize with the first incoming value. 9787 else { 9788 // Select between the current value and the previous incoming edge 9789 // based on the incoming mask. 9790 Value *Cond = State.get(getMask(In), Part); 9791 Entry[Part] = 9792 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9793 } 9794 } 9795 } 9796 for (unsigned Part = 0; Part < State.UF; ++Part) 9797 State.set(this, Entry[Part], Part); 9798 } 9799 9800 void VPInterleaveRecipe::execute(VPTransformState &State) { 9801 assert(!State.Instance && "Interleave group being replicated."); 9802 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9803 getStoredValues(), getMask()); 9804 } 9805 9806 void VPReductionRecipe::execute(VPTransformState &State) { 9807 assert(!State.Instance && "Reduction being replicated."); 9808 Value *PrevInChain = State.get(getChainOp(), 0); 9809 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9810 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9811 // Propagate the fast-math flags carried by the underlying instruction. 9812 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9813 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9814 for (unsigned Part = 0; Part < State.UF; ++Part) { 9815 Value *NewVecOp = State.get(getVecOp(), Part); 9816 if (VPValue *Cond = getCondOp()) { 9817 Value *NewCond = State.get(Cond, Part); 9818 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9819 Value *Iden = RdxDesc->getRecurrenceIdentity( 9820 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9821 Value *IdenVec = 9822 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9823 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9824 NewVecOp = Select; 9825 } 9826 Value *NewRed; 9827 Value *NextInChain; 9828 if (IsOrdered) { 9829 if (State.VF.isVector()) 9830 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9831 PrevInChain); 9832 else 9833 NewRed = State.Builder.CreateBinOp( 9834 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9835 NewVecOp); 9836 PrevInChain = NewRed; 9837 } else { 9838 PrevInChain = State.get(getChainOp(), Part); 9839 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9840 } 9841 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9842 NextInChain = 9843 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9844 NewRed, PrevInChain); 9845 } else if (IsOrdered) 9846 NextInChain = NewRed; 9847 else 9848 NextInChain = State.Builder.CreateBinOp( 9849 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9850 PrevInChain); 9851 State.set(this, NextInChain, Part); 9852 } 9853 } 9854 9855 void VPReplicateRecipe::execute(VPTransformState &State) { 9856 if (State.Instance) { // Generate a single instance. 9857 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9858 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9859 IsPredicated, State); 9860 // Insert scalar instance packing it into a vector. 9861 if (AlsoPack && State.VF.isVector()) { 9862 // If we're constructing lane 0, initialize to start from poison. 9863 if (State.Instance->Lane.isFirstLane()) { 9864 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9865 Value *Poison = PoisonValue::get( 9866 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9867 State.set(this, Poison, State.Instance->Part); 9868 } 9869 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9870 } 9871 return; 9872 } 9873 9874 // Generate scalar instances for all VF lanes of all UF parts, unless the 9875 // instruction is uniform inwhich case generate only the first lane for each 9876 // of the UF parts. 9877 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9878 assert((!State.VF.isScalable() || IsUniform) && 9879 "Can't scalarize a scalable vector"); 9880 for (unsigned Part = 0; Part < State.UF; ++Part) 9881 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9882 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9883 VPIteration(Part, Lane), IsPredicated, 9884 State); 9885 } 9886 9887 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9888 assert(State.Instance && "Branch on Mask works only on single instance."); 9889 9890 unsigned Part = State.Instance->Part; 9891 unsigned Lane = State.Instance->Lane.getKnownLane(); 9892 9893 Value *ConditionBit = nullptr; 9894 VPValue *BlockInMask = getMask(); 9895 if (BlockInMask) { 9896 ConditionBit = State.get(BlockInMask, Part); 9897 if (ConditionBit->getType()->isVectorTy()) 9898 ConditionBit = State.Builder.CreateExtractElement( 9899 ConditionBit, State.Builder.getInt32(Lane)); 9900 } else // Block in mask is all-one. 9901 ConditionBit = State.Builder.getTrue(); 9902 9903 // Replace the temporary unreachable terminator with a new conditional branch, 9904 // whose two destinations will be set later when they are created. 9905 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9906 assert(isa<UnreachableInst>(CurrentTerminator) && 9907 "Expected to replace unreachable terminator with conditional branch."); 9908 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9909 CondBr->setSuccessor(0, nullptr); 9910 ReplaceInstWithInst(CurrentTerminator, CondBr); 9911 } 9912 9913 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9914 assert(State.Instance && "Predicated instruction PHI works per instance."); 9915 Instruction *ScalarPredInst = 9916 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9917 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9918 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9919 assert(PredicatingBB && "Predicated block has no single predecessor."); 9920 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9921 "operand must be VPReplicateRecipe"); 9922 9923 // By current pack/unpack logic we need to generate only a single phi node: if 9924 // a vector value for the predicated instruction exists at this point it means 9925 // the instruction has vector users only, and a phi for the vector value is 9926 // needed. In this case the recipe of the predicated instruction is marked to 9927 // also do that packing, thereby "hoisting" the insert-element sequence. 9928 // Otherwise, a phi node for the scalar value is needed. 9929 unsigned Part = State.Instance->Part; 9930 if (State.hasVectorValue(getOperand(0), Part)) { 9931 Value *VectorValue = State.get(getOperand(0), Part); 9932 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9933 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9934 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9935 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9936 if (State.hasVectorValue(this, Part)) 9937 State.reset(this, VPhi, Part); 9938 else 9939 State.set(this, VPhi, Part); 9940 // NOTE: Currently we need to update the value of the operand, so the next 9941 // predicated iteration inserts its generated value in the correct vector. 9942 State.reset(getOperand(0), VPhi, Part); 9943 } else { 9944 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9945 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9946 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9947 PredicatingBB); 9948 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9949 if (State.hasScalarValue(this, *State.Instance)) 9950 State.reset(this, Phi, *State.Instance); 9951 else 9952 State.set(this, Phi, *State.Instance); 9953 // NOTE: Currently we need to update the value of the operand, so the next 9954 // predicated iteration inserts its generated value in the correct vector. 9955 State.reset(getOperand(0), Phi, *State.Instance); 9956 } 9957 } 9958 9959 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9960 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9961 9962 // Attempt to issue a wide load. 9963 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9964 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9965 9966 assert((LI || SI) && "Invalid Load/Store instruction"); 9967 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9968 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9969 9970 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9971 9972 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9973 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9974 bool CreateGatherScatter = !Consecutive; 9975 9976 auto &Builder = State.Builder; 9977 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9978 bool isMaskRequired = getMask(); 9979 if (isMaskRequired) 9980 for (unsigned Part = 0; Part < State.UF; ++Part) 9981 BlockInMaskParts[Part] = State.get(getMask(), Part); 9982 9983 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9984 // Calculate the pointer for the specific unroll-part. 9985 GetElementPtrInst *PartPtr = nullptr; 9986 9987 bool InBounds = false; 9988 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9989 InBounds = gep->isInBounds(); 9990 if (Reverse) { 9991 // If the address is consecutive but reversed, then the 9992 // wide store needs to start at the last vector element. 9993 // RunTimeVF = VScale * VF.getKnownMinValue() 9994 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9995 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9996 // NumElt = -Part * RunTimeVF 9997 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9998 // LastLane = 1 - RunTimeVF 9999 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 10000 PartPtr = 10001 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 10002 PartPtr->setIsInBounds(InBounds); 10003 PartPtr = cast<GetElementPtrInst>( 10004 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 10005 PartPtr->setIsInBounds(InBounds); 10006 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 10007 BlockInMaskParts[Part] = 10008 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 10009 } else { 10010 Value *Increment = 10011 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 10012 PartPtr = cast<GetElementPtrInst>( 10013 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 10014 PartPtr->setIsInBounds(InBounds); 10015 } 10016 10017 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 10018 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 10019 }; 10020 10021 // Handle Stores: 10022 if (SI) { 10023 State.ILV->setDebugLocFromInst(SI); 10024 10025 for (unsigned Part = 0; Part < State.UF; ++Part) { 10026 Instruction *NewSI = nullptr; 10027 Value *StoredVal = State.get(StoredValue, Part); 10028 if (CreateGatherScatter) { 10029 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10030 Value *VectorGep = State.get(getAddr(), Part); 10031 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 10032 MaskPart); 10033 } else { 10034 if (Reverse) { 10035 // If we store to reverse consecutive memory locations, then we need 10036 // to reverse the order of elements in the stored value. 10037 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 10038 // We don't want to update the value in the map as it might be used in 10039 // another expression. So don't call resetVectorValue(StoredVal). 10040 } 10041 auto *VecPtr = 10042 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10043 if (isMaskRequired) 10044 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 10045 BlockInMaskParts[Part]); 10046 else 10047 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 10048 } 10049 State.ILV->addMetadata(NewSI, SI); 10050 } 10051 return; 10052 } 10053 10054 // Handle loads. 10055 assert(LI && "Must have a load instruction"); 10056 State.ILV->setDebugLocFromInst(LI); 10057 for (unsigned Part = 0; Part < State.UF; ++Part) { 10058 Value *NewLI; 10059 if (CreateGatherScatter) { 10060 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10061 Value *VectorGep = State.get(getAddr(), Part); 10062 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10063 nullptr, "wide.masked.gather"); 10064 State.ILV->addMetadata(NewLI, LI); 10065 } else { 10066 auto *VecPtr = 10067 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10068 if (isMaskRequired) 10069 NewLI = Builder.CreateMaskedLoad( 10070 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10071 PoisonValue::get(DataTy), "wide.masked.load"); 10072 else 10073 NewLI = 10074 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10075 10076 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10077 State.ILV->addMetadata(NewLI, LI); 10078 if (Reverse) 10079 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10080 } 10081 10082 State.set(this, NewLI, Part); 10083 } 10084 } 10085 10086 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10087 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10088 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10089 // for predication. 10090 static ScalarEpilogueLowering getScalarEpilogueLowering( 10091 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10092 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10093 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10094 LoopVectorizationLegality &LVL) { 10095 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10096 // don't look at hints or options, and don't request a scalar epilogue. 10097 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10098 // LoopAccessInfo (due to code dependency and not being able to reliably get 10099 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10100 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10101 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10102 // back to the old way and vectorize with versioning when forced. See D81345.) 10103 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10104 PGSOQueryType::IRPass) && 10105 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10106 return CM_ScalarEpilogueNotAllowedOptSize; 10107 10108 // 2) If set, obey the directives 10109 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10110 switch (PreferPredicateOverEpilogue) { 10111 case PreferPredicateTy::ScalarEpilogue: 10112 return CM_ScalarEpilogueAllowed; 10113 case PreferPredicateTy::PredicateElseScalarEpilogue: 10114 return CM_ScalarEpilogueNotNeededUsePredicate; 10115 case PreferPredicateTy::PredicateOrDontVectorize: 10116 return CM_ScalarEpilogueNotAllowedUsePredicate; 10117 }; 10118 } 10119 10120 // 3) If set, obey the hints 10121 switch (Hints.getPredicate()) { 10122 case LoopVectorizeHints::FK_Enabled: 10123 return CM_ScalarEpilogueNotNeededUsePredicate; 10124 case LoopVectorizeHints::FK_Disabled: 10125 return CM_ScalarEpilogueAllowed; 10126 }; 10127 10128 // 4) if the TTI hook indicates this is profitable, request predication. 10129 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10130 LVL.getLAI())) 10131 return CM_ScalarEpilogueNotNeededUsePredicate; 10132 10133 return CM_ScalarEpilogueAllowed; 10134 } 10135 10136 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10137 // If Values have been set for this Def return the one relevant for \p Part. 10138 if (hasVectorValue(Def, Part)) 10139 return Data.PerPartOutput[Def][Part]; 10140 10141 if (!hasScalarValue(Def, {Part, 0})) { 10142 Value *IRV = Def->getLiveInIRValue(); 10143 Value *B = ILV->getBroadcastInstrs(IRV); 10144 set(Def, B, Part); 10145 return B; 10146 } 10147 10148 Value *ScalarValue = get(Def, {Part, 0}); 10149 // If we aren't vectorizing, we can just copy the scalar map values over 10150 // to the vector map. 10151 if (VF.isScalar()) { 10152 set(Def, ScalarValue, Part); 10153 return ScalarValue; 10154 } 10155 10156 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10157 bool IsUniform = RepR && RepR->isUniform(); 10158 10159 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10160 // Check if there is a scalar value for the selected lane. 10161 if (!hasScalarValue(Def, {Part, LastLane})) { 10162 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10163 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10164 "unexpected recipe found to be invariant"); 10165 IsUniform = true; 10166 LastLane = 0; 10167 } 10168 10169 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10170 // Set the insert point after the last scalarized instruction or after the 10171 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10172 // will directly follow the scalar definitions. 10173 auto OldIP = Builder.saveIP(); 10174 auto NewIP = 10175 isa<PHINode>(LastInst) 10176 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10177 : std::next(BasicBlock::iterator(LastInst)); 10178 Builder.SetInsertPoint(&*NewIP); 10179 10180 // However, if we are vectorizing, we need to construct the vector values. 10181 // If the value is known to be uniform after vectorization, we can just 10182 // broadcast the scalar value corresponding to lane zero for each unroll 10183 // iteration. Otherwise, we construct the vector values using 10184 // insertelement instructions. Since the resulting vectors are stored in 10185 // State, we will only generate the insertelements once. 10186 Value *VectorValue = nullptr; 10187 if (IsUniform) { 10188 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10189 set(Def, VectorValue, Part); 10190 } else { 10191 // Initialize packing with insertelements to start from undef. 10192 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10193 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10194 set(Def, Undef, Part); 10195 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10196 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10197 VectorValue = get(Def, Part); 10198 } 10199 Builder.restoreIP(OldIP); 10200 return VectorValue; 10201 } 10202 10203 // Process the loop in the VPlan-native vectorization path. This path builds 10204 // VPlan upfront in the vectorization pipeline, which allows to apply 10205 // VPlan-to-VPlan transformations from the very beginning without modifying the 10206 // input LLVM IR. 10207 static bool processLoopInVPlanNativePath( 10208 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10209 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10210 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10211 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10212 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10213 LoopVectorizationRequirements &Requirements) { 10214 10215 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10216 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10217 return false; 10218 } 10219 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10220 Function *F = L->getHeader()->getParent(); 10221 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10222 10223 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10224 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10225 10226 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10227 &Hints, IAI); 10228 // Use the planner for outer loop vectorization. 10229 // TODO: CM is not used at this point inside the planner. Turn CM into an 10230 // optional argument if we don't need it in the future. 10231 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10232 Requirements, ORE); 10233 10234 // Get user vectorization factor. 10235 ElementCount UserVF = Hints.getWidth(); 10236 10237 CM.collectElementTypesForWidening(); 10238 10239 // Plan how to best vectorize, return the best VF and its cost. 10240 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10241 10242 // If we are stress testing VPlan builds, do not attempt to generate vector 10243 // code. Masked vector code generation support will follow soon. 10244 // Also, do not attempt to vectorize if no vector code will be produced. 10245 if (VPlanBuildStressTest || EnableVPlanPredication || 10246 VectorizationFactor::Disabled() == VF) 10247 return false; 10248 10249 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10250 10251 { 10252 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10253 F->getParent()->getDataLayout()); 10254 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10255 &CM, BFI, PSI, Checks); 10256 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10257 << L->getHeader()->getParent()->getName() << "\"\n"); 10258 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10259 } 10260 10261 // Mark the loop as already vectorized to avoid vectorizing again. 10262 Hints.setAlreadyVectorized(); 10263 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10264 return true; 10265 } 10266 10267 // Emit a remark if there are stores to floats that required a floating point 10268 // extension. If the vectorized loop was generated with floating point there 10269 // will be a performance penalty from the conversion overhead and the change in 10270 // the vector width. 10271 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10272 SmallVector<Instruction *, 4> Worklist; 10273 for (BasicBlock *BB : L->getBlocks()) { 10274 for (Instruction &Inst : *BB) { 10275 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10276 if (S->getValueOperand()->getType()->isFloatTy()) 10277 Worklist.push_back(S); 10278 } 10279 } 10280 } 10281 10282 // Traverse the floating point stores upwards searching, for floating point 10283 // conversions. 10284 SmallPtrSet<const Instruction *, 4> Visited; 10285 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10286 while (!Worklist.empty()) { 10287 auto *I = Worklist.pop_back_val(); 10288 if (!L->contains(I)) 10289 continue; 10290 if (!Visited.insert(I).second) 10291 continue; 10292 10293 // Emit a remark if the floating point store required a floating 10294 // point conversion. 10295 // TODO: More work could be done to identify the root cause such as a 10296 // constant or a function return type and point the user to it. 10297 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10298 ORE->emit([&]() { 10299 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10300 I->getDebugLoc(), L->getHeader()) 10301 << "floating point conversion changes vector width. " 10302 << "Mixed floating point precision requires an up/down " 10303 << "cast that will negatively impact performance."; 10304 }); 10305 10306 for (Use &Op : I->operands()) 10307 if (auto *OpI = dyn_cast<Instruction>(Op)) 10308 Worklist.push_back(OpI); 10309 } 10310 } 10311 10312 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10313 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10314 !EnableLoopInterleaving), 10315 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10316 !EnableLoopVectorization) {} 10317 10318 bool LoopVectorizePass::processLoop(Loop *L) { 10319 assert((EnableVPlanNativePath || L->isInnermost()) && 10320 "VPlan-native path is not enabled. Only process inner loops."); 10321 10322 #ifndef NDEBUG 10323 const std::string DebugLocStr = getDebugLocString(L); 10324 #endif /* NDEBUG */ 10325 10326 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10327 << L->getHeader()->getParent()->getName() << "\" from " 10328 << DebugLocStr << "\n"); 10329 10330 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10331 10332 LLVM_DEBUG( 10333 dbgs() << "LV: Loop hints:" 10334 << " force=" 10335 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10336 ? "disabled" 10337 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10338 ? "enabled" 10339 : "?")) 10340 << " width=" << Hints.getWidth() 10341 << " interleave=" << Hints.getInterleave() << "\n"); 10342 10343 // Function containing loop 10344 Function *F = L->getHeader()->getParent(); 10345 10346 // Looking at the diagnostic output is the only way to determine if a loop 10347 // was vectorized (other than looking at the IR or machine code), so it 10348 // is important to generate an optimization remark for each loop. Most of 10349 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10350 // generated as OptimizationRemark and OptimizationRemarkMissed are 10351 // less verbose reporting vectorized loops and unvectorized loops that may 10352 // benefit from vectorization, respectively. 10353 10354 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10355 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10356 return false; 10357 } 10358 10359 PredicatedScalarEvolution PSE(*SE, *L); 10360 10361 // Check if it is legal to vectorize the loop. 10362 LoopVectorizationRequirements Requirements; 10363 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10364 &Requirements, &Hints, DB, AC, BFI, PSI); 10365 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10366 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10367 Hints.emitRemarkWithHints(); 10368 return false; 10369 } 10370 10371 // Check the function attributes and profiles to find out if this function 10372 // should be optimized for size. 10373 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10374 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10375 10376 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10377 // here. They may require CFG and instruction level transformations before 10378 // even evaluating whether vectorization is profitable. Since we cannot modify 10379 // the incoming IR, we need to build VPlan upfront in the vectorization 10380 // pipeline. 10381 if (!L->isInnermost()) 10382 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10383 ORE, BFI, PSI, Hints, Requirements); 10384 10385 assert(L->isInnermost() && "Inner loop expected."); 10386 10387 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10388 // count by optimizing for size, to minimize overheads. 10389 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10390 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10391 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10392 << "This loop is worth vectorizing only if no scalar " 10393 << "iteration overheads are incurred."); 10394 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10395 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10396 else { 10397 LLVM_DEBUG(dbgs() << "\n"); 10398 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10399 } 10400 } 10401 10402 // Check the function attributes to see if implicit floats are allowed. 10403 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10404 // an integer loop and the vector instructions selected are purely integer 10405 // vector instructions? 10406 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10407 reportVectorizationFailure( 10408 "Can't vectorize when the NoImplicitFloat attribute is used", 10409 "loop not vectorized due to NoImplicitFloat attribute", 10410 "NoImplicitFloat", ORE, L); 10411 Hints.emitRemarkWithHints(); 10412 return false; 10413 } 10414 10415 // Check if the target supports potentially unsafe FP vectorization. 10416 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10417 // for the target we're vectorizing for, to make sure none of the 10418 // additional fp-math flags can help. 10419 if (Hints.isPotentiallyUnsafe() && 10420 TTI->isFPVectorizationPotentiallyUnsafe()) { 10421 reportVectorizationFailure( 10422 "Potentially unsafe FP op prevents vectorization", 10423 "loop not vectorized due to unsafe FP support.", 10424 "UnsafeFP", ORE, L); 10425 Hints.emitRemarkWithHints(); 10426 return false; 10427 } 10428 10429 bool AllowOrderedReductions; 10430 // If the flag is set, use that instead and override the TTI behaviour. 10431 if (ForceOrderedReductions.getNumOccurrences() > 0) 10432 AllowOrderedReductions = ForceOrderedReductions; 10433 else 10434 AllowOrderedReductions = TTI->enableOrderedReductions(); 10435 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10436 ORE->emit([&]() { 10437 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10438 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10439 ExactFPMathInst->getDebugLoc(), 10440 ExactFPMathInst->getParent()) 10441 << "loop not vectorized: cannot prove it is safe to reorder " 10442 "floating-point operations"; 10443 }); 10444 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10445 "reorder floating-point operations\n"); 10446 Hints.emitRemarkWithHints(); 10447 return false; 10448 } 10449 10450 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10451 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10452 10453 // If an override option has been passed in for interleaved accesses, use it. 10454 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10455 UseInterleaved = EnableInterleavedMemAccesses; 10456 10457 // Analyze interleaved memory accesses. 10458 if (UseInterleaved) { 10459 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10460 } 10461 10462 // Use the cost model. 10463 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10464 F, &Hints, IAI); 10465 CM.collectValuesToIgnore(); 10466 CM.collectElementTypesForWidening(); 10467 10468 // Use the planner for vectorization. 10469 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10470 Requirements, ORE); 10471 10472 // Get user vectorization factor and interleave count. 10473 ElementCount UserVF = Hints.getWidth(); 10474 unsigned UserIC = Hints.getInterleave(); 10475 10476 // Plan how to best vectorize, return the best VF and its cost. 10477 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10478 10479 VectorizationFactor VF = VectorizationFactor::Disabled(); 10480 unsigned IC = 1; 10481 10482 if (MaybeVF) { 10483 VF = *MaybeVF; 10484 // Select the interleave count. 10485 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10486 } 10487 10488 // Identify the diagnostic messages that should be produced. 10489 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10490 bool VectorizeLoop = true, InterleaveLoop = true; 10491 if (VF.Width.isScalar()) { 10492 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10493 VecDiagMsg = std::make_pair( 10494 "VectorizationNotBeneficial", 10495 "the cost-model indicates that vectorization is not beneficial"); 10496 VectorizeLoop = false; 10497 } 10498 10499 if (!MaybeVF && UserIC > 1) { 10500 // Tell the user interleaving was avoided up-front, despite being explicitly 10501 // requested. 10502 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10503 "interleaving should be avoided up front\n"); 10504 IntDiagMsg = std::make_pair( 10505 "InterleavingAvoided", 10506 "Ignoring UserIC, because interleaving was avoided up front"); 10507 InterleaveLoop = false; 10508 } else if (IC == 1 && UserIC <= 1) { 10509 // Tell the user interleaving is not beneficial. 10510 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10511 IntDiagMsg = std::make_pair( 10512 "InterleavingNotBeneficial", 10513 "the cost-model indicates that interleaving is not beneficial"); 10514 InterleaveLoop = false; 10515 if (UserIC == 1) { 10516 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10517 IntDiagMsg.second += 10518 " and is explicitly disabled or interleave count is set to 1"; 10519 } 10520 } else if (IC > 1 && UserIC == 1) { 10521 // Tell the user interleaving is beneficial, but it explicitly disabled. 10522 LLVM_DEBUG( 10523 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10524 IntDiagMsg = std::make_pair( 10525 "InterleavingBeneficialButDisabled", 10526 "the cost-model indicates that interleaving is beneficial " 10527 "but is explicitly disabled or interleave count is set to 1"); 10528 InterleaveLoop = false; 10529 } 10530 10531 // Override IC if user provided an interleave count. 10532 IC = UserIC > 0 ? UserIC : IC; 10533 10534 // Emit diagnostic messages, if any. 10535 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10536 if (!VectorizeLoop && !InterleaveLoop) { 10537 // Do not vectorize or interleaving the loop. 10538 ORE->emit([&]() { 10539 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10540 L->getStartLoc(), L->getHeader()) 10541 << VecDiagMsg.second; 10542 }); 10543 ORE->emit([&]() { 10544 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10545 L->getStartLoc(), L->getHeader()) 10546 << IntDiagMsg.second; 10547 }); 10548 return false; 10549 } else if (!VectorizeLoop && InterleaveLoop) { 10550 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10551 ORE->emit([&]() { 10552 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10553 L->getStartLoc(), L->getHeader()) 10554 << VecDiagMsg.second; 10555 }); 10556 } else if (VectorizeLoop && !InterleaveLoop) { 10557 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10558 << ") in " << DebugLocStr << '\n'); 10559 ORE->emit([&]() { 10560 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10561 L->getStartLoc(), L->getHeader()) 10562 << IntDiagMsg.second; 10563 }); 10564 } else if (VectorizeLoop && InterleaveLoop) { 10565 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10566 << ") in " << DebugLocStr << '\n'); 10567 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10568 } 10569 10570 bool DisableRuntimeUnroll = false; 10571 MDNode *OrigLoopID = L->getLoopID(); 10572 { 10573 // Optimistically generate runtime checks. Drop them if they turn out to not 10574 // be profitable. Limit the scope of Checks, so the cleanup happens 10575 // immediately after vector codegeneration is done. 10576 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10577 F->getParent()->getDataLayout()); 10578 if (!VF.Width.isScalar() || IC > 1) 10579 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate()); 10580 10581 using namespace ore; 10582 if (!VectorizeLoop) { 10583 assert(IC > 1 && "interleave count should not be 1 or 0"); 10584 // If we decided that it is not legal to vectorize the loop, then 10585 // interleave it. 10586 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10587 &CM, BFI, PSI, Checks); 10588 10589 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10590 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10591 10592 ORE->emit([&]() { 10593 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10594 L->getHeader()) 10595 << "interleaved loop (interleaved count: " 10596 << NV("InterleaveCount", IC) << ")"; 10597 }); 10598 } else { 10599 // If we decided that it is *legal* to vectorize the loop, then do it. 10600 10601 // Consider vectorizing the epilogue too if it's profitable. 10602 VectorizationFactor EpilogueVF = 10603 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10604 if (EpilogueVF.Width.isVector()) { 10605 10606 // The first pass vectorizes the main loop and creates a scalar epilogue 10607 // to be vectorized by executing the plan (potentially with a different 10608 // factor) again shortly afterwards. 10609 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10610 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10611 EPI, &LVL, &CM, BFI, PSI, Checks); 10612 10613 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10614 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10615 DT); 10616 ++LoopsVectorized; 10617 10618 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10619 formLCSSARecursively(*L, *DT, LI, SE); 10620 10621 // Second pass vectorizes the epilogue and adjusts the control flow 10622 // edges from the first pass. 10623 EPI.MainLoopVF = EPI.EpilogueVF; 10624 EPI.MainLoopUF = EPI.EpilogueUF; 10625 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10626 ORE, EPI, &LVL, &CM, BFI, PSI, 10627 Checks); 10628 10629 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10630 10631 // Ensure that the start values for any VPReductionPHIRecipes are 10632 // updated before vectorising the epilogue loop. 10633 VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); 10634 for (VPRecipeBase &R : Header->phis()) { 10635 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10636 if (auto *Resume = MainILV.getReductionResumeValue( 10637 ReductionPhi->getRecurrenceDescriptor())) { 10638 VPValue *StartVal = new VPValue(Resume); 10639 BestEpiPlan.addExternalDef(StartVal); 10640 ReductionPhi->setOperand(0, StartVal); 10641 } 10642 } 10643 } 10644 10645 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10646 DT); 10647 ++LoopsEpilogueVectorized; 10648 10649 if (!MainILV.areSafetyChecksAdded()) 10650 DisableRuntimeUnroll = true; 10651 } else { 10652 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10653 &LVL, &CM, BFI, PSI, Checks); 10654 10655 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10656 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10657 ++LoopsVectorized; 10658 10659 // Add metadata to disable runtime unrolling a scalar loop when there 10660 // are no runtime checks about strides and memory. A scalar loop that is 10661 // rarely used is not worth unrolling. 10662 if (!LB.areSafetyChecksAdded()) 10663 DisableRuntimeUnroll = true; 10664 } 10665 // Report the vectorization decision. 10666 ORE->emit([&]() { 10667 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10668 L->getHeader()) 10669 << "vectorized loop (vectorization width: " 10670 << NV("VectorizationFactor", VF.Width) 10671 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10672 }); 10673 } 10674 10675 if (ORE->allowExtraAnalysis(LV_NAME)) 10676 checkMixedPrecision(L, ORE); 10677 } 10678 10679 Optional<MDNode *> RemainderLoopID = 10680 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10681 LLVMLoopVectorizeFollowupEpilogue}); 10682 if (RemainderLoopID.hasValue()) { 10683 L->setLoopID(RemainderLoopID.getValue()); 10684 } else { 10685 if (DisableRuntimeUnroll) 10686 AddRuntimeUnrollDisableMetaData(L); 10687 10688 // Mark the loop as already vectorized to avoid vectorizing again. 10689 Hints.setAlreadyVectorized(); 10690 } 10691 10692 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10693 return true; 10694 } 10695 10696 LoopVectorizeResult LoopVectorizePass::runImpl( 10697 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10698 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10699 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10700 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10701 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10702 SE = &SE_; 10703 LI = &LI_; 10704 TTI = &TTI_; 10705 DT = &DT_; 10706 BFI = &BFI_; 10707 TLI = TLI_; 10708 AA = &AA_; 10709 AC = &AC_; 10710 GetLAA = &GetLAA_; 10711 DB = &DB_; 10712 ORE = &ORE_; 10713 PSI = PSI_; 10714 10715 // Don't attempt if 10716 // 1. the target claims to have no vector registers, and 10717 // 2. interleaving won't help ILP. 10718 // 10719 // The second condition is necessary because, even if the target has no 10720 // vector registers, loop vectorization may still enable scalar 10721 // interleaving. 10722 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10723 TTI->getMaxInterleaveFactor(1) < 2) 10724 return LoopVectorizeResult(false, false); 10725 10726 bool Changed = false, CFGChanged = false; 10727 10728 // The vectorizer requires loops to be in simplified form. 10729 // Since simplification may add new inner loops, it has to run before the 10730 // legality and profitability checks. This means running the loop vectorizer 10731 // will simplify all loops, regardless of whether anything end up being 10732 // vectorized. 10733 for (auto &L : *LI) 10734 Changed |= CFGChanged |= 10735 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10736 10737 // Build up a worklist of inner-loops to vectorize. This is necessary as 10738 // the act of vectorizing or partially unrolling a loop creates new loops 10739 // and can invalidate iterators across the loops. 10740 SmallVector<Loop *, 8> Worklist; 10741 10742 for (Loop *L : *LI) 10743 collectSupportedLoops(*L, LI, ORE, Worklist); 10744 10745 LoopsAnalyzed += Worklist.size(); 10746 10747 // Now walk the identified inner loops. 10748 while (!Worklist.empty()) { 10749 Loop *L = Worklist.pop_back_val(); 10750 10751 // For the inner loops we actually process, form LCSSA to simplify the 10752 // transform. 10753 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10754 10755 Changed |= CFGChanged |= processLoop(L); 10756 } 10757 10758 // Process each loop nest in the function. 10759 return LoopVectorizeResult(Changed, CFGChanged); 10760 } 10761 10762 PreservedAnalyses LoopVectorizePass::run(Function &F, 10763 FunctionAnalysisManager &AM) { 10764 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10765 auto &LI = AM.getResult<LoopAnalysis>(F); 10766 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10767 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10768 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10769 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10770 auto &AA = AM.getResult<AAManager>(F); 10771 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10772 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10773 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10774 10775 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10776 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10777 [&](Loop &L) -> const LoopAccessInfo & { 10778 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10779 TLI, TTI, nullptr, nullptr, nullptr}; 10780 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10781 }; 10782 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10783 ProfileSummaryInfo *PSI = 10784 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10785 LoopVectorizeResult Result = 10786 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10787 if (!Result.MadeAnyChange) 10788 return PreservedAnalyses::all(); 10789 PreservedAnalyses PA; 10790 10791 // We currently do not preserve loopinfo/dominator analyses with outer loop 10792 // vectorization. Until this is addressed, mark these analyses as preserved 10793 // only for non-VPlan-native path. 10794 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10795 if (!EnableVPlanNativePath) { 10796 PA.preserve<LoopAnalysis>(); 10797 PA.preserve<DominatorTreeAnalysis>(); 10798 } 10799 10800 if (Result.MadeCFGChange) { 10801 // Making CFG changes likely means a loop got vectorized. Indicate that 10802 // extra simplification passes should be run. 10803 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10804 // be run if runtime checks have been added. 10805 AM.getResult<ShouldRunExtraVectorPasses>(F); 10806 PA.preserve<ShouldRunExtraVectorPasses>(); 10807 } else { 10808 PA.preserveSet<CFGAnalyses>(); 10809 } 10810 return PA; 10811 } 10812 10813 void LoopVectorizePass::printPipeline( 10814 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10815 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10816 OS, MapClassName2PassName); 10817 10818 OS << "<"; 10819 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10820 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10821 OS << ">"; 10822 } 10823