1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 458 PSI(PSI), RTChecks(RTChecks) { 459 // Query this against the original loop and save it here because the profile 460 // of the original loop header may change as the transformation happens. 461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 463 } 464 465 virtual ~InnerLoopVectorizer() = default; 466 467 /// Create a new empty loop that will contain vectorized instructions later 468 /// on, while the old loop will be used as the scalar remainder. Control flow 469 /// is generated around the vectorized (and scalar epilogue) loops consisting 470 /// of various checks and bypasses. Return the pre-header block of the new 471 /// loop. 472 /// In the case of epilogue vectorization, this function is overriden to 473 /// handle the more complex control flow around the loops. 474 virtual BasicBlock *createVectorizedLoopSkeleton(); 475 476 /// Widen a single call instruction within the innermost loop. 477 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 478 VPTransformState &State); 479 480 /// Widen a single select instruction within the innermost loop. 481 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 482 bool InvariantCond, VPTransformState &State); 483 484 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 485 void fixVectorizedLoop(VPTransformState &State); 486 487 // Return true if any runtime check is added. 488 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 489 490 /// A type for vectorized values in the new loop. Each value from the 491 /// original loop, when vectorized, is represented by UF vector values in the 492 /// new unrolled loop, where UF is the unroll factor. 493 using VectorParts = SmallVector<Value *, 2>; 494 495 /// Vectorize a single first-order recurrence or pointer induction PHINode in 496 /// a block. This method handles the induction variable canonicalization. It 497 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 498 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 499 VPTransformState &State); 500 501 /// A helper function to scalarize a single Instruction in the innermost loop. 502 /// Generates a sequence of scalar instances for each lane between \p MinLane 503 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 504 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 505 /// Instr's operands. 506 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 507 const VPIteration &Instance, bool IfPredicateInstr, 508 VPTransformState &State); 509 510 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 511 /// is provided, the integer induction variable will first be truncated to 512 /// the corresponding type. 513 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 514 VPValue *Def, VPValue *CastDef, 515 VPTransformState &State); 516 517 /// Construct the vector value of a scalarized value \p V one lane at a time. 518 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 519 VPTransformState &State); 520 521 /// Try to vectorize interleaved access group \p Group with the base address 522 /// given in \p Addr, optionally masking the vector operations if \p 523 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 524 /// values in the vectorized loop. 525 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 526 ArrayRef<VPValue *> VPDefs, 527 VPTransformState &State, VPValue *Addr, 528 ArrayRef<VPValue *> StoredValues, 529 VPValue *BlockInMask = nullptr); 530 531 /// Vectorize Load and Store instructions with the base address given in \p 532 /// Addr, optionally masking the vector operations if \p BlockInMask is 533 /// non-null. Use \p State to translate given VPValues to IR values in the 534 /// vectorized loop. 535 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 536 VPValue *Def, VPValue *Addr, 537 VPValue *StoredValue, VPValue *BlockInMask, 538 bool ConsecutiveStride, bool Reverse); 539 540 /// Set the debug location in the builder \p Ptr using the debug location in 541 /// \p V. If \p Ptr is None then it uses the class member's Builder. 542 void setDebugLocFromInst(const Value *V, 543 Optional<IRBuilder<> *> CustomBuilder = None); 544 545 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 546 void fixNonInductionPHIs(VPTransformState &State); 547 548 /// Returns true if the reordering of FP operations is not allowed, but we are 549 /// able to vectorize with strict in-order reductions for the given RdxDesc. 550 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); 551 552 /// Create a broadcast instruction. This method generates a broadcast 553 /// instruction (shuffle) for loop invariant values and for the induction 554 /// value. If this is the induction variable then we extend it to N, N+1, ... 555 /// this is needed because each iteration in the loop corresponds to a SIMD 556 /// element. 557 virtual Value *getBroadcastInstrs(Value *V); 558 559 /// Add metadata from one instruction to another. 560 /// 561 /// This includes both the original MDs from \p From and additional ones (\see 562 /// addNewMetadata). Use this for *newly created* instructions in the vector 563 /// loop. 564 void addMetadata(Instruction *To, Instruction *From); 565 566 /// Similar to the previous function but it adds the metadata to a 567 /// vector of instructions. 568 void addMetadata(ArrayRef<Value *> To, Instruction *From); 569 570 protected: 571 friend class LoopVectorizationPlanner; 572 573 /// A small list of PHINodes. 574 using PhiVector = SmallVector<PHINode *, 4>; 575 576 /// A type for scalarized values in the new loop. Each value from the 577 /// original loop, when scalarized, is represented by UF x VF scalar values 578 /// in the new unrolled loop, where UF is the unroll factor and VF is the 579 /// vectorization factor. 580 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 581 582 /// Set up the values of the IVs correctly when exiting the vector loop. 583 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 584 Value *CountRoundDown, Value *EndValue, 585 BasicBlock *MiddleBlock); 586 587 /// Create a new induction variable inside L. 588 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 589 Value *Step, Instruction *DL); 590 591 /// Handle all cross-iteration phis in the header. 592 void fixCrossIterationPHIs(VPTransformState &State); 593 594 /// Create the exit value of first order recurrences in the middle block and 595 /// update their users. 596 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 597 598 /// Create code for the loop exit value of the reduction. 599 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 600 601 /// Clear NSW/NUW flags from reduction instructions if necessary. 602 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 603 VPTransformState &State); 604 605 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 606 /// means we need to add the appropriate incoming value from the middle 607 /// block as exiting edges from the scalar epilogue loop (if present) are 608 /// already in place, and we exit the vector loop exclusively to the middle 609 /// block. 610 void fixLCSSAPHIs(VPTransformState &State); 611 612 /// Iteratively sink the scalarized operands of a predicated instruction into 613 /// the block that was created for it. 614 void sinkScalarOperands(Instruction *PredInst); 615 616 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 617 /// represented as. 618 void truncateToMinimalBitwidths(VPTransformState &State); 619 620 /// This function adds 621 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 622 /// to each vector element of Val. The sequence starts at StartIndex. 623 /// \p Opcode is relevant for FP induction variable. 624 virtual Value * 625 getStepVector(Value *Val, Value *StartIdx, Value *Step, 626 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 627 628 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 629 /// variable on which to base the steps, \p Step is the size of the step, and 630 /// \p EntryVal is the value from the original loop that maps to the steps. 631 /// Note that \p EntryVal doesn't have to be an induction variable - it 632 /// can also be a truncate instruction. 633 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 634 const InductionDescriptor &ID, VPValue *Def, 635 VPValue *CastDef, VPTransformState &State); 636 637 /// Create a vector induction phi node based on an existing scalar one. \p 638 /// EntryVal is the value from the original loop that maps to the vector phi 639 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 640 /// truncate instruction, instead of widening the original IV, we widen a 641 /// version of the IV truncated to \p EntryVal's type. 642 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 643 Value *Step, Value *Start, 644 Instruction *EntryVal, VPValue *Def, 645 VPValue *CastDef, 646 VPTransformState &State); 647 648 /// Returns true if an instruction \p I should be scalarized instead of 649 /// vectorized for the chosen vectorization factor. 650 bool shouldScalarizeInstruction(Instruction *I) const; 651 652 /// Returns true if we should generate a scalar version of \p IV. 653 bool needsScalarInduction(Instruction *IV) const; 654 655 /// If there is a cast involved in the induction variable \p ID, which should 656 /// be ignored in the vectorized loop body, this function records the 657 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 658 /// cast. We had already proved that the casted Phi is equal to the uncasted 659 /// Phi in the vectorized loop (under a runtime guard), and therefore 660 /// there is no need to vectorize the cast - the same value can be used in the 661 /// vector loop for both the Phi and the cast. 662 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 663 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 664 /// 665 /// \p EntryVal is the value from the original loop that maps to the vector 666 /// phi node and is used to distinguish what is the IV currently being 667 /// processed - original one (if \p EntryVal is a phi corresponding to the 668 /// original IV) or the "newly-created" one based on the proof mentioned above 669 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 670 /// latter case \p EntryVal is a TruncInst and we must not record anything for 671 /// that IV, but it's error-prone to expect callers of this routine to care 672 /// about that, hence this explicit parameter. 673 void recordVectorLoopValueForInductionCast( 674 const InductionDescriptor &ID, const Instruction *EntryVal, 675 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 676 unsigned Part, unsigned Lane = UINT_MAX); 677 678 /// Generate a shuffle sequence that will reverse the vector Vec. 679 virtual Value *reverseVector(Value *Vec); 680 681 /// Returns (and creates if needed) the original loop trip count. 682 Value *getOrCreateTripCount(Loop *NewLoop); 683 684 /// Returns (and creates if needed) the trip count of the widened loop. 685 Value *getOrCreateVectorTripCount(Loop *NewLoop); 686 687 /// Returns a bitcasted value to the requested vector type. 688 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 689 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 690 const DataLayout &DL); 691 692 /// Emit a bypass check to see if the vector trip count is zero, including if 693 /// it overflows. 694 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 695 696 /// Emit a bypass check to see if all of the SCEV assumptions we've 697 /// had to make are correct. Returns the block containing the checks or 698 /// nullptr if no checks have been added. 699 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 700 701 /// Emit bypass checks to check any memory assumptions we may have made. 702 /// Returns the block containing the checks or nullptr if no checks have been 703 /// added. 704 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 705 706 /// Compute the transformed value of Index at offset StartValue using step 707 /// StepValue. 708 /// For integer induction, returns StartValue + Index * StepValue. 709 /// For pointer induction, returns StartValue[Index * StepValue]. 710 /// FIXME: The newly created binary instructions should contain nsw/nuw 711 /// flags, which can be found from the original scalar operations. 712 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 713 const DataLayout &DL, 714 const InductionDescriptor &ID) const; 715 716 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 717 /// vector loop preheader, middle block and scalar preheader. Also 718 /// allocate a loop object for the new vector loop and return it. 719 Loop *createVectorLoopSkeleton(StringRef Prefix); 720 721 /// Create new phi nodes for the induction variables to resume iteration count 722 /// in the scalar epilogue, from where the vectorized loop left off (given by 723 /// \p VectorTripCount). 724 /// In cases where the loop skeleton is more complicated (eg. epilogue 725 /// vectorization) and the resume values can come from an additional bypass 726 /// block, the \p AdditionalBypass pair provides information about the bypass 727 /// block and the end value on the edge from bypass to this loop. 728 void createInductionResumeValues( 729 Loop *L, Value *VectorTripCount, 730 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 731 732 /// Complete the loop skeleton by adding debug MDs, creating appropriate 733 /// conditional branches in the middle block, preparing the builder and 734 /// running the verifier. Take in the vector loop \p L as argument, and return 735 /// the preheader of the completed vector loop. 736 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 737 738 /// Add additional metadata to \p To that was not present on \p Orig. 739 /// 740 /// Currently this is used to add the noalias annotations based on the 741 /// inserted memchecks. Use this for instructions that are *cloned* into the 742 /// vector loop. 743 void addNewMetadata(Instruction *To, const Instruction *Orig); 744 745 /// Collect poison-generating recipes that may generate a poison value that is 746 /// used after vectorization, even when their operands are not poison. Those 747 /// recipes meet the following conditions: 748 /// * Contribute to the address computation of a recipe generating a widen 749 /// memory load/store (VPWidenMemoryInstructionRecipe or 750 /// VPInterleaveRecipe). 751 /// * Such a widen memory load/store has at least one underlying Instruction 752 /// that is in a basic block that needs predication and after vectorization 753 /// the generated instruction won't be predicated. 754 void collectPoisonGeneratingRecipes(VPTransformState &State); 755 756 /// Allow subclasses to override and print debug traces before/after vplan 757 /// execution, when trace information is requested. 758 virtual void printDebugTracesAtStart(){}; 759 virtual void printDebugTracesAtEnd(){}; 760 761 /// The original loop. 762 Loop *OrigLoop; 763 764 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 765 /// dynamic knowledge to simplify SCEV expressions and converts them to a 766 /// more usable form. 767 PredicatedScalarEvolution &PSE; 768 769 /// Loop Info. 770 LoopInfo *LI; 771 772 /// Dominator Tree. 773 DominatorTree *DT; 774 775 /// Alias Analysis. 776 AAResults *AA; 777 778 /// Target Library Info. 779 const TargetLibraryInfo *TLI; 780 781 /// Target Transform Info. 782 const TargetTransformInfo *TTI; 783 784 /// Assumption Cache. 785 AssumptionCache *AC; 786 787 /// Interface to emit optimization remarks. 788 OptimizationRemarkEmitter *ORE; 789 790 /// LoopVersioning. It's only set up (non-null) if memchecks were 791 /// used. 792 /// 793 /// This is currently only used to add no-alias metadata based on the 794 /// memchecks. The actually versioning is performed manually. 795 std::unique_ptr<LoopVersioning> LVer; 796 797 /// The vectorization SIMD factor to use. Each vector will have this many 798 /// vector elements. 799 ElementCount VF; 800 801 /// The vectorization unroll factor to use. Each scalar is vectorized to this 802 /// many different vector instructions. 803 unsigned UF; 804 805 /// The builder that we use 806 IRBuilder<> Builder; 807 808 // --- Vectorization state --- 809 810 /// The vector-loop preheader. 811 BasicBlock *LoopVectorPreHeader; 812 813 /// The scalar-loop preheader. 814 BasicBlock *LoopScalarPreHeader; 815 816 /// Middle Block between the vector and the scalar. 817 BasicBlock *LoopMiddleBlock; 818 819 /// The unique ExitBlock of the scalar loop if one exists. Note that 820 /// there can be multiple exiting edges reaching this block. 821 BasicBlock *LoopExitBlock; 822 823 /// The vector loop body. 824 BasicBlock *LoopVectorBody; 825 826 /// The scalar loop body. 827 BasicBlock *LoopScalarBody; 828 829 /// A list of all bypass blocks. The first block is the entry of the loop. 830 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 831 832 /// The new Induction variable which was added to the new block. 833 PHINode *Induction = nullptr; 834 835 /// The induction variable of the old basic block. 836 PHINode *OldInduction = nullptr; 837 838 /// Store instructions that were predicated. 839 SmallVector<Instruction *, 4> PredicatedInstructions; 840 841 /// Trip count of the original loop. 842 Value *TripCount = nullptr; 843 844 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 845 Value *VectorTripCount = nullptr; 846 847 /// The legality analysis. 848 LoopVectorizationLegality *Legal; 849 850 /// The profitablity analysis. 851 LoopVectorizationCostModel *Cost; 852 853 // Record whether runtime checks are added. 854 bool AddedSafetyChecks = false; 855 856 // Holds the end values for each induction variable. We save the end values 857 // so we can later fix-up the external users of the induction variables. 858 DenseMap<PHINode *, Value *> IVEndValues; 859 860 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 861 // fixed up at the end of vector code generation. 862 SmallVector<PHINode *, 8> OrigPHIsToFix; 863 864 /// BFI and PSI are used to check for profile guided size optimizations. 865 BlockFrequencyInfo *BFI; 866 ProfileSummaryInfo *PSI; 867 868 // Whether this loop should be optimized for size based on profile guided size 869 // optimizatios. 870 bool OptForSizeBasedOnProfile; 871 872 /// Structure to hold information about generated runtime checks, responsible 873 /// for cleaning the checks, if vectorization turns out unprofitable. 874 GeneratedRTChecks &RTChecks; 875 }; 876 877 class InnerLoopUnroller : public InnerLoopVectorizer { 878 public: 879 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 880 LoopInfo *LI, DominatorTree *DT, 881 const TargetLibraryInfo *TLI, 882 const TargetTransformInfo *TTI, AssumptionCache *AC, 883 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 884 LoopVectorizationLegality *LVL, 885 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 886 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 887 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 888 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 889 BFI, PSI, Check) {} 890 891 private: 892 Value *getBroadcastInstrs(Value *V) override; 893 Value *getStepVector( 894 Value *Val, Value *StartIdx, Value *Step, 895 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 896 Value *reverseVector(Value *Vec) override; 897 }; 898 899 /// Encapsulate information regarding vectorization of a loop and its epilogue. 900 /// This information is meant to be updated and used across two stages of 901 /// epilogue vectorization. 902 struct EpilogueLoopVectorizationInfo { 903 ElementCount MainLoopVF = ElementCount::getFixed(0); 904 unsigned MainLoopUF = 0; 905 ElementCount EpilogueVF = ElementCount::getFixed(0); 906 unsigned EpilogueUF = 0; 907 BasicBlock *MainLoopIterationCountCheck = nullptr; 908 BasicBlock *EpilogueIterationCountCheck = nullptr; 909 BasicBlock *SCEVSafetyCheck = nullptr; 910 BasicBlock *MemSafetyCheck = nullptr; 911 Value *TripCount = nullptr; 912 Value *VectorTripCount = nullptr; 913 914 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 915 ElementCount EVF, unsigned EUF) 916 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 917 assert(EUF == 1 && 918 "A high UF for the epilogue loop is likely not beneficial."); 919 } 920 }; 921 922 /// An extension of the inner loop vectorizer that creates a skeleton for a 923 /// vectorized loop that has its epilogue (residual) also vectorized. 924 /// The idea is to run the vplan on a given loop twice, firstly to setup the 925 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 926 /// from the first step and vectorize the epilogue. This is achieved by 927 /// deriving two concrete strategy classes from this base class and invoking 928 /// them in succession from the loop vectorizer planner. 929 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 930 public: 931 InnerLoopAndEpilogueVectorizer( 932 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 933 DominatorTree *DT, const TargetLibraryInfo *TLI, 934 const TargetTransformInfo *TTI, AssumptionCache *AC, 935 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 936 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 937 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 938 GeneratedRTChecks &Checks) 939 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 940 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 941 Checks), 942 EPI(EPI) {} 943 944 // Override this function to handle the more complex control flow around the 945 // three loops. 946 BasicBlock *createVectorizedLoopSkeleton() final override { 947 return createEpilogueVectorizedLoopSkeleton(); 948 } 949 950 /// The interface for creating a vectorized skeleton using one of two 951 /// different strategies, each corresponding to one execution of the vplan 952 /// as described above. 953 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 954 955 /// Holds and updates state information required to vectorize the main loop 956 /// and its epilogue in two separate passes. This setup helps us avoid 957 /// regenerating and recomputing runtime safety checks. It also helps us to 958 /// shorten the iteration-count-check path length for the cases where the 959 /// iteration count of the loop is so small that the main vector loop is 960 /// completely skipped. 961 EpilogueLoopVectorizationInfo &EPI; 962 }; 963 964 /// A specialized derived class of inner loop vectorizer that performs 965 /// vectorization of *main* loops in the process of vectorizing loops and their 966 /// epilogues. 967 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 968 public: 969 EpilogueVectorizerMainLoop( 970 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 971 DominatorTree *DT, const TargetLibraryInfo *TLI, 972 const TargetTransformInfo *TTI, AssumptionCache *AC, 973 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 974 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 975 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 976 GeneratedRTChecks &Check) 977 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 978 EPI, LVL, CM, BFI, PSI, Check) {} 979 /// Implements the interface for creating a vectorized skeleton using the 980 /// *main loop* strategy (ie the first pass of vplan execution). 981 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 982 983 protected: 984 /// Emits an iteration count bypass check once for the main loop (when \p 985 /// ForEpilogue is false) and once for the epilogue loop (when \p 986 /// ForEpilogue is true). 987 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 988 bool ForEpilogue); 989 void printDebugTracesAtStart() override; 990 void printDebugTracesAtEnd() override; 991 }; 992 993 // A specialized derived class of inner loop vectorizer that performs 994 // vectorization of *epilogue* loops in the process of vectorizing loops and 995 // their epilogues. 996 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 997 public: 998 EpilogueVectorizerEpilogueLoop( 999 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1000 DominatorTree *DT, const TargetLibraryInfo *TLI, 1001 const TargetTransformInfo *TTI, AssumptionCache *AC, 1002 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1003 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1004 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1005 GeneratedRTChecks &Checks) 1006 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1007 EPI, LVL, CM, BFI, PSI, Checks) {} 1008 /// Implements the interface for creating a vectorized skeleton using the 1009 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1010 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1011 1012 protected: 1013 /// Emits an iteration count bypass check after the main vector loop has 1014 /// finished to see if there are any iterations left to execute by either 1015 /// the vector epilogue or the scalar epilogue. 1016 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1017 BasicBlock *Bypass, 1018 BasicBlock *Insert); 1019 void printDebugTracesAtStart() override; 1020 void printDebugTracesAtEnd() override; 1021 }; 1022 } // end namespace llvm 1023 1024 /// Look for a meaningful debug location on the instruction or it's 1025 /// operands. 1026 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1027 if (!I) 1028 return I; 1029 1030 DebugLoc Empty; 1031 if (I->getDebugLoc() != Empty) 1032 return I; 1033 1034 for (Use &Op : I->operands()) { 1035 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1036 if (OpInst->getDebugLoc() != Empty) 1037 return OpInst; 1038 } 1039 1040 return I; 1041 } 1042 1043 void InnerLoopVectorizer::setDebugLocFromInst( 1044 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1045 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1046 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1047 const DILocation *DIL = Inst->getDebugLoc(); 1048 1049 // When a FSDiscriminator is enabled, we don't need to add the multiply 1050 // factors to the discriminators. 1051 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1052 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1053 // FIXME: For scalable vectors, assume vscale=1. 1054 auto NewDIL = 1055 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1056 if (NewDIL) 1057 B->SetCurrentDebugLocation(NewDIL.getValue()); 1058 else 1059 LLVM_DEBUG(dbgs() 1060 << "Failed to create new discriminator: " 1061 << DIL->getFilename() << " Line: " << DIL->getLine()); 1062 } else 1063 B->SetCurrentDebugLocation(DIL); 1064 } else 1065 B->SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1069 /// is passed, the message relates to that particular instruction. 1070 #ifndef NDEBUG 1071 static void debugVectorizationMessage(const StringRef Prefix, 1072 const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: " << Prefix << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1104 } 1105 1106 /// Return a value for Step multiplied by VF. 1107 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1108 int64_t Step) { 1109 assert(Ty->isIntegerTy() && "Expected an integer step"); 1110 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1111 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1112 } 1113 1114 namespace llvm { 1115 1116 /// Return the runtime value for VF. 1117 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1118 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1119 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1120 } 1121 1122 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1123 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1124 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1125 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1126 return B.CreateUIToFP(RuntimeVF, FTy); 1127 } 1128 1129 void reportVectorizationFailure(const StringRef DebugMsg, 1130 const StringRef OREMsg, const StringRef ORETag, 1131 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1132 Instruction *I) { 1133 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1134 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1135 ORE->emit( 1136 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1137 << "loop not vectorized: " << OREMsg); 1138 } 1139 1140 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1141 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1142 Instruction *I) { 1143 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1144 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1145 ORE->emit( 1146 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1147 << Msg); 1148 } 1149 1150 } // end namespace llvm 1151 1152 #ifndef NDEBUG 1153 /// \return string containing a file name and a line # for the given loop. 1154 static std::string getDebugLocString(const Loop *L) { 1155 std::string Result; 1156 if (L) { 1157 raw_string_ostream OS(Result); 1158 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1159 LoopDbgLoc.print(OS); 1160 else 1161 // Just print the module name. 1162 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1163 OS.flush(); 1164 } 1165 return Result; 1166 } 1167 #endif 1168 1169 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1170 const Instruction *Orig) { 1171 // If the loop was versioned with memchecks, add the corresponding no-alias 1172 // metadata. 1173 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1174 LVer->annotateInstWithNoAlias(To, Orig); 1175 } 1176 1177 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1178 VPTransformState &State) { 1179 1180 // Collect recipes in the backward slice of `Root` that may generate a poison 1181 // value that is used after vectorization. 1182 SmallPtrSet<VPRecipeBase *, 16> Visited; 1183 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1184 SmallVector<VPRecipeBase *, 16> Worklist; 1185 Worklist.push_back(Root); 1186 1187 // Traverse the backward slice of Root through its use-def chain. 1188 while (!Worklist.empty()) { 1189 VPRecipeBase *CurRec = Worklist.back(); 1190 Worklist.pop_back(); 1191 1192 if (!Visited.insert(CurRec).second) 1193 continue; 1194 1195 // Prune search if we find another recipe generating a widen memory 1196 // instruction. Widen memory instructions involved in address computation 1197 // will lead to gather/scatter instructions, which don't need to be 1198 // handled. 1199 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1200 isa<VPInterleaveRecipe>(CurRec)) 1201 continue; 1202 1203 // This recipe contributes to the address computation of a widen 1204 // load/store. Collect recipe if its underlying instruction has 1205 // poison-generating flags. 1206 Instruction *Instr = CurRec->getUnderlyingInstr(); 1207 if (Instr && cast<Operator>(Instr)->hasPoisonGeneratingFlags()) 1208 State.MayGeneratePoisonRecipes.insert(CurRec); 1209 1210 // Add new definitions to the worklist. 1211 for (VPValue *operand : CurRec->operands()) 1212 if (VPDef *OpDef = operand->getDef()) 1213 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1214 } 1215 }); 1216 1217 // Traverse all the recipes in the VPlan and collect the poison-generating 1218 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1219 // VPInterleaveRecipe. 1220 auto Iter = depth_first( 1221 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1222 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1223 for (VPRecipeBase &Recipe : *VPBB) { 1224 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1225 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1226 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1227 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1228 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1229 collectPoisonGeneratingInstrsInBackwardSlice( 1230 cast<VPRecipeBase>(AddrDef)); 1231 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1232 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1233 if (AddrDef) { 1234 // Check if any member of the interleave group needs predication. 1235 const InterleaveGroup<Instruction> *InterGroup = 1236 InterleaveRec->getInterleaveGroup(); 1237 bool NeedPredication = false; 1238 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1239 I < NumMembers; ++I) { 1240 Instruction *Member = InterGroup->getMember(I); 1241 if (Member) 1242 NeedPredication |= 1243 Legal->blockNeedsPredication(Member->getParent()); 1244 } 1245 1246 if (NeedPredication) 1247 collectPoisonGeneratingInstrsInBackwardSlice( 1248 cast<VPRecipeBase>(AddrDef)); 1249 } 1250 } 1251 } 1252 } 1253 } 1254 1255 void InnerLoopVectorizer::addMetadata(Instruction *To, 1256 Instruction *From) { 1257 propagateMetadata(To, From); 1258 addNewMetadata(To, From); 1259 } 1260 1261 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1262 Instruction *From) { 1263 for (Value *V : To) { 1264 if (Instruction *I = dyn_cast<Instruction>(V)) 1265 addMetadata(I, From); 1266 } 1267 } 1268 1269 namespace llvm { 1270 1271 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1272 // lowered. 1273 enum ScalarEpilogueLowering { 1274 1275 // The default: allowing scalar epilogues. 1276 CM_ScalarEpilogueAllowed, 1277 1278 // Vectorization with OptForSize: don't allow epilogues. 1279 CM_ScalarEpilogueNotAllowedOptSize, 1280 1281 // A special case of vectorisation with OptForSize: loops with a very small 1282 // trip count are considered for vectorization under OptForSize, thereby 1283 // making sure the cost of their loop body is dominant, free of runtime 1284 // guards and scalar iteration overheads. 1285 CM_ScalarEpilogueNotAllowedLowTripLoop, 1286 1287 // Loop hint predicate indicating an epilogue is undesired. 1288 CM_ScalarEpilogueNotNeededUsePredicate, 1289 1290 // Directive indicating we must either tail fold or not vectorize 1291 CM_ScalarEpilogueNotAllowedUsePredicate 1292 }; 1293 1294 /// ElementCountComparator creates a total ordering for ElementCount 1295 /// for the purposes of using it in a set structure. 1296 struct ElementCountComparator { 1297 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1298 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1299 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1300 } 1301 }; 1302 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1303 1304 /// LoopVectorizationCostModel - estimates the expected speedups due to 1305 /// vectorization. 1306 /// In many cases vectorization is not profitable. This can happen because of 1307 /// a number of reasons. In this class we mainly attempt to predict the 1308 /// expected speedup/slowdowns due to the supported instruction set. We use the 1309 /// TargetTransformInfo to query the different backends for the cost of 1310 /// different operations. 1311 class LoopVectorizationCostModel { 1312 public: 1313 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1314 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1315 LoopVectorizationLegality *Legal, 1316 const TargetTransformInfo &TTI, 1317 const TargetLibraryInfo *TLI, DemandedBits *DB, 1318 AssumptionCache *AC, 1319 OptimizationRemarkEmitter *ORE, const Function *F, 1320 const LoopVectorizeHints *Hints, 1321 InterleavedAccessInfo &IAI) 1322 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1323 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1324 Hints(Hints), InterleaveInfo(IAI) {} 1325 1326 /// \return An upper bound for the vectorization factors (both fixed and 1327 /// scalable). If the factors are 0, vectorization and interleaving should be 1328 /// avoided up front. 1329 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1330 1331 /// \return True if runtime checks are required for vectorization, and false 1332 /// otherwise. 1333 bool runtimeChecksRequired(); 1334 1335 /// \return The most profitable vectorization factor and the cost of that VF. 1336 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1337 /// then this vectorization factor will be selected if vectorization is 1338 /// possible. 1339 VectorizationFactor 1340 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1341 1342 VectorizationFactor 1343 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1344 const LoopVectorizationPlanner &LVP); 1345 1346 /// Setup cost-based decisions for user vectorization factor. 1347 /// \return true if the UserVF is a feasible VF to be chosen. 1348 bool selectUserVectorizationFactor(ElementCount UserVF) { 1349 collectUniformsAndScalars(UserVF); 1350 collectInstsToScalarize(UserVF); 1351 return expectedCost(UserVF).first.isValid(); 1352 } 1353 1354 /// \return The size (in bits) of the smallest and widest types in the code 1355 /// that needs to be vectorized. We ignore values that remain scalar such as 1356 /// 64 bit loop indices. 1357 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1358 1359 /// \return The desired interleave count. 1360 /// If interleave count has been specified by metadata it will be returned. 1361 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1362 /// are the selected vectorization factor and the cost of the selected VF. 1363 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1364 1365 /// Memory access instruction may be vectorized in more than one way. 1366 /// Form of instruction after vectorization depends on cost. 1367 /// This function takes cost-based decisions for Load/Store instructions 1368 /// and collects them in a map. This decisions map is used for building 1369 /// the lists of loop-uniform and loop-scalar instructions. 1370 /// The calculated cost is saved with widening decision in order to 1371 /// avoid redundant calculations. 1372 void setCostBasedWideningDecision(ElementCount VF); 1373 1374 /// A struct that represents some properties of the register usage 1375 /// of a loop. 1376 struct RegisterUsage { 1377 /// Holds the number of loop invariant values that are used in the loop. 1378 /// The key is ClassID of target-provided register class. 1379 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1380 /// Holds the maximum number of concurrent live intervals in the loop. 1381 /// The key is ClassID of target-provided register class. 1382 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1383 }; 1384 1385 /// \return Returns information about the register usages of the loop for the 1386 /// given vectorization factors. 1387 SmallVector<RegisterUsage, 8> 1388 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1389 1390 /// Collect values we want to ignore in the cost model. 1391 void collectValuesToIgnore(); 1392 1393 /// Collect all element types in the loop for which widening is needed. 1394 void collectElementTypesForWidening(); 1395 1396 /// Split reductions into those that happen in the loop, and those that happen 1397 /// outside. In loop reductions are collected into InLoopReductionChains. 1398 void collectInLoopReductions(); 1399 1400 /// Returns true if we should use strict in-order reductions for the given 1401 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1402 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1403 /// of FP operations. 1404 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1405 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1406 } 1407 1408 /// \returns The smallest bitwidth each instruction can be represented with. 1409 /// The vector equivalents of these instructions should be truncated to this 1410 /// type. 1411 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1412 return MinBWs; 1413 } 1414 1415 /// \returns True if it is more profitable to scalarize instruction \p I for 1416 /// vectorization factor \p VF. 1417 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1418 assert(VF.isVector() && 1419 "Profitable to scalarize relevant only for VF > 1."); 1420 1421 // Cost model is not run in the VPlan-native path - return conservative 1422 // result until this changes. 1423 if (EnableVPlanNativePath) 1424 return false; 1425 1426 auto Scalars = InstsToScalarize.find(VF); 1427 assert(Scalars != InstsToScalarize.end() && 1428 "VF not yet analyzed for scalarization profitability"); 1429 return Scalars->second.find(I) != Scalars->second.end(); 1430 } 1431 1432 /// Returns true if \p I is known to be uniform after vectorization. 1433 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1434 if (VF.isScalar()) 1435 return true; 1436 1437 // Cost model is not run in the VPlan-native path - return conservative 1438 // result until this changes. 1439 if (EnableVPlanNativePath) 1440 return false; 1441 1442 auto UniformsPerVF = Uniforms.find(VF); 1443 assert(UniformsPerVF != Uniforms.end() && 1444 "VF not yet analyzed for uniformity"); 1445 return UniformsPerVF->second.count(I); 1446 } 1447 1448 /// Returns true if \p I is known to be scalar after vectorization. 1449 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1450 if (VF.isScalar()) 1451 return true; 1452 1453 // Cost model is not run in the VPlan-native path - return conservative 1454 // result until this changes. 1455 if (EnableVPlanNativePath) 1456 return false; 1457 1458 auto ScalarsPerVF = Scalars.find(VF); 1459 assert(ScalarsPerVF != Scalars.end() && 1460 "Scalar values are not calculated for VF"); 1461 return ScalarsPerVF->second.count(I); 1462 } 1463 1464 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1465 /// for vectorization factor \p VF. 1466 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1467 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1468 !isProfitableToScalarize(I, VF) && 1469 !isScalarAfterVectorization(I, VF); 1470 } 1471 1472 /// Decision that was taken during cost calculation for memory instruction. 1473 enum InstWidening { 1474 CM_Unknown, 1475 CM_Widen, // For consecutive accesses with stride +1. 1476 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1477 CM_Interleave, 1478 CM_GatherScatter, 1479 CM_Scalarize 1480 }; 1481 1482 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1483 /// instruction \p I and vector width \p VF. 1484 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1485 InstructionCost Cost) { 1486 assert(VF.isVector() && "Expected VF >=2"); 1487 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1488 } 1489 1490 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1491 /// interleaving group \p Grp and vector width \p VF. 1492 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1493 ElementCount VF, InstWidening W, 1494 InstructionCost Cost) { 1495 assert(VF.isVector() && "Expected VF >=2"); 1496 /// Broadcast this decicion to all instructions inside the group. 1497 /// But the cost will be assigned to one instruction only. 1498 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1499 if (auto *I = Grp->getMember(i)) { 1500 if (Grp->getInsertPos() == I) 1501 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1502 else 1503 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1504 } 1505 } 1506 } 1507 1508 /// Return the cost model decision for the given instruction \p I and vector 1509 /// width \p VF. Return CM_Unknown if this instruction did not pass 1510 /// through the cost modeling. 1511 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1512 assert(VF.isVector() && "Expected VF to be a vector VF"); 1513 // Cost model is not run in the VPlan-native path - return conservative 1514 // result until this changes. 1515 if (EnableVPlanNativePath) 1516 return CM_GatherScatter; 1517 1518 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1519 auto Itr = WideningDecisions.find(InstOnVF); 1520 if (Itr == WideningDecisions.end()) 1521 return CM_Unknown; 1522 return Itr->second.first; 1523 } 1524 1525 /// Return the vectorization cost for the given instruction \p I and vector 1526 /// width \p VF. 1527 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1528 assert(VF.isVector() && "Expected VF >=2"); 1529 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1530 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1531 "The cost is not calculated"); 1532 return WideningDecisions[InstOnVF].second; 1533 } 1534 1535 /// Return True if instruction \p I is an optimizable truncate whose operand 1536 /// is an induction variable. Such a truncate will be removed by adding a new 1537 /// induction variable with the destination type. 1538 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1539 // If the instruction is not a truncate, return false. 1540 auto *Trunc = dyn_cast<TruncInst>(I); 1541 if (!Trunc) 1542 return false; 1543 1544 // Get the source and destination types of the truncate. 1545 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1546 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1547 1548 // If the truncate is free for the given types, return false. Replacing a 1549 // free truncate with an induction variable would add an induction variable 1550 // update instruction to each iteration of the loop. We exclude from this 1551 // check the primary induction variable since it will need an update 1552 // instruction regardless. 1553 Value *Op = Trunc->getOperand(0); 1554 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1555 return false; 1556 1557 // If the truncated value is not an induction variable, return false. 1558 return Legal->isInductionPhi(Op); 1559 } 1560 1561 /// Collects the instructions to scalarize for each predicated instruction in 1562 /// the loop. 1563 void collectInstsToScalarize(ElementCount VF); 1564 1565 /// Collect Uniform and Scalar values for the given \p VF. 1566 /// The sets depend on CM decision for Load/Store instructions 1567 /// that may be vectorized as interleave, gather-scatter or scalarized. 1568 void collectUniformsAndScalars(ElementCount VF) { 1569 // Do the analysis once. 1570 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1571 return; 1572 setCostBasedWideningDecision(VF); 1573 collectLoopUniforms(VF); 1574 collectLoopScalars(VF); 1575 } 1576 1577 /// Returns true if the target machine supports masked store operation 1578 /// for the given \p DataType and kind of access to \p Ptr. 1579 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1580 return Legal->isConsecutivePtr(DataType, Ptr) && 1581 TTI.isLegalMaskedStore(DataType, Alignment); 1582 } 1583 1584 /// Returns true if the target machine supports masked load operation 1585 /// for the given \p DataType and kind of access to \p Ptr. 1586 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1587 return Legal->isConsecutivePtr(DataType, Ptr) && 1588 TTI.isLegalMaskedLoad(DataType, Alignment); 1589 } 1590 1591 /// Returns true if the target machine can represent \p V as a masked gather 1592 /// or scatter operation. 1593 bool isLegalGatherOrScatter(Value *V) { 1594 bool LI = isa<LoadInst>(V); 1595 bool SI = isa<StoreInst>(V); 1596 if (!LI && !SI) 1597 return false; 1598 auto *Ty = getLoadStoreType(V); 1599 Align Align = getLoadStoreAlignment(V); 1600 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1601 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1602 } 1603 1604 /// Returns true if the target machine supports all of the reduction 1605 /// variables found for the given VF. 1606 bool canVectorizeReductions(ElementCount VF) const { 1607 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1608 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1609 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1610 })); 1611 } 1612 1613 /// Returns true if \p I is an instruction that will be scalarized with 1614 /// predication. Such instructions include conditional stores and 1615 /// instructions that may divide by zero. 1616 /// If a non-zero VF has been calculated, we check if I will be scalarized 1617 /// predication for that VF. 1618 bool isScalarWithPredication(Instruction *I) const; 1619 1620 // Returns true if \p I is an instruction that will be predicated either 1621 // through scalar predication or masked load/store or masked gather/scatter. 1622 // Superset of instructions that return true for isScalarWithPredication. 1623 bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { 1624 // When we know the load is uniform and the original scalar loop was not 1625 // predicated we don't need to mark it as a predicated instruction. Any 1626 // vectorised blocks created when tail-folding are something artificial we 1627 // have introduced and we know there is always at least one active lane. 1628 // That's why we call Legal->blockNeedsPredication here because it doesn't 1629 // query tail-folding. 1630 if (IsKnownUniform && isa<LoadInst>(I) && 1631 !Legal->blockNeedsPredication(I->getParent())) 1632 return false; 1633 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1634 return false; 1635 // Loads and stores that need some form of masked operation are predicated 1636 // instructions. 1637 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1638 return Legal->isMaskRequired(I); 1639 return isScalarWithPredication(I); 1640 } 1641 1642 /// Returns true if \p I is a memory instruction with consecutive memory 1643 /// access that can be widened. 1644 bool 1645 memoryInstructionCanBeWidened(Instruction *I, 1646 ElementCount VF = ElementCount::getFixed(1)); 1647 1648 /// Returns true if \p I is a memory instruction in an interleaved-group 1649 /// of memory accesses that can be vectorized with wide vector loads/stores 1650 /// and shuffles. 1651 bool 1652 interleavedAccessCanBeWidened(Instruction *I, 1653 ElementCount VF = ElementCount::getFixed(1)); 1654 1655 /// Check if \p Instr belongs to any interleaved access group. 1656 bool isAccessInterleaved(Instruction *Instr) { 1657 return InterleaveInfo.isInterleaved(Instr); 1658 } 1659 1660 /// Get the interleaved access group that \p Instr belongs to. 1661 const InterleaveGroup<Instruction> * 1662 getInterleavedAccessGroup(Instruction *Instr) { 1663 return InterleaveInfo.getInterleaveGroup(Instr); 1664 } 1665 1666 /// Returns true if we're required to use a scalar epilogue for at least 1667 /// the final iteration of the original loop. 1668 bool requiresScalarEpilogue(ElementCount VF) const { 1669 if (!isScalarEpilogueAllowed()) 1670 return false; 1671 // If we might exit from anywhere but the latch, must run the exiting 1672 // iteration in scalar form. 1673 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1674 return true; 1675 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1676 } 1677 1678 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1679 /// loop hint annotation. 1680 bool isScalarEpilogueAllowed() const { 1681 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1682 } 1683 1684 /// Returns true if all loop blocks should be masked to fold tail loop. 1685 bool foldTailByMasking() const { return FoldTailByMasking; } 1686 1687 /// Returns true if the instructions in this block requires predication 1688 /// for any reason, e.g. because tail folding now requires a predicate 1689 /// or because the block in the original loop was predicated. 1690 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1691 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1692 } 1693 1694 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1695 /// nodes to the chain of instructions representing the reductions. Uses a 1696 /// MapVector to ensure deterministic iteration order. 1697 using ReductionChainMap = 1698 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1699 1700 /// Return the chain of instructions representing an inloop reduction. 1701 const ReductionChainMap &getInLoopReductionChains() const { 1702 return InLoopReductionChains; 1703 } 1704 1705 /// Returns true if the Phi is part of an inloop reduction. 1706 bool isInLoopReduction(PHINode *Phi) const { 1707 return InLoopReductionChains.count(Phi); 1708 } 1709 1710 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1711 /// with factor VF. Return the cost of the instruction, including 1712 /// scalarization overhead if it's needed. 1713 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1714 1715 /// Estimate cost of a call instruction CI if it were vectorized with factor 1716 /// VF. Return the cost of the instruction, including scalarization overhead 1717 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1718 /// scalarized - 1719 /// i.e. either vector version isn't available, or is too expensive. 1720 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1721 bool &NeedToScalarize) const; 1722 1723 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1724 /// that of B. 1725 bool isMoreProfitable(const VectorizationFactor &A, 1726 const VectorizationFactor &B) const; 1727 1728 /// Invalidates decisions already taken by the cost model. 1729 void invalidateCostModelingDecisions() { 1730 WideningDecisions.clear(); 1731 Uniforms.clear(); 1732 Scalars.clear(); 1733 } 1734 1735 private: 1736 unsigned NumPredStores = 0; 1737 1738 /// \return An upper bound for the vectorization factors for both 1739 /// fixed and scalable vectorization, where the minimum-known number of 1740 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1741 /// disabled or unsupported, then the scalable part will be equal to 1742 /// ElementCount::getScalable(0). 1743 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1744 ElementCount UserVF); 1745 1746 /// \return the maximized element count based on the targets vector 1747 /// registers and the loop trip-count, but limited to a maximum safe VF. 1748 /// This is a helper function of computeFeasibleMaxVF. 1749 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1750 /// issue that occurred on one of the buildbots which cannot be reproduced 1751 /// without having access to the properietary compiler (see comments on 1752 /// D98509). The issue is currently under investigation and this workaround 1753 /// will be removed as soon as possible. 1754 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1755 unsigned SmallestType, 1756 unsigned WidestType, 1757 const ElementCount &MaxSafeVF); 1758 1759 /// \return the maximum legal scalable VF, based on the safe max number 1760 /// of elements. 1761 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1762 1763 /// The vectorization cost is a combination of the cost itself and a boolean 1764 /// indicating whether any of the contributing operations will actually 1765 /// operate on vector values after type legalization in the backend. If this 1766 /// latter value is false, then all operations will be scalarized (i.e. no 1767 /// vectorization has actually taken place). 1768 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1769 1770 /// Returns the expected execution cost. The unit of the cost does 1771 /// not matter because we use the 'cost' units to compare different 1772 /// vector widths. The cost that is returned is *not* normalized by 1773 /// the factor width. If \p Invalid is not nullptr, this function 1774 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1775 /// each instruction that has an Invalid cost for the given VF. 1776 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1777 VectorizationCostTy 1778 expectedCost(ElementCount VF, 1779 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1780 1781 /// Returns the execution time cost of an instruction for a given vector 1782 /// width. Vector width of one means scalar. 1783 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1784 1785 /// The cost-computation logic from getInstructionCost which provides 1786 /// the vector type as an output parameter. 1787 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1788 Type *&VectorTy); 1789 1790 /// Return the cost of instructions in an inloop reduction pattern, if I is 1791 /// part of that pattern. 1792 Optional<InstructionCost> 1793 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1794 TTI::TargetCostKind CostKind); 1795 1796 /// Calculate vectorization cost of memory instruction \p I. 1797 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1798 1799 /// The cost computation for scalarized memory instruction. 1800 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1801 1802 /// The cost computation for interleaving group of memory instructions. 1803 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1804 1805 /// The cost computation for Gather/Scatter instruction. 1806 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1807 1808 /// The cost computation for widening instruction \p I with consecutive 1809 /// memory access. 1810 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1811 1812 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1813 /// Load: scalar load + broadcast. 1814 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1815 /// element) 1816 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1817 1818 /// Estimate the overhead of scalarizing an instruction. This is a 1819 /// convenience wrapper for the type-based getScalarizationOverhead API. 1820 InstructionCost getScalarizationOverhead(Instruction *I, 1821 ElementCount VF) const; 1822 1823 /// Returns whether the instruction is a load or store and will be a emitted 1824 /// as a vector operation. 1825 bool isConsecutiveLoadOrStore(Instruction *I); 1826 1827 /// Returns true if an artificially high cost for emulated masked memrefs 1828 /// should be used. 1829 bool useEmulatedMaskMemRefHack(Instruction *I); 1830 1831 /// Map of scalar integer values to the smallest bitwidth they can be legally 1832 /// represented as. The vector equivalents of these values should be truncated 1833 /// to this type. 1834 MapVector<Instruction *, uint64_t> MinBWs; 1835 1836 /// A type representing the costs for instructions if they were to be 1837 /// scalarized rather than vectorized. The entries are Instruction-Cost 1838 /// pairs. 1839 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1840 1841 /// A set containing all BasicBlocks that are known to present after 1842 /// vectorization as a predicated block. 1843 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1844 1845 /// Records whether it is allowed to have the original scalar loop execute at 1846 /// least once. This may be needed as a fallback loop in case runtime 1847 /// aliasing/dependence checks fail, or to handle the tail/remainder 1848 /// iterations when the trip count is unknown or doesn't divide by the VF, 1849 /// or as a peel-loop to handle gaps in interleave-groups. 1850 /// Under optsize and when the trip count is very small we don't allow any 1851 /// iterations to execute in the scalar loop. 1852 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1853 1854 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1855 bool FoldTailByMasking = false; 1856 1857 /// A map holding scalar costs for different vectorization factors. The 1858 /// presence of a cost for an instruction in the mapping indicates that the 1859 /// instruction will be scalarized when vectorizing with the associated 1860 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1861 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1862 1863 /// Holds the instructions known to be uniform after vectorization. 1864 /// The data is collected per VF. 1865 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1866 1867 /// Holds the instructions known to be scalar after vectorization. 1868 /// The data is collected per VF. 1869 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1870 1871 /// Holds the instructions (address computations) that are forced to be 1872 /// scalarized. 1873 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1874 1875 /// PHINodes of the reductions that should be expanded in-loop along with 1876 /// their associated chains of reduction operations, in program order from top 1877 /// (PHI) to bottom 1878 ReductionChainMap InLoopReductionChains; 1879 1880 /// A Map of inloop reduction operations and their immediate chain operand. 1881 /// FIXME: This can be removed once reductions can be costed correctly in 1882 /// vplan. This was added to allow quick lookup to the inloop operations, 1883 /// without having to loop through InLoopReductionChains. 1884 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1885 1886 /// Returns the expected difference in cost from scalarizing the expression 1887 /// feeding a predicated instruction \p PredInst. The instructions to 1888 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1889 /// non-negative return value implies the expression will be scalarized. 1890 /// Currently, only single-use chains are considered for scalarization. 1891 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1892 ElementCount VF); 1893 1894 /// Collect the instructions that are uniform after vectorization. An 1895 /// instruction is uniform if we represent it with a single scalar value in 1896 /// the vectorized loop corresponding to each vector iteration. Examples of 1897 /// uniform instructions include pointer operands of consecutive or 1898 /// interleaved memory accesses. Note that although uniformity implies an 1899 /// instruction will be scalar, the reverse is not true. In general, a 1900 /// scalarized instruction will be represented by VF scalar values in the 1901 /// vectorized loop, each corresponding to an iteration of the original 1902 /// scalar loop. 1903 void collectLoopUniforms(ElementCount VF); 1904 1905 /// Collect the instructions that are scalar after vectorization. An 1906 /// instruction is scalar if it is known to be uniform or will be scalarized 1907 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1908 /// to the list if they are used by a load/store instruction that is marked as 1909 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1910 /// VF values in the vectorized loop, each corresponding to an iteration of 1911 /// the original scalar loop. 1912 void collectLoopScalars(ElementCount VF); 1913 1914 /// Keeps cost model vectorization decision and cost for instructions. 1915 /// Right now it is used for memory instructions only. 1916 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1917 std::pair<InstWidening, InstructionCost>>; 1918 1919 DecisionList WideningDecisions; 1920 1921 /// Returns true if \p V is expected to be vectorized and it needs to be 1922 /// extracted. 1923 bool needsExtract(Value *V, ElementCount VF) const { 1924 Instruction *I = dyn_cast<Instruction>(V); 1925 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1926 TheLoop->isLoopInvariant(I)) 1927 return false; 1928 1929 // Assume we can vectorize V (and hence we need extraction) if the 1930 // scalars are not computed yet. This can happen, because it is called 1931 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1932 // the scalars are collected. That should be a safe assumption in most 1933 // cases, because we check if the operands have vectorizable types 1934 // beforehand in LoopVectorizationLegality. 1935 return Scalars.find(VF) == Scalars.end() || 1936 !isScalarAfterVectorization(I, VF); 1937 }; 1938 1939 /// Returns a range containing only operands needing to be extracted. 1940 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1941 ElementCount VF) const { 1942 return SmallVector<Value *, 4>(make_filter_range( 1943 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1944 } 1945 1946 /// Determines if we have the infrastructure to vectorize loop \p L and its 1947 /// epilogue, assuming the main loop is vectorized by \p VF. 1948 bool isCandidateForEpilogueVectorization(const Loop &L, 1949 const ElementCount VF) const; 1950 1951 /// Returns true if epilogue vectorization is considered profitable, and 1952 /// false otherwise. 1953 /// \p VF is the vectorization factor chosen for the original loop. 1954 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1955 1956 public: 1957 /// The loop that we evaluate. 1958 Loop *TheLoop; 1959 1960 /// Predicated scalar evolution analysis. 1961 PredicatedScalarEvolution &PSE; 1962 1963 /// Loop Info analysis. 1964 LoopInfo *LI; 1965 1966 /// Vectorization legality. 1967 LoopVectorizationLegality *Legal; 1968 1969 /// Vector target information. 1970 const TargetTransformInfo &TTI; 1971 1972 /// Target Library Info. 1973 const TargetLibraryInfo *TLI; 1974 1975 /// Demanded bits analysis. 1976 DemandedBits *DB; 1977 1978 /// Assumption cache. 1979 AssumptionCache *AC; 1980 1981 /// Interface to emit optimization remarks. 1982 OptimizationRemarkEmitter *ORE; 1983 1984 const Function *TheFunction; 1985 1986 /// Loop Vectorize Hint. 1987 const LoopVectorizeHints *Hints; 1988 1989 /// The interleave access information contains groups of interleaved accesses 1990 /// with the same stride and close to each other. 1991 InterleavedAccessInfo &InterleaveInfo; 1992 1993 /// Values to ignore in the cost model. 1994 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1995 1996 /// Values to ignore in the cost model when VF > 1. 1997 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1998 1999 /// All element types found in the loop. 2000 SmallPtrSet<Type *, 16> ElementTypesInLoop; 2001 2002 /// Profitable vector factors. 2003 SmallVector<VectorizationFactor, 8> ProfitableVFs; 2004 }; 2005 } // end namespace llvm 2006 2007 /// Helper struct to manage generating runtime checks for vectorization. 2008 /// 2009 /// The runtime checks are created up-front in temporary blocks to allow better 2010 /// estimating the cost and un-linked from the existing IR. After deciding to 2011 /// vectorize, the checks are moved back. If deciding not to vectorize, the 2012 /// temporary blocks are completely removed. 2013 class GeneratedRTChecks { 2014 /// Basic block which contains the generated SCEV checks, if any. 2015 BasicBlock *SCEVCheckBlock = nullptr; 2016 2017 /// The value representing the result of the generated SCEV checks. If it is 2018 /// nullptr, either no SCEV checks have been generated or they have been used. 2019 Value *SCEVCheckCond = nullptr; 2020 2021 /// Basic block which contains the generated memory runtime checks, if any. 2022 BasicBlock *MemCheckBlock = nullptr; 2023 2024 /// The value representing the result of the generated memory runtime checks. 2025 /// If it is nullptr, either no memory runtime checks have been generated or 2026 /// they have been used. 2027 Value *MemRuntimeCheckCond = nullptr; 2028 2029 DominatorTree *DT; 2030 LoopInfo *LI; 2031 2032 SCEVExpander SCEVExp; 2033 SCEVExpander MemCheckExp; 2034 2035 public: 2036 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 2037 const DataLayout &DL) 2038 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 2039 MemCheckExp(SE, DL, "scev.check") {} 2040 2041 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 2042 /// accurately estimate the cost of the runtime checks. The blocks are 2043 /// un-linked from the IR and is added back during vector code generation. If 2044 /// there is no vector code generation, the check blocks are removed 2045 /// completely. 2046 void Create(Loop *L, const LoopAccessInfo &LAI, 2047 const SCEVUnionPredicate &UnionPred) { 2048 2049 BasicBlock *LoopHeader = L->getHeader(); 2050 BasicBlock *Preheader = L->getLoopPreheader(); 2051 2052 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2053 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2054 // may be used by SCEVExpander. The blocks will be un-linked from their 2055 // predecessors and removed from LI & DT at the end of the function. 2056 if (!UnionPred.isAlwaysTrue()) { 2057 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2058 nullptr, "vector.scevcheck"); 2059 2060 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2061 &UnionPred, SCEVCheckBlock->getTerminator()); 2062 } 2063 2064 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2065 if (RtPtrChecking.Need) { 2066 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2067 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2068 "vector.memcheck"); 2069 2070 MemRuntimeCheckCond = 2071 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2072 RtPtrChecking.getChecks(), MemCheckExp); 2073 assert(MemRuntimeCheckCond && 2074 "no RT checks generated although RtPtrChecking " 2075 "claimed checks are required"); 2076 } 2077 2078 if (!MemCheckBlock && !SCEVCheckBlock) 2079 return; 2080 2081 // Unhook the temporary block with the checks, update various places 2082 // accordingly. 2083 if (SCEVCheckBlock) 2084 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2085 if (MemCheckBlock) 2086 MemCheckBlock->replaceAllUsesWith(Preheader); 2087 2088 if (SCEVCheckBlock) { 2089 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2090 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2091 Preheader->getTerminator()->eraseFromParent(); 2092 } 2093 if (MemCheckBlock) { 2094 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2095 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2096 Preheader->getTerminator()->eraseFromParent(); 2097 } 2098 2099 DT->changeImmediateDominator(LoopHeader, Preheader); 2100 if (MemCheckBlock) { 2101 DT->eraseNode(MemCheckBlock); 2102 LI->removeBlock(MemCheckBlock); 2103 } 2104 if (SCEVCheckBlock) { 2105 DT->eraseNode(SCEVCheckBlock); 2106 LI->removeBlock(SCEVCheckBlock); 2107 } 2108 } 2109 2110 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2111 /// unused. 2112 ~GeneratedRTChecks() { 2113 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2114 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2115 if (!SCEVCheckCond) 2116 SCEVCleaner.markResultUsed(); 2117 2118 if (!MemRuntimeCheckCond) 2119 MemCheckCleaner.markResultUsed(); 2120 2121 if (MemRuntimeCheckCond) { 2122 auto &SE = *MemCheckExp.getSE(); 2123 // Memory runtime check generation creates compares that use expanded 2124 // values. Remove them before running the SCEVExpanderCleaners. 2125 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2126 if (MemCheckExp.isInsertedInstruction(&I)) 2127 continue; 2128 SE.forgetValue(&I); 2129 I.eraseFromParent(); 2130 } 2131 } 2132 MemCheckCleaner.cleanup(); 2133 SCEVCleaner.cleanup(); 2134 2135 if (SCEVCheckCond) 2136 SCEVCheckBlock->eraseFromParent(); 2137 if (MemRuntimeCheckCond) 2138 MemCheckBlock->eraseFromParent(); 2139 } 2140 2141 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2142 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2143 /// depending on the generated condition. 2144 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2145 BasicBlock *LoopVectorPreHeader, 2146 BasicBlock *LoopExitBlock) { 2147 if (!SCEVCheckCond) 2148 return nullptr; 2149 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2150 if (C->isZero()) 2151 return nullptr; 2152 2153 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2154 2155 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2156 // Create new preheader for vector loop. 2157 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2158 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2159 2160 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2161 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2162 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2163 SCEVCheckBlock); 2164 2165 DT->addNewBlock(SCEVCheckBlock, Pred); 2166 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2167 2168 ReplaceInstWithInst( 2169 SCEVCheckBlock->getTerminator(), 2170 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2171 // Mark the check as used, to prevent it from being removed during cleanup. 2172 SCEVCheckCond = nullptr; 2173 return SCEVCheckBlock; 2174 } 2175 2176 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2177 /// the branches to branch to the vector preheader or \p Bypass, depending on 2178 /// the generated condition. 2179 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2180 BasicBlock *LoopVectorPreHeader) { 2181 // Check if we generated code that checks in runtime if arrays overlap. 2182 if (!MemRuntimeCheckCond) 2183 return nullptr; 2184 2185 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2186 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2187 MemCheckBlock); 2188 2189 DT->addNewBlock(MemCheckBlock, Pred); 2190 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2191 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2192 2193 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2194 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2195 2196 ReplaceInstWithInst( 2197 MemCheckBlock->getTerminator(), 2198 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2199 MemCheckBlock->getTerminator()->setDebugLoc( 2200 Pred->getTerminator()->getDebugLoc()); 2201 2202 // Mark the check as used, to prevent it from being removed during cleanup. 2203 MemRuntimeCheckCond = nullptr; 2204 return MemCheckBlock; 2205 } 2206 }; 2207 2208 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2209 // vectorization. The loop needs to be annotated with #pragma omp simd 2210 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2211 // vector length information is not provided, vectorization is not considered 2212 // explicit. Interleave hints are not allowed either. These limitations will be 2213 // relaxed in the future. 2214 // Please, note that we are currently forced to abuse the pragma 'clang 2215 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2216 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2217 // provides *explicit vectorization hints* (LV can bypass legal checks and 2218 // assume that vectorization is legal). However, both hints are implemented 2219 // using the same metadata (llvm.loop.vectorize, processed by 2220 // LoopVectorizeHints). This will be fixed in the future when the native IR 2221 // representation for pragma 'omp simd' is introduced. 2222 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2223 OptimizationRemarkEmitter *ORE) { 2224 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2225 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2226 2227 // Only outer loops with an explicit vectorization hint are supported. 2228 // Unannotated outer loops are ignored. 2229 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2230 return false; 2231 2232 Function *Fn = OuterLp->getHeader()->getParent(); 2233 if (!Hints.allowVectorization(Fn, OuterLp, 2234 true /*VectorizeOnlyWhenForced*/)) { 2235 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2236 return false; 2237 } 2238 2239 if (Hints.getInterleave() > 1) { 2240 // TODO: Interleave support is future work. 2241 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2242 "outer loops.\n"); 2243 Hints.emitRemarkWithHints(); 2244 return false; 2245 } 2246 2247 return true; 2248 } 2249 2250 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2251 OptimizationRemarkEmitter *ORE, 2252 SmallVectorImpl<Loop *> &V) { 2253 // Collect inner loops and outer loops without irreducible control flow. For 2254 // now, only collect outer loops that have explicit vectorization hints. If we 2255 // are stress testing the VPlan H-CFG construction, we collect the outermost 2256 // loop of every loop nest. 2257 if (L.isInnermost() || VPlanBuildStressTest || 2258 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2259 LoopBlocksRPO RPOT(&L); 2260 RPOT.perform(LI); 2261 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2262 V.push_back(&L); 2263 // TODO: Collect inner loops inside marked outer loops in case 2264 // vectorization fails for the outer loop. Do not invoke 2265 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2266 // already known to be reducible. We can use an inherited attribute for 2267 // that. 2268 return; 2269 } 2270 } 2271 for (Loop *InnerL : L) 2272 collectSupportedLoops(*InnerL, LI, ORE, V); 2273 } 2274 2275 namespace { 2276 2277 /// The LoopVectorize Pass. 2278 struct LoopVectorize : public FunctionPass { 2279 /// Pass identification, replacement for typeid 2280 static char ID; 2281 2282 LoopVectorizePass Impl; 2283 2284 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2285 bool VectorizeOnlyWhenForced = false) 2286 : FunctionPass(ID), 2287 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2288 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2289 } 2290 2291 bool runOnFunction(Function &F) override { 2292 if (skipFunction(F)) 2293 return false; 2294 2295 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2296 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2297 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2298 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2299 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2300 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2301 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2302 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2303 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2304 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2305 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2306 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2307 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2308 2309 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2310 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2311 2312 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2313 GetLAA, *ORE, PSI).MadeAnyChange; 2314 } 2315 2316 void getAnalysisUsage(AnalysisUsage &AU) const override { 2317 AU.addRequired<AssumptionCacheTracker>(); 2318 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2319 AU.addRequired<DominatorTreeWrapperPass>(); 2320 AU.addRequired<LoopInfoWrapperPass>(); 2321 AU.addRequired<ScalarEvolutionWrapperPass>(); 2322 AU.addRequired<TargetTransformInfoWrapperPass>(); 2323 AU.addRequired<AAResultsWrapperPass>(); 2324 AU.addRequired<LoopAccessLegacyAnalysis>(); 2325 AU.addRequired<DemandedBitsWrapperPass>(); 2326 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2327 AU.addRequired<InjectTLIMappingsLegacy>(); 2328 2329 // We currently do not preserve loopinfo/dominator analyses with outer loop 2330 // vectorization. Until this is addressed, mark these analyses as preserved 2331 // only for non-VPlan-native path. 2332 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2333 if (!EnableVPlanNativePath) { 2334 AU.addPreserved<LoopInfoWrapperPass>(); 2335 AU.addPreserved<DominatorTreeWrapperPass>(); 2336 } 2337 2338 AU.addPreserved<BasicAAWrapperPass>(); 2339 AU.addPreserved<GlobalsAAWrapperPass>(); 2340 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2341 } 2342 }; 2343 2344 } // end anonymous namespace 2345 2346 //===----------------------------------------------------------------------===// 2347 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2348 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2349 //===----------------------------------------------------------------------===// 2350 2351 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2352 // We need to place the broadcast of invariant variables outside the loop, 2353 // but only if it's proven safe to do so. Else, broadcast will be inside 2354 // vector loop body. 2355 Instruction *Instr = dyn_cast<Instruction>(V); 2356 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2357 (!Instr || 2358 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2359 // Place the code for broadcasting invariant variables in the new preheader. 2360 IRBuilder<>::InsertPointGuard Guard(Builder); 2361 if (SafeToHoist) 2362 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2363 2364 // Broadcast the scalar into all locations in the vector. 2365 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2366 2367 return Shuf; 2368 } 2369 2370 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2371 const InductionDescriptor &II, Value *Step, Value *Start, 2372 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2373 VPTransformState &State) { 2374 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2375 "Expected either an induction phi-node or a truncate of it!"); 2376 2377 // Construct the initial value of the vector IV in the vector loop preheader 2378 auto CurrIP = Builder.saveIP(); 2379 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2380 if (isa<TruncInst>(EntryVal)) { 2381 assert(Start->getType()->isIntegerTy() && 2382 "Truncation requires an integer type"); 2383 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2384 Step = Builder.CreateTrunc(Step, TruncType); 2385 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2386 } 2387 2388 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2389 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2390 Value *SteppedStart = 2391 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2392 2393 // We create vector phi nodes for both integer and floating-point induction 2394 // variables. Here, we determine the kind of arithmetic we will perform. 2395 Instruction::BinaryOps AddOp; 2396 Instruction::BinaryOps MulOp; 2397 if (Step->getType()->isIntegerTy()) { 2398 AddOp = Instruction::Add; 2399 MulOp = Instruction::Mul; 2400 } else { 2401 AddOp = II.getInductionOpcode(); 2402 MulOp = Instruction::FMul; 2403 } 2404 2405 // Multiply the vectorization factor by the step using integer or 2406 // floating-point arithmetic as appropriate. 2407 Type *StepType = Step->getType(); 2408 Value *RuntimeVF; 2409 if (Step->getType()->isFloatingPointTy()) 2410 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); 2411 else 2412 RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2413 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2414 2415 // Create a vector splat to use in the induction update. 2416 // 2417 // FIXME: If the step is non-constant, we create the vector splat with 2418 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2419 // handle a constant vector splat. 2420 Value *SplatVF = isa<Constant>(Mul) 2421 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2422 : Builder.CreateVectorSplat(VF, Mul); 2423 Builder.restoreIP(CurrIP); 2424 2425 // We may need to add the step a number of times, depending on the unroll 2426 // factor. The last of those goes into the PHI. 2427 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2428 &*LoopVectorBody->getFirstInsertionPt()); 2429 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2430 Instruction *LastInduction = VecInd; 2431 for (unsigned Part = 0; Part < UF; ++Part) { 2432 State.set(Def, LastInduction, Part); 2433 2434 if (isa<TruncInst>(EntryVal)) 2435 addMetadata(LastInduction, EntryVal); 2436 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2437 State, Part); 2438 2439 LastInduction = cast<Instruction>( 2440 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2441 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2442 } 2443 2444 // Move the last step to the end of the latch block. This ensures consistent 2445 // placement of all induction updates. 2446 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2447 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2448 auto *ICmp = cast<Instruction>(Br->getCondition()); 2449 LastInduction->moveBefore(ICmp); 2450 LastInduction->setName("vec.ind.next"); 2451 2452 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2453 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2454 } 2455 2456 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2457 return Cost->isScalarAfterVectorization(I, VF) || 2458 Cost->isProfitableToScalarize(I, VF); 2459 } 2460 2461 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2462 if (shouldScalarizeInstruction(IV)) 2463 return true; 2464 auto isScalarInst = [&](User *U) -> bool { 2465 auto *I = cast<Instruction>(U); 2466 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2467 }; 2468 return llvm::any_of(IV->users(), isScalarInst); 2469 } 2470 2471 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2472 const InductionDescriptor &ID, const Instruction *EntryVal, 2473 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2474 unsigned Part, unsigned Lane) { 2475 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2476 "Expected either an induction phi-node or a truncate of it!"); 2477 2478 // This induction variable is not the phi from the original loop but the 2479 // newly-created IV based on the proof that casted Phi is equal to the 2480 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2481 // re-uses the same InductionDescriptor that original IV uses but we don't 2482 // have to do any recording in this case - that is done when original IV is 2483 // processed. 2484 if (isa<TruncInst>(EntryVal)) 2485 return; 2486 2487 if (!CastDef) { 2488 assert(ID.getCastInsts().empty() && 2489 "there are casts for ID, but no CastDef"); 2490 return; 2491 } 2492 assert(!ID.getCastInsts().empty() && 2493 "there is a CastDef, but no casts for ID"); 2494 // Only the first Cast instruction in the Casts vector is of interest. 2495 // The rest of the Casts (if exist) have no uses outside the 2496 // induction update chain itself. 2497 if (Lane < UINT_MAX) 2498 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2499 else 2500 State.set(CastDef, VectorLoopVal, Part); 2501 } 2502 2503 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2504 TruncInst *Trunc, VPValue *Def, 2505 VPValue *CastDef, 2506 VPTransformState &State) { 2507 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2508 "Primary induction variable must have an integer type"); 2509 2510 auto II = Legal->getInductionVars().find(IV); 2511 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2512 2513 auto ID = II->second; 2514 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2515 2516 // The value from the original loop to which we are mapping the new induction 2517 // variable. 2518 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2519 2520 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2521 2522 // Generate code for the induction step. Note that induction steps are 2523 // required to be loop-invariant 2524 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2525 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2526 "Induction step should be loop invariant"); 2527 if (PSE.getSE()->isSCEVable(IV->getType())) { 2528 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2529 return Exp.expandCodeFor(Step, Step->getType(), 2530 LoopVectorPreHeader->getTerminator()); 2531 } 2532 return cast<SCEVUnknown>(Step)->getValue(); 2533 }; 2534 2535 // The scalar value to broadcast. This is derived from the canonical 2536 // induction variable. If a truncation type is given, truncate the canonical 2537 // induction variable and step. Otherwise, derive these values from the 2538 // induction descriptor. 2539 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2540 Value *ScalarIV = Induction; 2541 if (IV != OldInduction) { 2542 ScalarIV = IV->getType()->isIntegerTy() 2543 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2544 : Builder.CreateCast(Instruction::SIToFP, Induction, 2545 IV->getType()); 2546 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2547 ScalarIV->setName("offset.idx"); 2548 } 2549 if (Trunc) { 2550 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2551 assert(Step->getType()->isIntegerTy() && 2552 "Truncation requires an integer step"); 2553 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2554 Step = Builder.CreateTrunc(Step, TruncType); 2555 } 2556 return ScalarIV; 2557 }; 2558 2559 // Create the vector values from the scalar IV, in the absence of creating a 2560 // vector IV. 2561 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2562 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2563 for (unsigned Part = 0; Part < UF; ++Part) { 2564 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2565 Value *StartIdx; 2566 if (Step->getType()->isFloatingPointTy()) 2567 StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); 2568 else 2569 StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); 2570 2571 Value *EntryPart = 2572 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2573 State.set(Def, EntryPart, Part); 2574 if (Trunc) 2575 addMetadata(EntryPart, Trunc); 2576 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2577 State, Part); 2578 } 2579 }; 2580 2581 // Fast-math-flags propagate from the original induction instruction. 2582 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2583 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2584 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2585 2586 // Now do the actual transformations, and start with creating the step value. 2587 Value *Step = CreateStepValue(ID.getStep()); 2588 if (VF.isZero() || VF.isScalar()) { 2589 Value *ScalarIV = CreateScalarIV(Step); 2590 CreateSplatIV(ScalarIV, Step); 2591 return; 2592 } 2593 2594 // Determine if we want a scalar version of the induction variable. This is 2595 // true if the induction variable itself is not widened, or if it has at 2596 // least one user in the loop that is not widened. 2597 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2598 if (!NeedsScalarIV) { 2599 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2600 State); 2601 return; 2602 } 2603 2604 // Try to create a new independent vector induction variable. If we can't 2605 // create the phi node, we will splat the scalar induction variable in each 2606 // loop iteration. 2607 if (!shouldScalarizeInstruction(EntryVal)) { 2608 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2609 State); 2610 Value *ScalarIV = CreateScalarIV(Step); 2611 // Create scalar steps that can be used by instructions we will later 2612 // scalarize. Note that the addition of the scalar steps will not increase 2613 // the number of instructions in the loop in the common case prior to 2614 // InstCombine. We will be trading one vector extract for each scalar step. 2615 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2616 return; 2617 } 2618 2619 // All IV users are scalar instructions, so only emit a scalar IV, not a 2620 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2621 // predicate used by the masked loads/stores. 2622 Value *ScalarIV = CreateScalarIV(Step); 2623 if (!Cost->isScalarEpilogueAllowed()) 2624 CreateSplatIV(ScalarIV, Step); 2625 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2626 } 2627 2628 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2629 Value *Step, 2630 Instruction::BinaryOps BinOp) { 2631 // Create and check the types. 2632 auto *ValVTy = cast<VectorType>(Val->getType()); 2633 ElementCount VLen = ValVTy->getElementCount(); 2634 2635 Type *STy = Val->getType()->getScalarType(); 2636 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2637 "Induction Step must be an integer or FP"); 2638 assert(Step->getType() == STy && "Step has wrong type"); 2639 2640 SmallVector<Constant *, 8> Indices; 2641 2642 // Create a vector of consecutive numbers from zero to VF. 2643 VectorType *InitVecValVTy = ValVTy; 2644 Type *InitVecValSTy = STy; 2645 if (STy->isFloatingPointTy()) { 2646 InitVecValSTy = 2647 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2648 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2649 } 2650 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2651 2652 // Splat the StartIdx 2653 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2654 2655 if (STy->isIntegerTy()) { 2656 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2657 Step = Builder.CreateVectorSplat(VLen, Step); 2658 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2659 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2660 // which can be found from the original scalar operations. 2661 Step = Builder.CreateMul(InitVec, Step); 2662 return Builder.CreateAdd(Val, Step, "induction"); 2663 } 2664 2665 // Floating point induction. 2666 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2667 "Binary Opcode should be specified for FP induction"); 2668 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2669 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2670 2671 Step = Builder.CreateVectorSplat(VLen, Step); 2672 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2673 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2674 } 2675 2676 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2677 Instruction *EntryVal, 2678 const InductionDescriptor &ID, 2679 VPValue *Def, VPValue *CastDef, 2680 VPTransformState &State) { 2681 // We shouldn't have to build scalar steps if we aren't vectorizing. 2682 assert(VF.isVector() && "VF should be greater than one"); 2683 // Get the value type and ensure it and the step have the same integer type. 2684 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2685 assert(ScalarIVTy == Step->getType() && 2686 "Val and Step should have the same type"); 2687 2688 // We build scalar steps for both integer and floating-point induction 2689 // variables. Here, we determine the kind of arithmetic we will perform. 2690 Instruction::BinaryOps AddOp; 2691 Instruction::BinaryOps MulOp; 2692 if (ScalarIVTy->isIntegerTy()) { 2693 AddOp = Instruction::Add; 2694 MulOp = Instruction::Mul; 2695 } else { 2696 AddOp = ID.getInductionOpcode(); 2697 MulOp = Instruction::FMul; 2698 } 2699 2700 // Determine the number of scalars we need to generate for each unroll 2701 // iteration. If EntryVal is uniform, we only need to generate the first 2702 // lane. Otherwise, we generate all VF values. 2703 bool IsUniform = 2704 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2705 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2706 // Compute the scalar steps and save the results in State. 2707 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2708 ScalarIVTy->getScalarSizeInBits()); 2709 Type *VecIVTy = nullptr; 2710 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2711 if (!IsUniform && VF.isScalable()) { 2712 VecIVTy = VectorType::get(ScalarIVTy, VF); 2713 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2714 SplatStep = Builder.CreateVectorSplat(VF, Step); 2715 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2716 } 2717 2718 for (unsigned Part = 0; Part < UF; ++Part) { 2719 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); 2720 2721 if (!IsUniform && VF.isScalable()) { 2722 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2723 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2724 if (ScalarIVTy->isFloatingPointTy()) 2725 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2726 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2727 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2728 State.set(Def, Add, Part); 2729 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2730 Part); 2731 // It's useful to record the lane values too for the known minimum number 2732 // of elements so we do those below. This improves the code quality when 2733 // trying to extract the first element, for example. 2734 } 2735 2736 if (ScalarIVTy->isFloatingPointTy()) 2737 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2738 2739 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2740 Value *StartIdx = Builder.CreateBinOp( 2741 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2742 // The step returned by `createStepForVF` is a runtime-evaluated value 2743 // when VF is scalable. Otherwise, it should be folded into a Constant. 2744 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2745 "Expected StartIdx to be folded to a constant when VF is not " 2746 "scalable"); 2747 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2748 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2749 State.set(Def, Add, VPIteration(Part, Lane)); 2750 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2751 Part, Lane); 2752 } 2753 } 2754 } 2755 2756 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2757 const VPIteration &Instance, 2758 VPTransformState &State) { 2759 Value *ScalarInst = State.get(Def, Instance); 2760 Value *VectorValue = State.get(Def, Instance.Part); 2761 VectorValue = Builder.CreateInsertElement( 2762 VectorValue, ScalarInst, 2763 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2764 State.set(Def, VectorValue, Instance.Part); 2765 } 2766 2767 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2768 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2769 return Builder.CreateVectorReverse(Vec, "reverse"); 2770 } 2771 2772 // Return whether we allow using masked interleave-groups (for dealing with 2773 // strided loads/stores that reside in predicated blocks, or for dealing 2774 // with gaps). 2775 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2776 // If an override option has been passed in for interleaved accesses, use it. 2777 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2778 return EnableMaskedInterleavedMemAccesses; 2779 2780 return TTI.enableMaskedInterleavedAccessVectorization(); 2781 } 2782 2783 // Try to vectorize the interleave group that \p Instr belongs to. 2784 // 2785 // E.g. Translate following interleaved load group (factor = 3): 2786 // for (i = 0; i < N; i+=3) { 2787 // R = Pic[i]; // Member of index 0 2788 // G = Pic[i+1]; // Member of index 1 2789 // B = Pic[i+2]; // Member of index 2 2790 // ... // do something to R, G, B 2791 // } 2792 // To: 2793 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2794 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2795 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2796 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2797 // 2798 // Or translate following interleaved store group (factor = 3): 2799 // for (i = 0; i < N; i+=3) { 2800 // ... do something to R, G, B 2801 // Pic[i] = R; // Member of index 0 2802 // Pic[i+1] = G; // Member of index 1 2803 // Pic[i+2] = B; // Member of index 2 2804 // } 2805 // To: 2806 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2807 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2808 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2809 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2810 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2811 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2812 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2813 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2814 VPValue *BlockInMask) { 2815 Instruction *Instr = Group->getInsertPos(); 2816 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2817 2818 // Prepare for the vector type of the interleaved load/store. 2819 Type *ScalarTy = getLoadStoreType(Instr); 2820 unsigned InterleaveFactor = Group->getFactor(); 2821 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2822 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2823 2824 // Prepare for the new pointers. 2825 SmallVector<Value *, 2> AddrParts; 2826 unsigned Index = Group->getIndex(Instr); 2827 2828 // TODO: extend the masked interleaved-group support to reversed access. 2829 assert((!BlockInMask || !Group->isReverse()) && 2830 "Reversed masked interleave-group not supported."); 2831 2832 // If the group is reverse, adjust the index to refer to the last vector lane 2833 // instead of the first. We adjust the index from the first vector lane, 2834 // rather than directly getting the pointer for lane VF - 1, because the 2835 // pointer operand of the interleaved access is supposed to be uniform. For 2836 // uniform instructions, we're only required to generate a value for the 2837 // first vector lane in each unroll iteration. 2838 if (Group->isReverse()) 2839 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2840 2841 for (unsigned Part = 0; Part < UF; Part++) { 2842 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2843 setDebugLocFromInst(AddrPart); 2844 2845 // Notice current instruction could be any index. Need to adjust the address 2846 // to the member of index 0. 2847 // 2848 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2849 // b = A[i]; // Member of index 0 2850 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2851 // 2852 // E.g. A[i+1] = a; // Member of index 1 2853 // A[i] = b; // Member of index 0 2854 // A[i+2] = c; // Member of index 2 (Current instruction) 2855 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2856 2857 bool InBounds = false; 2858 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2859 InBounds = gep->isInBounds(); 2860 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2861 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2862 2863 // Cast to the vector pointer type. 2864 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2865 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2866 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2867 } 2868 2869 setDebugLocFromInst(Instr); 2870 Value *PoisonVec = PoisonValue::get(VecTy); 2871 2872 Value *MaskForGaps = nullptr; 2873 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2874 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2875 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2876 } 2877 2878 // Vectorize the interleaved load group. 2879 if (isa<LoadInst>(Instr)) { 2880 // For each unroll part, create a wide load for the group. 2881 SmallVector<Value *, 2> NewLoads; 2882 for (unsigned Part = 0; Part < UF; Part++) { 2883 Instruction *NewLoad; 2884 if (BlockInMask || MaskForGaps) { 2885 assert(useMaskedInterleavedAccesses(*TTI) && 2886 "masked interleaved groups are not allowed."); 2887 Value *GroupMask = MaskForGaps; 2888 if (BlockInMask) { 2889 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2890 Value *ShuffledMask = Builder.CreateShuffleVector( 2891 BlockInMaskPart, 2892 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2893 "interleaved.mask"); 2894 GroupMask = MaskForGaps 2895 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2896 MaskForGaps) 2897 : ShuffledMask; 2898 } 2899 NewLoad = 2900 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2901 GroupMask, PoisonVec, "wide.masked.vec"); 2902 } 2903 else 2904 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2905 Group->getAlign(), "wide.vec"); 2906 Group->addMetadata(NewLoad); 2907 NewLoads.push_back(NewLoad); 2908 } 2909 2910 // For each member in the group, shuffle out the appropriate data from the 2911 // wide loads. 2912 unsigned J = 0; 2913 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2914 Instruction *Member = Group->getMember(I); 2915 2916 // Skip the gaps in the group. 2917 if (!Member) 2918 continue; 2919 2920 auto StrideMask = 2921 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2922 for (unsigned Part = 0; Part < UF; Part++) { 2923 Value *StridedVec = Builder.CreateShuffleVector( 2924 NewLoads[Part], StrideMask, "strided.vec"); 2925 2926 // If this member has different type, cast the result type. 2927 if (Member->getType() != ScalarTy) { 2928 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2929 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2930 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2931 } 2932 2933 if (Group->isReverse()) 2934 StridedVec = reverseVector(StridedVec); 2935 2936 State.set(VPDefs[J], StridedVec, Part); 2937 } 2938 ++J; 2939 } 2940 return; 2941 } 2942 2943 // The sub vector type for current instruction. 2944 auto *SubVT = VectorType::get(ScalarTy, VF); 2945 2946 // Vectorize the interleaved store group. 2947 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2948 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2949 "masked interleaved groups are not allowed."); 2950 assert((!MaskForGaps || !VF.isScalable()) && 2951 "masking gaps for scalable vectors is not yet supported."); 2952 for (unsigned Part = 0; Part < UF; Part++) { 2953 // Collect the stored vector from each member. 2954 SmallVector<Value *, 4> StoredVecs; 2955 for (unsigned i = 0; i < InterleaveFactor; i++) { 2956 assert((Group->getMember(i) || MaskForGaps) && 2957 "Fail to get a member from an interleaved store group"); 2958 Instruction *Member = Group->getMember(i); 2959 2960 // Skip the gaps in the group. 2961 if (!Member) { 2962 Value *Undef = PoisonValue::get(SubVT); 2963 StoredVecs.push_back(Undef); 2964 continue; 2965 } 2966 2967 Value *StoredVec = State.get(StoredValues[i], Part); 2968 2969 if (Group->isReverse()) 2970 StoredVec = reverseVector(StoredVec); 2971 2972 // If this member has different type, cast it to a unified type. 2973 2974 if (StoredVec->getType() != SubVT) 2975 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2976 2977 StoredVecs.push_back(StoredVec); 2978 } 2979 2980 // Concatenate all vectors into a wide vector. 2981 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2982 2983 // Interleave the elements in the wide vector. 2984 Value *IVec = Builder.CreateShuffleVector( 2985 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2986 "interleaved.vec"); 2987 2988 Instruction *NewStoreInstr; 2989 if (BlockInMask || MaskForGaps) { 2990 Value *GroupMask = MaskForGaps; 2991 if (BlockInMask) { 2992 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2993 Value *ShuffledMask = Builder.CreateShuffleVector( 2994 BlockInMaskPart, 2995 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2996 "interleaved.mask"); 2997 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2998 ShuffledMask, MaskForGaps) 2999 : ShuffledMask; 3000 } 3001 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 3002 Group->getAlign(), GroupMask); 3003 } else 3004 NewStoreInstr = 3005 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 3006 3007 Group->addMetadata(NewStoreInstr); 3008 } 3009 } 3010 3011 void InnerLoopVectorizer::vectorizeMemoryInstruction( 3012 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 3013 VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride, 3014 bool Reverse) { 3015 // Attempt to issue a wide load. 3016 LoadInst *LI = dyn_cast<LoadInst>(Instr); 3017 StoreInst *SI = dyn_cast<StoreInst>(Instr); 3018 3019 assert((LI || SI) && "Invalid Load/Store instruction"); 3020 assert((!SI || StoredValue) && "No stored value provided for widened store"); 3021 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 3022 3023 Type *ScalarDataTy = getLoadStoreType(Instr); 3024 3025 auto *DataTy = VectorType::get(ScalarDataTy, VF); 3026 const Align Alignment = getLoadStoreAlignment(Instr); 3027 bool CreateGatherScatter = !ConsecutiveStride; 3028 3029 VectorParts BlockInMaskParts(UF); 3030 bool isMaskRequired = BlockInMask; 3031 if (isMaskRequired) 3032 for (unsigned Part = 0; Part < UF; ++Part) 3033 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 3034 3035 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 3036 // Calculate the pointer for the specific unroll-part. 3037 GetElementPtrInst *PartPtr = nullptr; 3038 3039 bool InBounds = false; 3040 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 3041 InBounds = gep->isInBounds(); 3042 if (Reverse) { 3043 // If the address is consecutive but reversed, then the 3044 // wide store needs to start at the last vector element. 3045 // RunTimeVF = VScale * VF.getKnownMinValue() 3046 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 3047 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 3048 // NumElt = -Part * RunTimeVF 3049 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 3050 // LastLane = 1 - RunTimeVF 3051 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 3052 PartPtr = 3053 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 3054 PartPtr->setIsInBounds(InBounds); 3055 PartPtr = cast<GetElementPtrInst>( 3056 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 3057 PartPtr->setIsInBounds(InBounds); 3058 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 3059 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 3060 } else { 3061 Value *Increment = 3062 createStepForVF(Builder, Builder.getInt32Ty(), VF, Part); 3063 PartPtr = cast<GetElementPtrInst>( 3064 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 3065 PartPtr->setIsInBounds(InBounds); 3066 } 3067 3068 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 3069 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 3070 }; 3071 3072 // Handle Stores: 3073 if (SI) { 3074 setDebugLocFromInst(SI); 3075 3076 for (unsigned Part = 0; Part < UF; ++Part) { 3077 Instruction *NewSI = nullptr; 3078 Value *StoredVal = State.get(StoredValue, Part); 3079 if (CreateGatherScatter) { 3080 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3081 Value *VectorGep = State.get(Addr, Part); 3082 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 3083 MaskPart); 3084 } else { 3085 if (Reverse) { 3086 // If we store to reverse consecutive memory locations, then we need 3087 // to reverse the order of elements in the stored value. 3088 StoredVal = reverseVector(StoredVal); 3089 // We don't want to update the value in the map as it might be used in 3090 // another expression. So don't call resetVectorValue(StoredVal). 3091 } 3092 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3093 if (isMaskRequired) 3094 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 3095 BlockInMaskParts[Part]); 3096 else 3097 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 3098 } 3099 addMetadata(NewSI, SI); 3100 } 3101 return; 3102 } 3103 3104 // Handle loads. 3105 assert(LI && "Must have a load instruction"); 3106 setDebugLocFromInst(LI); 3107 for (unsigned Part = 0; Part < UF; ++Part) { 3108 Value *NewLI; 3109 if (CreateGatherScatter) { 3110 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 3111 Value *VectorGep = State.get(Addr, Part); 3112 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 3113 nullptr, "wide.masked.gather"); 3114 addMetadata(NewLI, LI); 3115 } else { 3116 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 3117 if (isMaskRequired) 3118 NewLI = Builder.CreateMaskedLoad( 3119 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 3120 PoisonValue::get(DataTy), "wide.masked.load"); 3121 else 3122 NewLI = 3123 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3124 3125 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3126 addMetadata(NewLI, LI); 3127 if (Reverse) 3128 NewLI = reverseVector(NewLI); 3129 } 3130 3131 State.set(Def, NewLI, Part); 3132 } 3133 } 3134 3135 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 3136 VPReplicateRecipe *RepRecipe, 3137 const VPIteration &Instance, 3138 bool IfPredicateInstr, 3139 VPTransformState &State) { 3140 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3141 3142 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3143 // the first lane and part. 3144 if (isa<NoAliasScopeDeclInst>(Instr)) 3145 if (!Instance.isFirstIteration()) 3146 return; 3147 3148 setDebugLocFromInst(Instr); 3149 3150 // Does this instruction return a value ? 3151 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3152 3153 Instruction *Cloned = Instr->clone(); 3154 if (!IsVoidRetTy) 3155 Cloned->setName(Instr->getName() + ".cloned"); 3156 3157 // If the scalarized instruction contributes to the address computation of a 3158 // widen masked load/store which was in a basic block that needed predication 3159 // and is not predicated after vectorization, we can't propagate 3160 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 3161 // instruction could feed a poison value to the base address of the widen 3162 // load/store. 3163 if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) 3164 Cloned->dropPoisonGeneratingFlags(); 3165 3166 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3167 Builder.GetInsertPoint()); 3168 // Replace the operands of the cloned instructions with their scalar 3169 // equivalents in the new loop. 3170 for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { 3171 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3172 auto InputInstance = Instance; 3173 if (!Operand || !OrigLoop->contains(Operand) || 3174 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3175 InputInstance.Lane = VPLane::getFirstLane(); 3176 auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); 3177 Cloned->setOperand(op, NewOp); 3178 } 3179 addNewMetadata(Cloned, Instr); 3180 3181 // Place the cloned scalar in the new loop. 3182 Builder.Insert(Cloned); 3183 3184 State.set(RepRecipe, Cloned, Instance); 3185 3186 // If we just cloned a new assumption, add it the assumption cache. 3187 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3188 AC->registerAssumption(II); 3189 3190 // End if-block. 3191 if (IfPredicateInstr) 3192 PredicatedInstructions.push_back(Cloned); 3193 } 3194 3195 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3196 Value *End, Value *Step, 3197 Instruction *DL) { 3198 BasicBlock *Header = L->getHeader(); 3199 BasicBlock *Latch = L->getLoopLatch(); 3200 // As we're just creating this loop, it's possible no latch exists 3201 // yet. If so, use the header as this will be a single block loop. 3202 if (!Latch) 3203 Latch = Header; 3204 3205 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3206 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3207 setDebugLocFromInst(OldInst, &B); 3208 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3209 3210 B.SetInsertPoint(Latch->getTerminator()); 3211 setDebugLocFromInst(OldInst, &B); 3212 3213 // Create i+1 and fill the PHINode. 3214 // 3215 // If the tail is not folded, we know that End - Start >= Step (either 3216 // statically or through the minimum iteration checks). We also know that both 3217 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3218 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3219 // overflows and we can mark the induction increment as NUW. 3220 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3221 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3222 Induction->addIncoming(Start, L->getLoopPreheader()); 3223 Induction->addIncoming(Next, Latch); 3224 // Create the compare. 3225 Value *ICmp = B.CreateICmpEQ(Next, End); 3226 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3227 3228 // Now we have two terminators. Remove the old one from the block. 3229 Latch->getTerminator()->eraseFromParent(); 3230 3231 return Induction; 3232 } 3233 3234 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3235 if (TripCount) 3236 return TripCount; 3237 3238 assert(L && "Create Trip Count for null loop."); 3239 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3240 // Find the loop boundaries. 3241 ScalarEvolution *SE = PSE.getSE(); 3242 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3243 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3244 "Invalid loop count"); 3245 3246 Type *IdxTy = Legal->getWidestInductionType(); 3247 assert(IdxTy && "No type for induction"); 3248 3249 // The exit count might have the type of i64 while the phi is i32. This can 3250 // happen if we have an induction variable that is sign extended before the 3251 // compare. The only way that we get a backedge taken count is that the 3252 // induction variable was signed and as such will not overflow. In such a case 3253 // truncation is legal. 3254 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3255 IdxTy->getPrimitiveSizeInBits()) 3256 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3257 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3258 3259 // Get the total trip count from the count by adding 1. 3260 const SCEV *ExitCount = SE->getAddExpr( 3261 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3262 3263 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3264 3265 // Expand the trip count and place the new instructions in the preheader. 3266 // Notice that the pre-header does not change, only the loop body. 3267 SCEVExpander Exp(*SE, DL, "induction"); 3268 3269 // Count holds the overall loop count (N). 3270 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3271 L->getLoopPreheader()->getTerminator()); 3272 3273 if (TripCount->getType()->isPointerTy()) 3274 TripCount = 3275 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3276 L->getLoopPreheader()->getTerminator()); 3277 3278 return TripCount; 3279 } 3280 3281 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3282 if (VectorTripCount) 3283 return VectorTripCount; 3284 3285 Value *TC = getOrCreateTripCount(L); 3286 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3287 3288 Type *Ty = TC->getType(); 3289 // This is where we can make the step a runtime constant. 3290 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3291 3292 // If the tail is to be folded by masking, round the number of iterations N 3293 // up to a multiple of Step instead of rounding down. This is done by first 3294 // adding Step-1 and then rounding down. Note that it's ok if this addition 3295 // overflows: the vector induction variable will eventually wrap to zero given 3296 // that it starts at zero and its Step is a power of two; the loop will then 3297 // exit, with the last early-exit vector comparison also producing all-true. 3298 if (Cost->foldTailByMasking()) { 3299 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3300 "VF*UF must be a power of 2 when folding tail by masking"); 3301 assert(!VF.isScalable() && 3302 "Tail folding not yet supported for scalable vectors"); 3303 TC = Builder.CreateAdd( 3304 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3305 } 3306 3307 // Now we need to generate the expression for the part of the loop that the 3308 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3309 // iterations are not required for correctness, or N - Step, otherwise. Step 3310 // is equal to the vectorization factor (number of SIMD elements) times the 3311 // unroll factor (number of SIMD instructions). 3312 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3313 3314 // There are cases where we *must* run at least one iteration in the remainder 3315 // loop. See the cost model for when this can happen. If the step evenly 3316 // divides the trip count, we set the remainder to be equal to the step. If 3317 // the step does not evenly divide the trip count, no adjustment is necessary 3318 // since there will already be scalar iterations. Note that the minimum 3319 // iterations check ensures that N >= Step. 3320 if (Cost->requiresScalarEpilogue(VF)) { 3321 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3322 R = Builder.CreateSelect(IsZero, Step, R); 3323 } 3324 3325 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3326 3327 return VectorTripCount; 3328 } 3329 3330 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3331 const DataLayout &DL) { 3332 // Verify that V is a vector type with same number of elements as DstVTy. 3333 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3334 unsigned VF = DstFVTy->getNumElements(); 3335 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3336 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3337 Type *SrcElemTy = SrcVecTy->getElementType(); 3338 Type *DstElemTy = DstFVTy->getElementType(); 3339 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3340 "Vector elements must have same size"); 3341 3342 // Do a direct cast if element types are castable. 3343 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3344 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3345 } 3346 // V cannot be directly casted to desired vector type. 3347 // May happen when V is a floating point vector but DstVTy is a vector of 3348 // pointers or vice-versa. Handle this using a two-step bitcast using an 3349 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3350 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3351 "Only one type should be a pointer type"); 3352 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3353 "Only one type should be a floating point type"); 3354 Type *IntTy = 3355 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3356 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3357 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3358 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3359 } 3360 3361 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3362 BasicBlock *Bypass) { 3363 Value *Count = getOrCreateTripCount(L); 3364 // Reuse existing vector loop preheader for TC checks. 3365 // Note that new preheader block is generated for vector loop. 3366 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3367 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3368 3369 // Generate code to check if the loop's trip count is less than VF * UF, or 3370 // equal to it in case a scalar epilogue is required; this implies that the 3371 // vector trip count is zero. This check also covers the case where adding one 3372 // to the backedge-taken count overflowed leading to an incorrect trip count 3373 // of zero. In this case we will also jump to the scalar loop. 3374 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3375 : ICmpInst::ICMP_ULT; 3376 3377 // If tail is to be folded, vector loop takes care of all iterations. 3378 Value *CheckMinIters = Builder.getFalse(); 3379 if (!Cost->foldTailByMasking()) { 3380 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3381 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3382 } 3383 // Create new preheader for vector loop. 3384 LoopVectorPreHeader = 3385 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3386 "vector.ph"); 3387 3388 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3389 DT->getNode(Bypass)->getIDom()) && 3390 "TC check is expected to dominate Bypass"); 3391 3392 // Update dominator for Bypass & LoopExit (if needed). 3393 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3394 if (!Cost->requiresScalarEpilogue(VF)) 3395 // If there is an epilogue which must run, there's no edge from the 3396 // middle block to exit blocks and thus no need to update the immediate 3397 // dominator of the exit blocks. 3398 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3399 3400 ReplaceInstWithInst( 3401 TCCheckBlock->getTerminator(), 3402 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3403 LoopBypassBlocks.push_back(TCCheckBlock); 3404 } 3405 3406 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3407 3408 BasicBlock *const SCEVCheckBlock = 3409 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3410 if (!SCEVCheckBlock) 3411 return nullptr; 3412 3413 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3414 (OptForSizeBasedOnProfile && 3415 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3416 "Cannot SCEV check stride or overflow when optimizing for size"); 3417 3418 3419 // Update dominator only if this is first RT check. 3420 if (LoopBypassBlocks.empty()) { 3421 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3422 if (!Cost->requiresScalarEpilogue(VF)) 3423 // If there is an epilogue which must run, there's no edge from the 3424 // middle block to exit blocks and thus no need to update the immediate 3425 // dominator of the exit blocks. 3426 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3427 } 3428 3429 LoopBypassBlocks.push_back(SCEVCheckBlock); 3430 AddedSafetyChecks = true; 3431 return SCEVCheckBlock; 3432 } 3433 3434 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3435 BasicBlock *Bypass) { 3436 // VPlan-native path does not do any analysis for runtime checks currently. 3437 if (EnableVPlanNativePath) 3438 return nullptr; 3439 3440 BasicBlock *const MemCheckBlock = 3441 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3442 3443 // Check if we generated code that checks in runtime if arrays overlap. We put 3444 // the checks into a separate block to make the more common case of few 3445 // elements faster. 3446 if (!MemCheckBlock) 3447 return nullptr; 3448 3449 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3450 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3451 "Cannot emit memory checks when optimizing for size, unless forced " 3452 "to vectorize."); 3453 ORE->emit([&]() { 3454 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3455 L->getStartLoc(), L->getHeader()) 3456 << "Code-size may be reduced by not forcing " 3457 "vectorization, or by source-code modifications " 3458 "eliminating the need for runtime checks " 3459 "(e.g., adding 'restrict')."; 3460 }); 3461 } 3462 3463 LoopBypassBlocks.push_back(MemCheckBlock); 3464 3465 AddedSafetyChecks = true; 3466 3467 // We currently don't use LoopVersioning for the actual loop cloning but we 3468 // still use it to add the noalias metadata. 3469 LVer = std::make_unique<LoopVersioning>( 3470 *Legal->getLAI(), 3471 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3472 DT, PSE.getSE()); 3473 LVer->prepareNoAliasMetadata(); 3474 return MemCheckBlock; 3475 } 3476 3477 Value *InnerLoopVectorizer::emitTransformedIndex( 3478 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3479 const InductionDescriptor &ID) const { 3480 3481 SCEVExpander Exp(*SE, DL, "induction"); 3482 auto Step = ID.getStep(); 3483 auto StartValue = ID.getStartValue(); 3484 assert(Index->getType()->getScalarType() == Step->getType() && 3485 "Index scalar type does not match StepValue type"); 3486 3487 // Note: the IR at this point is broken. We cannot use SE to create any new 3488 // SCEV and then expand it, hoping that SCEV's simplification will give us 3489 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3490 // lead to various SCEV crashes. So all we can do is to use builder and rely 3491 // on InstCombine for future simplifications. Here we handle some trivial 3492 // cases only. 3493 auto CreateAdd = [&B](Value *X, Value *Y) { 3494 assert(X->getType() == Y->getType() && "Types don't match!"); 3495 if (auto *CX = dyn_cast<ConstantInt>(X)) 3496 if (CX->isZero()) 3497 return Y; 3498 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3499 if (CY->isZero()) 3500 return X; 3501 return B.CreateAdd(X, Y); 3502 }; 3503 3504 // We allow X to be a vector type, in which case Y will potentially be 3505 // splatted into a vector with the same element count. 3506 auto CreateMul = [&B](Value *X, Value *Y) { 3507 assert(X->getType()->getScalarType() == Y->getType() && 3508 "Types don't match!"); 3509 if (auto *CX = dyn_cast<ConstantInt>(X)) 3510 if (CX->isOne()) 3511 return Y; 3512 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3513 if (CY->isOne()) 3514 return X; 3515 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3516 if (XVTy && !isa<VectorType>(Y->getType())) 3517 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3518 return B.CreateMul(X, Y); 3519 }; 3520 3521 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3522 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3523 // the DomTree is not kept up-to-date for additional blocks generated in the 3524 // vector loop. By using the header as insertion point, we guarantee that the 3525 // expanded instructions dominate all their uses. 3526 auto GetInsertPoint = [this, &B]() { 3527 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3528 if (InsertBB != LoopVectorBody && 3529 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3530 return LoopVectorBody->getTerminator(); 3531 return &*B.GetInsertPoint(); 3532 }; 3533 3534 switch (ID.getKind()) { 3535 case InductionDescriptor::IK_IntInduction: { 3536 assert(!isa<VectorType>(Index->getType()) && 3537 "Vector indices not supported for integer inductions yet"); 3538 assert(Index->getType() == StartValue->getType() && 3539 "Index type does not match StartValue type"); 3540 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3541 return B.CreateSub(StartValue, Index); 3542 auto *Offset = CreateMul( 3543 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3544 return CreateAdd(StartValue, Offset); 3545 } 3546 case InductionDescriptor::IK_PtrInduction: { 3547 assert(isa<SCEVConstant>(Step) && 3548 "Expected constant step for pointer induction"); 3549 return B.CreateGEP( 3550 ID.getElementType(), StartValue, 3551 CreateMul(Index, 3552 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3553 GetInsertPoint()))); 3554 } 3555 case InductionDescriptor::IK_FpInduction: { 3556 assert(!isa<VectorType>(Index->getType()) && 3557 "Vector indices not supported for FP inductions yet"); 3558 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3559 auto InductionBinOp = ID.getInductionBinOp(); 3560 assert(InductionBinOp && 3561 (InductionBinOp->getOpcode() == Instruction::FAdd || 3562 InductionBinOp->getOpcode() == Instruction::FSub) && 3563 "Original bin op should be defined for FP induction"); 3564 3565 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3566 Value *MulExp = B.CreateFMul(StepValue, Index); 3567 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3568 "induction"); 3569 } 3570 case InductionDescriptor::IK_NoInduction: 3571 return nullptr; 3572 } 3573 llvm_unreachable("invalid enum"); 3574 } 3575 3576 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3577 LoopScalarBody = OrigLoop->getHeader(); 3578 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3579 assert(LoopVectorPreHeader && "Invalid loop structure"); 3580 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3581 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3582 "multiple exit loop without required epilogue?"); 3583 3584 LoopMiddleBlock = 3585 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3586 LI, nullptr, Twine(Prefix) + "middle.block"); 3587 LoopScalarPreHeader = 3588 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3589 nullptr, Twine(Prefix) + "scalar.ph"); 3590 3591 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3592 3593 // Set up the middle block terminator. Two cases: 3594 // 1) If we know that we must execute the scalar epilogue, emit an 3595 // unconditional branch. 3596 // 2) Otherwise, we must have a single unique exit block (due to how we 3597 // implement the multiple exit case). In this case, set up a conditonal 3598 // branch from the middle block to the loop scalar preheader, and the 3599 // exit block. completeLoopSkeleton will update the condition to use an 3600 // iteration check, if required to decide whether to execute the remainder. 3601 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3602 BranchInst::Create(LoopScalarPreHeader) : 3603 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3604 Builder.getTrue()); 3605 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3606 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3607 3608 // We intentionally don't let SplitBlock to update LoopInfo since 3609 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3610 // LoopVectorBody is explicitly added to the correct place few lines later. 3611 LoopVectorBody = 3612 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3613 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3614 3615 // Update dominator for loop exit. 3616 if (!Cost->requiresScalarEpilogue(VF)) 3617 // If there is an epilogue which must run, there's no edge from the 3618 // middle block to exit blocks and thus no need to update the immediate 3619 // dominator of the exit blocks. 3620 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3621 3622 // Create and register the new vector loop. 3623 Loop *Lp = LI->AllocateLoop(); 3624 Loop *ParentLoop = OrigLoop->getParentLoop(); 3625 3626 // Insert the new loop into the loop nest and register the new basic blocks 3627 // before calling any utilities such as SCEV that require valid LoopInfo. 3628 if (ParentLoop) { 3629 ParentLoop->addChildLoop(Lp); 3630 } else { 3631 LI->addTopLevelLoop(Lp); 3632 } 3633 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3634 return Lp; 3635 } 3636 3637 void InnerLoopVectorizer::createInductionResumeValues( 3638 Loop *L, Value *VectorTripCount, 3639 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3640 assert(VectorTripCount && L && "Expected valid arguments"); 3641 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3642 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3643 "Inconsistent information about additional bypass."); 3644 // We are going to resume the execution of the scalar loop. 3645 // Go over all of the induction variables that we found and fix the 3646 // PHIs that are left in the scalar version of the loop. 3647 // The starting values of PHI nodes depend on the counter of the last 3648 // iteration in the vectorized loop. 3649 // If we come from a bypass edge then we need to start from the original 3650 // start value. 3651 for (auto &InductionEntry : Legal->getInductionVars()) { 3652 PHINode *OrigPhi = InductionEntry.first; 3653 InductionDescriptor II = InductionEntry.second; 3654 3655 // Create phi nodes to merge from the backedge-taken check block. 3656 PHINode *BCResumeVal = 3657 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3658 LoopScalarPreHeader->getTerminator()); 3659 // Copy original phi DL over to the new one. 3660 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3661 Value *&EndValue = IVEndValues[OrigPhi]; 3662 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3663 if (OrigPhi == OldInduction) { 3664 // We know what the end value is. 3665 EndValue = VectorTripCount; 3666 } else { 3667 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3668 3669 // Fast-math-flags propagate from the original induction instruction. 3670 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3671 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3672 3673 Type *StepType = II.getStep()->getType(); 3674 Instruction::CastOps CastOp = 3675 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3676 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3677 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3678 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3679 EndValue->setName("ind.end"); 3680 3681 // Compute the end value for the additional bypass (if applicable). 3682 if (AdditionalBypass.first) { 3683 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3684 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3685 StepType, true); 3686 CRD = 3687 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3688 EndValueFromAdditionalBypass = 3689 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3690 EndValueFromAdditionalBypass->setName("ind.end"); 3691 } 3692 } 3693 // The new PHI merges the original incoming value, in case of a bypass, 3694 // or the value at the end of the vectorized loop. 3695 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3696 3697 // Fix the scalar body counter (PHI node). 3698 // The old induction's phi node in the scalar body needs the truncated 3699 // value. 3700 for (BasicBlock *BB : LoopBypassBlocks) 3701 BCResumeVal->addIncoming(II.getStartValue(), BB); 3702 3703 if (AdditionalBypass.first) 3704 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3705 EndValueFromAdditionalBypass); 3706 3707 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3708 } 3709 } 3710 3711 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3712 MDNode *OrigLoopID) { 3713 assert(L && "Expected valid loop."); 3714 3715 // The trip counts should be cached by now. 3716 Value *Count = getOrCreateTripCount(L); 3717 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3718 3719 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3720 3721 // Add a check in the middle block to see if we have completed 3722 // all of the iterations in the first vector loop. Three cases: 3723 // 1) If we require a scalar epilogue, there is no conditional branch as 3724 // we unconditionally branch to the scalar preheader. Do nothing. 3725 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3726 // Thus if tail is to be folded, we know we don't need to run the 3727 // remainder and we can use the previous value for the condition (true). 3728 // 3) Otherwise, construct a runtime check. 3729 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3730 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3731 Count, VectorTripCount, "cmp.n", 3732 LoopMiddleBlock->getTerminator()); 3733 3734 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3735 // of the corresponding compare because they may have ended up with 3736 // different line numbers and we want to avoid awkward line stepping while 3737 // debugging. Eg. if the compare has got a line number inside the loop. 3738 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3739 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3740 } 3741 3742 // Get ready to start creating new instructions into the vectorized body. 3743 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3744 "Inconsistent vector loop preheader"); 3745 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3746 3747 Optional<MDNode *> VectorizedLoopID = 3748 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3749 LLVMLoopVectorizeFollowupVectorized}); 3750 if (VectorizedLoopID.hasValue()) { 3751 L->setLoopID(VectorizedLoopID.getValue()); 3752 3753 // Do not setAlreadyVectorized if loop attributes have been defined 3754 // explicitly. 3755 return LoopVectorPreHeader; 3756 } 3757 3758 // Keep all loop hints from the original loop on the vector loop (we'll 3759 // replace the vectorizer-specific hints below). 3760 if (MDNode *LID = OrigLoop->getLoopID()) 3761 L->setLoopID(LID); 3762 3763 LoopVectorizeHints Hints(L, true, *ORE); 3764 Hints.setAlreadyVectorized(); 3765 3766 #ifdef EXPENSIVE_CHECKS 3767 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3768 LI->verify(*DT); 3769 #endif 3770 3771 return LoopVectorPreHeader; 3772 } 3773 3774 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3775 /* 3776 In this function we generate a new loop. The new loop will contain 3777 the vectorized instructions while the old loop will continue to run the 3778 scalar remainder. 3779 3780 [ ] <-- loop iteration number check. 3781 / | 3782 / v 3783 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3784 | / | 3785 | / v 3786 || [ ] <-- vector pre header. 3787 |/ | 3788 | v 3789 | [ ] \ 3790 | [ ]_| <-- vector loop. 3791 | | 3792 | v 3793 \ -[ ] <--- middle-block. 3794 \/ | 3795 /\ v 3796 | ->[ ] <--- new preheader. 3797 | | 3798 (opt) v <-- edge from middle to exit iff epilogue is not required. 3799 | [ ] \ 3800 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3801 \ | 3802 \ v 3803 >[ ] <-- exit block(s). 3804 ... 3805 */ 3806 3807 // Get the metadata of the original loop before it gets modified. 3808 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3809 3810 // Workaround! Compute the trip count of the original loop and cache it 3811 // before we start modifying the CFG. This code has a systemic problem 3812 // wherein it tries to run analysis over partially constructed IR; this is 3813 // wrong, and not simply for SCEV. The trip count of the original loop 3814 // simply happens to be prone to hitting this in practice. In theory, we 3815 // can hit the same issue for any SCEV, or ValueTracking query done during 3816 // mutation. See PR49900. 3817 getOrCreateTripCount(OrigLoop); 3818 3819 // Create an empty vector loop, and prepare basic blocks for the runtime 3820 // checks. 3821 Loop *Lp = createVectorLoopSkeleton(""); 3822 3823 // Now, compare the new count to zero. If it is zero skip the vector loop and 3824 // jump to the scalar loop. This check also covers the case where the 3825 // backedge-taken count is uint##_max: adding one to it will overflow leading 3826 // to an incorrect trip count of zero. In this (rare) case we will also jump 3827 // to the scalar loop. 3828 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3829 3830 // Generate the code to check any assumptions that we've made for SCEV 3831 // expressions. 3832 emitSCEVChecks(Lp, LoopScalarPreHeader); 3833 3834 // Generate the code that checks in runtime if arrays overlap. We put the 3835 // checks into a separate block to make the more common case of few elements 3836 // faster. 3837 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3838 3839 // Some loops have a single integer induction variable, while other loops 3840 // don't. One example is c++ iterators that often have multiple pointer 3841 // induction variables. In the code below we also support a case where we 3842 // don't have a single induction variable. 3843 // 3844 // We try to obtain an induction variable from the original loop as hard 3845 // as possible. However if we don't find one that: 3846 // - is an integer 3847 // - counts from zero, stepping by one 3848 // - is the size of the widest induction variable type 3849 // then we create a new one. 3850 OldInduction = Legal->getPrimaryInduction(); 3851 Type *IdxTy = Legal->getWidestInductionType(); 3852 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3853 // The loop step is equal to the vectorization factor (num of SIMD elements) 3854 // times the unroll factor (num of SIMD instructions). 3855 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3856 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3857 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3858 Induction = 3859 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3860 getDebugLocFromInstOrOperands(OldInduction)); 3861 3862 // Emit phis for the new starting index of the scalar loop. 3863 createInductionResumeValues(Lp, CountRoundDown); 3864 3865 return completeLoopSkeleton(Lp, OrigLoopID); 3866 } 3867 3868 // Fix up external users of the induction variable. At this point, we are 3869 // in LCSSA form, with all external PHIs that use the IV having one input value, 3870 // coming from the remainder loop. We need those PHIs to also have a correct 3871 // value for the IV when arriving directly from the middle block. 3872 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3873 const InductionDescriptor &II, 3874 Value *CountRoundDown, Value *EndValue, 3875 BasicBlock *MiddleBlock) { 3876 // There are two kinds of external IV usages - those that use the value 3877 // computed in the last iteration (the PHI) and those that use the penultimate 3878 // value (the value that feeds into the phi from the loop latch). 3879 // We allow both, but they, obviously, have different values. 3880 3881 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3882 3883 DenseMap<Value *, Value *> MissingVals; 3884 3885 // An external user of the last iteration's value should see the value that 3886 // the remainder loop uses to initialize its own IV. 3887 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3888 for (User *U : PostInc->users()) { 3889 Instruction *UI = cast<Instruction>(U); 3890 if (!OrigLoop->contains(UI)) { 3891 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3892 MissingVals[UI] = EndValue; 3893 } 3894 } 3895 3896 // An external user of the penultimate value need to see EndValue - Step. 3897 // The simplest way to get this is to recompute it from the constituent SCEVs, 3898 // that is Start + (Step * (CRD - 1)). 3899 for (User *U : OrigPhi->users()) { 3900 auto *UI = cast<Instruction>(U); 3901 if (!OrigLoop->contains(UI)) { 3902 const DataLayout &DL = 3903 OrigLoop->getHeader()->getModule()->getDataLayout(); 3904 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3905 3906 IRBuilder<> B(MiddleBlock->getTerminator()); 3907 3908 // Fast-math-flags propagate from the original induction instruction. 3909 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3910 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3911 3912 Value *CountMinusOne = B.CreateSub( 3913 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3914 Value *CMO = 3915 !II.getStep()->getType()->isIntegerTy() 3916 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3917 II.getStep()->getType()) 3918 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3919 CMO->setName("cast.cmo"); 3920 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3921 Escape->setName("ind.escape"); 3922 MissingVals[UI] = Escape; 3923 } 3924 } 3925 3926 for (auto &I : MissingVals) { 3927 PHINode *PHI = cast<PHINode>(I.first); 3928 // One corner case we have to handle is two IVs "chasing" each-other, 3929 // that is %IV2 = phi [...], [ %IV1, %latch ] 3930 // In this case, if IV1 has an external use, we need to avoid adding both 3931 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3932 // don't already have an incoming value for the middle block. 3933 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3934 PHI->addIncoming(I.second, MiddleBlock); 3935 } 3936 } 3937 3938 namespace { 3939 3940 struct CSEDenseMapInfo { 3941 static bool canHandle(const Instruction *I) { 3942 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3943 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3944 } 3945 3946 static inline Instruction *getEmptyKey() { 3947 return DenseMapInfo<Instruction *>::getEmptyKey(); 3948 } 3949 3950 static inline Instruction *getTombstoneKey() { 3951 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3952 } 3953 3954 static unsigned getHashValue(const Instruction *I) { 3955 assert(canHandle(I) && "Unknown instruction!"); 3956 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3957 I->value_op_end())); 3958 } 3959 3960 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3961 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3962 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3963 return LHS == RHS; 3964 return LHS->isIdenticalTo(RHS); 3965 } 3966 }; 3967 3968 } // end anonymous namespace 3969 3970 ///Perform cse of induction variable instructions. 3971 static void cse(BasicBlock *BB) { 3972 // Perform simple cse. 3973 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3974 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3975 if (!CSEDenseMapInfo::canHandle(&In)) 3976 continue; 3977 3978 // Check if we can replace this instruction with any of the 3979 // visited instructions. 3980 if (Instruction *V = CSEMap.lookup(&In)) { 3981 In.replaceAllUsesWith(V); 3982 In.eraseFromParent(); 3983 continue; 3984 } 3985 3986 CSEMap[&In] = &In; 3987 } 3988 } 3989 3990 InstructionCost 3991 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3992 bool &NeedToScalarize) const { 3993 Function *F = CI->getCalledFunction(); 3994 Type *ScalarRetTy = CI->getType(); 3995 SmallVector<Type *, 4> Tys, ScalarTys; 3996 for (auto &ArgOp : CI->args()) 3997 ScalarTys.push_back(ArgOp->getType()); 3998 3999 // Estimate cost of scalarized vector call. The source operands are assumed 4000 // to be vectors, so we need to extract individual elements from there, 4001 // execute VF scalar calls, and then gather the result into the vector return 4002 // value. 4003 InstructionCost ScalarCallCost = 4004 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 4005 if (VF.isScalar()) 4006 return ScalarCallCost; 4007 4008 // Compute corresponding vector type for return value and arguments. 4009 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 4010 for (Type *ScalarTy : ScalarTys) 4011 Tys.push_back(ToVectorTy(ScalarTy, VF)); 4012 4013 // Compute costs of unpacking argument values for the scalar calls and 4014 // packing the return values to a vector. 4015 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 4016 4017 InstructionCost Cost = 4018 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 4019 4020 // If we can't emit a vector call for this function, then the currently found 4021 // cost is the cost we need to return. 4022 NeedToScalarize = true; 4023 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4024 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 4025 4026 if (!TLI || CI->isNoBuiltin() || !VecFunc) 4027 return Cost; 4028 4029 // If the corresponding vector cost is cheaper, return its cost. 4030 InstructionCost VectorCallCost = 4031 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 4032 if (VectorCallCost < Cost) { 4033 NeedToScalarize = false; 4034 Cost = VectorCallCost; 4035 } 4036 return Cost; 4037 } 4038 4039 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 4040 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 4041 return Elt; 4042 return VectorType::get(Elt, VF); 4043 } 4044 4045 InstructionCost 4046 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 4047 ElementCount VF) const { 4048 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4049 assert(ID && "Expected intrinsic call!"); 4050 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 4051 FastMathFlags FMF; 4052 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 4053 FMF = FPMO->getFastMathFlags(); 4054 4055 SmallVector<const Value *> Arguments(CI->args()); 4056 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 4057 SmallVector<Type *> ParamTys; 4058 std::transform(FTy->param_begin(), FTy->param_end(), 4059 std::back_inserter(ParamTys), 4060 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 4061 4062 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 4063 dyn_cast<IntrinsicInst>(CI)); 4064 return TTI.getIntrinsicInstrCost(CostAttrs, 4065 TargetTransformInfo::TCK_RecipThroughput); 4066 } 4067 4068 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 4069 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 4070 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 4071 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 4072 } 4073 4074 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 4075 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 4076 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 4077 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 4078 } 4079 4080 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 4081 // For every instruction `I` in MinBWs, truncate the operands, create a 4082 // truncated version of `I` and reextend its result. InstCombine runs 4083 // later and will remove any ext/trunc pairs. 4084 SmallPtrSet<Value *, 4> Erased; 4085 for (const auto &KV : Cost->getMinimalBitwidths()) { 4086 // If the value wasn't vectorized, we must maintain the original scalar 4087 // type. The absence of the value from State indicates that it 4088 // wasn't vectorized. 4089 // FIXME: Should not rely on getVPValue at this point. 4090 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4091 if (!State.hasAnyVectorValue(Def)) 4092 continue; 4093 for (unsigned Part = 0; Part < UF; ++Part) { 4094 Value *I = State.get(Def, Part); 4095 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 4096 continue; 4097 Type *OriginalTy = I->getType(); 4098 Type *ScalarTruncatedTy = 4099 IntegerType::get(OriginalTy->getContext(), KV.second); 4100 auto *TruncatedTy = VectorType::get( 4101 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 4102 if (TruncatedTy == OriginalTy) 4103 continue; 4104 4105 IRBuilder<> B(cast<Instruction>(I)); 4106 auto ShrinkOperand = [&](Value *V) -> Value * { 4107 if (auto *ZI = dyn_cast<ZExtInst>(V)) 4108 if (ZI->getSrcTy() == TruncatedTy) 4109 return ZI->getOperand(0); 4110 return B.CreateZExtOrTrunc(V, TruncatedTy); 4111 }; 4112 4113 // The actual instruction modification depends on the instruction type, 4114 // unfortunately. 4115 Value *NewI = nullptr; 4116 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 4117 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 4118 ShrinkOperand(BO->getOperand(1))); 4119 4120 // Any wrapping introduced by shrinking this operation shouldn't be 4121 // considered undefined behavior. So, we can't unconditionally copy 4122 // arithmetic wrapping flags to NewI. 4123 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 4124 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 4125 NewI = 4126 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 4127 ShrinkOperand(CI->getOperand(1))); 4128 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 4129 NewI = B.CreateSelect(SI->getCondition(), 4130 ShrinkOperand(SI->getTrueValue()), 4131 ShrinkOperand(SI->getFalseValue())); 4132 } else if (auto *CI = dyn_cast<CastInst>(I)) { 4133 switch (CI->getOpcode()) { 4134 default: 4135 llvm_unreachable("Unhandled cast!"); 4136 case Instruction::Trunc: 4137 NewI = ShrinkOperand(CI->getOperand(0)); 4138 break; 4139 case Instruction::SExt: 4140 NewI = B.CreateSExtOrTrunc( 4141 CI->getOperand(0), 4142 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4143 break; 4144 case Instruction::ZExt: 4145 NewI = B.CreateZExtOrTrunc( 4146 CI->getOperand(0), 4147 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4148 break; 4149 } 4150 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4151 auto Elements0 = 4152 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 4153 auto *O0 = B.CreateZExtOrTrunc( 4154 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 4155 auto Elements1 = 4156 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 4157 auto *O1 = B.CreateZExtOrTrunc( 4158 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 4159 4160 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4161 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4162 // Don't do anything with the operands, just extend the result. 4163 continue; 4164 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4165 auto Elements = 4166 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 4167 auto *O0 = B.CreateZExtOrTrunc( 4168 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4169 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4170 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4171 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4172 auto Elements = 4173 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 4174 auto *O0 = B.CreateZExtOrTrunc( 4175 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 4176 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4177 } else { 4178 // If we don't know what to do, be conservative and don't do anything. 4179 continue; 4180 } 4181 4182 // Lastly, extend the result. 4183 NewI->takeName(cast<Instruction>(I)); 4184 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4185 I->replaceAllUsesWith(Res); 4186 cast<Instruction>(I)->eraseFromParent(); 4187 Erased.insert(I); 4188 State.reset(Def, Res, Part); 4189 } 4190 } 4191 4192 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4193 for (const auto &KV : Cost->getMinimalBitwidths()) { 4194 // If the value wasn't vectorized, we must maintain the original scalar 4195 // type. The absence of the value from State indicates that it 4196 // wasn't vectorized. 4197 // FIXME: Should not rely on getVPValue at this point. 4198 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4199 if (!State.hasAnyVectorValue(Def)) 4200 continue; 4201 for (unsigned Part = 0; Part < UF; ++Part) { 4202 Value *I = State.get(Def, Part); 4203 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4204 if (Inst && Inst->use_empty()) { 4205 Value *NewI = Inst->getOperand(0); 4206 Inst->eraseFromParent(); 4207 State.reset(Def, NewI, Part); 4208 } 4209 } 4210 } 4211 } 4212 4213 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4214 // Insert truncates and extends for any truncated instructions as hints to 4215 // InstCombine. 4216 if (VF.isVector()) 4217 truncateToMinimalBitwidths(State); 4218 4219 // Fix widened non-induction PHIs by setting up the PHI operands. 4220 if (OrigPHIsToFix.size()) { 4221 assert(EnableVPlanNativePath && 4222 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4223 fixNonInductionPHIs(State); 4224 } 4225 4226 // At this point every instruction in the original loop is widened to a 4227 // vector form. Now we need to fix the recurrences in the loop. These PHI 4228 // nodes are currently empty because we did not want to introduce cycles. 4229 // This is the second stage of vectorizing recurrences. 4230 fixCrossIterationPHIs(State); 4231 4232 // Forget the original basic block. 4233 PSE.getSE()->forgetLoop(OrigLoop); 4234 4235 // If we inserted an edge from the middle block to the unique exit block, 4236 // update uses outside the loop (phis) to account for the newly inserted 4237 // edge. 4238 if (!Cost->requiresScalarEpilogue(VF)) { 4239 // Fix-up external users of the induction variables. 4240 for (auto &Entry : Legal->getInductionVars()) 4241 fixupIVUsers(Entry.first, Entry.second, 4242 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4243 IVEndValues[Entry.first], LoopMiddleBlock); 4244 4245 fixLCSSAPHIs(State); 4246 } 4247 4248 for (Instruction *PI : PredicatedInstructions) 4249 sinkScalarOperands(&*PI); 4250 4251 // Remove redundant induction instructions. 4252 cse(LoopVectorBody); 4253 4254 // Set/update profile weights for the vector and remainder loops as original 4255 // loop iterations are now distributed among them. Note that original loop 4256 // represented by LoopScalarBody becomes remainder loop after vectorization. 4257 // 4258 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4259 // end up getting slightly roughened result but that should be OK since 4260 // profile is not inherently precise anyway. Note also possible bypass of 4261 // vector code caused by legality checks is ignored, assigning all the weight 4262 // to the vector loop, optimistically. 4263 // 4264 // For scalable vectorization we can't know at compile time how many iterations 4265 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4266 // vscale of '1'. 4267 setProfileInfoAfterUnrolling( 4268 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4269 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4270 } 4271 4272 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4273 // In order to support recurrences we need to be able to vectorize Phi nodes. 4274 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4275 // stage #2: We now need to fix the recurrences by adding incoming edges to 4276 // the currently empty PHI nodes. At this point every instruction in the 4277 // original loop is widened to a vector form so we can use them to construct 4278 // the incoming edges. 4279 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4280 for (VPRecipeBase &R : Header->phis()) { 4281 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4282 fixReduction(ReductionPhi, State); 4283 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4284 fixFirstOrderRecurrence(FOR, State); 4285 } 4286 } 4287 4288 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4289 VPTransformState &State) { 4290 // This is the second phase of vectorizing first-order recurrences. An 4291 // overview of the transformation is described below. Suppose we have the 4292 // following loop. 4293 // 4294 // for (int i = 0; i < n; ++i) 4295 // b[i] = a[i] - a[i - 1]; 4296 // 4297 // There is a first-order recurrence on "a". For this loop, the shorthand 4298 // scalar IR looks like: 4299 // 4300 // scalar.ph: 4301 // s_init = a[-1] 4302 // br scalar.body 4303 // 4304 // scalar.body: 4305 // i = phi [0, scalar.ph], [i+1, scalar.body] 4306 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4307 // s2 = a[i] 4308 // b[i] = s2 - s1 4309 // br cond, scalar.body, ... 4310 // 4311 // In this example, s1 is a recurrence because it's value depends on the 4312 // previous iteration. In the first phase of vectorization, we created a 4313 // vector phi v1 for s1. We now complete the vectorization and produce the 4314 // shorthand vector IR shown below (for VF = 4, UF = 1). 4315 // 4316 // vector.ph: 4317 // v_init = vector(..., ..., ..., a[-1]) 4318 // br vector.body 4319 // 4320 // vector.body 4321 // i = phi [0, vector.ph], [i+4, vector.body] 4322 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4323 // v2 = a[i, i+1, i+2, i+3]; 4324 // v3 = vector(v1(3), v2(0, 1, 2)) 4325 // b[i, i+1, i+2, i+3] = v2 - v3 4326 // br cond, vector.body, middle.block 4327 // 4328 // middle.block: 4329 // x = v2(3) 4330 // br scalar.ph 4331 // 4332 // scalar.ph: 4333 // s_init = phi [x, middle.block], [a[-1], otherwise] 4334 // br scalar.body 4335 // 4336 // After execution completes the vector loop, we extract the next value of 4337 // the recurrence (x) to use as the initial value in the scalar loop. 4338 4339 // Extract the last vector element in the middle block. This will be the 4340 // initial value for the recurrence when jumping to the scalar loop. 4341 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4342 Value *Incoming = State.get(PreviousDef, UF - 1); 4343 auto *ExtractForScalar = Incoming; 4344 auto *IdxTy = Builder.getInt32Ty(); 4345 if (VF.isVector()) { 4346 auto *One = ConstantInt::get(IdxTy, 1); 4347 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4348 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4349 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4350 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4351 "vector.recur.extract"); 4352 } 4353 // Extract the second last element in the middle block if the 4354 // Phi is used outside the loop. We need to extract the phi itself 4355 // and not the last element (the phi update in the current iteration). This 4356 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4357 // when the scalar loop is not run at all. 4358 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4359 if (VF.isVector()) { 4360 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4361 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4362 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4363 Incoming, Idx, "vector.recur.extract.for.phi"); 4364 } else if (UF > 1) 4365 // When loop is unrolled without vectorizing, initialize 4366 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4367 // of `Incoming`. This is analogous to the vectorized case above: extracting 4368 // the second last element when VF > 1. 4369 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4370 4371 // Fix the initial value of the original recurrence in the scalar loop. 4372 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4373 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4374 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4375 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4376 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4377 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4378 Start->addIncoming(Incoming, BB); 4379 } 4380 4381 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4382 Phi->setName("scalar.recur"); 4383 4384 // Finally, fix users of the recurrence outside the loop. The users will need 4385 // either the last value of the scalar recurrence or the last value of the 4386 // vector recurrence we extracted in the middle block. Since the loop is in 4387 // LCSSA form, we just need to find all the phi nodes for the original scalar 4388 // recurrence in the exit block, and then add an edge for the middle block. 4389 // Note that LCSSA does not imply single entry when the original scalar loop 4390 // had multiple exiting edges (as we always run the last iteration in the 4391 // scalar epilogue); in that case, there is no edge from middle to exit and 4392 // and thus no phis which needed updated. 4393 if (!Cost->requiresScalarEpilogue(VF)) 4394 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4395 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4396 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4397 } 4398 4399 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4400 VPTransformState &State) { 4401 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4402 // Get it's reduction variable descriptor. 4403 assert(Legal->isReductionVariable(OrigPhi) && 4404 "Unable to find the reduction variable"); 4405 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4406 4407 RecurKind RK = RdxDesc.getRecurrenceKind(); 4408 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4409 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4410 setDebugLocFromInst(ReductionStartValue); 4411 4412 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4413 // This is the vector-clone of the value that leaves the loop. 4414 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4415 4416 // Wrap flags are in general invalid after vectorization, clear them. 4417 clearReductionWrapFlags(RdxDesc, State); 4418 4419 // Before each round, move the insertion point right between 4420 // the PHIs and the values we are going to write. 4421 // This allows us to write both PHINodes and the extractelement 4422 // instructions. 4423 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4424 4425 setDebugLocFromInst(LoopExitInst); 4426 4427 Type *PhiTy = OrigPhi->getType(); 4428 // If tail is folded by masking, the vector value to leave the loop should be 4429 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4430 // instead of the former. For an inloop reduction the reduction will already 4431 // be predicated, and does not need to be handled here. 4432 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4433 for (unsigned Part = 0; Part < UF; ++Part) { 4434 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4435 Value *Sel = nullptr; 4436 for (User *U : VecLoopExitInst->users()) { 4437 if (isa<SelectInst>(U)) { 4438 assert(!Sel && "Reduction exit feeding two selects"); 4439 Sel = U; 4440 } else 4441 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4442 } 4443 assert(Sel && "Reduction exit feeds no select"); 4444 State.reset(LoopExitInstDef, Sel, Part); 4445 4446 // If the target can create a predicated operator for the reduction at no 4447 // extra cost in the loop (for example a predicated vadd), it can be 4448 // cheaper for the select to remain in the loop than be sunk out of it, 4449 // and so use the select value for the phi instead of the old 4450 // LoopExitValue. 4451 if (PreferPredicatedReductionSelect || 4452 TTI->preferPredicatedReductionSelect( 4453 RdxDesc.getOpcode(), PhiTy, 4454 TargetTransformInfo::ReductionFlags())) { 4455 auto *VecRdxPhi = 4456 cast<PHINode>(State.get(PhiR, Part)); 4457 VecRdxPhi->setIncomingValueForBlock( 4458 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4459 } 4460 } 4461 } 4462 4463 // If the vector reduction can be performed in a smaller type, we truncate 4464 // then extend the loop exit value to enable InstCombine to evaluate the 4465 // entire expression in the smaller type. 4466 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4467 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4468 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4469 Builder.SetInsertPoint( 4470 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4471 VectorParts RdxParts(UF); 4472 for (unsigned Part = 0; Part < UF; ++Part) { 4473 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4474 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4475 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4476 : Builder.CreateZExt(Trunc, VecTy); 4477 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4478 if (U != Trunc) { 4479 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4480 RdxParts[Part] = Extnd; 4481 } 4482 } 4483 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4484 for (unsigned Part = 0; Part < UF; ++Part) { 4485 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4486 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4487 } 4488 } 4489 4490 // Reduce all of the unrolled parts into a single vector. 4491 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4492 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4493 4494 // The middle block terminator has already been assigned a DebugLoc here (the 4495 // OrigLoop's single latch terminator). We want the whole middle block to 4496 // appear to execute on this line because: (a) it is all compiler generated, 4497 // (b) these instructions are always executed after evaluating the latch 4498 // conditional branch, and (c) other passes may add new predecessors which 4499 // terminate on this line. This is the easiest way to ensure we don't 4500 // accidentally cause an extra step back into the loop while debugging. 4501 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4502 if (PhiR->isOrdered()) 4503 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4504 else { 4505 // Floating-point operations should have some FMF to enable the reduction. 4506 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4507 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4508 for (unsigned Part = 1; Part < UF; ++Part) { 4509 Value *RdxPart = State.get(LoopExitInstDef, Part); 4510 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4511 ReducedPartRdx = Builder.CreateBinOp( 4512 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4513 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4514 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4515 ReducedPartRdx, RdxPart); 4516 else 4517 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4518 } 4519 } 4520 4521 // Create the reduction after the loop. Note that inloop reductions create the 4522 // target reduction in the loop using a Reduction recipe. 4523 if (VF.isVector() && !PhiR->isInLoop()) { 4524 ReducedPartRdx = 4525 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4526 // If the reduction can be performed in a smaller type, we need to extend 4527 // the reduction to the wider type before we branch to the original loop. 4528 if (PhiTy != RdxDesc.getRecurrenceType()) 4529 ReducedPartRdx = RdxDesc.isSigned() 4530 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4531 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4532 } 4533 4534 // Create a phi node that merges control-flow from the backedge-taken check 4535 // block and the middle block. 4536 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4537 LoopScalarPreHeader->getTerminator()); 4538 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4539 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4540 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4541 4542 // Now, we need to fix the users of the reduction variable 4543 // inside and outside of the scalar remainder loop. 4544 4545 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4546 // in the exit blocks. See comment on analogous loop in 4547 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4548 if (!Cost->requiresScalarEpilogue(VF)) 4549 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4550 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4551 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4552 4553 // Fix the scalar loop reduction variable with the incoming reduction sum 4554 // from the vector body and from the backedge value. 4555 int IncomingEdgeBlockIdx = 4556 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4557 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4558 // Pick the other block. 4559 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4560 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4561 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4562 } 4563 4564 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4565 VPTransformState &State) { 4566 RecurKind RK = RdxDesc.getRecurrenceKind(); 4567 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4568 return; 4569 4570 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4571 assert(LoopExitInstr && "null loop exit instruction"); 4572 SmallVector<Instruction *, 8> Worklist; 4573 SmallPtrSet<Instruction *, 8> Visited; 4574 Worklist.push_back(LoopExitInstr); 4575 Visited.insert(LoopExitInstr); 4576 4577 while (!Worklist.empty()) { 4578 Instruction *Cur = Worklist.pop_back_val(); 4579 if (isa<OverflowingBinaryOperator>(Cur)) 4580 for (unsigned Part = 0; Part < UF; ++Part) { 4581 // FIXME: Should not rely on getVPValue at this point. 4582 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4583 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4584 } 4585 4586 for (User *U : Cur->users()) { 4587 Instruction *UI = cast<Instruction>(U); 4588 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4589 Visited.insert(UI).second) 4590 Worklist.push_back(UI); 4591 } 4592 } 4593 } 4594 4595 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4596 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4597 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4598 // Some phis were already hand updated by the reduction and recurrence 4599 // code above, leave them alone. 4600 continue; 4601 4602 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4603 // Non-instruction incoming values will have only one value. 4604 4605 VPLane Lane = VPLane::getFirstLane(); 4606 if (isa<Instruction>(IncomingValue) && 4607 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4608 VF)) 4609 Lane = VPLane::getLastLaneForVF(VF); 4610 4611 // Can be a loop invariant incoming value or the last scalar value to be 4612 // extracted from the vectorized loop. 4613 // FIXME: Should not rely on getVPValue at this point. 4614 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4615 Value *lastIncomingValue = 4616 OrigLoop->isLoopInvariant(IncomingValue) 4617 ? IncomingValue 4618 : State.get(State.Plan->getVPValue(IncomingValue, true), 4619 VPIteration(UF - 1, Lane)); 4620 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4621 } 4622 } 4623 4624 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4625 // The basic block and loop containing the predicated instruction. 4626 auto *PredBB = PredInst->getParent(); 4627 auto *VectorLoop = LI->getLoopFor(PredBB); 4628 4629 // Initialize a worklist with the operands of the predicated instruction. 4630 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4631 4632 // Holds instructions that we need to analyze again. An instruction may be 4633 // reanalyzed if we don't yet know if we can sink it or not. 4634 SmallVector<Instruction *, 8> InstsToReanalyze; 4635 4636 // Returns true if a given use occurs in the predicated block. Phi nodes use 4637 // their operands in their corresponding predecessor blocks. 4638 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4639 auto *I = cast<Instruction>(U.getUser()); 4640 BasicBlock *BB = I->getParent(); 4641 if (auto *Phi = dyn_cast<PHINode>(I)) 4642 BB = Phi->getIncomingBlock( 4643 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4644 return BB == PredBB; 4645 }; 4646 4647 // Iteratively sink the scalarized operands of the predicated instruction 4648 // into the block we created for it. When an instruction is sunk, it's 4649 // operands are then added to the worklist. The algorithm ends after one pass 4650 // through the worklist doesn't sink a single instruction. 4651 bool Changed; 4652 do { 4653 // Add the instructions that need to be reanalyzed to the worklist, and 4654 // reset the changed indicator. 4655 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4656 InstsToReanalyze.clear(); 4657 Changed = false; 4658 4659 while (!Worklist.empty()) { 4660 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4661 4662 // We can't sink an instruction if it is a phi node, is not in the loop, 4663 // or may have side effects. 4664 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4665 I->mayHaveSideEffects()) 4666 continue; 4667 4668 // If the instruction is already in PredBB, check if we can sink its 4669 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4670 // sinking the scalar instruction I, hence it appears in PredBB; but it 4671 // may have failed to sink I's operands (recursively), which we try 4672 // (again) here. 4673 if (I->getParent() == PredBB) { 4674 Worklist.insert(I->op_begin(), I->op_end()); 4675 continue; 4676 } 4677 4678 // It's legal to sink the instruction if all its uses occur in the 4679 // predicated block. Otherwise, there's nothing to do yet, and we may 4680 // need to reanalyze the instruction. 4681 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4682 InstsToReanalyze.push_back(I); 4683 continue; 4684 } 4685 4686 // Move the instruction to the beginning of the predicated block, and add 4687 // it's operands to the worklist. 4688 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4689 Worklist.insert(I->op_begin(), I->op_end()); 4690 4691 // The sinking may have enabled other instructions to be sunk, so we will 4692 // need to iterate. 4693 Changed = true; 4694 } 4695 } while (Changed); 4696 } 4697 4698 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4699 for (PHINode *OrigPhi : OrigPHIsToFix) { 4700 VPWidenPHIRecipe *VPPhi = 4701 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4702 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4703 // Make sure the builder has a valid insert point. 4704 Builder.SetInsertPoint(NewPhi); 4705 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4706 VPValue *Inc = VPPhi->getIncomingValue(i); 4707 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4708 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4709 } 4710 } 4711 } 4712 4713 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4714 return Cost->useOrderedReductions(RdxDesc); 4715 } 4716 4717 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4718 VPWidenPHIRecipe *PhiR, 4719 VPTransformState &State) { 4720 PHINode *P = cast<PHINode>(PN); 4721 if (EnableVPlanNativePath) { 4722 // Currently we enter here in the VPlan-native path for non-induction 4723 // PHIs where all control flow is uniform. We simply widen these PHIs. 4724 // Create a vector phi with no operands - the vector phi operands will be 4725 // set at the end of vector code generation. 4726 Type *VecTy = (State.VF.isScalar()) 4727 ? PN->getType() 4728 : VectorType::get(PN->getType(), State.VF); 4729 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4730 State.set(PhiR, VecPhi, 0); 4731 OrigPHIsToFix.push_back(P); 4732 4733 return; 4734 } 4735 4736 assert(PN->getParent() == OrigLoop->getHeader() && 4737 "Non-header phis should have been handled elsewhere"); 4738 4739 // In order to support recurrences we need to be able to vectorize Phi nodes. 4740 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4741 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4742 // this value when we vectorize all of the instructions that use the PHI. 4743 4744 assert(!Legal->isReductionVariable(P) && 4745 "reductions should be handled elsewhere"); 4746 4747 setDebugLocFromInst(P); 4748 4749 // This PHINode must be an induction variable. 4750 // Make sure that we know about it. 4751 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4752 4753 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4754 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4755 4756 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4757 // which can be found from the original scalar operations. 4758 switch (II.getKind()) { 4759 case InductionDescriptor::IK_NoInduction: 4760 llvm_unreachable("Unknown induction"); 4761 case InductionDescriptor::IK_IntInduction: 4762 case InductionDescriptor::IK_FpInduction: 4763 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4764 case InductionDescriptor::IK_PtrInduction: { 4765 // Handle the pointer induction variable case. 4766 assert(P->getType()->isPointerTy() && "Unexpected type."); 4767 4768 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4769 // This is the normalized GEP that starts counting at zero. 4770 Value *PtrInd = 4771 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4772 // Determine the number of scalars we need to generate for each unroll 4773 // iteration. If the instruction is uniform, we only need to generate the 4774 // first lane. Otherwise, we generate all VF values. 4775 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4776 assert((IsUniform || !State.VF.isScalable()) && 4777 "Cannot scalarize a scalable VF"); 4778 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4779 4780 for (unsigned Part = 0; Part < UF; ++Part) { 4781 Value *PartStart = 4782 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4783 4784 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4785 Value *Idx = Builder.CreateAdd( 4786 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4787 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4788 Value *SclrGep = 4789 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4790 SclrGep->setName("next.gep"); 4791 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4792 } 4793 } 4794 return; 4795 } 4796 assert(isa<SCEVConstant>(II.getStep()) && 4797 "Induction step not a SCEV constant!"); 4798 Type *PhiType = II.getStep()->getType(); 4799 4800 // Build a pointer phi 4801 Value *ScalarStartValue = II.getStartValue(); 4802 Type *ScStValueType = ScalarStartValue->getType(); 4803 PHINode *NewPointerPhi = 4804 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4805 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4806 4807 // A pointer induction, performed by using a gep 4808 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4809 Instruction *InductionLoc = LoopLatch->getTerminator(); 4810 const SCEV *ScalarStep = II.getStep(); 4811 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4812 Value *ScalarStepValue = 4813 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4814 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4815 Value *NumUnrolledElems = 4816 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4817 Value *InductionGEP = GetElementPtrInst::Create( 4818 II.getElementType(), NewPointerPhi, 4819 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4820 InductionLoc); 4821 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4822 4823 // Create UF many actual address geps that use the pointer 4824 // phi as base and a vectorized version of the step value 4825 // (<step*0, ..., step*N>) as offset. 4826 for (unsigned Part = 0; Part < State.UF; ++Part) { 4827 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4828 Value *StartOffsetScalar = 4829 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4830 Value *StartOffset = 4831 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4832 // Create a vector of consecutive numbers from zero to VF. 4833 StartOffset = 4834 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4835 4836 Value *GEP = Builder.CreateGEP( 4837 II.getElementType(), NewPointerPhi, 4838 Builder.CreateMul( 4839 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4840 "vector.gep")); 4841 State.set(PhiR, GEP, Part); 4842 } 4843 } 4844 } 4845 } 4846 4847 /// A helper function for checking whether an integer division-related 4848 /// instruction may divide by zero (in which case it must be predicated if 4849 /// executed conditionally in the scalar code). 4850 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4851 /// Non-zero divisors that are non compile-time constants will not be 4852 /// converted into multiplication, so we will still end up scalarizing 4853 /// the division, but can do so w/o predication. 4854 static bool mayDivideByZero(Instruction &I) { 4855 assert((I.getOpcode() == Instruction::UDiv || 4856 I.getOpcode() == Instruction::SDiv || 4857 I.getOpcode() == Instruction::URem || 4858 I.getOpcode() == Instruction::SRem) && 4859 "Unexpected instruction"); 4860 Value *Divisor = I.getOperand(1); 4861 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4862 return !CInt || CInt->isZero(); 4863 } 4864 4865 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4866 VPUser &ArgOperands, 4867 VPTransformState &State) { 4868 assert(!isa<DbgInfoIntrinsic>(I) && 4869 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4870 setDebugLocFromInst(&I); 4871 4872 Module *M = I.getParent()->getParent()->getParent(); 4873 auto *CI = cast<CallInst>(&I); 4874 4875 SmallVector<Type *, 4> Tys; 4876 for (Value *ArgOperand : CI->args()) 4877 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4878 4879 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4880 4881 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4882 // version of the instruction. 4883 // Is it beneficial to perform intrinsic call compared to lib call? 4884 bool NeedToScalarize = false; 4885 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4886 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4887 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4888 assert((UseVectorIntrinsic || !NeedToScalarize) && 4889 "Instruction should be scalarized elsewhere."); 4890 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4891 "Either the intrinsic cost or vector call cost must be valid"); 4892 4893 for (unsigned Part = 0; Part < UF; ++Part) { 4894 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4895 SmallVector<Value *, 4> Args; 4896 for (auto &I : enumerate(ArgOperands.operands())) { 4897 // Some intrinsics have a scalar argument - don't replace it with a 4898 // vector. 4899 Value *Arg; 4900 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4901 Arg = State.get(I.value(), Part); 4902 else { 4903 Arg = State.get(I.value(), VPIteration(0, 0)); 4904 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4905 TysForDecl.push_back(Arg->getType()); 4906 } 4907 Args.push_back(Arg); 4908 } 4909 4910 Function *VectorF; 4911 if (UseVectorIntrinsic) { 4912 // Use vector version of the intrinsic. 4913 if (VF.isVector()) 4914 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4915 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4916 assert(VectorF && "Can't retrieve vector intrinsic."); 4917 } else { 4918 // Use vector version of the function call. 4919 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4920 #ifndef NDEBUG 4921 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4922 "Can't create vector function."); 4923 #endif 4924 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4925 } 4926 SmallVector<OperandBundleDef, 1> OpBundles; 4927 CI->getOperandBundlesAsDefs(OpBundles); 4928 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4929 4930 if (isa<FPMathOperator>(V)) 4931 V->copyFastMathFlags(CI); 4932 4933 State.set(Def, V, Part); 4934 addMetadata(V, &I); 4935 } 4936 } 4937 4938 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4939 VPUser &Operands, 4940 bool InvariantCond, 4941 VPTransformState &State) { 4942 setDebugLocFromInst(&I); 4943 4944 // The condition can be loop invariant but still defined inside the 4945 // loop. This means that we can't just use the original 'cond' value. 4946 // We have to take the 'vectorized' value and pick the first lane. 4947 // Instcombine will make this a no-op. 4948 auto *InvarCond = InvariantCond 4949 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4950 : nullptr; 4951 4952 for (unsigned Part = 0; Part < UF; ++Part) { 4953 Value *Cond = 4954 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4955 Value *Op0 = State.get(Operands.getOperand(1), Part); 4956 Value *Op1 = State.get(Operands.getOperand(2), Part); 4957 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4958 State.set(VPDef, Sel, Part); 4959 addMetadata(Sel, &I); 4960 } 4961 } 4962 4963 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4964 // We should not collect Scalars more than once per VF. Right now, this 4965 // function is called from collectUniformsAndScalars(), which already does 4966 // this check. Collecting Scalars for VF=1 does not make any sense. 4967 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4968 "This function should not be visited twice for the same VF"); 4969 4970 SmallSetVector<Instruction *, 8> Worklist; 4971 4972 // These sets are used to seed the analysis with pointers used by memory 4973 // accesses that will remain scalar. 4974 SmallSetVector<Instruction *, 8> ScalarPtrs; 4975 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4976 auto *Latch = TheLoop->getLoopLatch(); 4977 4978 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4979 // The pointer operands of loads and stores will be scalar as long as the 4980 // memory access is not a gather or scatter operation. The value operand of a 4981 // store will remain scalar if the store is scalarized. 4982 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4983 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4984 assert(WideningDecision != CM_Unknown && 4985 "Widening decision should be ready at this moment"); 4986 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4987 if (Ptr == Store->getValueOperand()) 4988 return WideningDecision == CM_Scalarize; 4989 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4990 "Ptr is neither a value or pointer operand"); 4991 return WideningDecision != CM_GatherScatter; 4992 }; 4993 4994 // A helper that returns true if the given value is a bitcast or 4995 // getelementptr instruction contained in the loop. 4996 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4997 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4998 isa<GetElementPtrInst>(V)) && 4999 !TheLoop->isLoopInvariant(V); 5000 }; 5001 5002 // A helper that evaluates a memory access's use of a pointer. If the use will 5003 // be a scalar use and the pointer is only used by memory accesses, we place 5004 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 5005 // PossibleNonScalarPtrs. 5006 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5007 // We only care about bitcast and getelementptr instructions contained in 5008 // the loop. 5009 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5010 return; 5011 5012 // If the pointer has already been identified as scalar (e.g., if it was 5013 // also identified as uniform), there's nothing to do. 5014 auto *I = cast<Instruction>(Ptr); 5015 if (Worklist.count(I)) 5016 return; 5017 5018 // If the use of the pointer will be a scalar use, and all users of the 5019 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5020 // place the pointer in PossibleNonScalarPtrs. 5021 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5022 return isa<LoadInst>(U) || isa<StoreInst>(U); 5023 })) 5024 ScalarPtrs.insert(I); 5025 else 5026 PossibleNonScalarPtrs.insert(I); 5027 }; 5028 5029 // We seed the scalars analysis with three classes of instructions: (1) 5030 // instructions marked uniform-after-vectorization and (2) bitcast, 5031 // getelementptr and (pointer) phi instructions used by memory accesses 5032 // requiring a scalar use. 5033 // 5034 // (1) Add to the worklist all instructions that have been identified as 5035 // uniform-after-vectorization. 5036 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5037 5038 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5039 // memory accesses requiring a scalar use. The pointer operands of loads and 5040 // stores will be scalar as long as the memory accesses is not a gather or 5041 // scatter operation. The value operand of a store will remain scalar if the 5042 // store is scalarized. 5043 for (auto *BB : TheLoop->blocks()) 5044 for (auto &I : *BB) { 5045 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5046 evaluatePtrUse(Load, Load->getPointerOperand()); 5047 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5048 evaluatePtrUse(Store, Store->getPointerOperand()); 5049 evaluatePtrUse(Store, Store->getValueOperand()); 5050 } 5051 } 5052 for (auto *I : ScalarPtrs) 5053 if (!PossibleNonScalarPtrs.count(I)) { 5054 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5055 Worklist.insert(I); 5056 } 5057 5058 // Insert the forced scalars. 5059 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5060 // induction variable when the PHI user is scalarized. 5061 auto ForcedScalar = ForcedScalars.find(VF); 5062 if (ForcedScalar != ForcedScalars.end()) 5063 for (auto *I : ForcedScalar->second) 5064 Worklist.insert(I); 5065 5066 // Expand the worklist by looking through any bitcasts and getelementptr 5067 // instructions we've already identified as scalar. This is similar to the 5068 // expansion step in collectLoopUniforms(); however, here we're only 5069 // expanding to include additional bitcasts and getelementptr instructions. 5070 unsigned Idx = 0; 5071 while (Idx != Worklist.size()) { 5072 Instruction *Dst = Worklist[Idx++]; 5073 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5074 continue; 5075 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5076 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5077 auto *J = cast<Instruction>(U); 5078 return !TheLoop->contains(J) || Worklist.count(J) || 5079 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5080 isScalarUse(J, Src)); 5081 })) { 5082 Worklist.insert(Src); 5083 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5084 } 5085 } 5086 5087 // An induction variable will remain scalar if all users of the induction 5088 // variable and induction variable update remain scalar. 5089 for (auto &Induction : Legal->getInductionVars()) { 5090 auto *Ind = Induction.first; 5091 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5092 5093 // If tail-folding is applied, the primary induction variable will be used 5094 // to feed a vector compare. 5095 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5096 continue; 5097 5098 // Returns true if \p Indvar is a pointer induction that is used directly by 5099 // load/store instruction \p I. 5100 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 5101 Instruction *I) { 5102 return Induction.second.getKind() == 5103 InductionDescriptor::IK_PtrInduction && 5104 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 5105 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 5106 }; 5107 5108 // Determine if all users of the induction variable are scalar after 5109 // vectorization. 5110 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5111 auto *I = cast<Instruction>(U); 5112 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5113 IsDirectLoadStoreFromPtrIndvar(Ind, I); 5114 }); 5115 if (!ScalarInd) 5116 continue; 5117 5118 // Determine if all users of the induction variable update instruction are 5119 // scalar after vectorization. 5120 auto ScalarIndUpdate = 5121 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5122 auto *I = cast<Instruction>(U); 5123 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5124 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 5125 }); 5126 if (!ScalarIndUpdate) 5127 continue; 5128 5129 // The induction variable and its update instruction will remain scalar. 5130 Worklist.insert(Ind); 5131 Worklist.insert(IndUpdate); 5132 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5133 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5134 << "\n"); 5135 } 5136 5137 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5138 } 5139 5140 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5141 if (!blockNeedsPredicationForAnyReason(I->getParent())) 5142 return false; 5143 switch(I->getOpcode()) { 5144 default: 5145 break; 5146 case Instruction::Load: 5147 case Instruction::Store: { 5148 if (!Legal->isMaskRequired(I)) 5149 return false; 5150 auto *Ptr = getLoadStorePointerOperand(I); 5151 auto *Ty = getLoadStoreType(I); 5152 const Align Alignment = getLoadStoreAlignment(I); 5153 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5154 TTI.isLegalMaskedGather(Ty, Alignment)) 5155 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5156 TTI.isLegalMaskedScatter(Ty, Alignment)); 5157 } 5158 case Instruction::UDiv: 5159 case Instruction::SDiv: 5160 case Instruction::SRem: 5161 case Instruction::URem: 5162 return mayDivideByZero(*I); 5163 } 5164 return false; 5165 } 5166 5167 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5168 Instruction *I, ElementCount VF) { 5169 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5170 assert(getWideningDecision(I, VF) == CM_Unknown && 5171 "Decision should not be set yet."); 5172 auto *Group = getInterleavedAccessGroup(I); 5173 assert(Group && "Must have a group."); 5174 5175 // If the instruction's allocated size doesn't equal it's type size, it 5176 // requires padding and will be scalarized. 5177 auto &DL = I->getModule()->getDataLayout(); 5178 auto *ScalarTy = getLoadStoreType(I); 5179 if (hasIrregularType(ScalarTy, DL)) 5180 return false; 5181 5182 // Check if masking is required. 5183 // A Group may need masking for one of two reasons: it resides in a block that 5184 // needs predication, or it was decided to use masking to deal with gaps 5185 // (either a gap at the end of a load-access that may result in a speculative 5186 // load, or any gaps in a store-access). 5187 bool PredicatedAccessRequiresMasking = 5188 blockNeedsPredicationForAnyReason(I->getParent()) && 5189 Legal->isMaskRequired(I); 5190 bool LoadAccessWithGapsRequiresEpilogMasking = 5191 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 5192 !isScalarEpilogueAllowed(); 5193 bool StoreAccessWithGapsRequiresMasking = 5194 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 5195 if (!PredicatedAccessRequiresMasking && 5196 !LoadAccessWithGapsRequiresEpilogMasking && 5197 !StoreAccessWithGapsRequiresMasking) 5198 return true; 5199 5200 // If masked interleaving is required, we expect that the user/target had 5201 // enabled it, because otherwise it either wouldn't have been created or 5202 // it should have been invalidated by the CostModel. 5203 assert(useMaskedInterleavedAccesses(TTI) && 5204 "Masked interleave-groups for predicated accesses are not enabled."); 5205 5206 if (Group->isReverse()) 5207 return false; 5208 5209 auto *Ty = getLoadStoreType(I); 5210 const Align Alignment = getLoadStoreAlignment(I); 5211 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5212 : TTI.isLegalMaskedStore(Ty, Alignment); 5213 } 5214 5215 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5216 Instruction *I, ElementCount VF) { 5217 // Get and ensure we have a valid memory instruction. 5218 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 5219 5220 auto *Ptr = getLoadStorePointerOperand(I); 5221 auto *ScalarTy = getLoadStoreType(I); 5222 5223 // In order to be widened, the pointer should be consecutive, first of all. 5224 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5225 return false; 5226 5227 // If the instruction is a store located in a predicated block, it will be 5228 // scalarized. 5229 if (isScalarWithPredication(I)) 5230 return false; 5231 5232 // If the instruction's allocated size doesn't equal it's type size, it 5233 // requires padding and will be scalarized. 5234 auto &DL = I->getModule()->getDataLayout(); 5235 if (hasIrregularType(ScalarTy, DL)) 5236 return false; 5237 5238 return true; 5239 } 5240 5241 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5242 // We should not collect Uniforms more than once per VF. Right now, 5243 // this function is called from collectUniformsAndScalars(), which 5244 // already does this check. Collecting Uniforms for VF=1 does not make any 5245 // sense. 5246 5247 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5248 "This function should not be visited twice for the same VF"); 5249 5250 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5251 // not analyze again. Uniforms.count(VF) will return 1. 5252 Uniforms[VF].clear(); 5253 5254 // We now know that the loop is vectorizable! 5255 // Collect instructions inside the loop that will remain uniform after 5256 // vectorization. 5257 5258 // Global values, params and instructions outside of current loop are out of 5259 // scope. 5260 auto isOutOfScope = [&](Value *V) -> bool { 5261 Instruction *I = dyn_cast<Instruction>(V); 5262 return (!I || !TheLoop->contains(I)); 5263 }; 5264 5265 // Worklist containing uniform instructions demanding lane 0. 5266 SetVector<Instruction *> Worklist; 5267 BasicBlock *Latch = TheLoop->getLoopLatch(); 5268 5269 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5270 // that are scalar with predication must not be considered uniform after 5271 // vectorization, because that would create an erroneous replicating region 5272 // where only a single instance out of VF should be formed. 5273 // TODO: optimize such seldom cases if found important, see PR40816. 5274 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5275 if (isOutOfScope(I)) { 5276 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5277 << *I << "\n"); 5278 return; 5279 } 5280 if (isScalarWithPredication(I)) { 5281 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5282 << *I << "\n"); 5283 return; 5284 } 5285 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5286 Worklist.insert(I); 5287 }; 5288 5289 // Start with the conditional branch. If the branch condition is an 5290 // instruction contained in the loop that is only used by the branch, it is 5291 // uniform. 5292 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5293 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5294 addToWorklistIfAllowed(Cmp); 5295 5296 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5297 InstWidening WideningDecision = getWideningDecision(I, VF); 5298 assert(WideningDecision != CM_Unknown && 5299 "Widening decision should be ready at this moment"); 5300 5301 // A uniform memory op is itself uniform. We exclude uniform stores 5302 // here as they demand the last lane, not the first one. 5303 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5304 assert(WideningDecision == CM_Scalarize); 5305 return true; 5306 } 5307 5308 return (WideningDecision == CM_Widen || 5309 WideningDecision == CM_Widen_Reverse || 5310 WideningDecision == CM_Interleave); 5311 }; 5312 5313 5314 // Returns true if Ptr is the pointer operand of a memory access instruction 5315 // I, and I is known to not require scalarization. 5316 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5317 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5318 }; 5319 5320 // Holds a list of values which are known to have at least one uniform use. 5321 // Note that there may be other uses which aren't uniform. A "uniform use" 5322 // here is something which only demands lane 0 of the unrolled iterations; 5323 // it does not imply that all lanes produce the same value (e.g. this is not 5324 // the usual meaning of uniform) 5325 SetVector<Value *> HasUniformUse; 5326 5327 // Scan the loop for instructions which are either a) known to have only 5328 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5329 for (auto *BB : TheLoop->blocks()) 5330 for (auto &I : *BB) { 5331 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5332 switch (II->getIntrinsicID()) { 5333 case Intrinsic::sideeffect: 5334 case Intrinsic::experimental_noalias_scope_decl: 5335 case Intrinsic::assume: 5336 case Intrinsic::lifetime_start: 5337 case Intrinsic::lifetime_end: 5338 if (TheLoop->hasLoopInvariantOperands(&I)) 5339 addToWorklistIfAllowed(&I); 5340 break; 5341 default: 5342 break; 5343 } 5344 } 5345 5346 // ExtractValue instructions must be uniform, because the operands are 5347 // known to be loop-invariant. 5348 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5349 assert(isOutOfScope(EVI->getAggregateOperand()) && 5350 "Expected aggregate value to be loop invariant"); 5351 addToWorklistIfAllowed(EVI); 5352 continue; 5353 } 5354 5355 // If there's no pointer operand, there's nothing to do. 5356 auto *Ptr = getLoadStorePointerOperand(&I); 5357 if (!Ptr) 5358 continue; 5359 5360 // A uniform memory op is itself uniform. We exclude uniform stores 5361 // here as they demand the last lane, not the first one. 5362 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5363 addToWorklistIfAllowed(&I); 5364 5365 if (isUniformDecision(&I, VF)) { 5366 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5367 HasUniformUse.insert(Ptr); 5368 } 5369 } 5370 5371 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5372 // demanding) users. Since loops are assumed to be in LCSSA form, this 5373 // disallows uses outside the loop as well. 5374 for (auto *V : HasUniformUse) { 5375 if (isOutOfScope(V)) 5376 continue; 5377 auto *I = cast<Instruction>(V); 5378 auto UsersAreMemAccesses = 5379 llvm::all_of(I->users(), [&](User *U) -> bool { 5380 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5381 }); 5382 if (UsersAreMemAccesses) 5383 addToWorklistIfAllowed(I); 5384 } 5385 5386 // Expand Worklist in topological order: whenever a new instruction 5387 // is added , its users should be already inside Worklist. It ensures 5388 // a uniform instruction will only be used by uniform instructions. 5389 unsigned idx = 0; 5390 while (idx != Worklist.size()) { 5391 Instruction *I = Worklist[idx++]; 5392 5393 for (auto OV : I->operand_values()) { 5394 // isOutOfScope operands cannot be uniform instructions. 5395 if (isOutOfScope(OV)) 5396 continue; 5397 // First order recurrence Phi's should typically be considered 5398 // non-uniform. 5399 auto *OP = dyn_cast<PHINode>(OV); 5400 if (OP && Legal->isFirstOrderRecurrence(OP)) 5401 continue; 5402 // If all the users of the operand are uniform, then add the 5403 // operand into the uniform worklist. 5404 auto *OI = cast<Instruction>(OV); 5405 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5406 auto *J = cast<Instruction>(U); 5407 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5408 })) 5409 addToWorklistIfAllowed(OI); 5410 } 5411 } 5412 5413 // For an instruction to be added into Worklist above, all its users inside 5414 // the loop should also be in Worklist. However, this condition cannot be 5415 // true for phi nodes that form a cyclic dependence. We must process phi 5416 // nodes separately. An induction variable will remain uniform if all users 5417 // of the induction variable and induction variable update remain uniform. 5418 // The code below handles both pointer and non-pointer induction variables. 5419 for (auto &Induction : Legal->getInductionVars()) { 5420 auto *Ind = Induction.first; 5421 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5422 5423 // Determine if all users of the induction variable are uniform after 5424 // vectorization. 5425 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5426 auto *I = cast<Instruction>(U); 5427 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5428 isVectorizedMemAccessUse(I, Ind); 5429 }); 5430 if (!UniformInd) 5431 continue; 5432 5433 // Determine if all users of the induction variable update instruction are 5434 // uniform after vectorization. 5435 auto UniformIndUpdate = 5436 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5437 auto *I = cast<Instruction>(U); 5438 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5439 isVectorizedMemAccessUse(I, IndUpdate); 5440 }); 5441 if (!UniformIndUpdate) 5442 continue; 5443 5444 // The induction variable and its update instruction will remain uniform. 5445 addToWorklistIfAllowed(Ind); 5446 addToWorklistIfAllowed(IndUpdate); 5447 } 5448 5449 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5450 } 5451 5452 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5453 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5454 5455 if (Legal->getRuntimePointerChecking()->Need) { 5456 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5457 "runtime pointer checks needed. Enable vectorization of this " 5458 "loop with '#pragma clang loop vectorize(enable)' when " 5459 "compiling with -Os/-Oz", 5460 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5461 return true; 5462 } 5463 5464 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5465 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5466 "runtime SCEV checks needed. Enable vectorization of this " 5467 "loop with '#pragma clang loop vectorize(enable)' when " 5468 "compiling with -Os/-Oz", 5469 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5470 return true; 5471 } 5472 5473 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5474 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5475 reportVectorizationFailure("Runtime stride check for small trip count", 5476 "runtime stride == 1 checks needed. Enable vectorization of " 5477 "this loop without such check by compiling with -Os/-Oz", 5478 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5479 return true; 5480 } 5481 5482 return false; 5483 } 5484 5485 ElementCount 5486 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5487 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5488 return ElementCount::getScalable(0); 5489 5490 if (Hints->isScalableVectorizationDisabled()) { 5491 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5492 "ScalableVectorizationDisabled", ORE, TheLoop); 5493 return ElementCount::getScalable(0); 5494 } 5495 5496 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5497 5498 auto MaxScalableVF = ElementCount::getScalable( 5499 std::numeric_limits<ElementCount::ScalarTy>::max()); 5500 5501 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5502 // FIXME: While for scalable vectors this is currently sufficient, this should 5503 // be replaced by a more detailed mechanism that filters out specific VFs, 5504 // instead of invalidating vectorization for a whole set of VFs based on the 5505 // MaxVF. 5506 5507 // Disable scalable vectorization if the loop contains unsupported reductions. 5508 if (!canVectorizeReductions(MaxScalableVF)) { 5509 reportVectorizationInfo( 5510 "Scalable vectorization not supported for the reduction " 5511 "operations found in this loop.", 5512 "ScalableVFUnfeasible", ORE, TheLoop); 5513 return ElementCount::getScalable(0); 5514 } 5515 5516 // Disable scalable vectorization if the loop contains any instructions 5517 // with element types not supported for scalable vectors. 5518 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5519 return !Ty->isVoidTy() && 5520 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5521 })) { 5522 reportVectorizationInfo("Scalable vectorization is not supported " 5523 "for all element types found in this loop.", 5524 "ScalableVFUnfeasible", ORE, TheLoop); 5525 return ElementCount::getScalable(0); 5526 } 5527 5528 if (Legal->isSafeForAnyVectorWidth()) 5529 return MaxScalableVF; 5530 5531 // Limit MaxScalableVF by the maximum safe dependence distance. 5532 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5533 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5534 unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) 5535 .getVScaleRangeArgs() 5536 .second; 5537 if (VScaleMax > 0) 5538 MaxVScale = VScaleMax; 5539 } 5540 MaxScalableVF = ElementCount::getScalable( 5541 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5542 if (!MaxScalableVF) 5543 reportVectorizationInfo( 5544 "Max legal vector width too small, scalable vectorization " 5545 "unfeasible.", 5546 "ScalableVFUnfeasible", ORE, TheLoop); 5547 5548 return MaxScalableVF; 5549 } 5550 5551 FixedScalableVFPair 5552 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5553 ElementCount UserVF) { 5554 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5555 unsigned SmallestType, WidestType; 5556 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5557 5558 // Get the maximum safe dependence distance in bits computed by LAA. 5559 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5560 // the memory accesses that is most restrictive (involved in the smallest 5561 // dependence distance). 5562 unsigned MaxSafeElements = 5563 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5564 5565 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5566 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5567 5568 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5569 << ".\n"); 5570 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5571 << ".\n"); 5572 5573 // First analyze the UserVF, fall back if the UserVF should be ignored. 5574 if (UserVF) { 5575 auto MaxSafeUserVF = 5576 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5577 5578 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5579 // If `VF=vscale x N` is safe, then so is `VF=N` 5580 if (UserVF.isScalable()) 5581 return FixedScalableVFPair( 5582 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5583 else 5584 return UserVF; 5585 } 5586 5587 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5588 5589 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5590 // is better to ignore the hint and let the compiler choose a suitable VF. 5591 if (!UserVF.isScalable()) { 5592 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5593 << " is unsafe, clamping to max safe VF=" 5594 << MaxSafeFixedVF << ".\n"); 5595 ORE->emit([&]() { 5596 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5597 TheLoop->getStartLoc(), 5598 TheLoop->getHeader()) 5599 << "User-specified vectorization factor " 5600 << ore::NV("UserVectorizationFactor", UserVF) 5601 << " is unsafe, clamping to maximum safe vectorization factor " 5602 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5603 }); 5604 return MaxSafeFixedVF; 5605 } 5606 5607 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5608 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5609 << " is ignored because scalable vectors are not " 5610 "available.\n"); 5611 ORE->emit([&]() { 5612 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5613 TheLoop->getStartLoc(), 5614 TheLoop->getHeader()) 5615 << "User-specified vectorization factor " 5616 << ore::NV("UserVectorizationFactor", UserVF) 5617 << " is ignored because the target does not support scalable " 5618 "vectors. The compiler will pick a more suitable value."; 5619 }); 5620 } else { 5621 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5622 << " is unsafe. Ignoring scalable UserVF.\n"); 5623 ORE->emit([&]() { 5624 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5625 TheLoop->getStartLoc(), 5626 TheLoop->getHeader()) 5627 << "User-specified vectorization factor " 5628 << ore::NV("UserVectorizationFactor", UserVF) 5629 << " is unsafe. Ignoring the hint to let the compiler pick a " 5630 "more suitable value."; 5631 }); 5632 } 5633 } 5634 5635 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5636 << " / " << WidestType << " bits.\n"); 5637 5638 FixedScalableVFPair Result(ElementCount::getFixed(1), 5639 ElementCount::getScalable(0)); 5640 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5641 WidestType, MaxSafeFixedVF)) 5642 Result.FixedVF = MaxVF; 5643 5644 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5645 WidestType, MaxSafeScalableVF)) 5646 if (MaxVF.isScalable()) { 5647 Result.ScalableVF = MaxVF; 5648 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5649 << "\n"); 5650 } 5651 5652 return Result; 5653 } 5654 5655 FixedScalableVFPair 5656 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5657 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5658 // TODO: It may by useful to do since it's still likely to be dynamically 5659 // uniform if the target can skip. 5660 reportVectorizationFailure( 5661 "Not inserting runtime ptr check for divergent target", 5662 "runtime pointer checks needed. Not enabled for divergent target", 5663 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5664 return FixedScalableVFPair::getNone(); 5665 } 5666 5667 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5668 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5669 if (TC == 1) { 5670 reportVectorizationFailure("Single iteration (non) loop", 5671 "loop trip count is one, irrelevant for vectorization", 5672 "SingleIterationLoop", ORE, TheLoop); 5673 return FixedScalableVFPair::getNone(); 5674 } 5675 5676 switch (ScalarEpilogueStatus) { 5677 case CM_ScalarEpilogueAllowed: 5678 return computeFeasibleMaxVF(TC, UserVF); 5679 case CM_ScalarEpilogueNotAllowedUsePredicate: 5680 LLVM_FALLTHROUGH; 5681 case CM_ScalarEpilogueNotNeededUsePredicate: 5682 LLVM_DEBUG( 5683 dbgs() << "LV: vector predicate hint/switch found.\n" 5684 << "LV: Not allowing scalar epilogue, creating predicated " 5685 << "vector loop.\n"); 5686 break; 5687 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5688 // fallthrough as a special case of OptForSize 5689 case CM_ScalarEpilogueNotAllowedOptSize: 5690 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5691 LLVM_DEBUG( 5692 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5693 else 5694 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5695 << "count.\n"); 5696 5697 // Bail if runtime checks are required, which are not good when optimising 5698 // for size. 5699 if (runtimeChecksRequired()) 5700 return FixedScalableVFPair::getNone(); 5701 5702 break; 5703 } 5704 5705 // The only loops we can vectorize without a scalar epilogue, are loops with 5706 // a bottom-test and a single exiting block. We'd have to handle the fact 5707 // that not every instruction executes on the last iteration. This will 5708 // require a lane mask which varies through the vector loop body. (TODO) 5709 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5710 // If there was a tail-folding hint/switch, but we can't fold the tail by 5711 // masking, fallback to a vectorization with a scalar epilogue. 5712 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5713 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5714 "scalar epilogue instead.\n"); 5715 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5716 return computeFeasibleMaxVF(TC, UserVF); 5717 } 5718 return FixedScalableVFPair::getNone(); 5719 } 5720 5721 // Now try the tail folding 5722 5723 // Invalidate interleave groups that require an epilogue if we can't mask 5724 // the interleave-group. 5725 if (!useMaskedInterleavedAccesses(TTI)) { 5726 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5727 "No decisions should have been taken at this point"); 5728 // Note: There is no need to invalidate any cost modeling decisions here, as 5729 // non where taken so far. 5730 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5731 } 5732 5733 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5734 // Avoid tail folding if the trip count is known to be a multiple of any VF 5735 // we chose. 5736 // FIXME: The condition below pessimises the case for fixed-width vectors, 5737 // when scalable VFs are also candidates for vectorization. 5738 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5739 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5740 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5741 "MaxFixedVF must be a power of 2"); 5742 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5743 : MaxFixedVF.getFixedValue(); 5744 ScalarEvolution *SE = PSE.getSE(); 5745 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5746 const SCEV *ExitCount = SE->getAddExpr( 5747 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5748 const SCEV *Rem = SE->getURemExpr( 5749 SE->applyLoopGuards(ExitCount, TheLoop), 5750 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5751 if (Rem->isZero()) { 5752 // Accept MaxFixedVF if we do not have a tail. 5753 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5754 return MaxFactors; 5755 } 5756 } 5757 5758 // For scalable vectors, don't use tail folding as this is currently not yet 5759 // supported. The code is likely to have ended up here if the tripcount is 5760 // low, in which case it makes sense not to use scalable vectors. 5761 if (MaxFactors.ScalableVF.isVector()) 5762 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5763 5764 // If we don't know the precise trip count, or if the trip count that we 5765 // found modulo the vectorization factor is not zero, try to fold the tail 5766 // by masking. 5767 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5768 if (Legal->prepareToFoldTailByMasking()) { 5769 FoldTailByMasking = true; 5770 return MaxFactors; 5771 } 5772 5773 // If there was a tail-folding hint/switch, but we can't fold the tail by 5774 // masking, fallback to a vectorization with a scalar epilogue. 5775 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5776 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5777 "scalar epilogue instead.\n"); 5778 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5779 return MaxFactors; 5780 } 5781 5782 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5783 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5784 return FixedScalableVFPair::getNone(); 5785 } 5786 5787 if (TC == 0) { 5788 reportVectorizationFailure( 5789 "Unable to calculate the loop count due to complex control flow", 5790 "unable to calculate the loop count due to complex control flow", 5791 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5792 return FixedScalableVFPair::getNone(); 5793 } 5794 5795 reportVectorizationFailure( 5796 "Cannot optimize for size and vectorize at the same time.", 5797 "cannot optimize for size and vectorize at the same time. " 5798 "Enable vectorization of this loop with '#pragma clang loop " 5799 "vectorize(enable)' when compiling with -Os/-Oz", 5800 "NoTailLoopWithOptForSize", ORE, TheLoop); 5801 return FixedScalableVFPair::getNone(); 5802 } 5803 5804 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5805 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5806 const ElementCount &MaxSafeVF) { 5807 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5808 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5809 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5810 : TargetTransformInfo::RGK_FixedWidthVector); 5811 5812 // Convenience function to return the minimum of two ElementCounts. 5813 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5814 assert((LHS.isScalable() == RHS.isScalable()) && 5815 "Scalable flags must match"); 5816 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5817 }; 5818 5819 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5820 // Note that both WidestRegister and WidestType may not be a powers of 2. 5821 auto MaxVectorElementCount = ElementCount::get( 5822 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5823 ComputeScalableMaxVF); 5824 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5825 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5826 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5827 5828 if (!MaxVectorElementCount) { 5829 LLVM_DEBUG(dbgs() << "LV: The target has no " 5830 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5831 << " vector registers.\n"); 5832 return ElementCount::getFixed(1); 5833 } 5834 5835 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5836 if (ConstTripCount && 5837 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5838 isPowerOf2_32(ConstTripCount)) { 5839 // We need to clamp the VF to be the ConstTripCount. There is no point in 5840 // choosing a higher viable VF as done in the loop below. If 5841 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5842 // the TC is less than or equal to the known number of lanes. 5843 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5844 << ConstTripCount << "\n"); 5845 return TripCountEC; 5846 } 5847 5848 ElementCount MaxVF = MaxVectorElementCount; 5849 if (TTI.shouldMaximizeVectorBandwidth() || 5850 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5851 auto MaxVectorElementCountMaxBW = ElementCount::get( 5852 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5853 ComputeScalableMaxVF); 5854 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5855 5856 // Collect all viable vectorization factors larger than the default MaxVF 5857 // (i.e. MaxVectorElementCount). 5858 SmallVector<ElementCount, 8> VFs; 5859 for (ElementCount VS = MaxVectorElementCount * 2; 5860 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5861 VFs.push_back(VS); 5862 5863 // For each VF calculate its register usage. 5864 auto RUs = calculateRegisterUsage(VFs); 5865 5866 // Select the largest VF which doesn't require more registers than existing 5867 // ones. 5868 for (int i = RUs.size() - 1; i >= 0; --i) { 5869 bool Selected = true; 5870 for (auto &pair : RUs[i].MaxLocalUsers) { 5871 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5872 if (pair.second > TargetNumRegisters) 5873 Selected = false; 5874 } 5875 if (Selected) { 5876 MaxVF = VFs[i]; 5877 break; 5878 } 5879 } 5880 if (ElementCount MinVF = 5881 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5882 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5883 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5884 << ") with target's minimum: " << MinVF << '\n'); 5885 MaxVF = MinVF; 5886 } 5887 } 5888 } 5889 return MaxVF; 5890 } 5891 5892 bool LoopVectorizationCostModel::isMoreProfitable( 5893 const VectorizationFactor &A, const VectorizationFactor &B) const { 5894 InstructionCost CostA = A.Cost; 5895 InstructionCost CostB = B.Cost; 5896 5897 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5898 5899 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5900 MaxTripCount) { 5901 // If we are folding the tail and the trip count is a known (possibly small) 5902 // constant, the trip count will be rounded up to an integer number of 5903 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5904 // which we compare directly. When not folding the tail, the total cost will 5905 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5906 // approximated with the per-lane cost below instead of using the tripcount 5907 // as here. 5908 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5909 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5910 return RTCostA < RTCostB; 5911 } 5912 5913 // Improve estimate for the vector width if it is scalable. 5914 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5915 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5916 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5917 if (A.Width.isScalable()) 5918 EstimatedWidthA *= VScale.getValue(); 5919 if (B.Width.isScalable()) 5920 EstimatedWidthB *= VScale.getValue(); 5921 } 5922 5923 // When set to preferred, for now assume vscale may be larger than 1 (or the 5924 // one being tuned for), so that scalable vectorization is slightly favorable 5925 // over fixed-width vectorization. 5926 if (Hints->isScalableVectorizationPreferred()) 5927 if (A.Width.isScalable() && !B.Width.isScalable()) 5928 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5929 5930 // To avoid the need for FP division: 5931 // (CostA / A.Width) < (CostB / B.Width) 5932 // <=> (CostA * B.Width) < (CostB * A.Width) 5933 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5934 } 5935 5936 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5937 const ElementCountSet &VFCandidates) { 5938 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5939 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5940 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5941 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5942 "Expected Scalar VF to be a candidate"); 5943 5944 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5945 VectorizationFactor ChosenFactor = ScalarCost; 5946 5947 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5948 if (ForceVectorization && VFCandidates.size() > 1) { 5949 // Ignore scalar width, because the user explicitly wants vectorization. 5950 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5951 // evaluation. 5952 ChosenFactor.Cost = InstructionCost::getMax(); 5953 } 5954 5955 SmallVector<InstructionVFPair> InvalidCosts; 5956 for (const auto &i : VFCandidates) { 5957 // The cost for scalar VF=1 is already calculated, so ignore it. 5958 if (i.isScalar()) 5959 continue; 5960 5961 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5962 VectorizationFactor Candidate(i, C.first); 5963 5964 #ifndef NDEBUG 5965 unsigned AssumedMinimumVscale = 1; 5966 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5967 AssumedMinimumVscale = VScale.getValue(); 5968 unsigned Width = 5969 Candidate.Width.isScalable() 5970 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5971 : Candidate.Width.getFixedValue(); 5972 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5973 << " costs: " << (Candidate.Cost / Width)); 5974 if (i.isScalable()) 5975 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5976 << AssumedMinimumVscale << ")"); 5977 LLVM_DEBUG(dbgs() << ".\n"); 5978 #endif 5979 5980 if (!C.second && !ForceVectorization) { 5981 LLVM_DEBUG( 5982 dbgs() << "LV: Not considering vector loop of width " << i 5983 << " because it will not generate any vector instructions.\n"); 5984 continue; 5985 } 5986 5987 // If profitable add it to ProfitableVF list. 5988 if (isMoreProfitable(Candidate, ScalarCost)) 5989 ProfitableVFs.push_back(Candidate); 5990 5991 if (isMoreProfitable(Candidate, ChosenFactor)) 5992 ChosenFactor = Candidate; 5993 } 5994 5995 // Emit a report of VFs with invalid costs in the loop. 5996 if (!InvalidCosts.empty()) { 5997 // Group the remarks per instruction, keeping the instruction order from 5998 // InvalidCosts. 5999 std::map<Instruction *, unsigned> Numbering; 6000 unsigned I = 0; 6001 for (auto &Pair : InvalidCosts) 6002 if (!Numbering.count(Pair.first)) 6003 Numbering[Pair.first] = I++; 6004 6005 // Sort the list, first on instruction(number) then on VF. 6006 llvm::sort(InvalidCosts, 6007 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 6008 if (Numbering[A.first] != Numbering[B.first]) 6009 return Numbering[A.first] < Numbering[B.first]; 6010 ElementCountComparator ECC; 6011 return ECC(A.second, B.second); 6012 }); 6013 6014 // For a list of ordered instruction-vf pairs: 6015 // [(load, vf1), (load, vf2), (store, vf1)] 6016 // Group the instructions together to emit separate remarks for: 6017 // load (vf1, vf2) 6018 // store (vf1) 6019 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 6020 auto Subset = ArrayRef<InstructionVFPair>(); 6021 do { 6022 if (Subset.empty()) 6023 Subset = Tail.take_front(1); 6024 6025 Instruction *I = Subset.front().first; 6026 6027 // If the next instruction is different, or if there are no other pairs, 6028 // emit a remark for the collated subset. e.g. 6029 // [(load, vf1), (load, vf2))] 6030 // to emit: 6031 // remark: invalid costs for 'load' at VF=(vf, vf2) 6032 if (Subset == Tail || Tail[Subset.size()].first != I) { 6033 std::string OutString; 6034 raw_string_ostream OS(OutString); 6035 assert(!Subset.empty() && "Unexpected empty range"); 6036 OS << "Instruction with invalid costs prevented vectorization at VF=("; 6037 for (auto &Pair : Subset) 6038 OS << (Pair.second == Subset.front().second ? "" : ", ") 6039 << Pair.second; 6040 OS << "):"; 6041 if (auto *CI = dyn_cast<CallInst>(I)) 6042 OS << " call to " << CI->getCalledFunction()->getName(); 6043 else 6044 OS << " " << I->getOpcodeName(); 6045 OS.flush(); 6046 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 6047 Tail = Tail.drop_front(Subset.size()); 6048 Subset = {}; 6049 } else 6050 // Grow the subset by one element 6051 Subset = Tail.take_front(Subset.size() + 1); 6052 } while (!Tail.empty()); 6053 } 6054 6055 if (!EnableCondStoresVectorization && NumPredStores) { 6056 reportVectorizationFailure("There are conditional stores.", 6057 "store that is conditionally executed prevents vectorization", 6058 "ConditionalStore", ORE, TheLoop); 6059 ChosenFactor = ScalarCost; 6060 } 6061 6062 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6063 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 6064 << "LV: Vectorization seems to be not beneficial, " 6065 << "but was forced by a user.\n"); 6066 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6067 return ChosenFactor; 6068 } 6069 6070 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6071 const Loop &L, ElementCount VF) const { 6072 // Cross iteration phis such as reductions need special handling and are 6073 // currently unsupported. 6074 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6075 return Legal->isFirstOrderRecurrence(&Phi) || 6076 Legal->isReductionVariable(&Phi); 6077 })) 6078 return false; 6079 6080 // Phis with uses outside of the loop require special handling and are 6081 // currently unsupported. 6082 for (auto &Entry : Legal->getInductionVars()) { 6083 // Look for uses of the value of the induction at the last iteration. 6084 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6085 for (User *U : PostInc->users()) 6086 if (!L.contains(cast<Instruction>(U))) 6087 return false; 6088 // Look for uses of penultimate value of the induction. 6089 for (User *U : Entry.first->users()) 6090 if (!L.contains(cast<Instruction>(U))) 6091 return false; 6092 } 6093 6094 // Induction variables that are widened require special handling that is 6095 // currently not supported. 6096 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6097 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6098 this->isProfitableToScalarize(Entry.first, VF)); 6099 })) 6100 return false; 6101 6102 // Epilogue vectorization code has not been auditted to ensure it handles 6103 // non-latch exits properly. It may be fine, but it needs auditted and 6104 // tested. 6105 if (L.getExitingBlock() != L.getLoopLatch()) 6106 return false; 6107 6108 return true; 6109 } 6110 6111 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6112 const ElementCount VF) const { 6113 // FIXME: We need a much better cost-model to take different parameters such 6114 // as register pressure, code size increase and cost of extra branches into 6115 // account. For now we apply a very crude heuristic and only consider loops 6116 // with vectorization factors larger than a certain value. 6117 // We also consider epilogue vectorization unprofitable for targets that don't 6118 // consider interleaving beneficial (eg. MVE). 6119 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6120 return false; 6121 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6122 return true; 6123 return false; 6124 } 6125 6126 VectorizationFactor 6127 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6128 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6129 VectorizationFactor Result = VectorizationFactor::Disabled(); 6130 if (!EnableEpilogueVectorization) { 6131 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6132 return Result; 6133 } 6134 6135 if (!isScalarEpilogueAllowed()) { 6136 LLVM_DEBUG( 6137 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6138 "allowed.\n";); 6139 return Result; 6140 } 6141 6142 // Not really a cost consideration, but check for unsupported cases here to 6143 // simplify the logic. 6144 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6145 LLVM_DEBUG( 6146 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6147 "not a supported candidate.\n";); 6148 return Result; 6149 } 6150 6151 if (EpilogueVectorizationForceVF > 1) { 6152 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6153 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 6154 if (LVP.hasPlanWithVF(ForcedEC)) 6155 return {ForcedEC, 0}; 6156 else { 6157 LLVM_DEBUG( 6158 dbgs() 6159 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6160 return Result; 6161 } 6162 } 6163 6164 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6165 TheLoop->getHeader()->getParent()->hasMinSize()) { 6166 LLVM_DEBUG( 6167 dbgs() 6168 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6169 return Result; 6170 } 6171 6172 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 6173 if (MainLoopVF.isScalable()) 6174 LLVM_DEBUG( 6175 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 6176 "yet supported. Converting to fixed-width (VF=" 6177 << FixedMainLoopVF << ") instead\n"); 6178 6179 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 6180 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 6181 "this loop\n"); 6182 return Result; 6183 } 6184 6185 for (auto &NextVF : ProfitableVFs) 6186 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 6187 (Result.Width.getFixedValue() == 1 || 6188 isMoreProfitable(NextVF, Result)) && 6189 LVP.hasPlanWithVF(NextVF.Width)) 6190 Result = NextVF; 6191 6192 if (Result != VectorizationFactor::Disabled()) 6193 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6194 << Result.Width.getFixedValue() << "\n";); 6195 return Result; 6196 } 6197 6198 std::pair<unsigned, unsigned> 6199 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6200 unsigned MinWidth = -1U; 6201 unsigned MaxWidth = 8; 6202 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6203 for (Type *T : ElementTypesInLoop) { 6204 MinWidth = std::min<unsigned>( 6205 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6206 MaxWidth = std::max<unsigned>( 6207 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 6208 } 6209 return {MinWidth, MaxWidth}; 6210 } 6211 6212 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6213 ElementTypesInLoop.clear(); 6214 // For each block. 6215 for (BasicBlock *BB : TheLoop->blocks()) { 6216 // For each instruction in the loop. 6217 for (Instruction &I : BB->instructionsWithoutDebug()) { 6218 Type *T = I.getType(); 6219 6220 // Skip ignored values. 6221 if (ValuesToIgnore.count(&I)) 6222 continue; 6223 6224 // Only examine Loads, Stores and PHINodes. 6225 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6226 continue; 6227 6228 // Examine PHI nodes that are reduction variables. Update the type to 6229 // account for the recurrence type. 6230 if (auto *PN = dyn_cast<PHINode>(&I)) { 6231 if (!Legal->isReductionVariable(PN)) 6232 continue; 6233 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; 6234 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6235 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6236 RdxDesc.getRecurrenceType(), 6237 TargetTransformInfo::ReductionFlags())) 6238 continue; 6239 T = RdxDesc.getRecurrenceType(); 6240 } 6241 6242 // Examine the stored values. 6243 if (auto *ST = dyn_cast<StoreInst>(&I)) 6244 T = ST->getValueOperand()->getType(); 6245 6246 // Ignore loaded pointer types and stored pointer types that are not 6247 // vectorizable. 6248 // 6249 // FIXME: The check here attempts to predict whether a load or store will 6250 // be vectorized. We only know this for certain after a VF has 6251 // been selected. Here, we assume that if an access can be 6252 // vectorized, it will be. We should also look at extending this 6253 // optimization to non-pointer types. 6254 // 6255 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6256 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6257 continue; 6258 6259 ElementTypesInLoop.insert(T); 6260 } 6261 } 6262 } 6263 6264 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6265 unsigned LoopCost) { 6266 // -- The interleave heuristics -- 6267 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6268 // There are many micro-architectural considerations that we can't predict 6269 // at this level. For example, frontend pressure (on decode or fetch) due to 6270 // code size, or the number and capabilities of the execution ports. 6271 // 6272 // We use the following heuristics to select the interleave count: 6273 // 1. If the code has reductions, then we interleave to break the cross 6274 // iteration dependency. 6275 // 2. If the loop is really small, then we interleave to reduce the loop 6276 // overhead. 6277 // 3. We don't interleave if we think that we will spill registers to memory 6278 // due to the increased register pressure. 6279 6280 if (!isScalarEpilogueAllowed()) 6281 return 1; 6282 6283 // We used the distance for the interleave count. 6284 if (Legal->getMaxSafeDepDistBytes() != -1U) 6285 return 1; 6286 6287 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6288 const bool HasReductions = !Legal->getReductionVars().empty(); 6289 // Do not interleave loops with a relatively small known or estimated trip 6290 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6291 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6292 // because with the above conditions interleaving can expose ILP and break 6293 // cross iteration dependences for reductions. 6294 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6295 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6296 return 1; 6297 6298 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6299 // We divide by these constants so assume that we have at least one 6300 // instruction that uses at least one register. 6301 for (auto& pair : R.MaxLocalUsers) { 6302 pair.second = std::max(pair.second, 1U); 6303 } 6304 6305 // We calculate the interleave count using the following formula. 6306 // Subtract the number of loop invariants from the number of available 6307 // registers. These registers are used by all of the interleaved instances. 6308 // Next, divide the remaining registers by the number of registers that is 6309 // required by the loop, in order to estimate how many parallel instances 6310 // fit without causing spills. All of this is rounded down if necessary to be 6311 // a power of two. We want power of two interleave count to simplify any 6312 // addressing operations or alignment considerations. 6313 // We also want power of two interleave counts to ensure that the induction 6314 // variable of the vector loop wraps to zero, when tail is folded by masking; 6315 // this currently happens when OptForSize, in which case IC is set to 1 above. 6316 unsigned IC = UINT_MAX; 6317 6318 for (auto& pair : R.MaxLocalUsers) { 6319 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6320 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6321 << " registers of " 6322 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6323 if (VF.isScalar()) { 6324 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6325 TargetNumRegisters = ForceTargetNumScalarRegs; 6326 } else { 6327 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6328 TargetNumRegisters = ForceTargetNumVectorRegs; 6329 } 6330 unsigned MaxLocalUsers = pair.second; 6331 unsigned LoopInvariantRegs = 0; 6332 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6333 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6334 6335 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6336 // Don't count the induction variable as interleaved. 6337 if (EnableIndVarRegisterHeur) { 6338 TmpIC = 6339 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6340 std::max(1U, (MaxLocalUsers - 1))); 6341 } 6342 6343 IC = std::min(IC, TmpIC); 6344 } 6345 6346 // Clamp the interleave ranges to reasonable counts. 6347 unsigned MaxInterleaveCount = 6348 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6349 6350 // Check if the user has overridden the max. 6351 if (VF.isScalar()) { 6352 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6353 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6354 } else { 6355 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6356 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6357 } 6358 6359 // If trip count is known or estimated compile time constant, limit the 6360 // interleave count to be less than the trip count divided by VF, provided it 6361 // is at least 1. 6362 // 6363 // For scalable vectors we can't know if interleaving is beneficial. It may 6364 // not be beneficial for small loops if none of the lanes in the second vector 6365 // iterations is enabled. However, for larger loops, there is likely to be a 6366 // similar benefit as for fixed-width vectors. For now, we choose to leave 6367 // the InterleaveCount as if vscale is '1', although if some information about 6368 // the vector is known (e.g. min vector size), we can make a better decision. 6369 if (BestKnownTC) { 6370 MaxInterleaveCount = 6371 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6372 // Make sure MaxInterleaveCount is greater than 0. 6373 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6374 } 6375 6376 assert(MaxInterleaveCount > 0 && 6377 "Maximum interleave count must be greater than 0"); 6378 6379 // Clamp the calculated IC to be between the 1 and the max interleave count 6380 // that the target and trip count allows. 6381 if (IC > MaxInterleaveCount) 6382 IC = MaxInterleaveCount; 6383 else 6384 // Make sure IC is greater than 0. 6385 IC = std::max(1u, IC); 6386 6387 assert(IC > 0 && "Interleave count must be greater than 0."); 6388 6389 // If we did not calculate the cost for VF (because the user selected the VF) 6390 // then we calculate the cost of VF here. 6391 if (LoopCost == 0) { 6392 InstructionCost C = expectedCost(VF).first; 6393 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6394 LoopCost = *C.getValue(); 6395 } 6396 6397 assert(LoopCost && "Non-zero loop cost expected"); 6398 6399 // Interleave if we vectorized this loop and there is a reduction that could 6400 // benefit from interleaving. 6401 if (VF.isVector() && HasReductions) { 6402 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6403 return IC; 6404 } 6405 6406 // Note that if we've already vectorized the loop we will have done the 6407 // runtime check and so interleaving won't require further checks. 6408 bool InterleavingRequiresRuntimePointerCheck = 6409 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6410 6411 // We want to interleave small loops in order to reduce the loop overhead and 6412 // potentially expose ILP opportunities. 6413 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6414 << "LV: IC is " << IC << '\n' 6415 << "LV: VF is " << VF << '\n'); 6416 const bool AggressivelyInterleaveReductions = 6417 TTI.enableAggressiveInterleaving(HasReductions); 6418 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6419 // We assume that the cost overhead is 1 and we use the cost model 6420 // to estimate the cost of the loop and interleave until the cost of the 6421 // loop overhead is about 5% of the cost of the loop. 6422 unsigned SmallIC = 6423 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6424 6425 // Interleave until store/load ports (estimated by max interleave count) are 6426 // saturated. 6427 unsigned NumStores = Legal->getNumStores(); 6428 unsigned NumLoads = Legal->getNumLoads(); 6429 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6430 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6431 6432 // There is little point in interleaving for reductions containing selects 6433 // and compares when VF=1 since it may just create more overhead than it's 6434 // worth for loops with small trip counts. This is because we still have to 6435 // do the final reduction after the loop. 6436 bool HasSelectCmpReductions = 6437 HasReductions && 6438 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6439 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6440 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6441 RdxDesc.getRecurrenceKind()); 6442 }); 6443 if (HasSelectCmpReductions) { 6444 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6445 return 1; 6446 } 6447 6448 // If we have a scalar reduction (vector reductions are already dealt with 6449 // by this point), we can increase the critical path length if the loop 6450 // we're interleaving is inside another loop. For tree-wise reductions 6451 // set the limit to 2, and for ordered reductions it's best to disable 6452 // interleaving entirely. 6453 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6454 bool HasOrderedReductions = 6455 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6456 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6457 return RdxDesc.isOrdered(); 6458 }); 6459 if (HasOrderedReductions) { 6460 LLVM_DEBUG( 6461 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6462 return 1; 6463 } 6464 6465 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6466 SmallIC = std::min(SmallIC, F); 6467 StoresIC = std::min(StoresIC, F); 6468 LoadsIC = std::min(LoadsIC, F); 6469 } 6470 6471 if (EnableLoadStoreRuntimeInterleave && 6472 std::max(StoresIC, LoadsIC) > SmallIC) { 6473 LLVM_DEBUG( 6474 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6475 return std::max(StoresIC, LoadsIC); 6476 } 6477 6478 // If there are scalar reductions and TTI has enabled aggressive 6479 // interleaving for reductions, we will interleave to expose ILP. 6480 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6481 AggressivelyInterleaveReductions) { 6482 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6483 // Interleave no less than SmallIC but not as aggressive as the normal IC 6484 // to satisfy the rare situation when resources are too limited. 6485 return std::max(IC / 2, SmallIC); 6486 } else { 6487 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6488 return SmallIC; 6489 } 6490 } 6491 6492 // Interleave if this is a large loop (small loops are already dealt with by 6493 // this point) that could benefit from interleaving. 6494 if (AggressivelyInterleaveReductions) { 6495 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6496 return IC; 6497 } 6498 6499 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6500 return 1; 6501 } 6502 6503 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6504 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6505 // This function calculates the register usage by measuring the highest number 6506 // of values that are alive at a single location. Obviously, this is a very 6507 // rough estimation. We scan the loop in a topological order in order and 6508 // assign a number to each instruction. We use RPO to ensure that defs are 6509 // met before their users. We assume that each instruction that has in-loop 6510 // users starts an interval. We record every time that an in-loop value is 6511 // used, so we have a list of the first and last occurrences of each 6512 // instruction. Next, we transpose this data structure into a multi map that 6513 // holds the list of intervals that *end* at a specific location. This multi 6514 // map allows us to perform a linear search. We scan the instructions linearly 6515 // and record each time that a new interval starts, by placing it in a set. 6516 // If we find this value in the multi-map then we remove it from the set. 6517 // The max register usage is the maximum size of the set. 6518 // We also search for instructions that are defined outside the loop, but are 6519 // used inside the loop. We need this number separately from the max-interval 6520 // usage number because when we unroll, loop-invariant values do not take 6521 // more register. 6522 LoopBlocksDFS DFS(TheLoop); 6523 DFS.perform(LI); 6524 6525 RegisterUsage RU; 6526 6527 // Each 'key' in the map opens a new interval. The values 6528 // of the map are the index of the 'last seen' usage of the 6529 // instruction that is the key. 6530 using IntervalMap = DenseMap<Instruction *, unsigned>; 6531 6532 // Maps instruction to its index. 6533 SmallVector<Instruction *, 64> IdxToInstr; 6534 // Marks the end of each interval. 6535 IntervalMap EndPoint; 6536 // Saves the list of instruction indices that are used in the loop. 6537 SmallPtrSet<Instruction *, 8> Ends; 6538 // Saves the list of values that are used in the loop but are 6539 // defined outside the loop, such as arguments and constants. 6540 SmallPtrSet<Value *, 8> LoopInvariants; 6541 6542 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6543 for (Instruction &I : BB->instructionsWithoutDebug()) { 6544 IdxToInstr.push_back(&I); 6545 6546 // Save the end location of each USE. 6547 for (Value *U : I.operands()) { 6548 auto *Instr = dyn_cast<Instruction>(U); 6549 6550 // Ignore non-instruction values such as arguments, constants, etc. 6551 if (!Instr) 6552 continue; 6553 6554 // If this instruction is outside the loop then record it and continue. 6555 if (!TheLoop->contains(Instr)) { 6556 LoopInvariants.insert(Instr); 6557 continue; 6558 } 6559 6560 // Overwrite previous end points. 6561 EndPoint[Instr] = IdxToInstr.size(); 6562 Ends.insert(Instr); 6563 } 6564 } 6565 } 6566 6567 // Saves the list of intervals that end with the index in 'key'. 6568 using InstrList = SmallVector<Instruction *, 2>; 6569 DenseMap<unsigned, InstrList> TransposeEnds; 6570 6571 // Transpose the EndPoints to a list of values that end at each index. 6572 for (auto &Interval : EndPoint) 6573 TransposeEnds[Interval.second].push_back(Interval.first); 6574 6575 SmallPtrSet<Instruction *, 8> OpenIntervals; 6576 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6577 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6578 6579 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6580 6581 // A lambda that gets the register usage for the given type and VF. 6582 const auto &TTICapture = TTI; 6583 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6584 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6585 return 0; 6586 InstructionCost::CostType RegUsage = 6587 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6588 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6589 "Nonsensical values for register usage."); 6590 return RegUsage; 6591 }; 6592 6593 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6594 Instruction *I = IdxToInstr[i]; 6595 6596 // Remove all of the instructions that end at this location. 6597 InstrList &List = TransposeEnds[i]; 6598 for (Instruction *ToRemove : List) 6599 OpenIntervals.erase(ToRemove); 6600 6601 // Ignore instructions that are never used within the loop. 6602 if (!Ends.count(I)) 6603 continue; 6604 6605 // Skip ignored values. 6606 if (ValuesToIgnore.count(I)) 6607 continue; 6608 6609 // For each VF find the maximum usage of registers. 6610 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6611 // Count the number of live intervals. 6612 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6613 6614 if (VFs[j].isScalar()) { 6615 for (auto Inst : OpenIntervals) { 6616 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6617 if (RegUsage.find(ClassID) == RegUsage.end()) 6618 RegUsage[ClassID] = 1; 6619 else 6620 RegUsage[ClassID] += 1; 6621 } 6622 } else { 6623 collectUniformsAndScalars(VFs[j]); 6624 for (auto Inst : OpenIntervals) { 6625 // Skip ignored values for VF > 1. 6626 if (VecValuesToIgnore.count(Inst)) 6627 continue; 6628 if (isScalarAfterVectorization(Inst, VFs[j])) { 6629 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6630 if (RegUsage.find(ClassID) == RegUsage.end()) 6631 RegUsage[ClassID] = 1; 6632 else 6633 RegUsage[ClassID] += 1; 6634 } else { 6635 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6636 if (RegUsage.find(ClassID) == RegUsage.end()) 6637 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6638 else 6639 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6640 } 6641 } 6642 } 6643 6644 for (auto& pair : RegUsage) { 6645 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6646 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6647 else 6648 MaxUsages[j][pair.first] = pair.second; 6649 } 6650 } 6651 6652 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6653 << OpenIntervals.size() << '\n'); 6654 6655 // Add the current instruction to the list of open intervals. 6656 OpenIntervals.insert(I); 6657 } 6658 6659 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6660 SmallMapVector<unsigned, unsigned, 4> Invariant; 6661 6662 for (auto Inst : LoopInvariants) { 6663 unsigned Usage = 6664 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6665 unsigned ClassID = 6666 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6667 if (Invariant.find(ClassID) == Invariant.end()) 6668 Invariant[ClassID] = Usage; 6669 else 6670 Invariant[ClassID] += Usage; 6671 } 6672 6673 LLVM_DEBUG({ 6674 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6675 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6676 << " item\n"; 6677 for (const auto &pair : MaxUsages[i]) { 6678 dbgs() << "LV(REG): RegisterClass: " 6679 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6680 << " registers\n"; 6681 } 6682 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6683 << " item\n"; 6684 for (const auto &pair : Invariant) { 6685 dbgs() << "LV(REG): RegisterClass: " 6686 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6687 << " registers\n"; 6688 } 6689 }); 6690 6691 RU.LoopInvariantRegs = Invariant; 6692 RU.MaxLocalUsers = MaxUsages[i]; 6693 RUs[i] = RU; 6694 } 6695 6696 return RUs; 6697 } 6698 6699 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6700 // TODO: Cost model for emulated masked load/store is completely 6701 // broken. This hack guides the cost model to use an artificially 6702 // high enough value to practically disable vectorization with such 6703 // operations, except where previously deployed legality hack allowed 6704 // using very low cost values. This is to avoid regressions coming simply 6705 // from moving "masked load/store" check from legality to cost model. 6706 // Masked Load/Gather emulation was previously never allowed. 6707 // Limited number of Masked Store/Scatter emulation was allowed. 6708 assert(isPredicatedInst(I) && 6709 "Expecting a scalar emulated instruction"); 6710 return isa<LoadInst>(I) || 6711 (isa<StoreInst>(I) && 6712 NumPredStores > NumberOfStoresToPredicate); 6713 } 6714 6715 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6716 // If we aren't vectorizing the loop, or if we've already collected the 6717 // instructions to scalarize, there's nothing to do. Collection may already 6718 // have occurred if we have a user-selected VF and are now computing the 6719 // expected cost for interleaving. 6720 if (VF.isScalar() || VF.isZero() || 6721 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6722 return; 6723 6724 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6725 // not profitable to scalarize any instructions, the presence of VF in the 6726 // map will indicate that we've analyzed it already. 6727 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6728 6729 // Find all the instructions that are scalar with predication in the loop and 6730 // determine if it would be better to not if-convert the blocks they are in. 6731 // If so, we also record the instructions to scalarize. 6732 for (BasicBlock *BB : TheLoop->blocks()) { 6733 if (!blockNeedsPredicationForAnyReason(BB)) 6734 continue; 6735 for (Instruction &I : *BB) 6736 if (isScalarWithPredication(&I)) { 6737 ScalarCostsTy ScalarCosts; 6738 // Do not apply discount if scalable, because that would lead to 6739 // invalid scalarization costs. 6740 // Do not apply discount logic if hacked cost is needed 6741 // for emulated masked memrefs. 6742 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6743 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6744 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6745 // Remember that BB will remain after vectorization. 6746 PredicatedBBsAfterVectorization.insert(BB); 6747 } 6748 } 6749 } 6750 6751 int LoopVectorizationCostModel::computePredInstDiscount( 6752 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6753 assert(!isUniformAfterVectorization(PredInst, VF) && 6754 "Instruction marked uniform-after-vectorization will be predicated"); 6755 6756 // Initialize the discount to zero, meaning that the scalar version and the 6757 // vector version cost the same. 6758 InstructionCost Discount = 0; 6759 6760 // Holds instructions to analyze. The instructions we visit are mapped in 6761 // ScalarCosts. Those instructions are the ones that would be scalarized if 6762 // we find that the scalar version costs less. 6763 SmallVector<Instruction *, 8> Worklist; 6764 6765 // Returns true if the given instruction can be scalarized. 6766 auto canBeScalarized = [&](Instruction *I) -> bool { 6767 // We only attempt to scalarize instructions forming a single-use chain 6768 // from the original predicated block that would otherwise be vectorized. 6769 // Although not strictly necessary, we give up on instructions we know will 6770 // already be scalar to avoid traversing chains that are unlikely to be 6771 // beneficial. 6772 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6773 isScalarAfterVectorization(I, VF)) 6774 return false; 6775 6776 // If the instruction is scalar with predication, it will be analyzed 6777 // separately. We ignore it within the context of PredInst. 6778 if (isScalarWithPredication(I)) 6779 return false; 6780 6781 // If any of the instruction's operands are uniform after vectorization, 6782 // the instruction cannot be scalarized. This prevents, for example, a 6783 // masked load from being scalarized. 6784 // 6785 // We assume we will only emit a value for lane zero of an instruction 6786 // marked uniform after vectorization, rather than VF identical values. 6787 // Thus, if we scalarize an instruction that uses a uniform, we would 6788 // create uses of values corresponding to the lanes we aren't emitting code 6789 // for. This behavior can be changed by allowing getScalarValue to clone 6790 // the lane zero values for uniforms rather than asserting. 6791 for (Use &U : I->operands()) 6792 if (auto *J = dyn_cast<Instruction>(U.get())) 6793 if (isUniformAfterVectorization(J, VF)) 6794 return false; 6795 6796 // Otherwise, we can scalarize the instruction. 6797 return true; 6798 }; 6799 6800 // Compute the expected cost discount from scalarizing the entire expression 6801 // feeding the predicated instruction. We currently only consider expressions 6802 // that are single-use instruction chains. 6803 Worklist.push_back(PredInst); 6804 while (!Worklist.empty()) { 6805 Instruction *I = Worklist.pop_back_val(); 6806 6807 // If we've already analyzed the instruction, there's nothing to do. 6808 if (ScalarCosts.find(I) != ScalarCosts.end()) 6809 continue; 6810 6811 // Compute the cost of the vector instruction. Note that this cost already 6812 // includes the scalarization overhead of the predicated instruction. 6813 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6814 6815 // Compute the cost of the scalarized instruction. This cost is the cost of 6816 // the instruction as if it wasn't if-converted and instead remained in the 6817 // predicated block. We will scale this cost by block probability after 6818 // computing the scalarization overhead. 6819 InstructionCost ScalarCost = 6820 VF.getFixedValue() * 6821 getInstructionCost(I, ElementCount::getFixed(1)).first; 6822 6823 // Compute the scalarization overhead of needed insertelement instructions 6824 // and phi nodes. 6825 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6826 ScalarCost += TTI.getScalarizationOverhead( 6827 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6828 APInt::getAllOnes(VF.getFixedValue()), true, false); 6829 ScalarCost += 6830 VF.getFixedValue() * 6831 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6832 } 6833 6834 // Compute the scalarization overhead of needed extractelement 6835 // instructions. For each of the instruction's operands, if the operand can 6836 // be scalarized, add it to the worklist; otherwise, account for the 6837 // overhead. 6838 for (Use &U : I->operands()) 6839 if (auto *J = dyn_cast<Instruction>(U.get())) { 6840 assert(VectorType::isValidElementType(J->getType()) && 6841 "Instruction has non-scalar type"); 6842 if (canBeScalarized(J)) 6843 Worklist.push_back(J); 6844 else if (needsExtract(J, VF)) { 6845 ScalarCost += TTI.getScalarizationOverhead( 6846 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6847 APInt::getAllOnes(VF.getFixedValue()), false, true); 6848 } 6849 } 6850 6851 // Scale the total scalar cost by block probability. 6852 ScalarCost /= getReciprocalPredBlockProb(); 6853 6854 // Compute the discount. A non-negative discount means the vector version 6855 // of the instruction costs more, and scalarizing would be beneficial. 6856 Discount += VectorCost - ScalarCost; 6857 ScalarCosts[I] = ScalarCost; 6858 } 6859 6860 return *Discount.getValue(); 6861 } 6862 6863 LoopVectorizationCostModel::VectorizationCostTy 6864 LoopVectorizationCostModel::expectedCost( 6865 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6866 VectorizationCostTy Cost; 6867 6868 // For each block. 6869 for (BasicBlock *BB : TheLoop->blocks()) { 6870 VectorizationCostTy BlockCost; 6871 6872 // For each instruction in the old loop. 6873 for (Instruction &I : BB->instructionsWithoutDebug()) { 6874 // Skip ignored values. 6875 if (ValuesToIgnore.count(&I) || 6876 (VF.isVector() && VecValuesToIgnore.count(&I))) 6877 continue; 6878 6879 VectorizationCostTy C = getInstructionCost(&I, VF); 6880 6881 // Check if we should override the cost. 6882 if (C.first.isValid() && 6883 ForceTargetInstructionCost.getNumOccurrences() > 0) 6884 C.first = InstructionCost(ForceTargetInstructionCost); 6885 6886 // Keep a list of instructions with invalid costs. 6887 if (Invalid && !C.first.isValid()) 6888 Invalid->emplace_back(&I, VF); 6889 6890 BlockCost.first += C.first; 6891 BlockCost.second |= C.second; 6892 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6893 << " for VF " << VF << " For instruction: " << I 6894 << '\n'); 6895 } 6896 6897 // If we are vectorizing a predicated block, it will have been 6898 // if-converted. This means that the block's instructions (aside from 6899 // stores and instructions that may divide by zero) will now be 6900 // unconditionally executed. For the scalar case, we may not always execute 6901 // the predicated block, if it is an if-else block. Thus, scale the block's 6902 // cost by the probability of executing it. blockNeedsPredication from 6903 // Legal is used so as to not include all blocks in tail folded loops. 6904 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6905 BlockCost.first /= getReciprocalPredBlockProb(); 6906 6907 Cost.first += BlockCost.first; 6908 Cost.second |= BlockCost.second; 6909 } 6910 6911 return Cost; 6912 } 6913 6914 /// Gets Address Access SCEV after verifying that the access pattern 6915 /// is loop invariant except the induction variable dependence. 6916 /// 6917 /// This SCEV can be sent to the Target in order to estimate the address 6918 /// calculation cost. 6919 static const SCEV *getAddressAccessSCEV( 6920 Value *Ptr, 6921 LoopVectorizationLegality *Legal, 6922 PredicatedScalarEvolution &PSE, 6923 const Loop *TheLoop) { 6924 6925 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6926 if (!Gep) 6927 return nullptr; 6928 6929 // We are looking for a gep with all loop invariant indices except for one 6930 // which should be an induction variable. 6931 auto SE = PSE.getSE(); 6932 unsigned NumOperands = Gep->getNumOperands(); 6933 for (unsigned i = 1; i < NumOperands; ++i) { 6934 Value *Opd = Gep->getOperand(i); 6935 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6936 !Legal->isInductionVariable(Opd)) 6937 return nullptr; 6938 } 6939 6940 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6941 return PSE.getSCEV(Ptr); 6942 } 6943 6944 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6945 return Legal->hasStride(I->getOperand(0)) || 6946 Legal->hasStride(I->getOperand(1)); 6947 } 6948 6949 InstructionCost 6950 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6951 ElementCount VF) { 6952 assert(VF.isVector() && 6953 "Scalarization cost of instruction implies vectorization."); 6954 if (VF.isScalable()) 6955 return InstructionCost::getInvalid(); 6956 6957 Type *ValTy = getLoadStoreType(I); 6958 auto SE = PSE.getSE(); 6959 6960 unsigned AS = getLoadStoreAddressSpace(I); 6961 Value *Ptr = getLoadStorePointerOperand(I); 6962 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6963 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6964 // that it is being called from this specific place. 6965 6966 // Figure out whether the access is strided and get the stride value 6967 // if it's known in compile time 6968 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6969 6970 // Get the cost of the scalar memory instruction and address computation. 6971 InstructionCost Cost = 6972 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6973 6974 // Don't pass *I here, since it is scalar but will actually be part of a 6975 // vectorized loop where the user of it is a vectorized instruction. 6976 const Align Alignment = getLoadStoreAlignment(I); 6977 Cost += VF.getKnownMinValue() * 6978 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6979 AS, TTI::TCK_RecipThroughput); 6980 6981 // Get the overhead of the extractelement and insertelement instructions 6982 // we might create due to scalarization. 6983 Cost += getScalarizationOverhead(I, VF); 6984 6985 // If we have a predicated load/store, it will need extra i1 extracts and 6986 // conditional branches, but may not be executed for each vector lane. Scale 6987 // the cost by the probability of executing the predicated block. 6988 if (isPredicatedInst(I)) { 6989 Cost /= getReciprocalPredBlockProb(); 6990 6991 // Add the cost of an i1 extract and a branch 6992 auto *Vec_i1Ty = 6993 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6994 Cost += TTI.getScalarizationOverhead( 6995 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6996 /*Insert=*/false, /*Extract=*/true); 6997 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6998 6999 if (useEmulatedMaskMemRefHack(I)) 7000 // Artificially setting to a high enough value to practically disable 7001 // vectorization with such operations. 7002 Cost = 3000000; 7003 } 7004 7005 return Cost; 7006 } 7007 7008 InstructionCost 7009 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7010 ElementCount VF) { 7011 Type *ValTy = getLoadStoreType(I); 7012 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7013 Value *Ptr = getLoadStorePointerOperand(I); 7014 unsigned AS = getLoadStoreAddressSpace(I); 7015 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 7016 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7017 7018 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7019 "Stride should be 1 or -1 for consecutive memory access"); 7020 const Align Alignment = getLoadStoreAlignment(I); 7021 InstructionCost Cost = 0; 7022 if (Legal->isMaskRequired(I)) 7023 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7024 CostKind); 7025 else 7026 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7027 CostKind, I); 7028 7029 bool Reverse = ConsecutiveStride < 0; 7030 if (Reverse) 7031 Cost += 7032 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7033 return Cost; 7034 } 7035 7036 InstructionCost 7037 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7038 ElementCount VF) { 7039 assert(Legal->isUniformMemOp(*I)); 7040 7041 Type *ValTy = getLoadStoreType(I); 7042 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7043 const Align Alignment = getLoadStoreAlignment(I); 7044 unsigned AS = getLoadStoreAddressSpace(I); 7045 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7046 if (isa<LoadInst>(I)) { 7047 return TTI.getAddressComputationCost(ValTy) + 7048 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7049 CostKind) + 7050 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7051 } 7052 StoreInst *SI = cast<StoreInst>(I); 7053 7054 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7055 return TTI.getAddressComputationCost(ValTy) + 7056 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7057 CostKind) + 7058 (isLoopInvariantStoreValue 7059 ? 0 7060 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7061 VF.getKnownMinValue() - 1)); 7062 } 7063 7064 InstructionCost 7065 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7066 ElementCount VF) { 7067 Type *ValTy = getLoadStoreType(I); 7068 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7069 const Align Alignment = getLoadStoreAlignment(I); 7070 const Value *Ptr = getLoadStorePointerOperand(I); 7071 7072 return TTI.getAddressComputationCost(VectorTy) + 7073 TTI.getGatherScatterOpCost( 7074 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7075 TargetTransformInfo::TCK_RecipThroughput, I); 7076 } 7077 7078 InstructionCost 7079 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7080 ElementCount VF) { 7081 // TODO: Once we have support for interleaving with scalable vectors 7082 // we can calculate the cost properly here. 7083 if (VF.isScalable()) 7084 return InstructionCost::getInvalid(); 7085 7086 Type *ValTy = getLoadStoreType(I); 7087 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7088 unsigned AS = getLoadStoreAddressSpace(I); 7089 7090 auto Group = getInterleavedAccessGroup(I); 7091 assert(Group && "Fail to get an interleaved access group."); 7092 7093 unsigned InterleaveFactor = Group->getFactor(); 7094 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7095 7096 // Holds the indices of existing members in the interleaved group. 7097 SmallVector<unsigned, 4> Indices; 7098 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 7099 if (Group->getMember(IF)) 7100 Indices.push_back(IF); 7101 7102 // Calculate the cost of the whole interleaved group. 7103 bool UseMaskForGaps = 7104 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 7105 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 7106 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7107 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7108 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7109 7110 if (Group->isReverse()) { 7111 // TODO: Add support for reversed masked interleaved access. 7112 assert(!Legal->isMaskRequired(I) && 7113 "Reverse masked interleaved access not supported."); 7114 Cost += 7115 Group->getNumMembers() * 7116 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7117 } 7118 return Cost; 7119 } 7120 7121 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 7122 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7123 using namespace llvm::PatternMatch; 7124 // Early exit for no inloop reductions 7125 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7126 return None; 7127 auto *VectorTy = cast<VectorType>(Ty); 7128 7129 // We are looking for a pattern of, and finding the minimal acceptable cost: 7130 // reduce(mul(ext(A), ext(B))) or 7131 // reduce(mul(A, B)) or 7132 // reduce(ext(A)) or 7133 // reduce(A). 7134 // The basic idea is that we walk down the tree to do that, finding the root 7135 // reduction instruction in InLoopReductionImmediateChains. From there we find 7136 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7137 // of the components. If the reduction cost is lower then we return it for the 7138 // reduction instruction and 0 for the other instructions in the pattern. If 7139 // it is not we return an invalid cost specifying the orignal cost method 7140 // should be used. 7141 Instruction *RetI = I; 7142 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 7143 if (!RetI->hasOneUser()) 7144 return None; 7145 RetI = RetI->user_back(); 7146 } 7147 if (match(RetI, m_Mul(m_Value(), m_Value())) && 7148 RetI->user_back()->getOpcode() == Instruction::Add) { 7149 if (!RetI->hasOneUser()) 7150 return None; 7151 RetI = RetI->user_back(); 7152 } 7153 7154 // Test if the found instruction is a reduction, and if not return an invalid 7155 // cost specifying the parent to use the original cost modelling. 7156 if (!InLoopReductionImmediateChains.count(RetI)) 7157 return None; 7158 7159 // Find the reduction this chain is a part of and calculate the basic cost of 7160 // the reduction on its own. 7161 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7162 Instruction *ReductionPhi = LastChain; 7163 while (!isa<PHINode>(ReductionPhi)) 7164 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7165 7166 const RecurrenceDescriptor &RdxDesc = 7167 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7168 7169 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7170 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 7171 7172 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 7173 // normal fmul instruction to the cost of the fadd reduction. 7174 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 7175 BaseCost += 7176 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 7177 7178 // If we're using ordered reductions then we can just return the base cost 7179 // here, since getArithmeticReductionCost calculates the full ordered 7180 // reduction cost when FP reassociation is not allowed. 7181 if (useOrderedReductions(RdxDesc)) 7182 return BaseCost; 7183 7184 // Get the operand that was not the reduction chain and match it to one of the 7185 // patterns, returning the better cost if it is found. 7186 Instruction *RedOp = RetI->getOperand(1) == LastChain 7187 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7188 : dyn_cast<Instruction>(RetI->getOperand(1)); 7189 7190 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7191 7192 Instruction *Op0, *Op1; 7193 if (RedOp && 7194 match(RedOp, 7195 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 7196 match(Op0, m_ZExtOrSExt(m_Value())) && 7197 Op0->getOpcode() == Op1->getOpcode() && 7198 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7199 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 7200 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 7201 7202 // Matched reduce(ext(mul(ext(A), ext(B))) 7203 // Note that the extend opcodes need to all match, or if A==B they will have 7204 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 7205 // which is equally fine. 7206 bool IsUnsigned = isa<ZExtInst>(Op0); 7207 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7208 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 7209 7210 InstructionCost ExtCost = 7211 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 7212 TTI::CastContextHint::None, CostKind, Op0); 7213 InstructionCost MulCost = 7214 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 7215 InstructionCost Ext2Cost = 7216 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 7217 TTI::CastContextHint::None, CostKind, RedOp); 7218 7219 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7220 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7221 CostKind); 7222 7223 if (RedCost.isValid() && 7224 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7225 return I == RetI ? RedCost : 0; 7226 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7227 !TheLoop->isLoopInvariant(RedOp)) { 7228 // Matched reduce(ext(A)) 7229 bool IsUnsigned = isa<ZExtInst>(RedOp); 7230 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7231 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7232 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7233 CostKind); 7234 7235 InstructionCost ExtCost = 7236 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7237 TTI::CastContextHint::None, CostKind, RedOp); 7238 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7239 return I == RetI ? RedCost : 0; 7240 } else if (RedOp && 7241 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7242 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7243 Op0->getOpcode() == Op1->getOpcode() && 7244 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7245 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7246 bool IsUnsigned = isa<ZExtInst>(Op0); 7247 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7248 // Matched reduce(mul(ext, ext)) 7249 InstructionCost ExtCost = 7250 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7251 TTI::CastContextHint::None, CostKind, Op0); 7252 InstructionCost MulCost = 7253 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7254 7255 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7256 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7257 CostKind); 7258 7259 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7260 return I == RetI ? RedCost : 0; 7261 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7262 // Matched reduce(mul()) 7263 InstructionCost MulCost = 7264 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7265 7266 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7267 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7268 CostKind); 7269 7270 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7271 return I == RetI ? RedCost : 0; 7272 } 7273 } 7274 7275 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7276 } 7277 7278 InstructionCost 7279 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7280 ElementCount VF) { 7281 // Calculate scalar cost only. Vectorization cost should be ready at this 7282 // moment. 7283 if (VF.isScalar()) { 7284 Type *ValTy = getLoadStoreType(I); 7285 const Align Alignment = getLoadStoreAlignment(I); 7286 unsigned AS = getLoadStoreAddressSpace(I); 7287 7288 return TTI.getAddressComputationCost(ValTy) + 7289 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7290 TTI::TCK_RecipThroughput, I); 7291 } 7292 return getWideningCost(I, VF); 7293 } 7294 7295 LoopVectorizationCostModel::VectorizationCostTy 7296 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7297 ElementCount VF) { 7298 // If we know that this instruction will remain uniform, check the cost of 7299 // the scalar version. 7300 if (isUniformAfterVectorization(I, VF)) 7301 VF = ElementCount::getFixed(1); 7302 7303 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7304 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7305 7306 // Forced scalars do not have any scalarization overhead. 7307 auto ForcedScalar = ForcedScalars.find(VF); 7308 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7309 auto InstSet = ForcedScalar->second; 7310 if (InstSet.count(I)) 7311 return VectorizationCostTy( 7312 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7313 VF.getKnownMinValue()), 7314 false); 7315 } 7316 7317 Type *VectorTy; 7318 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7319 7320 bool TypeNotScalarized = false; 7321 if (VF.isVector() && VectorTy->isVectorTy()) { 7322 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7323 if (NumParts) 7324 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7325 else 7326 C = InstructionCost::getInvalid(); 7327 } 7328 return VectorizationCostTy(C, TypeNotScalarized); 7329 } 7330 7331 InstructionCost 7332 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7333 ElementCount VF) const { 7334 7335 // There is no mechanism yet to create a scalable scalarization loop, 7336 // so this is currently Invalid. 7337 if (VF.isScalable()) 7338 return InstructionCost::getInvalid(); 7339 7340 if (VF.isScalar()) 7341 return 0; 7342 7343 InstructionCost Cost = 0; 7344 Type *RetTy = ToVectorTy(I->getType(), VF); 7345 if (!RetTy->isVoidTy() && 7346 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7347 Cost += TTI.getScalarizationOverhead( 7348 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7349 false); 7350 7351 // Some targets keep addresses scalar. 7352 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7353 return Cost; 7354 7355 // Some targets support efficient element stores. 7356 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7357 return Cost; 7358 7359 // Collect operands to consider. 7360 CallInst *CI = dyn_cast<CallInst>(I); 7361 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7362 7363 // Skip operands that do not require extraction/scalarization and do not incur 7364 // any overhead. 7365 SmallVector<Type *> Tys; 7366 for (auto *V : filterExtractingOperands(Ops, VF)) 7367 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7368 return Cost + TTI.getOperandsScalarizationOverhead( 7369 filterExtractingOperands(Ops, VF), Tys); 7370 } 7371 7372 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7373 if (VF.isScalar()) 7374 return; 7375 NumPredStores = 0; 7376 for (BasicBlock *BB : TheLoop->blocks()) { 7377 // For each instruction in the old loop. 7378 for (Instruction &I : *BB) { 7379 Value *Ptr = getLoadStorePointerOperand(&I); 7380 if (!Ptr) 7381 continue; 7382 7383 // TODO: We should generate better code and update the cost model for 7384 // predicated uniform stores. Today they are treated as any other 7385 // predicated store (see added test cases in 7386 // invariant-store-vectorization.ll). 7387 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7388 NumPredStores++; 7389 7390 if (Legal->isUniformMemOp(I)) { 7391 // TODO: Avoid replicating loads and stores instead of 7392 // relying on instcombine to remove them. 7393 // Load: Scalar load + broadcast 7394 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7395 InstructionCost Cost; 7396 if (isa<StoreInst>(&I) && VF.isScalable() && 7397 isLegalGatherOrScatter(&I)) { 7398 Cost = getGatherScatterCost(&I, VF); 7399 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7400 } else { 7401 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7402 "Cannot yet scalarize uniform stores"); 7403 Cost = getUniformMemOpCost(&I, VF); 7404 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7405 } 7406 continue; 7407 } 7408 7409 // We assume that widening is the best solution when possible. 7410 if (memoryInstructionCanBeWidened(&I, VF)) { 7411 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7412 int ConsecutiveStride = Legal->isConsecutivePtr( 7413 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7414 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7415 "Expected consecutive stride."); 7416 InstWidening Decision = 7417 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7418 setWideningDecision(&I, VF, Decision, Cost); 7419 continue; 7420 } 7421 7422 // Choose between Interleaving, Gather/Scatter or Scalarization. 7423 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7424 unsigned NumAccesses = 1; 7425 if (isAccessInterleaved(&I)) { 7426 auto Group = getInterleavedAccessGroup(&I); 7427 assert(Group && "Fail to get an interleaved access group."); 7428 7429 // Make one decision for the whole group. 7430 if (getWideningDecision(&I, VF) != CM_Unknown) 7431 continue; 7432 7433 NumAccesses = Group->getNumMembers(); 7434 if (interleavedAccessCanBeWidened(&I, VF)) 7435 InterleaveCost = getInterleaveGroupCost(&I, VF); 7436 } 7437 7438 InstructionCost GatherScatterCost = 7439 isLegalGatherOrScatter(&I) 7440 ? getGatherScatterCost(&I, VF) * NumAccesses 7441 : InstructionCost::getInvalid(); 7442 7443 InstructionCost ScalarizationCost = 7444 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7445 7446 // Choose better solution for the current VF, 7447 // write down this decision and use it during vectorization. 7448 InstructionCost Cost; 7449 InstWidening Decision; 7450 if (InterleaveCost <= GatherScatterCost && 7451 InterleaveCost < ScalarizationCost) { 7452 Decision = CM_Interleave; 7453 Cost = InterleaveCost; 7454 } else if (GatherScatterCost < ScalarizationCost) { 7455 Decision = CM_GatherScatter; 7456 Cost = GatherScatterCost; 7457 } else { 7458 Decision = CM_Scalarize; 7459 Cost = ScalarizationCost; 7460 } 7461 // If the instructions belongs to an interleave group, the whole group 7462 // receives the same decision. The whole group receives the cost, but 7463 // the cost will actually be assigned to one instruction. 7464 if (auto Group = getInterleavedAccessGroup(&I)) 7465 setWideningDecision(Group, VF, Decision, Cost); 7466 else 7467 setWideningDecision(&I, VF, Decision, Cost); 7468 } 7469 } 7470 7471 // Make sure that any load of address and any other address computation 7472 // remains scalar unless there is gather/scatter support. This avoids 7473 // inevitable extracts into address registers, and also has the benefit of 7474 // activating LSR more, since that pass can't optimize vectorized 7475 // addresses. 7476 if (TTI.prefersVectorizedAddressing()) 7477 return; 7478 7479 // Start with all scalar pointer uses. 7480 SmallPtrSet<Instruction *, 8> AddrDefs; 7481 for (BasicBlock *BB : TheLoop->blocks()) 7482 for (Instruction &I : *BB) { 7483 Instruction *PtrDef = 7484 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7485 if (PtrDef && TheLoop->contains(PtrDef) && 7486 getWideningDecision(&I, VF) != CM_GatherScatter) 7487 AddrDefs.insert(PtrDef); 7488 } 7489 7490 // Add all instructions used to generate the addresses. 7491 SmallVector<Instruction *, 4> Worklist; 7492 append_range(Worklist, AddrDefs); 7493 while (!Worklist.empty()) { 7494 Instruction *I = Worklist.pop_back_val(); 7495 for (auto &Op : I->operands()) 7496 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7497 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7498 AddrDefs.insert(InstOp).second) 7499 Worklist.push_back(InstOp); 7500 } 7501 7502 for (auto *I : AddrDefs) { 7503 if (isa<LoadInst>(I)) { 7504 // Setting the desired widening decision should ideally be handled in 7505 // by cost functions, but since this involves the task of finding out 7506 // if the loaded register is involved in an address computation, it is 7507 // instead changed here when we know this is the case. 7508 InstWidening Decision = getWideningDecision(I, VF); 7509 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7510 // Scalarize a widened load of address. 7511 setWideningDecision( 7512 I, VF, CM_Scalarize, 7513 (VF.getKnownMinValue() * 7514 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7515 else if (auto Group = getInterleavedAccessGroup(I)) { 7516 // Scalarize an interleave group of address loads. 7517 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7518 if (Instruction *Member = Group->getMember(I)) 7519 setWideningDecision( 7520 Member, VF, CM_Scalarize, 7521 (VF.getKnownMinValue() * 7522 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7523 } 7524 } 7525 } else 7526 // Make sure I gets scalarized and a cost estimate without 7527 // scalarization overhead. 7528 ForcedScalars[VF].insert(I); 7529 } 7530 } 7531 7532 InstructionCost 7533 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7534 Type *&VectorTy) { 7535 Type *RetTy = I->getType(); 7536 if (canTruncateToMinimalBitwidth(I, VF)) 7537 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7538 auto SE = PSE.getSE(); 7539 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7540 7541 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7542 ElementCount VF) -> bool { 7543 if (VF.isScalar()) 7544 return true; 7545 7546 auto Scalarized = InstsToScalarize.find(VF); 7547 assert(Scalarized != InstsToScalarize.end() && 7548 "VF not yet analyzed for scalarization profitability"); 7549 return !Scalarized->second.count(I) && 7550 llvm::all_of(I->users(), [&](User *U) { 7551 auto *UI = cast<Instruction>(U); 7552 return !Scalarized->second.count(UI); 7553 }); 7554 }; 7555 (void) hasSingleCopyAfterVectorization; 7556 7557 if (isScalarAfterVectorization(I, VF)) { 7558 // With the exception of GEPs and PHIs, after scalarization there should 7559 // only be one copy of the instruction generated in the loop. This is 7560 // because the VF is either 1, or any instructions that need scalarizing 7561 // have already been dealt with by the the time we get here. As a result, 7562 // it means we don't have to multiply the instruction cost by VF. 7563 assert(I->getOpcode() == Instruction::GetElementPtr || 7564 I->getOpcode() == Instruction::PHI || 7565 (I->getOpcode() == Instruction::BitCast && 7566 I->getType()->isPointerTy()) || 7567 hasSingleCopyAfterVectorization(I, VF)); 7568 VectorTy = RetTy; 7569 } else 7570 VectorTy = ToVectorTy(RetTy, VF); 7571 7572 // TODO: We need to estimate the cost of intrinsic calls. 7573 switch (I->getOpcode()) { 7574 case Instruction::GetElementPtr: 7575 // We mark this instruction as zero-cost because the cost of GEPs in 7576 // vectorized code depends on whether the corresponding memory instruction 7577 // is scalarized or not. Therefore, we handle GEPs with the memory 7578 // instruction cost. 7579 return 0; 7580 case Instruction::Br: { 7581 // In cases of scalarized and predicated instructions, there will be VF 7582 // predicated blocks in the vectorized loop. Each branch around these 7583 // blocks requires also an extract of its vector compare i1 element. 7584 bool ScalarPredicatedBB = false; 7585 BranchInst *BI = cast<BranchInst>(I); 7586 if (VF.isVector() && BI->isConditional() && 7587 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7588 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7589 ScalarPredicatedBB = true; 7590 7591 if (ScalarPredicatedBB) { 7592 // Not possible to scalarize scalable vector with predicated instructions. 7593 if (VF.isScalable()) 7594 return InstructionCost::getInvalid(); 7595 // Return cost for branches around scalarized and predicated blocks. 7596 auto *Vec_i1Ty = 7597 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7598 return ( 7599 TTI.getScalarizationOverhead( 7600 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7601 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7602 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7603 // The back-edge branch will remain, as will all scalar branches. 7604 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7605 else 7606 // This branch will be eliminated by if-conversion. 7607 return 0; 7608 // Note: We currently assume zero cost for an unconditional branch inside 7609 // a predicated block since it will become a fall-through, although we 7610 // may decide in the future to call TTI for all branches. 7611 } 7612 case Instruction::PHI: { 7613 auto *Phi = cast<PHINode>(I); 7614 7615 // First-order recurrences are replaced by vector shuffles inside the loop. 7616 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7617 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7618 return TTI.getShuffleCost( 7619 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7620 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7621 7622 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7623 // converted into select instructions. We require N - 1 selects per phi 7624 // node, where N is the number of incoming values. 7625 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7626 return (Phi->getNumIncomingValues() - 1) * 7627 TTI.getCmpSelInstrCost( 7628 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7629 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7630 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7631 7632 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7633 } 7634 case Instruction::UDiv: 7635 case Instruction::SDiv: 7636 case Instruction::URem: 7637 case Instruction::SRem: 7638 // If we have a predicated instruction, it may not be executed for each 7639 // vector lane. Get the scalarization cost and scale this amount by the 7640 // probability of executing the predicated block. If the instruction is not 7641 // predicated, we fall through to the next case. 7642 if (VF.isVector() && isScalarWithPredication(I)) { 7643 InstructionCost Cost = 0; 7644 7645 // These instructions have a non-void type, so account for the phi nodes 7646 // that we will create. This cost is likely to be zero. The phi node 7647 // cost, if any, should be scaled by the block probability because it 7648 // models a copy at the end of each predicated block. 7649 Cost += VF.getKnownMinValue() * 7650 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7651 7652 // The cost of the non-predicated instruction. 7653 Cost += VF.getKnownMinValue() * 7654 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7655 7656 // The cost of insertelement and extractelement instructions needed for 7657 // scalarization. 7658 Cost += getScalarizationOverhead(I, VF); 7659 7660 // Scale the cost by the probability of executing the predicated blocks. 7661 // This assumes the predicated block for each vector lane is equally 7662 // likely. 7663 return Cost / getReciprocalPredBlockProb(); 7664 } 7665 LLVM_FALLTHROUGH; 7666 case Instruction::Add: 7667 case Instruction::FAdd: 7668 case Instruction::Sub: 7669 case Instruction::FSub: 7670 case Instruction::Mul: 7671 case Instruction::FMul: 7672 case Instruction::FDiv: 7673 case Instruction::FRem: 7674 case Instruction::Shl: 7675 case Instruction::LShr: 7676 case Instruction::AShr: 7677 case Instruction::And: 7678 case Instruction::Or: 7679 case Instruction::Xor: { 7680 // Since we will replace the stride by 1 the multiplication should go away. 7681 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7682 return 0; 7683 7684 // Detect reduction patterns 7685 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7686 return *RedCost; 7687 7688 // Certain instructions can be cheaper to vectorize if they have a constant 7689 // second vector operand. One example of this are shifts on x86. 7690 Value *Op2 = I->getOperand(1); 7691 TargetTransformInfo::OperandValueProperties Op2VP; 7692 TargetTransformInfo::OperandValueKind Op2VK = 7693 TTI.getOperandInfo(Op2, Op2VP); 7694 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7695 Op2VK = TargetTransformInfo::OK_UniformValue; 7696 7697 SmallVector<const Value *, 4> Operands(I->operand_values()); 7698 return TTI.getArithmeticInstrCost( 7699 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7700 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7701 } 7702 case Instruction::FNeg: { 7703 return TTI.getArithmeticInstrCost( 7704 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7705 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7706 TargetTransformInfo::OP_None, I->getOperand(0), I); 7707 } 7708 case Instruction::Select: { 7709 SelectInst *SI = cast<SelectInst>(I); 7710 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7711 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7712 7713 const Value *Op0, *Op1; 7714 using namespace llvm::PatternMatch; 7715 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7716 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7717 // select x, y, false --> x & y 7718 // select x, true, y --> x | y 7719 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7720 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7721 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7722 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7723 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7724 Op1->getType()->getScalarSizeInBits() == 1); 7725 7726 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7727 return TTI.getArithmeticInstrCost( 7728 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7729 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7730 } 7731 7732 Type *CondTy = SI->getCondition()->getType(); 7733 if (!ScalarCond) 7734 CondTy = VectorType::get(CondTy, VF); 7735 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7736 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7737 } 7738 case Instruction::ICmp: 7739 case Instruction::FCmp: { 7740 Type *ValTy = I->getOperand(0)->getType(); 7741 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7742 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7743 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7744 VectorTy = ToVectorTy(ValTy, VF); 7745 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7746 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7747 } 7748 case Instruction::Store: 7749 case Instruction::Load: { 7750 ElementCount Width = VF; 7751 if (Width.isVector()) { 7752 InstWidening Decision = getWideningDecision(I, Width); 7753 assert(Decision != CM_Unknown && 7754 "CM decision should be taken at this point"); 7755 if (Decision == CM_Scalarize) 7756 Width = ElementCount::getFixed(1); 7757 } 7758 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7759 return getMemoryInstructionCost(I, VF); 7760 } 7761 case Instruction::BitCast: 7762 if (I->getType()->isPointerTy()) 7763 return 0; 7764 LLVM_FALLTHROUGH; 7765 case Instruction::ZExt: 7766 case Instruction::SExt: 7767 case Instruction::FPToUI: 7768 case Instruction::FPToSI: 7769 case Instruction::FPExt: 7770 case Instruction::PtrToInt: 7771 case Instruction::IntToPtr: 7772 case Instruction::SIToFP: 7773 case Instruction::UIToFP: 7774 case Instruction::Trunc: 7775 case Instruction::FPTrunc: { 7776 // Computes the CastContextHint from a Load/Store instruction. 7777 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7778 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7779 "Expected a load or a store!"); 7780 7781 if (VF.isScalar() || !TheLoop->contains(I)) 7782 return TTI::CastContextHint::Normal; 7783 7784 switch (getWideningDecision(I, VF)) { 7785 case LoopVectorizationCostModel::CM_GatherScatter: 7786 return TTI::CastContextHint::GatherScatter; 7787 case LoopVectorizationCostModel::CM_Interleave: 7788 return TTI::CastContextHint::Interleave; 7789 case LoopVectorizationCostModel::CM_Scalarize: 7790 case LoopVectorizationCostModel::CM_Widen: 7791 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7792 : TTI::CastContextHint::Normal; 7793 case LoopVectorizationCostModel::CM_Widen_Reverse: 7794 return TTI::CastContextHint::Reversed; 7795 case LoopVectorizationCostModel::CM_Unknown: 7796 llvm_unreachable("Instr did not go through cost modelling?"); 7797 } 7798 7799 llvm_unreachable("Unhandled case!"); 7800 }; 7801 7802 unsigned Opcode = I->getOpcode(); 7803 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7804 // For Trunc, the context is the only user, which must be a StoreInst. 7805 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7806 if (I->hasOneUse()) 7807 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7808 CCH = ComputeCCH(Store); 7809 } 7810 // For Z/Sext, the context is the operand, which must be a LoadInst. 7811 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7812 Opcode == Instruction::FPExt) { 7813 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7814 CCH = ComputeCCH(Load); 7815 } 7816 7817 // We optimize the truncation of induction variables having constant 7818 // integer steps. The cost of these truncations is the same as the scalar 7819 // operation. 7820 if (isOptimizableIVTruncate(I, VF)) { 7821 auto *Trunc = cast<TruncInst>(I); 7822 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7823 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7824 } 7825 7826 // Detect reduction patterns 7827 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7828 return *RedCost; 7829 7830 Type *SrcScalarTy = I->getOperand(0)->getType(); 7831 Type *SrcVecTy = 7832 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7833 if (canTruncateToMinimalBitwidth(I, VF)) { 7834 // This cast is going to be shrunk. This may remove the cast or it might 7835 // turn it into slightly different cast. For example, if MinBW == 16, 7836 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7837 // 7838 // Calculate the modified src and dest types. 7839 Type *MinVecTy = VectorTy; 7840 if (Opcode == Instruction::Trunc) { 7841 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7842 VectorTy = 7843 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7844 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7845 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7846 VectorTy = 7847 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7848 } 7849 } 7850 7851 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7852 } 7853 case Instruction::Call: { 7854 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7855 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7856 return *RedCost; 7857 bool NeedToScalarize; 7858 CallInst *CI = cast<CallInst>(I); 7859 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7860 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7861 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7862 return std::min(CallCost, IntrinsicCost); 7863 } 7864 return CallCost; 7865 } 7866 case Instruction::ExtractValue: 7867 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7868 case Instruction::Alloca: 7869 // We cannot easily widen alloca to a scalable alloca, as 7870 // the result would need to be a vector of pointers. 7871 if (VF.isScalable()) 7872 return InstructionCost::getInvalid(); 7873 LLVM_FALLTHROUGH; 7874 default: 7875 // This opcode is unknown. Assume that it is the same as 'mul'. 7876 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7877 } // end of switch. 7878 } 7879 7880 char LoopVectorize::ID = 0; 7881 7882 static const char lv_name[] = "Loop Vectorization"; 7883 7884 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7885 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7886 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7887 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7888 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7889 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7890 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7891 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7892 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7893 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7894 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7895 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7896 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7897 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7898 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7899 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7900 7901 namespace llvm { 7902 7903 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7904 7905 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7906 bool VectorizeOnlyWhenForced) { 7907 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7908 } 7909 7910 } // end namespace llvm 7911 7912 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7913 // Check if the pointer operand of a load or store instruction is 7914 // consecutive. 7915 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7916 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7917 return false; 7918 } 7919 7920 void LoopVectorizationCostModel::collectValuesToIgnore() { 7921 // Ignore ephemeral values. 7922 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7923 7924 // Ignore type-promoting instructions we identified during reduction 7925 // detection. 7926 for (auto &Reduction : Legal->getReductionVars()) { 7927 RecurrenceDescriptor &RedDes = Reduction.second; 7928 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7929 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7930 } 7931 // Ignore type-casting instructions we identified during induction 7932 // detection. 7933 for (auto &Induction : Legal->getInductionVars()) { 7934 InductionDescriptor &IndDes = Induction.second; 7935 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7936 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7937 } 7938 } 7939 7940 void LoopVectorizationCostModel::collectInLoopReductions() { 7941 for (auto &Reduction : Legal->getReductionVars()) { 7942 PHINode *Phi = Reduction.first; 7943 RecurrenceDescriptor &RdxDesc = Reduction.second; 7944 7945 // We don't collect reductions that are type promoted (yet). 7946 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7947 continue; 7948 7949 // If the target would prefer this reduction to happen "in-loop", then we 7950 // want to record it as such. 7951 unsigned Opcode = RdxDesc.getOpcode(); 7952 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7953 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7954 TargetTransformInfo::ReductionFlags())) 7955 continue; 7956 7957 // Check that we can correctly put the reductions into the loop, by 7958 // finding the chain of operations that leads from the phi to the loop 7959 // exit value. 7960 SmallVector<Instruction *, 4> ReductionOperations = 7961 RdxDesc.getReductionOpChain(Phi, TheLoop); 7962 bool InLoop = !ReductionOperations.empty(); 7963 if (InLoop) { 7964 InLoopReductionChains[Phi] = ReductionOperations; 7965 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7966 Instruction *LastChain = Phi; 7967 for (auto *I : ReductionOperations) { 7968 InLoopReductionImmediateChains[I] = LastChain; 7969 LastChain = I; 7970 } 7971 } 7972 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7973 << " reduction for phi: " << *Phi << "\n"); 7974 } 7975 } 7976 7977 // TODO: we could return a pair of values that specify the max VF and 7978 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7979 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7980 // doesn't have a cost model that can choose which plan to execute if 7981 // more than one is generated. 7982 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7983 LoopVectorizationCostModel &CM) { 7984 unsigned WidestType; 7985 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7986 return WidestVectorRegBits / WidestType; 7987 } 7988 7989 VectorizationFactor 7990 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7991 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7992 ElementCount VF = UserVF; 7993 // Outer loop handling: They may require CFG and instruction level 7994 // transformations before even evaluating whether vectorization is profitable. 7995 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7996 // the vectorization pipeline. 7997 if (!OrigLoop->isInnermost()) { 7998 // If the user doesn't provide a vectorization factor, determine a 7999 // reasonable one. 8000 if (UserVF.isZero()) { 8001 VF = ElementCount::getFixed(determineVPlanVF( 8002 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 8003 .getFixedSize(), 8004 CM)); 8005 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 8006 8007 // Make sure we have a VF > 1 for stress testing. 8008 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 8009 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 8010 << "overriding computed VF.\n"); 8011 VF = ElementCount::getFixed(4); 8012 } 8013 } 8014 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8015 assert(isPowerOf2_32(VF.getKnownMinValue()) && 8016 "VF needs to be a power of two"); 8017 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 8018 << "VF " << VF << " to build VPlans.\n"); 8019 buildVPlans(VF, VF); 8020 8021 // For VPlan build stress testing, we bail out after VPlan construction. 8022 if (VPlanBuildStressTest) 8023 return VectorizationFactor::Disabled(); 8024 8025 return {VF, 0 /*Cost*/}; 8026 } 8027 8028 LLVM_DEBUG( 8029 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 8030 "VPlan-native path.\n"); 8031 return VectorizationFactor::Disabled(); 8032 } 8033 8034 Optional<VectorizationFactor> 8035 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 8036 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8037 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 8038 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 8039 return None; 8040 8041 // Invalidate interleave groups if all blocks of loop will be predicated. 8042 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 8043 !useMaskedInterleavedAccesses(*TTI)) { 8044 LLVM_DEBUG( 8045 dbgs() 8046 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 8047 "which requires masked-interleaved support.\n"); 8048 if (CM.InterleaveInfo.invalidateGroups()) 8049 // Invalidating interleave groups also requires invalidating all decisions 8050 // based on them, which includes widening decisions and uniform and scalar 8051 // values. 8052 CM.invalidateCostModelingDecisions(); 8053 } 8054 8055 ElementCount MaxUserVF = 8056 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 8057 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 8058 if (!UserVF.isZero() && UserVFIsLegal) { 8059 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 8060 "VF needs to be a power of two"); 8061 // Collect the instructions (and their associated costs) that will be more 8062 // profitable to scalarize. 8063 if (CM.selectUserVectorizationFactor(UserVF)) { 8064 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 8065 CM.collectInLoopReductions(); 8066 buildVPlansWithVPRecipes(UserVF, UserVF); 8067 LLVM_DEBUG(printPlans(dbgs())); 8068 return {{UserVF, 0}}; 8069 } else 8070 reportVectorizationInfo("UserVF ignored because of invalid costs.", 8071 "InvalidCost", ORE, OrigLoop); 8072 } 8073 8074 // Populate the set of Vectorization Factor Candidates. 8075 ElementCountSet VFCandidates; 8076 for (auto VF = ElementCount::getFixed(1); 8077 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8078 VFCandidates.insert(VF); 8079 for (auto VF = ElementCount::getScalable(1); 8080 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8081 VFCandidates.insert(VF); 8082 8083 for (const auto &VF : VFCandidates) { 8084 // Collect Uniform and Scalar instructions after vectorization with VF. 8085 CM.collectUniformsAndScalars(VF); 8086 8087 // Collect the instructions (and their associated costs) that will be more 8088 // profitable to scalarize. 8089 if (VF.isVector()) 8090 CM.collectInstsToScalarize(VF); 8091 } 8092 8093 CM.collectInLoopReductions(); 8094 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8095 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8096 8097 LLVM_DEBUG(printPlans(dbgs())); 8098 if (!MaxFactors.hasVector()) 8099 return VectorizationFactor::Disabled(); 8100 8101 // Select the optimal vectorization factor. 8102 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8103 8104 // Check if it is profitable to vectorize with runtime checks. 8105 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8106 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8107 bool PragmaThresholdReached = 8108 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8109 bool ThresholdReached = 8110 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8111 if ((ThresholdReached && !Hints.allowReordering()) || 8112 PragmaThresholdReached) { 8113 ORE->emit([&]() { 8114 return OptimizationRemarkAnalysisAliasing( 8115 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8116 OrigLoop->getHeader()) 8117 << "loop not vectorized: cannot prove it is safe to reorder " 8118 "memory operations"; 8119 }); 8120 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8121 Hints.emitRemarkWithHints(); 8122 return VectorizationFactor::Disabled(); 8123 } 8124 } 8125 return SelectedVF; 8126 } 8127 8128 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 8129 assert(count_if(VPlans, 8130 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 8131 1 && 8132 "Best VF has not a single VPlan."); 8133 8134 for (const VPlanPtr &Plan : VPlans) { 8135 if (Plan->hasVF(VF)) 8136 return *Plan.get(); 8137 } 8138 llvm_unreachable("No plan found!"); 8139 } 8140 8141 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 8142 VPlan &BestVPlan, 8143 InnerLoopVectorizer &ILV, 8144 DominatorTree *DT) { 8145 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 8146 << '\n'); 8147 8148 // Perform the actual loop transformation. 8149 8150 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8151 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 8152 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8153 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8154 State.CanonicalIV = ILV.Induction; 8155 ILV.collectPoisonGeneratingRecipes(State); 8156 8157 ILV.printDebugTracesAtStart(); 8158 8159 //===------------------------------------------------===// 8160 // 8161 // Notice: any optimization or new instruction that go 8162 // into the code below should also be implemented in 8163 // the cost-model. 8164 // 8165 //===------------------------------------------------===// 8166 8167 // 2. Copy and widen instructions from the old loop into the new loop. 8168 BestVPlan.execute(&State); 8169 8170 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8171 // predication, updating analyses. 8172 ILV.fixVectorizedLoop(State); 8173 8174 ILV.printDebugTracesAtEnd(); 8175 } 8176 8177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8178 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8179 for (const auto &Plan : VPlans) 8180 if (PrintVPlansInDotFormat) 8181 Plan->printDOT(O); 8182 else 8183 Plan->print(O); 8184 } 8185 #endif 8186 8187 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8188 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8189 8190 // We create new control-flow for the vectorized loop, so the original exit 8191 // conditions will be dead after vectorization if it's only used by the 8192 // terminator 8193 SmallVector<BasicBlock*> ExitingBlocks; 8194 OrigLoop->getExitingBlocks(ExitingBlocks); 8195 for (auto *BB : ExitingBlocks) { 8196 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8197 if (!Cmp || !Cmp->hasOneUse()) 8198 continue; 8199 8200 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8201 if (!DeadInstructions.insert(Cmp).second) 8202 continue; 8203 8204 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8205 // TODO: can recurse through operands in general 8206 for (Value *Op : Cmp->operands()) { 8207 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8208 DeadInstructions.insert(cast<Instruction>(Op)); 8209 } 8210 } 8211 8212 // We create new "steps" for induction variable updates to which the original 8213 // induction variables map. An original update instruction will be dead if 8214 // all its users except the induction variable are dead. 8215 auto *Latch = OrigLoop->getLoopLatch(); 8216 for (auto &Induction : Legal->getInductionVars()) { 8217 PHINode *Ind = Induction.first; 8218 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8219 8220 // If the tail is to be folded by masking, the primary induction variable, 8221 // if exists, isn't dead: it will be used for masking. Don't kill it. 8222 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8223 continue; 8224 8225 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8226 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8227 })) 8228 DeadInstructions.insert(IndUpdate); 8229 8230 // We record as "Dead" also the type-casting instructions we had identified 8231 // during induction analysis. We don't need any handling for them in the 8232 // vectorized loop because we have proven that, under a proper runtime 8233 // test guarding the vectorized loop, the value of the phi, and the casted 8234 // value of the phi, are the same. The last instruction in this casting chain 8235 // will get its scalar/vector/widened def from the scalar/vector/widened def 8236 // of the respective phi node. Any other casts in the induction def-use chain 8237 // have no other uses outside the phi update chain, and will be ignored. 8238 InductionDescriptor &IndDes = Induction.second; 8239 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8240 DeadInstructions.insert(Casts.begin(), Casts.end()); 8241 } 8242 } 8243 8244 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8245 8246 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8247 8248 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8249 Value *Step, 8250 Instruction::BinaryOps BinOp) { 8251 // When unrolling and the VF is 1, we only need to add a simple scalar. 8252 Type *Ty = Val->getType(); 8253 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8254 8255 if (Ty->isFloatingPointTy()) { 8256 // Floating-point operations inherit FMF via the builder's flags. 8257 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8258 return Builder.CreateBinOp(BinOp, Val, MulOp); 8259 } 8260 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8261 } 8262 8263 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8264 SmallVector<Metadata *, 4> MDs; 8265 // Reserve first location for self reference to the LoopID metadata node. 8266 MDs.push_back(nullptr); 8267 bool IsUnrollMetadata = false; 8268 MDNode *LoopID = L->getLoopID(); 8269 if (LoopID) { 8270 // First find existing loop unrolling disable metadata. 8271 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8272 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8273 if (MD) { 8274 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8275 IsUnrollMetadata = 8276 S && S->getString().startswith("llvm.loop.unroll.disable"); 8277 } 8278 MDs.push_back(LoopID->getOperand(i)); 8279 } 8280 } 8281 8282 if (!IsUnrollMetadata) { 8283 // Add runtime unroll disable metadata. 8284 LLVMContext &Context = L->getHeader()->getContext(); 8285 SmallVector<Metadata *, 1> DisableOperands; 8286 DisableOperands.push_back( 8287 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8288 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8289 MDs.push_back(DisableNode); 8290 MDNode *NewLoopID = MDNode::get(Context, MDs); 8291 // Set operand 0 to refer to the loop id itself. 8292 NewLoopID->replaceOperandWith(0, NewLoopID); 8293 L->setLoopID(NewLoopID); 8294 } 8295 } 8296 8297 //===--------------------------------------------------------------------===// 8298 // EpilogueVectorizerMainLoop 8299 //===--------------------------------------------------------------------===// 8300 8301 /// This function is partially responsible for generating the control flow 8302 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8303 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8304 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8305 Loop *Lp = createVectorLoopSkeleton(""); 8306 8307 // Generate the code to check the minimum iteration count of the vector 8308 // epilogue (see below). 8309 EPI.EpilogueIterationCountCheck = 8310 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8311 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8312 8313 // Generate the code to check any assumptions that we've made for SCEV 8314 // expressions. 8315 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8316 8317 // Generate the code that checks at runtime if arrays overlap. We put the 8318 // checks into a separate block to make the more common case of few elements 8319 // faster. 8320 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8321 8322 // Generate the iteration count check for the main loop, *after* the check 8323 // for the epilogue loop, so that the path-length is shorter for the case 8324 // that goes directly through the vector epilogue. The longer-path length for 8325 // the main loop is compensated for, by the gain from vectorizing the larger 8326 // trip count. Note: the branch will get updated later on when we vectorize 8327 // the epilogue. 8328 EPI.MainLoopIterationCountCheck = 8329 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8330 8331 // Generate the induction variable. 8332 OldInduction = Legal->getPrimaryInduction(); 8333 Type *IdxTy = Legal->getWidestInductionType(); 8334 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8335 8336 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8337 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8338 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8339 EPI.VectorTripCount = CountRoundDown; 8340 Induction = 8341 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8342 getDebugLocFromInstOrOperands(OldInduction)); 8343 8344 // Skip induction resume value creation here because they will be created in 8345 // the second pass. If we created them here, they wouldn't be used anyway, 8346 // because the vplan in the second pass still contains the inductions from the 8347 // original loop. 8348 8349 return completeLoopSkeleton(Lp, OrigLoopID); 8350 } 8351 8352 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8353 LLVM_DEBUG({ 8354 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8355 << "Main Loop VF:" << EPI.MainLoopVF 8356 << ", Main Loop UF:" << EPI.MainLoopUF 8357 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8358 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8359 }); 8360 } 8361 8362 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8363 DEBUG_WITH_TYPE(VerboseDebug, { 8364 dbgs() << "intermediate fn:\n" 8365 << *OrigLoop->getHeader()->getParent() << "\n"; 8366 }); 8367 } 8368 8369 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8370 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8371 assert(L && "Expected valid Loop."); 8372 assert(Bypass && "Expected valid bypass basic block."); 8373 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8374 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8375 Value *Count = getOrCreateTripCount(L); 8376 // Reuse existing vector loop preheader for TC checks. 8377 // Note that new preheader block is generated for vector loop. 8378 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8379 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8380 8381 // Generate code to check if the loop's trip count is less than VF * UF of the 8382 // main vector loop. 8383 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8384 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8385 8386 Value *CheckMinIters = Builder.CreateICmp( 8387 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8388 "min.iters.check"); 8389 8390 if (!ForEpilogue) 8391 TCCheckBlock->setName("vector.main.loop.iter.check"); 8392 8393 // Create new preheader for vector loop. 8394 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8395 DT, LI, nullptr, "vector.ph"); 8396 8397 if (ForEpilogue) { 8398 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8399 DT->getNode(Bypass)->getIDom()) && 8400 "TC check is expected to dominate Bypass"); 8401 8402 // Update dominator for Bypass & LoopExit. 8403 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8404 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8405 // For loops with multiple exits, there's no edge from the middle block 8406 // to exit blocks (as the epilogue must run) and thus no need to update 8407 // the immediate dominator of the exit blocks. 8408 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8409 8410 LoopBypassBlocks.push_back(TCCheckBlock); 8411 8412 // Save the trip count so we don't have to regenerate it in the 8413 // vec.epilog.iter.check. This is safe to do because the trip count 8414 // generated here dominates the vector epilog iter check. 8415 EPI.TripCount = Count; 8416 } 8417 8418 ReplaceInstWithInst( 8419 TCCheckBlock->getTerminator(), 8420 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8421 8422 return TCCheckBlock; 8423 } 8424 8425 //===--------------------------------------------------------------------===// 8426 // EpilogueVectorizerEpilogueLoop 8427 //===--------------------------------------------------------------------===// 8428 8429 /// This function is partially responsible for generating the control flow 8430 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8431 BasicBlock * 8432 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8433 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8434 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8435 8436 // Now, compare the remaining count and if there aren't enough iterations to 8437 // execute the vectorized epilogue skip to the scalar part. 8438 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8439 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8440 LoopVectorPreHeader = 8441 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8442 LI, nullptr, "vec.epilog.ph"); 8443 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8444 VecEpilogueIterationCountCheck); 8445 8446 // Adjust the control flow taking the state info from the main loop 8447 // vectorization into account. 8448 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8449 "expected this to be saved from the previous pass."); 8450 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8451 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8452 8453 DT->changeImmediateDominator(LoopVectorPreHeader, 8454 EPI.MainLoopIterationCountCheck); 8455 8456 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8457 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8458 8459 if (EPI.SCEVSafetyCheck) 8460 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8461 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8462 if (EPI.MemSafetyCheck) 8463 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8464 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8465 8466 DT->changeImmediateDominator( 8467 VecEpilogueIterationCountCheck, 8468 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8469 8470 DT->changeImmediateDominator(LoopScalarPreHeader, 8471 EPI.EpilogueIterationCountCheck); 8472 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8473 // If there is an epilogue which must run, there's no edge from the 8474 // middle block to exit blocks and thus no need to update the immediate 8475 // dominator of the exit blocks. 8476 DT->changeImmediateDominator(LoopExitBlock, 8477 EPI.EpilogueIterationCountCheck); 8478 8479 // Keep track of bypass blocks, as they feed start values to the induction 8480 // phis in the scalar loop preheader. 8481 if (EPI.SCEVSafetyCheck) 8482 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8483 if (EPI.MemSafetyCheck) 8484 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8485 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8486 8487 // Generate a resume induction for the vector epilogue and put it in the 8488 // vector epilogue preheader 8489 Type *IdxTy = Legal->getWidestInductionType(); 8490 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8491 LoopVectorPreHeader->getFirstNonPHI()); 8492 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8493 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8494 EPI.MainLoopIterationCountCheck); 8495 8496 // Generate the induction variable. 8497 OldInduction = Legal->getPrimaryInduction(); 8498 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8499 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8500 Value *StartIdx = EPResumeVal; 8501 Induction = 8502 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8503 getDebugLocFromInstOrOperands(OldInduction)); 8504 8505 // Generate induction resume values. These variables save the new starting 8506 // indexes for the scalar loop. They are used to test if there are any tail 8507 // iterations left once the vector loop has completed. 8508 // Note that when the vectorized epilogue is skipped due to iteration count 8509 // check, then the resume value for the induction variable comes from 8510 // the trip count of the main vector loop, hence passing the AdditionalBypass 8511 // argument. 8512 createInductionResumeValues(Lp, CountRoundDown, 8513 {VecEpilogueIterationCountCheck, 8514 EPI.VectorTripCount} /* AdditionalBypass */); 8515 8516 AddRuntimeUnrollDisableMetaData(Lp); 8517 return completeLoopSkeleton(Lp, OrigLoopID); 8518 } 8519 8520 BasicBlock * 8521 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8522 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8523 8524 assert(EPI.TripCount && 8525 "Expected trip count to have been safed in the first pass."); 8526 assert( 8527 (!isa<Instruction>(EPI.TripCount) || 8528 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8529 "saved trip count does not dominate insertion point."); 8530 Value *TC = EPI.TripCount; 8531 IRBuilder<> Builder(Insert->getTerminator()); 8532 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8533 8534 // Generate code to check if the loop's trip count is less than VF * UF of the 8535 // vector epilogue loop. 8536 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8537 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8538 8539 Value *CheckMinIters = 8540 Builder.CreateICmp(P, Count, 8541 createStepForVF(Builder, Count->getType(), 8542 EPI.EpilogueVF, EPI.EpilogueUF), 8543 "min.epilog.iters.check"); 8544 8545 ReplaceInstWithInst( 8546 Insert->getTerminator(), 8547 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8548 8549 LoopBypassBlocks.push_back(Insert); 8550 return Insert; 8551 } 8552 8553 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8554 LLVM_DEBUG({ 8555 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8556 << "Epilogue Loop VF:" << EPI.EpilogueVF 8557 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8558 }); 8559 } 8560 8561 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8562 DEBUG_WITH_TYPE(VerboseDebug, { 8563 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8564 }); 8565 } 8566 8567 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8568 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8569 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8570 bool PredicateAtRangeStart = Predicate(Range.Start); 8571 8572 for (ElementCount TmpVF = Range.Start * 2; 8573 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8574 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8575 Range.End = TmpVF; 8576 break; 8577 } 8578 8579 return PredicateAtRangeStart; 8580 } 8581 8582 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8583 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8584 /// of VF's starting at a given VF and extending it as much as possible. Each 8585 /// vectorization decision can potentially shorten this sub-range during 8586 /// buildVPlan(). 8587 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8588 ElementCount MaxVF) { 8589 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8590 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8591 VFRange SubRange = {VF, MaxVFPlusOne}; 8592 VPlans.push_back(buildVPlan(SubRange)); 8593 VF = SubRange.End; 8594 } 8595 } 8596 8597 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8598 VPlanPtr &Plan) { 8599 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8600 8601 // Look for cached value. 8602 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8603 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8604 if (ECEntryIt != EdgeMaskCache.end()) 8605 return ECEntryIt->second; 8606 8607 VPValue *SrcMask = createBlockInMask(Src, Plan); 8608 8609 // The terminator has to be a branch inst! 8610 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8611 assert(BI && "Unexpected terminator found"); 8612 8613 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8614 return EdgeMaskCache[Edge] = SrcMask; 8615 8616 // If source is an exiting block, we know the exit edge is dynamically dead 8617 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8618 // adding uses of an otherwise potentially dead instruction. 8619 if (OrigLoop->isLoopExiting(Src)) 8620 return EdgeMaskCache[Edge] = SrcMask; 8621 8622 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8623 assert(EdgeMask && "No Edge Mask found for condition"); 8624 8625 if (BI->getSuccessor(0) != Dst) 8626 EdgeMask = Builder.createNot(EdgeMask); 8627 8628 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8629 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8630 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8631 // The select version does not introduce new UB if SrcMask is false and 8632 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8633 VPValue *False = Plan->getOrAddVPValue( 8634 ConstantInt::getFalse(BI->getCondition()->getType())); 8635 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8636 } 8637 8638 return EdgeMaskCache[Edge] = EdgeMask; 8639 } 8640 8641 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8642 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8643 8644 // Look for cached value. 8645 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8646 if (BCEntryIt != BlockMaskCache.end()) 8647 return BCEntryIt->second; 8648 8649 // All-one mask is modelled as no-mask following the convention for masked 8650 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8651 VPValue *BlockMask = nullptr; 8652 8653 if (OrigLoop->getHeader() == BB) { 8654 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8655 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8656 8657 // Create the block in mask as the first non-phi instruction in the block. 8658 VPBuilder::InsertPointGuard Guard(Builder); 8659 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8660 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8661 8662 // Introduce the early-exit compare IV <= BTC to form header block mask. 8663 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8664 // Start by constructing the desired canonical IV. 8665 VPValue *IV = nullptr; 8666 if (Legal->getPrimaryInduction()) 8667 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8668 else { 8669 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8670 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8671 IV = IVRecipe; 8672 } 8673 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8674 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8675 8676 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8677 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8678 // as a second argument, we only pass the IV here and extract the 8679 // tripcount from the transform state where codegen of the VP instructions 8680 // happen. 8681 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8682 } else { 8683 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8684 } 8685 return BlockMaskCache[BB] = BlockMask; 8686 } 8687 8688 // This is the block mask. We OR all incoming edges. 8689 for (auto *Predecessor : predecessors(BB)) { 8690 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8691 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8692 return BlockMaskCache[BB] = EdgeMask; 8693 8694 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8695 BlockMask = EdgeMask; 8696 continue; 8697 } 8698 8699 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8700 } 8701 8702 return BlockMaskCache[BB] = BlockMask; 8703 } 8704 8705 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8706 ArrayRef<VPValue *> Operands, 8707 VFRange &Range, 8708 VPlanPtr &Plan) { 8709 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8710 "Must be called with either a load or store"); 8711 8712 auto willWiden = [&](ElementCount VF) -> bool { 8713 if (VF.isScalar()) 8714 return false; 8715 LoopVectorizationCostModel::InstWidening Decision = 8716 CM.getWideningDecision(I, VF); 8717 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8718 "CM decision should be taken at this point."); 8719 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8720 return true; 8721 if (CM.isScalarAfterVectorization(I, VF) || 8722 CM.isProfitableToScalarize(I, VF)) 8723 return false; 8724 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8725 }; 8726 8727 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8728 return nullptr; 8729 8730 VPValue *Mask = nullptr; 8731 if (Legal->isMaskRequired(I)) 8732 Mask = createBlockInMask(I->getParent(), Plan); 8733 8734 // Determine if the pointer operand of the access is either consecutive or 8735 // reverse consecutive. 8736 LoopVectorizationCostModel::InstWidening Decision = 8737 CM.getWideningDecision(I, Range.Start); 8738 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8739 bool Consecutive = 8740 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8741 8742 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8743 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8744 Consecutive, Reverse); 8745 8746 StoreInst *Store = cast<StoreInst>(I); 8747 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8748 Mask, Consecutive, Reverse); 8749 } 8750 8751 VPWidenIntOrFpInductionRecipe * 8752 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8753 ArrayRef<VPValue *> Operands) const { 8754 // Check if this is an integer or fp induction. If so, build the recipe that 8755 // produces its scalar and vector values. 8756 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8757 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8758 II.getKind() == InductionDescriptor::IK_FpInduction) { 8759 assert(II.getStartValue() == 8760 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8761 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8762 return new VPWidenIntOrFpInductionRecipe( 8763 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8764 } 8765 8766 return nullptr; 8767 } 8768 8769 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8770 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8771 VPlan &Plan) const { 8772 // Optimize the special case where the source is a constant integer 8773 // induction variable. Notice that we can only optimize the 'trunc' case 8774 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8775 // (c) other casts depend on pointer size. 8776 8777 // Determine whether \p K is a truncation based on an induction variable that 8778 // can be optimized. 8779 auto isOptimizableIVTruncate = 8780 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8781 return [=](ElementCount VF) -> bool { 8782 return CM.isOptimizableIVTruncate(K, VF); 8783 }; 8784 }; 8785 8786 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8787 isOptimizableIVTruncate(I), Range)) { 8788 8789 InductionDescriptor II = 8790 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8791 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8792 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8793 Start, nullptr, I); 8794 } 8795 return nullptr; 8796 } 8797 8798 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8799 ArrayRef<VPValue *> Operands, 8800 VPlanPtr &Plan) { 8801 // If all incoming values are equal, the incoming VPValue can be used directly 8802 // instead of creating a new VPBlendRecipe. 8803 VPValue *FirstIncoming = Operands[0]; 8804 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8805 return FirstIncoming == Inc; 8806 })) { 8807 return Operands[0]; 8808 } 8809 8810 // We know that all PHIs in non-header blocks are converted into selects, so 8811 // we don't have to worry about the insertion order and we can just use the 8812 // builder. At this point we generate the predication tree. There may be 8813 // duplications since this is a simple recursive scan, but future 8814 // optimizations will clean it up. 8815 SmallVector<VPValue *, 2> OperandsWithMask; 8816 unsigned NumIncoming = Phi->getNumIncomingValues(); 8817 8818 for (unsigned In = 0; In < NumIncoming; In++) { 8819 VPValue *EdgeMask = 8820 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8821 assert((EdgeMask || NumIncoming == 1) && 8822 "Multiple predecessors with one having a full mask"); 8823 OperandsWithMask.push_back(Operands[In]); 8824 if (EdgeMask) 8825 OperandsWithMask.push_back(EdgeMask); 8826 } 8827 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8828 } 8829 8830 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8831 ArrayRef<VPValue *> Operands, 8832 VFRange &Range) const { 8833 8834 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8835 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8836 Range); 8837 8838 if (IsPredicated) 8839 return nullptr; 8840 8841 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8842 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8843 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8844 ID == Intrinsic::pseudoprobe || 8845 ID == Intrinsic::experimental_noalias_scope_decl)) 8846 return nullptr; 8847 8848 auto willWiden = [&](ElementCount VF) -> bool { 8849 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8850 // The following case may be scalarized depending on the VF. 8851 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8852 // version of the instruction. 8853 // Is it beneficial to perform intrinsic call compared to lib call? 8854 bool NeedToScalarize = false; 8855 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8856 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8857 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8858 return UseVectorIntrinsic || !NeedToScalarize; 8859 }; 8860 8861 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8862 return nullptr; 8863 8864 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8865 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8866 } 8867 8868 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8869 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8870 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8871 // Instruction should be widened, unless it is scalar after vectorization, 8872 // scalarization is profitable or it is predicated. 8873 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8874 return CM.isScalarAfterVectorization(I, VF) || 8875 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8876 }; 8877 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8878 Range); 8879 } 8880 8881 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8882 ArrayRef<VPValue *> Operands) const { 8883 auto IsVectorizableOpcode = [](unsigned Opcode) { 8884 switch (Opcode) { 8885 case Instruction::Add: 8886 case Instruction::And: 8887 case Instruction::AShr: 8888 case Instruction::BitCast: 8889 case Instruction::FAdd: 8890 case Instruction::FCmp: 8891 case Instruction::FDiv: 8892 case Instruction::FMul: 8893 case Instruction::FNeg: 8894 case Instruction::FPExt: 8895 case Instruction::FPToSI: 8896 case Instruction::FPToUI: 8897 case Instruction::FPTrunc: 8898 case Instruction::FRem: 8899 case Instruction::FSub: 8900 case Instruction::ICmp: 8901 case Instruction::IntToPtr: 8902 case Instruction::LShr: 8903 case Instruction::Mul: 8904 case Instruction::Or: 8905 case Instruction::PtrToInt: 8906 case Instruction::SDiv: 8907 case Instruction::Select: 8908 case Instruction::SExt: 8909 case Instruction::Shl: 8910 case Instruction::SIToFP: 8911 case Instruction::SRem: 8912 case Instruction::Sub: 8913 case Instruction::Trunc: 8914 case Instruction::UDiv: 8915 case Instruction::UIToFP: 8916 case Instruction::URem: 8917 case Instruction::Xor: 8918 case Instruction::ZExt: 8919 return true; 8920 } 8921 return false; 8922 }; 8923 8924 if (!IsVectorizableOpcode(I->getOpcode())) 8925 return nullptr; 8926 8927 // Success: widen this instruction. 8928 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8929 } 8930 8931 void VPRecipeBuilder::fixHeaderPhis() { 8932 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8933 for (VPWidenPHIRecipe *R : PhisToFix) { 8934 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8935 VPRecipeBase *IncR = 8936 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8937 R->addOperand(IncR->getVPSingleValue()); 8938 } 8939 } 8940 8941 VPBasicBlock *VPRecipeBuilder::handleReplication( 8942 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8943 VPlanPtr &Plan) { 8944 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8945 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8946 Range); 8947 8948 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8949 [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, 8950 Range); 8951 8952 // Even if the instruction is not marked as uniform, there are certain 8953 // intrinsic calls that can be effectively treated as such, so we check for 8954 // them here. Conservatively, we only do this for scalable vectors, since 8955 // for fixed-width VFs we can always fall back on full scalarization. 8956 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8957 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8958 case Intrinsic::assume: 8959 case Intrinsic::lifetime_start: 8960 case Intrinsic::lifetime_end: 8961 // For scalable vectors if one of the operands is variant then we still 8962 // want to mark as uniform, which will generate one instruction for just 8963 // the first lane of the vector. We can't scalarize the call in the same 8964 // way as for fixed-width vectors because we don't know how many lanes 8965 // there are. 8966 // 8967 // The reasons for doing it this way for scalable vectors are: 8968 // 1. For the assume intrinsic generating the instruction for the first 8969 // lane is still be better than not generating any at all. For 8970 // example, the input may be a splat across all lanes. 8971 // 2. For the lifetime start/end intrinsics the pointer operand only 8972 // does anything useful when the input comes from a stack object, 8973 // which suggests it should always be uniform. For non-stack objects 8974 // the effect is to poison the object, which still allows us to 8975 // remove the call. 8976 IsUniform = true; 8977 break; 8978 default: 8979 break; 8980 } 8981 } 8982 8983 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8984 IsUniform, IsPredicated); 8985 setRecipe(I, Recipe); 8986 Plan->addVPValue(I, Recipe); 8987 8988 // Find if I uses a predicated instruction. If so, it will use its scalar 8989 // value. Avoid hoisting the insert-element which packs the scalar value into 8990 // a vector value, as that happens iff all users use the vector value. 8991 for (VPValue *Op : Recipe->operands()) { 8992 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8993 if (!PredR) 8994 continue; 8995 auto *RepR = 8996 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8997 assert(RepR->isPredicated() && 8998 "expected Replicate recipe to be predicated"); 8999 RepR->setAlsoPack(false); 9000 } 9001 9002 // Finalize the recipe for Instr, first if it is not predicated. 9003 if (!IsPredicated) { 9004 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 9005 VPBB->appendRecipe(Recipe); 9006 return VPBB; 9007 } 9008 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 9009 assert(VPBB->getSuccessors().empty() && 9010 "VPBB has successors when handling predicated replication."); 9011 // Record predicated instructions for above packing optimizations. 9012 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 9013 VPBlockUtils::insertBlockAfter(Region, VPBB); 9014 auto *RegSucc = new VPBasicBlock(); 9015 VPBlockUtils::insertBlockAfter(RegSucc, Region); 9016 return RegSucc; 9017 } 9018 9019 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 9020 VPRecipeBase *PredRecipe, 9021 VPlanPtr &Plan) { 9022 // Instructions marked for predication are replicated and placed under an 9023 // if-then construct to prevent side-effects. 9024 9025 // Generate recipes to compute the block mask for this region. 9026 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 9027 9028 // Build the triangular if-then region. 9029 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 9030 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 9031 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 9032 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 9033 auto *PHIRecipe = Instr->getType()->isVoidTy() 9034 ? nullptr 9035 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 9036 if (PHIRecipe) { 9037 Plan->removeVPValueFor(Instr); 9038 Plan->addVPValue(Instr, PHIRecipe); 9039 } 9040 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 9041 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 9042 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 9043 9044 // Note: first set Entry as region entry and then connect successors starting 9045 // from it in order, to propagate the "parent" of each VPBasicBlock. 9046 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 9047 VPBlockUtils::connectBlocks(Pred, Exit); 9048 9049 return Region; 9050 } 9051 9052 VPRecipeOrVPValueTy 9053 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 9054 ArrayRef<VPValue *> Operands, 9055 VFRange &Range, VPlanPtr &Plan) { 9056 // First, check for specific widening recipes that deal with calls, memory 9057 // operations, inductions and Phi nodes. 9058 if (auto *CI = dyn_cast<CallInst>(Instr)) 9059 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 9060 9061 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 9062 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 9063 9064 VPRecipeBase *Recipe; 9065 if (auto Phi = dyn_cast<PHINode>(Instr)) { 9066 if (Phi->getParent() != OrigLoop->getHeader()) 9067 return tryToBlend(Phi, Operands, Plan); 9068 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 9069 return toVPRecipeResult(Recipe); 9070 9071 VPWidenPHIRecipe *PhiRecipe = nullptr; 9072 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 9073 VPValue *StartV = Operands[0]; 9074 if (Legal->isReductionVariable(Phi)) { 9075 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9076 assert(RdxDesc.getRecurrenceStartValue() == 9077 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 9078 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 9079 CM.isInLoopReduction(Phi), 9080 CM.useOrderedReductions(RdxDesc)); 9081 } else { 9082 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 9083 } 9084 9085 // Record the incoming value from the backedge, so we can add the incoming 9086 // value from the backedge after all recipes have been created. 9087 recordRecipeOf(cast<Instruction>( 9088 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 9089 PhisToFix.push_back(PhiRecipe); 9090 } else { 9091 // TODO: record start and backedge value for remaining pointer induction 9092 // phis. 9093 assert(Phi->getType()->isPointerTy() && 9094 "only pointer phis should be handled here"); 9095 PhiRecipe = new VPWidenPHIRecipe(Phi); 9096 } 9097 9098 return toVPRecipeResult(PhiRecipe); 9099 } 9100 9101 if (isa<TruncInst>(Instr) && 9102 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 9103 Range, *Plan))) 9104 return toVPRecipeResult(Recipe); 9105 9106 if (!shouldWiden(Instr, Range)) 9107 return nullptr; 9108 9109 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 9110 return toVPRecipeResult(new VPWidenGEPRecipe( 9111 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 9112 9113 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 9114 bool InvariantCond = 9115 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 9116 return toVPRecipeResult(new VPWidenSelectRecipe( 9117 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 9118 } 9119 9120 return toVPRecipeResult(tryToWiden(Instr, Operands)); 9121 } 9122 9123 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 9124 ElementCount MaxVF) { 9125 assert(OrigLoop->isInnermost() && "Inner loop expected."); 9126 9127 // Collect instructions from the original loop that will become trivially dead 9128 // in the vectorized loop. We don't need to vectorize these instructions. For 9129 // example, original induction update instructions can become dead because we 9130 // separately emit induction "steps" when generating code for the new loop. 9131 // Similarly, we create a new latch condition when setting up the structure 9132 // of the new loop, so the old one can become dead. 9133 SmallPtrSet<Instruction *, 4> DeadInstructions; 9134 collectTriviallyDeadInstructions(DeadInstructions); 9135 9136 // Add assume instructions we need to drop to DeadInstructions, to prevent 9137 // them from being added to the VPlan. 9138 // TODO: We only need to drop assumes in blocks that get flattend. If the 9139 // control flow is preserved, we should keep them. 9140 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9141 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9142 9143 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9144 // Dead instructions do not need sinking. Remove them from SinkAfter. 9145 for (Instruction *I : DeadInstructions) 9146 SinkAfter.erase(I); 9147 9148 // Cannot sink instructions after dead instructions (there won't be any 9149 // recipes for them). Instead, find the first non-dead previous instruction. 9150 for (auto &P : Legal->getSinkAfter()) { 9151 Instruction *SinkTarget = P.second; 9152 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 9153 (void)FirstInst; 9154 while (DeadInstructions.contains(SinkTarget)) { 9155 assert( 9156 SinkTarget != FirstInst && 9157 "Must find a live instruction (at least the one feeding the " 9158 "first-order recurrence PHI) before reaching beginning of the block"); 9159 SinkTarget = SinkTarget->getPrevNode(); 9160 assert(SinkTarget != P.first && 9161 "sink source equals target, no sinking required"); 9162 } 9163 P.second = SinkTarget; 9164 } 9165 9166 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9167 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9168 VFRange SubRange = {VF, MaxVFPlusOne}; 9169 VPlans.push_back( 9170 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9171 VF = SubRange.End; 9172 } 9173 } 9174 9175 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9176 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9177 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9178 9179 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9180 9181 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9182 9183 // --------------------------------------------------------------------------- 9184 // Pre-construction: record ingredients whose recipes we'll need to further 9185 // process after constructing the initial VPlan. 9186 // --------------------------------------------------------------------------- 9187 9188 // Mark instructions we'll need to sink later and their targets as 9189 // ingredients whose recipe we'll need to record. 9190 for (auto &Entry : SinkAfter) { 9191 RecipeBuilder.recordRecipeOf(Entry.first); 9192 RecipeBuilder.recordRecipeOf(Entry.second); 9193 } 9194 for (auto &Reduction : CM.getInLoopReductionChains()) { 9195 PHINode *Phi = Reduction.first; 9196 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9197 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9198 9199 RecipeBuilder.recordRecipeOf(Phi); 9200 for (auto &R : ReductionOperations) { 9201 RecipeBuilder.recordRecipeOf(R); 9202 // For min/max reducitons, where we have a pair of icmp/select, we also 9203 // need to record the ICmp recipe, so it can be removed later. 9204 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9205 "Only min/max recurrences allowed for inloop reductions"); 9206 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9207 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9208 } 9209 } 9210 9211 // For each interleave group which is relevant for this (possibly trimmed) 9212 // Range, add it to the set of groups to be later applied to the VPlan and add 9213 // placeholders for its members' Recipes which we'll be replacing with a 9214 // single VPInterleaveRecipe. 9215 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9216 auto applyIG = [IG, this](ElementCount VF) -> bool { 9217 return (VF.isVector() && // Query is illegal for VF == 1 9218 CM.getWideningDecision(IG->getInsertPos(), VF) == 9219 LoopVectorizationCostModel::CM_Interleave); 9220 }; 9221 if (!getDecisionAndClampRange(applyIG, Range)) 9222 continue; 9223 InterleaveGroups.insert(IG); 9224 for (unsigned i = 0; i < IG->getFactor(); i++) 9225 if (Instruction *Member = IG->getMember(i)) 9226 RecipeBuilder.recordRecipeOf(Member); 9227 }; 9228 9229 // --------------------------------------------------------------------------- 9230 // Build initial VPlan: Scan the body of the loop in a topological order to 9231 // visit each basic block after having visited its predecessor basic blocks. 9232 // --------------------------------------------------------------------------- 9233 9234 auto Plan = std::make_unique<VPlan>(); 9235 9236 // Scan the body of the loop in a topological order to visit each basic block 9237 // after having visited its predecessor basic blocks. 9238 LoopBlocksDFS DFS(OrigLoop); 9239 DFS.perform(LI); 9240 9241 VPBasicBlock *VPBB = nullptr; 9242 VPBasicBlock *HeaderVPBB = nullptr; 9243 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9244 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9245 // Relevant instructions from basic block BB will be grouped into VPRecipe 9246 // ingredients and fill a new VPBasicBlock. 9247 unsigned VPBBsForBB = 0; 9248 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9249 if (VPBB) 9250 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9251 else { 9252 auto *TopRegion = new VPRegionBlock("vector loop"); 9253 TopRegion->setEntry(FirstVPBBForBB); 9254 Plan->setEntry(TopRegion); 9255 HeaderVPBB = FirstVPBBForBB; 9256 } 9257 VPBB = FirstVPBBForBB; 9258 Builder.setInsertPoint(VPBB); 9259 9260 // Introduce each ingredient into VPlan. 9261 // TODO: Model and preserve debug instrinsics in VPlan. 9262 for (Instruction &I : BB->instructionsWithoutDebug()) { 9263 Instruction *Instr = &I; 9264 9265 // First filter out irrelevant instructions, to ensure no recipes are 9266 // built for them. 9267 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9268 continue; 9269 9270 SmallVector<VPValue *, 4> Operands; 9271 auto *Phi = dyn_cast<PHINode>(Instr); 9272 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9273 Operands.push_back(Plan->getOrAddVPValue( 9274 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9275 } else { 9276 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9277 Operands = {OpRange.begin(), OpRange.end()}; 9278 } 9279 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9280 Instr, Operands, Range, Plan)) { 9281 // If Instr can be simplified to an existing VPValue, use it. 9282 if (RecipeOrValue.is<VPValue *>()) { 9283 auto *VPV = RecipeOrValue.get<VPValue *>(); 9284 Plan->addVPValue(Instr, VPV); 9285 // If the re-used value is a recipe, register the recipe for the 9286 // instruction, in case the recipe for Instr needs to be recorded. 9287 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9288 RecipeBuilder.setRecipe(Instr, R); 9289 continue; 9290 } 9291 // Otherwise, add the new recipe. 9292 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9293 for (auto *Def : Recipe->definedValues()) { 9294 auto *UV = Def->getUnderlyingValue(); 9295 Plan->addVPValue(UV, Def); 9296 } 9297 9298 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9299 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9300 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9301 // of the header block. That can happen for truncates of induction 9302 // variables. Those recipes are moved to the phi section of the header 9303 // block after applying SinkAfter, which relies on the original 9304 // position of the trunc. 9305 assert(isa<TruncInst>(Instr)); 9306 InductionsToMove.push_back( 9307 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9308 } 9309 RecipeBuilder.setRecipe(Instr, Recipe); 9310 VPBB->appendRecipe(Recipe); 9311 continue; 9312 } 9313 9314 // Otherwise, if all widening options failed, Instruction is to be 9315 // replicated. This may create a successor for VPBB. 9316 VPBasicBlock *NextVPBB = 9317 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9318 if (NextVPBB != VPBB) { 9319 VPBB = NextVPBB; 9320 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9321 : ""); 9322 } 9323 } 9324 } 9325 9326 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9327 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9328 "entry block must be set to a VPRegionBlock having a non-empty entry " 9329 "VPBasicBlock"); 9330 cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); 9331 RecipeBuilder.fixHeaderPhis(); 9332 9333 // --------------------------------------------------------------------------- 9334 // Transform initial VPlan: Apply previously taken decisions, in order, to 9335 // bring the VPlan to its final state. 9336 // --------------------------------------------------------------------------- 9337 9338 // Apply Sink-After legal constraints. 9339 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9340 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9341 if (Region && Region->isReplicator()) { 9342 assert(Region->getNumSuccessors() == 1 && 9343 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9344 assert(R->getParent()->size() == 1 && 9345 "A recipe in an original replicator region must be the only " 9346 "recipe in its block"); 9347 return Region; 9348 } 9349 return nullptr; 9350 }; 9351 for (auto &Entry : SinkAfter) { 9352 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9353 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9354 9355 auto *TargetRegion = GetReplicateRegion(Target); 9356 auto *SinkRegion = GetReplicateRegion(Sink); 9357 if (!SinkRegion) { 9358 // If the sink source is not a replicate region, sink the recipe directly. 9359 if (TargetRegion) { 9360 // The target is in a replication region, make sure to move Sink to 9361 // the block after it, not into the replication region itself. 9362 VPBasicBlock *NextBlock = 9363 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9364 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9365 } else 9366 Sink->moveAfter(Target); 9367 continue; 9368 } 9369 9370 // The sink source is in a replicate region. Unhook the region from the CFG. 9371 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9372 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9373 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9374 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9375 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9376 9377 if (TargetRegion) { 9378 // The target recipe is also in a replicate region, move the sink region 9379 // after the target region. 9380 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9381 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9382 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9383 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9384 } else { 9385 // The sink source is in a replicate region, we need to move the whole 9386 // replicate region, which should only contain a single recipe in the 9387 // main block. 9388 auto *SplitBlock = 9389 Target->getParent()->splitAt(std::next(Target->getIterator())); 9390 9391 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9392 9393 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9394 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9395 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9396 if (VPBB == SplitPred) 9397 VPBB = SplitBlock; 9398 } 9399 } 9400 9401 // Now that sink-after is done, move induction recipes for optimized truncates 9402 // to the phi section of the header block. 9403 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9404 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9405 9406 // Adjust the recipes for any inloop reductions. 9407 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9408 9409 // Introduce a recipe to combine the incoming and previous values of a 9410 // first-order recurrence. 9411 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9412 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9413 if (!RecurPhi) 9414 continue; 9415 9416 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9417 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9418 auto *Region = GetReplicateRegion(PrevRecipe); 9419 if (Region) 9420 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9421 if (Region || PrevRecipe->isPhi()) 9422 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9423 else 9424 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9425 9426 auto *RecurSplice = cast<VPInstruction>( 9427 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9428 {RecurPhi, RecurPhi->getBackedgeValue()})); 9429 9430 RecurPhi->replaceAllUsesWith(RecurSplice); 9431 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9432 // all users. 9433 RecurSplice->setOperand(0, RecurPhi); 9434 } 9435 9436 // Interleave memory: for each Interleave Group we marked earlier as relevant 9437 // for this VPlan, replace the Recipes widening its memory instructions with a 9438 // single VPInterleaveRecipe at its insertion point. 9439 for (auto IG : InterleaveGroups) { 9440 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9441 RecipeBuilder.getRecipe(IG->getInsertPos())); 9442 SmallVector<VPValue *, 4> StoredValues; 9443 for (unsigned i = 0; i < IG->getFactor(); ++i) 9444 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9445 auto *StoreR = 9446 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9447 StoredValues.push_back(StoreR->getStoredValue()); 9448 } 9449 9450 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9451 Recipe->getMask()); 9452 VPIG->insertBefore(Recipe); 9453 unsigned J = 0; 9454 for (unsigned i = 0; i < IG->getFactor(); ++i) 9455 if (Instruction *Member = IG->getMember(i)) { 9456 if (!Member->getType()->isVoidTy()) { 9457 VPValue *OriginalV = Plan->getVPValue(Member); 9458 Plan->removeVPValueFor(Member); 9459 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9460 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9461 J++; 9462 } 9463 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9464 } 9465 } 9466 9467 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9468 // in ways that accessing values using original IR values is incorrect. 9469 Plan->disableValue2VPValue(); 9470 9471 VPlanTransforms::sinkScalarOperands(*Plan); 9472 VPlanTransforms::mergeReplicateRegions(*Plan); 9473 9474 std::string PlanName; 9475 raw_string_ostream RSO(PlanName); 9476 ElementCount VF = Range.Start; 9477 Plan->addVF(VF); 9478 RSO << "Initial VPlan for VF={" << VF; 9479 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9480 Plan->addVF(VF); 9481 RSO << "," << VF; 9482 } 9483 RSO << "},UF>=1"; 9484 RSO.flush(); 9485 Plan->setName(PlanName); 9486 9487 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9488 return Plan; 9489 } 9490 9491 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9492 // Outer loop handling: They may require CFG and instruction level 9493 // transformations before even evaluating whether vectorization is profitable. 9494 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9495 // the vectorization pipeline. 9496 assert(!OrigLoop->isInnermost()); 9497 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9498 9499 // Create new empty VPlan 9500 auto Plan = std::make_unique<VPlan>(); 9501 9502 // Build hierarchical CFG 9503 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9504 HCFGBuilder.buildHierarchicalCFG(); 9505 9506 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9507 VF *= 2) 9508 Plan->addVF(VF); 9509 9510 if (EnableVPlanPredication) { 9511 VPlanPredicator VPP(*Plan); 9512 VPP.predicate(); 9513 9514 // Avoid running transformation to recipes until masked code generation in 9515 // VPlan-native path is in place. 9516 return Plan; 9517 } 9518 9519 SmallPtrSet<Instruction *, 1> DeadInstructions; 9520 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9521 Legal->getInductionVars(), 9522 DeadInstructions, *PSE.getSE()); 9523 return Plan; 9524 } 9525 9526 // Adjust the recipes for reductions. For in-loop reductions the chain of 9527 // instructions leading from the loop exit instr to the phi need to be converted 9528 // to reductions, with one operand being vector and the other being the scalar 9529 // reduction chain. For other reductions, a select is introduced between the phi 9530 // and live-out recipes when folding the tail. 9531 void LoopVectorizationPlanner::adjustRecipesForReductions( 9532 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9533 ElementCount MinVF) { 9534 for (auto &Reduction : CM.getInLoopReductionChains()) { 9535 PHINode *Phi = Reduction.first; 9536 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9537 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9538 9539 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9540 continue; 9541 9542 // ReductionOperations are orders top-down from the phi's use to the 9543 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9544 // which of the two operands will remain scalar and which will be reduced. 9545 // For minmax the chain will be the select instructions. 9546 Instruction *Chain = Phi; 9547 for (Instruction *R : ReductionOperations) { 9548 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9549 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9550 9551 VPValue *ChainOp = Plan->getVPValue(Chain); 9552 unsigned FirstOpId; 9553 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9554 "Only min/max recurrences allowed for inloop reductions"); 9555 // Recognize a call to the llvm.fmuladd intrinsic. 9556 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9557 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9558 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9559 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9560 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9561 "Expected to replace a VPWidenSelectSC"); 9562 FirstOpId = 1; 9563 } else { 9564 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9565 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9566 "Expected to replace a VPWidenSC"); 9567 FirstOpId = 0; 9568 } 9569 unsigned VecOpId = 9570 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9571 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9572 9573 auto *CondOp = CM.foldTailByMasking() 9574 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9575 : nullptr; 9576 9577 if (IsFMulAdd) { 9578 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9579 // need to create an fmul recipe to use as the vector operand for the 9580 // fadd reduction. 9581 VPInstruction *FMulRecipe = new VPInstruction( 9582 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9583 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9584 WidenRecipe->getParent()->insert(FMulRecipe, 9585 WidenRecipe->getIterator()); 9586 VecOp = FMulRecipe; 9587 } 9588 VPReductionRecipe *RedRecipe = 9589 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9590 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9591 Plan->removeVPValueFor(R); 9592 Plan->addVPValue(R, RedRecipe); 9593 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9594 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9595 WidenRecipe->eraseFromParent(); 9596 9597 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9598 VPRecipeBase *CompareRecipe = 9599 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9600 assert(isa<VPWidenRecipe>(CompareRecipe) && 9601 "Expected to replace a VPWidenSC"); 9602 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9603 "Expected no remaining users"); 9604 CompareRecipe->eraseFromParent(); 9605 } 9606 Chain = R; 9607 } 9608 } 9609 9610 // If tail is folded by masking, introduce selects between the phi 9611 // and the live-out instruction of each reduction, at the end of the latch. 9612 if (CM.foldTailByMasking()) { 9613 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9614 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9615 if (!PhiR || PhiR->isInLoop()) 9616 continue; 9617 Builder.setInsertPoint(LatchVPBB); 9618 VPValue *Cond = 9619 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9620 VPValue *Red = PhiR->getBackedgeValue(); 9621 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9622 } 9623 } 9624 } 9625 9626 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9627 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9628 VPSlotTracker &SlotTracker) const { 9629 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9630 IG->getInsertPos()->printAsOperand(O, false); 9631 O << ", "; 9632 getAddr()->printAsOperand(O, SlotTracker); 9633 VPValue *Mask = getMask(); 9634 if (Mask) { 9635 O << ", "; 9636 Mask->printAsOperand(O, SlotTracker); 9637 } 9638 9639 unsigned OpIdx = 0; 9640 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9641 if (!IG->getMember(i)) 9642 continue; 9643 if (getNumStoreOperands() > 0) { 9644 O << "\n" << Indent << " store "; 9645 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9646 O << " to index " << i; 9647 } else { 9648 O << "\n" << Indent << " "; 9649 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9650 O << " = load from index " << i; 9651 } 9652 ++OpIdx; 9653 } 9654 } 9655 #endif 9656 9657 void VPWidenCallRecipe::execute(VPTransformState &State) { 9658 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9659 *this, State); 9660 } 9661 9662 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9663 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9664 this, *this, InvariantCond, State); 9665 } 9666 9667 void VPWidenRecipe::execute(VPTransformState &State) { 9668 auto &I = *cast<Instruction>(getUnderlyingValue()); 9669 auto &Builder = State.Builder; 9670 switch (I.getOpcode()) { 9671 case Instruction::Call: 9672 case Instruction::Br: 9673 case Instruction::PHI: 9674 case Instruction::GetElementPtr: 9675 case Instruction::Select: 9676 llvm_unreachable("This instruction is handled by a different recipe."); 9677 case Instruction::UDiv: 9678 case Instruction::SDiv: 9679 case Instruction::SRem: 9680 case Instruction::URem: 9681 case Instruction::Add: 9682 case Instruction::FAdd: 9683 case Instruction::Sub: 9684 case Instruction::FSub: 9685 case Instruction::FNeg: 9686 case Instruction::Mul: 9687 case Instruction::FMul: 9688 case Instruction::FDiv: 9689 case Instruction::FRem: 9690 case Instruction::Shl: 9691 case Instruction::LShr: 9692 case Instruction::AShr: 9693 case Instruction::And: 9694 case Instruction::Or: 9695 case Instruction::Xor: { 9696 // Just widen unops and binops. 9697 State.ILV->setDebugLocFromInst(&I); 9698 9699 for (unsigned Part = 0; Part < State.UF; ++Part) { 9700 SmallVector<Value *, 2> Ops; 9701 for (VPValue *VPOp : operands()) 9702 Ops.push_back(State.get(VPOp, Part)); 9703 9704 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9705 9706 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9707 VecOp->copyIRFlags(&I); 9708 9709 // If the instruction is vectorized and was in a basic block that needed 9710 // predication, we can't propagate poison-generating flags (nuw/nsw, 9711 // exact, etc.). The control flow has been linearized and the 9712 // instruction is no longer guarded by the predicate, which could make 9713 // the flag properties to no longer hold. 9714 if (State.MayGeneratePoisonRecipes.count(this) > 0) 9715 VecOp->dropPoisonGeneratingFlags(); 9716 } 9717 9718 // Use this vector value for all users of the original instruction. 9719 State.set(this, V, Part); 9720 State.ILV->addMetadata(V, &I); 9721 } 9722 9723 break; 9724 } 9725 case Instruction::ICmp: 9726 case Instruction::FCmp: { 9727 // Widen compares. Generate vector compares. 9728 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9729 auto *Cmp = cast<CmpInst>(&I); 9730 State.ILV->setDebugLocFromInst(Cmp); 9731 for (unsigned Part = 0; Part < State.UF; ++Part) { 9732 Value *A = State.get(getOperand(0), Part); 9733 Value *B = State.get(getOperand(1), Part); 9734 Value *C = nullptr; 9735 if (FCmp) { 9736 // Propagate fast math flags. 9737 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9738 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9739 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9740 } else { 9741 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9742 } 9743 State.set(this, C, Part); 9744 State.ILV->addMetadata(C, &I); 9745 } 9746 9747 break; 9748 } 9749 9750 case Instruction::ZExt: 9751 case Instruction::SExt: 9752 case Instruction::FPToUI: 9753 case Instruction::FPToSI: 9754 case Instruction::FPExt: 9755 case Instruction::PtrToInt: 9756 case Instruction::IntToPtr: 9757 case Instruction::SIToFP: 9758 case Instruction::UIToFP: 9759 case Instruction::Trunc: 9760 case Instruction::FPTrunc: 9761 case Instruction::BitCast: { 9762 auto *CI = cast<CastInst>(&I); 9763 State.ILV->setDebugLocFromInst(CI); 9764 9765 /// Vectorize casts. 9766 Type *DestTy = (State.VF.isScalar()) 9767 ? CI->getType() 9768 : VectorType::get(CI->getType(), State.VF); 9769 9770 for (unsigned Part = 0; Part < State.UF; ++Part) { 9771 Value *A = State.get(getOperand(0), Part); 9772 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9773 State.set(this, Cast, Part); 9774 State.ILV->addMetadata(Cast, &I); 9775 } 9776 break; 9777 } 9778 default: 9779 // This instruction is not vectorized by simple widening. 9780 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9781 llvm_unreachable("Unhandled instruction!"); 9782 } // end of switch. 9783 } 9784 9785 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9786 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9787 // Construct a vector GEP by widening the operands of the scalar GEP as 9788 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9789 // results in a vector of pointers when at least one operand of the GEP 9790 // is vector-typed. Thus, to keep the representation compact, we only use 9791 // vector-typed operands for loop-varying values. 9792 9793 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9794 // If we are vectorizing, but the GEP has only loop-invariant operands, 9795 // the GEP we build (by only using vector-typed operands for 9796 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9797 // produce a vector of pointers, we need to either arbitrarily pick an 9798 // operand to broadcast, or broadcast a clone of the original GEP. 9799 // Here, we broadcast a clone of the original. 9800 // 9801 // TODO: If at some point we decide to scalarize instructions having 9802 // loop-invariant operands, this special case will no longer be 9803 // required. We would add the scalarization decision to 9804 // collectLoopScalars() and teach getVectorValue() to broadcast 9805 // the lane-zero scalar value. 9806 auto *Clone = State.Builder.Insert(GEP->clone()); 9807 for (unsigned Part = 0; Part < State.UF; ++Part) { 9808 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9809 State.set(this, EntryPart, Part); 9810 State.ILV->addMetadata(EntryPart, GEP); 9811 } 9812 } else { 9813 // If the GEP has at least one loop-varying operand, we are sure to 9814 // produce a vector of pointers. But if we are only unrolling, we want 9815 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9816 // produce with the code below will be scalar (if VF == 1) or vector 9817 // (otherwise). Note that for the unroll-only case, we still maintain 9818 // values in the vector mapping with initVector, as we do for other 9819 // instructions. 9820 for (unsigned Part = 0; Part < State.UF; ++Part) { 9821 // The pointer operand of the new GEP. If it's loop-invariant, we 9822 // won't broadcast it. 9823 auto *Ptr = IsPtrLoopInvariant 9824 ? State.get(getOperand(0), VPIteration(0, 0)) 9825 : State.get(getOperand(0), Part); 9826 9827 // Collect all the indices for the new GEP. If any index is 9828 // loop-invariant, we won't broadcast it. 9829 SmallVector<Value *, 4> Indices; 9830 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9831 VPValue *Operand = getOperand(I); 9832 if (IsIndexLoopInvariant[I - 1]) 9833 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9834 else 9835 Indices.push_back(State.get(Operand, Part)); 9836 } 9837 9838 // If the GEP instruction is vectorized and was in a basic block that 9839 // needed predication, we can't propagate the poison-generating 'inbounds' 9840 // flag. The control flow has been linearized and the GEP is no longer 9841 // guarded by the predicate, which could make the 'inbounds' properties to 9842 // no longer hold. 9843 bool IsInBounds = 9844 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9845 9846 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9847 // but it should be a vector, otherwise. 9848 auto *NewGEP = IsInBounds 9849 ? State.Builder.CreateInBoundsGEP( 9850 GEP->getSourceElementType(), Ptr, Indices) 9851 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9852 Ptr, Indices); 9853 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9854 "NewGEP is not a pointer vector"); 9855 State.set(this, NewGEP, Part); 9856 State.ILV->addMetadata(NewGEP, GEP); 9857 } 9858 } 9859 } 9860 9861 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9862 assert(!State.Instance && "Int or FP induction being replicated."); 9863 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9864 getTruncInst(), getVPValue(0), 9865 getCastValue(), State); 9866 } 9867 9868 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9869 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9870 State); 9871 } 9872 9873 void VPBlendRecipe::execute(VPTransformState &State) { 9874 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9875 // We know that all PHIs in non-header blocks are converted into 9876 // selects, so we don't have to worry about the insertion order and we 9877 // can just use the builder. 9878 // At this point we generate the predication tree. There may be 9879 // duplications since this is a simple recursive scan, but future 9880 // optimizations will clean it up. 9881 9882 unsigned NumIncoming = getNumIncomingValues(); 9883 9884 // Generate a sequence of selects of the form: 9885 // SELECT(Mask3, In3, 9886 // SELECT(Mask2, In2, 9887 // SELECT(Mask1, In1, 9888 // In0))) 9889 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9890 // are essentially undef are taken from In0. 9891 InnerLoopVectorizer::VectorParts Entry(State.UF); 9892 for (unsigned In = 0; In < NumIncoming; ++In) { 9893 for (unsigned Part = 0; Part < State.UF; ++Part) { 9894 // We might have single edge PHIs (blocks) - use an identity 9895 // 'select' for the first PHI operand. 9896 Value *In0 = State.get(getIncomingValue(In), Part); 9897 if (In == 0) 9898 Entry[Part] = In0; // Initialize with the first incoming value. 9899 else { 9900 // Select between the current value and the previous incoming edge 9901 // based on the incoming mask. 9902 Value *Cond = State.get(getMask(In), Part); 9903 Entry[Part] = 9904 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9905 } 9906 } 9907 } 9908 for (unsigned Part = 0; Part < State.UF; ++Part) 9909 State.set(this, Entry[Part], Part); 9910 } 9911 9912 void VPInterleaveRecipe::execute(VPTransformState &State) { 9913 assert(!State.Instance && "Interleave group being replicated."); 9914 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9915 getStoredValues(), getMask()); 9916 } 9917 9918 void VPReductionRecipe::execute(VPTransformState &State) { 9919 assert(!State.Instance && "Reduction being replicated."); 9920 Value *PrevInChain = State.get(getChainOp(), 0); 9921 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9922 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9923 // Propagate the fast-math flags carried by the underlying instruction. 9924 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9925 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9926 for (unsigned Part = 0; Part < State.UF; ++Part) { 9927 Value *NewVecOp = State.get(getVecOp(), Part); 9928 if (VPValue *Cond = getCondOp()) { 9929 Value *NewCond = State.get(Cond, Part); 9930 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9931 Value *Iden = RdxDesc->getRecurrenceIdentity( 9932 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9933 Value *IdenVec = 9934 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9935 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9936 NewVecOp = Select; 9937 } 9938 Value *NewRed; 9939 Value *NextInChain; 9940 if (IsOrdered) { 9941 if (State.VF.isVector()) 9942 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9943 PrevInChain); 9944 else 9945 NewRed = State.Builder.CreateBinOp( 9946 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9947 NewVecOp); 9948 PrevInChain = NewRed; 9949 } else { 9950 PrevInChain = State.get(getChainOp(), Part); 9951 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9952 } 9953 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9954 NextInChain = 9955 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9956 NewRed, PrevInChain); 9957 } else if (IsOrdered) 9958 NextInChain = NewRed; 9959 else 9960 NextInChain = State.Builder.CreateBinOp( 9961 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9962 PrevInChain); 9963 State.set(this, NextInChain, Part); 9964 } 9965 } 9966 9967 void VPReplicateRecipe::execute(VPTransformState &State) { 9968 if (State.Instance) { // Generate a single instance. 9969 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9970 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9971 IsPredicated, State); 9972 // Insert scalar instance packing it into a vector. 9973 if (AlsoPack && State.VF.isVector()) { 9974 // If we're constructing lane 0, initialize to start from poison. 9975 if (State.Instance->Lane.isFirstLane()) { 9976 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9977 Value *Poison = PoisonValue::get( 9978 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9979 State.set(this, Poison, State.Instance->Part); 9980 } 9981 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9982 } 9983 return; 9984 } 9985 9986 // Generate scalar instances for all VF lanes of all UF parts, unless the 9987 // instruction is uniform inwhich case generate only the first lane for each 9988 // of the UF parts. 9989 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9990 assert((!State.VF.isScalable() || IsUniform) && 9991 "Can't scalarize a scalable vector"); 9992 for (unsigned Part = 0; Part < State.UF; ++Part) 9993 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9994 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9995 VPIteration(Part, Lane), IsPredicated, 9996 State); 9997 } 9998 9999 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 10000 assert(State.Instance && "Branch on Mask works only on single instance."); 10001 10002 unsigned Part = State.Instance->Part; 10003 unsigned Lane = State.Instance->Lane.getKnownLane(); 10004 10005 Value *ConditionBit = nullptr; 10006 VPValue *BlockInMask = getMask(); 10007 if (BlockInMask) { 10008 ConditionBit = State.get(BlockInMask, Part); 10009 if (ConditionBit->getType()->isVectorTy()) 10010 ConditionBit = State.Builder.CreateExtractElement( 10011 ConditionBit, State.Builder.getInt32(Lane)); 10012 } else // Block in mask is all-one. 10013 ConditionBit = State.Builder.getTrue(); 10014 10015 // Replace the temporary unreachable terminator with a new conditional branch, 10016 // whose two destinations will be set later when they are created. 10017 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 10018 assert(isa<UnreachableInst>(CurrentTerminator) && 10019 "Expected to replace unreachable terminator with conditional branch."); 10020 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 10021 CondBr->setSuccessor(0, nullptr); 10022 ReplaceInstWithInst(CurrentTerminator, CondBr); 10023 } 10024 10025 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 10026 assert(State.Instance && "Predicated instruction PHI works per instance."); 10027 Instruction *ScalarPredInst = 10028 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 10029 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 10030 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 10031 assert(PredicatingBB && "Predicated block has no single predecessor."); 10032 assert(isa<VPReplicateRecipe>(getOperand(0)) && 10033 "operand must be VPReplicateRecipe"); 10034 10035 // By current pack/unpack logic we need to generate only a single phi node: if 10036 // a vector value for the predicated instruction exists at this point it means 10037 // the instruction has vector users only, and a phi for the vector value is 10038 // needed. In this case the recipe of the predicated instruction is marked to 10039 // also do that packing, thereby "hoisting" the insert-element sequence. 10040 // Otherwise, a phi node for the scalar value is needed. 10041 unsigned Part = State.Instance->Part; 10042 if (State.hasVectorValue(getOperand(0), Part)) { 10043 Value *VectorValue = State.get(getOperand(0), Part); 10044 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 10045 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 10046 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 10047 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 10048 if (State.hasVectorValue(this, Part)) 10049 State.reset(this, VPhi, Part); 10050 else 10051 State.set(this, VPhi, Part); 10052 // NOTE: Currently we need to update the value of the operand, so the next 10053 // predicated iteration inserts its generated value in the correct vector. 10054 State.reset(getOperand(0), VPhi, Part); 10055 } else { 10056 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 10057 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 10058 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 10059 PredicatingBB); 10060 Phi->addIncoming(ScalarPredInst, PredicatedBB); 10061 if (State.hasScalarValue(this, *State.Instance)) 10062 State.reset(this, Phi, *State.Instance); 10063 else 10064 State.set(this, Phi, *State.Instance); 10065 // NOTE: Currently we need to update the value of the operand, so the next 10066 // predicated iteration inserts its generated value in the correct vector. 10067 State.reset(getOperand(0), Phi, *State.Instance); 10068 } 10069 } 10070 10071 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 10072 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 10073 State.ILV->vectorizeMemoryInstruction( 10074 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 10075 StoredValue, getMask(), Consecutive, Reverse); 10076 } 10077 10078 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10079 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10080 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10081 // for predication. 10082 static ScalarEpilogueLowering getScalarEpilogueLowering( 10083 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10084 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10085 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10086 LoopVectorizationLegality &LVL) { 10087 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10088 // don't look at hints or options, and don't request a scalar epilogue. 10089 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10090 // LoopAccessInfo (due to code dependency and not being able to reliably get 10091 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10092 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10093 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10094 // back to the old way and vectorize with versioning when forced. See D81345.) 10095 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10096 PGSOQueryType::IRPass) && 10097 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10098 return CM_ScalarEpilogueNotAllowedOptSize; 10099 10100 // 2) If set, obey the directives 10101 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10102 switch (PreferPredicateOverEpilogue) { 10103 case PreferPredicateTy::ScalarEpilogue: 10104 return CM_ScalarEpilogueAllowed; 10105 case PreferPredicateTy::PredicateElseScalarEpilogue: 10106 return CM_ScalarEpilogueNotNeededUsePredicate; 10107 case PreferPredicateTy::PredicateOrDontVectorize: 10108 return CM_ScalarEpilogueNotAllowedUsePredicate; 10109 }; 10110 } 10111 10112 // 3) If set, obey the hints 10113 switch (Hints.getPredicate()) { 10114 case LoopVectorizeHints::FK_Enabled: 10115 return CM_ScalarEpilogueNotNeededUsePredicate; 10116 case LoopVectorizeHints::FK_Disabled: 10117 return CM_ScalarEpilogueAllowed; 10118 }; 10119 10120 // 4) if the TTI hook indicates this is profitable, request predication. 10121 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10122 LVL.getLAI())) 10123 return CM_ScalarEpilogueNotNeededUsePredicate; 10124 10125 return CM_ScalarEpilogueAllowed; 10126 } 10127 10128 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10129 // If Values have been set for this Def return the one relevant for \p Part. 10130 if (hasVectorValue(Def, Part)) 10131 return Data.PerPartOutput[Def][Part]; 10132 10133 if (!hasScalarValue(Def, {Part, 0})) { 10134 Value *IRV = Def->getLiveInIRValue(); 10135 Value *B = ILV->getBroadcastInstrs(IRV); 10136 set(Def, B, Part); 10137 return B; 10138 } 10139 10140 Value *ScalarValue = get(Def, {Part, 0}); 10141 // If we aren't vectorizing, we can just copy the scalar map values over 10142 // to the vector map. 10143 if (VF.isScalar()) { 10144 set(Def, ScalarValue, Part); 10145 return ScalarValue; 10146 } 10147 10148 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10149 bool IsUniform = RepR && RepR->isUniform(); 10150 10151 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10152 // Check if there is a scalar value for the selected lane. 10153 if (!hasScalarValue(Def, {Part, LastLane})) { 10154 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10155 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10156 "unexpected recipe found to be invariant"); 10157 IsUniform = true; 10158 LastLane = 0; 10159 } 10160 10161 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10162 // Set the insert point after the last scalarized instruction or after the 10163 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10164 // will directly follow the scalar definitions. 10165 auto OldIP = Builder.saveIP(); 10166 auto NewIP = 10167 isa<PHINode>(LastInst) 10168 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10169 : std::next(BasicBlock::iterator(LastInst)); 10170 Builder.SetInsertPoint(&*NewIP); 10171 10172 // However, if we are vectorizing, we need to construct the vector values. 10173 // If the value is known to be uniform after vectorization, we can just 10174 // broadcast the scalar value corresponding to lane zero for each unroll 10175 // iteration. Otherwise, we construct the vector values using 10176 // insertelement instructions. Since the resulting vectors are stored in 10177 // State, we will only generate the insertelements once. 10178 Value *VectorValue = nullptr; 10179 if (IsUniform) { 10180 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10181 set(Def, VectorValue, Part); 10182 } else { 10183 // Initialize packing with insertelements to start from undef. 10184 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10185 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10186 set(Def, Undef, Part); 10187 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10188 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10189 VectorValue = get(Def, Part); 10190 } 10191 Builder.restoreIP(OldIP); 10192 return VectorValue; 10193 } 10194 10195 // Process the loop in the VPlan-native vectorization path. This path builds 10196 // VPlan upfront in the vectorization pipeline, which allows to apply 10197 // VPlan-to-VPlan transformations from the very beginning without modifying the 10198 // input LLVM IR. 10199 static bool processLoopInVPlanNativePath( 10200 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10201 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10202 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10203 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10204 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10205 LoopVectorizationRequirements &Requirements) { 10206 10207 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10208 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10209 return false; 10210 } 10211 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10212 Function *F = L->getHeader()->getParent(); 10213 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10214 10215 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10216 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10217 10218 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10219 &Hints, IAI); 10220 // Use the planner for outer loop vectorization. 10221 // TODO: CM is not used at this point inside the planner. Turn CM into an 10222 // optional argument if we don't need it in the future. 10223 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10224 Requirements, ORE); 10225 10226 // Get user vectorization factor. 10227 ElementCount UserVF = Hints.getWidth(); 10228 10229 CM.collectElementTypesForWidening(); 10230 10231 // Plan how to best vectorize, return the best VF and its cost. 10232 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10233 10234 // If we are stress testing VPlan builds, do not attempt to generate vector 10235 // code. Masked vector code generation support will follow soon. 10236 // Also, do not attempt to vectorize if no vector code will be produced. 10237 if (VPlanBuildStressTest || EnableVPlanPredication || 10238 VectorizationFactor::Disabled() == VF) 10239 return false; 10240 10241 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10242 10243 { 10244 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10245 F->getParent()->getDataLayout()); 10246 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10247 &CM, BFI, PSI, Checks); 10248 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10249 << L->getHeader()->getParent()->getName() << "\"\n"); 10250 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10251 } 10252 10253 // Mark the loop as already vectorized to avoid vectorizing again. 10254 Hints.setAlreadyVectorized(); 10255 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10256 return true; 10257 } 10258 10259 // Emit a remark if there are stores to floats that required a floating point 10260 // extension. If the vectorized loop was generated with floating point there 10261 // will be a performance penalty from the conversion overhead and the change in 10262 // the vector width. 10263 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10264 SmallVector<Instruction *, 4> Worklist; 10265 for (BasicBlock *BB : L->getBlocks()) { 10266 for (Instruction &Inst : *BB) { 10267 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10268 if (S->getValueOperand()->getType()->isFloatTy()) 10269 Worklist.push_back(S); 10270 } 10271 } 10272 } 10273 10274 // Traverse the floating point stores upwards searching, for floating point 10275 // conversions. 10276 SmallPtrSet<const Instruction *, 4> Visited; 10277 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10278 while (!Worklist.empty()) { 10279 auto *I = Worklist.pop_back_val(); 10280 if (!L->contains(I)) 10281 continue; 10282 if (!Visited.insert(I).second) 10283 continue; 10284 10285 // Emit a remark if the floating point store required a floating 10286 // point conversion. 10287 // TODO: More work could be done to identify the root cause such as a 10288 // constant or a function return type and point the user to it. 10289 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10290 ORE->emit([&]() { 10291 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10292 I->getDebugLoc(), L->getHeader()) 10293 << "floating point conversion changes vector width. " 10294 << "Mixed floating point precision requires an up/down " 10295 << "cast that will negatively impact performance."; 10296 }); 10297 10298 for (Use &Op : I->operands()) 10299 if (auto *OpI = dyn_cast<Instruction>(Op)) 10300 Worklist.push_back(OpI); 10301 } 10302 } 10303 10304 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10305 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10306 !EnableLoopInterleaving), 10307 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10308 !EnableLoopVectorization) {} 10309 10310 bool LoopVectorizePass::processLoop(Loop *L) { 10311 assert((EnableVPlanNativePath || L->isInnermost()) && 10312 "VPlan-native path is not enabled. Only process inner loops."); 10313 10314 #ifndef NDEBUG 10315 const std::string DebugLocStr = getDebugLocString(L); 10316 #endif /* NDEBUG */ 10317 10318 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10319 << L->getHeader()->getParent()->getName() << "\" from " 10320 << DebugLocStr << "\n"); 10321 10322 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10323 10324 LLVM_DEBUG( 10325 dbgs() << "LV: Loop hints:" 10326 << " force=" 10327 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10328 ? "disabled" 10329 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10330 ? "enabled" 10331 : "?")) 10332 << " width=" << Hints.getWidth() 10333 << " interleave=" << Hints.getInterleave() << "\n"); 10334 10335 // Function containing loop 10336 Function *F = L->getHeader()->getParent(); 10337 10338 // Looking at the diagnostic output is the only way to determine if a loop 10339 // was vectorized (other than looking at the IR or machine code), so it 10340 // is important to generate an optimization remark for each loop. Most of 10341 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10342 // generated as OptimizationRemark and OptimizationRemarkMissed are 10343 // less verbose reporting vectorized loops and unvectorized loops that may 10344 // benefit from vectorization, respectively. 10345 10346 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10347 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10348 return false; 10349 } 10350 10351 PredicatedScalarEvolution PSE(*SE, *L); 10352 10353 // Check if it is legal to vectorize the loop. 10354 LoopVectorizationRequirements Requirements; 10355 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10356 &Requirements, &Hints, DB, AC, BFI, PSI); 10357 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10358 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10359 Hints.emitRemarkWithHints(); 10360 return false; 10361 } 10362 10363 // Check the function attributes and profiles to find out if this function 10364 // should be optimized for size. 10365 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10366 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10367 10368 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10369 // here. They may require CFG and instruction level transformations before 10370 // even evaluating whether vectorization is profitable. Since we cannot modify 10371 // the incoming IR, we need to build VPlan upfront in the vectorization 10372 // pipeline. 10373 if (!L->isInnermost()) 10374 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10375 ORE, BFI, PSI, Hints, Requirements); 10376 10377 assert(L->isInnermost() && "Inner loop expected."); 10378 10379 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10380 // count by optimizing for size, to minimize overheads. 10381 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10382 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10383 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10384 << "This loop is worth vectorizing only if no scalar " 10385 << "iteration overheads are incurred."); 10386 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10387 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10388 else { 10389 LLVM_DEBUG(dbgs() << "\n"); 10390 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10391 } 10392 } 10393 10394 // Check the function attributes to see if implicit floats are allowed. 10395 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10396 // an integer loop and the vector instructions selected are purely integer 10397 // vector instructions? 10398 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10399 reportVectorizationFailure( 10400 "Can't vectorize when the NoImplicitFloat attribute is used", 10401 "loop not vectorized due to NoImplicitFloat attribute", 10402 "NoImplicitFloat", ORE, L); 10403 Hints.emitRemarkWithHints(); 10404 return false; 10405 } 10406 10407 // Check if the target supports potentially unsafe FP vectorization. 10408 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10409 // for the target we're vectorizing for, to make sure none of the 10410 // additional fp-math flags can help. 10411 if (Hints.isPotentiallyUnsafe() && 10412 TTI->isFPVectorizationPotentiallyUnsafe()) { 10413 reportVectorizationFailure( 10414 "Potentially unsafe FP op prevents vectorization", 10415 "loop not vectorized due to unsafe FP support.", 10416 "UnsafeFP", ORE, L); 10417 Hints.emitRemarkWithHints(); 10418 return false; 10419 } 10420 10421 bool AllowOrderedReductions; 10422 // If the flag is set, use that instead and override the TTI behaviour. 10423 if (ForceOrderedReductions.getNumOccurrences() > 0) 10424 AllowOrderedReductions = ForceOrderedReductions; 10425 else 10426 AllowOrderedReductions = TTI->enableOrderedReductions(); 10427 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10428 ORE->emit([&]() { 10429 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10430 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10431 ExactFPMathInst->getDebugLoc(), 10432 ExactFPMathInst->getParent()) 10433 << "loop not vectorized: cannot prove it is safe to reorder " 10434 "floating-point operations"; 10435 }); 10436 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10437 "reorder floating-point operations\n"); 10438 Hints.emitRemarkWithHints(); 10439 return false; 10440 } 10441 10442 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10443 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10444 10445 // If an override option has been passed in for interleaved accesses, use it. 10446 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10447 UseInterleaved = EnableInterleavedMemAccesses; 10448 10449 // Analyze interleaved memory accesses. 10450 if (UseInterleaved) { 10451 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10452 } 10453 10454 // Use the cost model. 10455 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10456 F, &Hints, IAI); 10457 CM.collectValuesToIgnore(); 10458 CM.collectElementTypesForWidening(); 10459 10460 // Use the planner for vectorization. 10461 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10462 Requirements, ORE); 10463 10464 // Get user vectorization factor and interleave count. 10465 ElementCount UserVF = Hints.getWidth(); 10466 unsigned UserIC = Hints.getInterleave(); 10467 10468 // Plan how to best vectorize, return the best VF and its cost. 10469 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10470 10471 VectorizationFactor VF = VectorizationFactor::Disabled(); 10472 unsigned IC = 1; 10473 10474 if (MaybeVF) { 10475 VF = *MaybeVF; 10476 // Select the interleave count. 10477 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10478 } 10479 10480 // Identify the diagnostic messages that should be produced. 10481 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10482 bool VectorizeLoop = true, InterleaveLoop = true; 10483 if (VF.Width.isScalar()) { 10484 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10485 VecDiagMsg = std::make_pair( 10486 "VectorizationNotBeneficial", 10487 "the cost-model indicates that vectorization is not beneficial"); 10488 VectorizeLoop = false; 10489 } 10490 10491 if (!MaybeVF && UserIC > 1) { 10492 // Tell the user interleaving was avoided up-front, despite being explicitly 10493 // requested. 10494 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10495 "interleaving should be avoided up front\n"); 10496 IntDiagMsg = std::make_pair( 10497 "InterleavingAvoided", 10498 "Ignoring UserIC, because interleaving was avoided up front"); 10499 InterleaveLoop = false; 10500 } else if (IC == 1 && UserIC <= 1) { 10501 // Tell the user interleaving is not beneficial. 10502 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10503 IntDiagMsg = std::make_pair( 10504 "InterleavingNotBeneficial", 10505 "the cost-model indicates that interleaving is not beneficial"); 10506 InterleaveLoop = false; 10507 if (UserIC == 1) { 10508 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10509 IntDiagMsg.second += 10510 " and is explicitly disabled or interleave count is set to 1"; 10511 } 10512 } else if (IC > 1 && UserIC == 1) { 10513 // Tell the user interleaving is beneficial, but it explicitly disabled. 10514 LLVM_DEBUG( 10515 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10516 IntDiagMsg = std::make_pair( 10517 "InterleavingBeneficialButDisabled", 10518 "the cost-model indicates that interleaving is beneficial " 10519 "but is explicitly disabled or interleave count is set to 1"); 10520 InterleaveLoop = false; 10521 } 10522 10523 // Override IC if user provided an interleave count. 10524 IC = UserIC > 0 ? UserIC : IC; 10525 10526 // Emit diagnostic messages, if any. 10527 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10528 if (!VectorizeLoop && !InterleaveLoop) { 10529 // Do not vectorize or interleaving the loop. 10530 ORE->emit([&]() { 10531 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10532 L->getStartLoc(), L->getHeader()) 10533 << VecDiagMsg.second; 10534 }); 10535 ORE->emit([&]() { 10536 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10537 L->getStartLoc(), L->getHeader()) 10538 << IntDiagMsg.second; 10539 }); 10540 return false; 10541 } else if (!VectorizeLoop && InterleaveLoop) { 10542 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10543 ORE->emit([&]() { 10544 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10545 L->getStartLoc(), L->getHeader()) 10546 << VecDiagMsg.second; 10547 }); 10548 } else if (VectorizeLoop && !InterleaveLoop) { 10549 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10550 << ") in " << DebugLocStr << '\n'); 10551 ORE->emit([&]() { 10552 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10553 L->getStartLoc(), L->getHeader()) 10554 << IntDiagMsg.second; 10555 }); 10556 } else if (VectorizeLoop && InterleaveLoop) { 10557 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10558 << ") in " << DebugLocStr << '\n'); 10559 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10560 } 10561 10562 bool DisableRuntimeUnroll = false; 10563 MDNode *OrigLoopID = L->getLoopID(); 10564 { 10565 // Optimistically generate runtime checks. Drop them if they turn out to not 10566 // be profitable. Limit the scope of Checks, so the cleanup happens 10567 // immediately after vector codegeneration is done. 10568 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10569 F->getParent()->getDataLayout()); 10570 if (!VF.Width.isScalar() || IC > 1) 10571 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10572 10573 using namespace ore; 10574 if (!VectorizeLoop) { 10575 assert(IC > 1 && "interleave count should not be 1 or 0"); 10576 // If we decided that it is not legal to vectorize the loop, then 10577 // interleave it. 10578 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10579 &CM, BFI, PSI, Checks); 10580 10581 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10582 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10583 10584 ORE->emit([&]() { 10585 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10586 L->getHeader()) 10587 << "interleaved loop (interleaved count: " 10588 << NV("InterleaveCount", IC) << ")"; 10589 }); 10590 } else { 10591 // If we decided that it is *legal* to vectorize the loop, then do it. 10592 10593 // Consider vectorizing the epilogue too if it's profitable. 10594 VectorizationFactor EpilogueVF = 10595 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10596 if (EpilogueVF.Width.isVector()) { 10597 10598 // The first pass vectorizes the main loop and creates a scalar epilogue 10599 // to be vectorized by executing the plan (potentially with a different 10600 // factor) again shortly afterwards. 10601 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10602 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10603 EPI, &LVL, &CM, BFI, PSI, Checks); 10604 10605 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10606 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10607 DT); 10608 ++LoopsVectorized; 10609 10610 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10611 formLCSSARecursively(*L, *DT, LI, SE); 10612 10613 // Second pass vectorizes the epilogue and adjusts the control flow 10614 // edges from the first pass. 10615 EPI.MainLoopVF = EPI.EpilogueVF; 10616 EPI.MainLoopUF = EPI.EpilogueUF; 10617 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10618 ORE, EPI, &LVL, &CM, BFI, PSI, 10619 Checks); 10620 10621 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10622 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10623 DT); 10624 ++LoopsEpilogueVectorized; 10625 10626 if (!MainILV.areSafetyChecksAdded()) 10627 DisableRuntimeUnroll = true; 10628 } else { 10629 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10630 &LVL, &CM, BFI, PSI, Checks); 10631 10632 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10633 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10634 ++LoopsVectorized; 10635 10636 // Add metadata to disable runtime unrolling a scalar loop when there 10637 // are no runtime checks about strides and memory. A scalar loop that is 10638 // rarely used is not worth unrolling. 10639 if (!LB.areSafetyChecksAdded()) 10640 DisableRuntimeUnroll = true; 10641 } 10642 // Report the vectorization decision. 10643 ORE->emit([&]() { 10644 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10645 L->getHeader()) 10646 << "vectorized loop (vectorization width: " 10647 << NV("VectorizationFactor", VF.Width) 10648 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10649 }); 10650 } 10651 10652 if (ORE->allowExtraAnalysis(LV_NAME)) 10653 checkMixedPrecision(L, ORE); 10654 } 10655 10656 Optional<MDNode *> RemainderLoopID = 10657 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10658 LLVMLoopVectorizeFollowupEpilogue}); 10659 if (RemainderLoopID.hasValue()) { 10660 L->setLoopID(RemainderLoopID.getValue()); 10661 } else { 10662 if (DisableRuntimeUnroll) 10663 AddRuntimeUnrollDisableMetaData(L); 10664 10665 // Mark the loop as already vectorized to avoid vectorizing again. 10666 Hints.setAlreadyVectorized(); 10667 } 10668 10669 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10670 return true; 10671 } 10672 10673 LoopVectorizeResult LoopVectorizePass::runImpl( 10674 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10675 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10676 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10677 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10678 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10679 SE = &SE_; 10680 LI = &LI_; 10681 TTI = &TTI_; 10682 DT = &DT_; 10683 BFI = &BFI_; 10684 TLI = TLI_; 10685 AA = &AA_; 10686 AC = &AC_; 10687 GetLAA = &GetLAA_; 10688 DB = &DB_; 10689 ORE = &ORE_; 10690 PSI = PSI_; 10691 10692 // Don't attempt if 10693 // 1. the target claims to have no vector registers, and 10694 // 2. interleaving won't help ILP. 10695 // 10696 // The second condition is necessary because, even if the target has no 10697 // vector registers, loop vectorization may still enable scalar 10698 // interleaving. 10699 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10700 TTI->getMaxInterleaveFactor(1) < 2) 10701 return LoopVectorizeResult(false, false); 10702 10703 bool Changed = false, CFGChanged = false; 10704 10705 // The vectorizer requires loops to be in simplified form. 10706 // Since simplification may add new inner loops, it has to run before the 10707 // legality and profitability checks. This means running the loop vectorizer 10708 // will simplify all loops, regardless of whether anything end up being 10709 // vectorized. 10710 for (auto &L : *LI) 10711 Changed |= CFGChanged |= 10712 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10713 10714 // Build up a worklist of inner-loops to vectorize. This is necessary as 10715 // the act of vectorizing or partially unrolling a loop creates new loops 10716 // and can invalidate iterators across the loops. 10717 SmallVector<Loop *, 8> Worklist; 10718 10719 for (Loop *L : *LI) 10720 collectSupportedLoops(*L, LI, ORE, Worklist); 10721 10722 LoopsAnalyzed += Worklist.size(); 10723 10724 // Now walk the identified inner loops. 10725 while (!Worklist.empty()) { 10726 Loop *L = Worklist.pop_back_val(); 10727 10728 // For the inner loops we actually process, form LCSSA to simplify the 10729 // transform. 10730 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10731 10732 Changed |= CFGChanged |= processLoop(L); 10733 } 10734 10735 // Process each loop nest in the function. 10736 return LoopVectorizeResult(Changed, CFGChanged); 10737 } 10738 10739 PreservedAnalyses LoopVectorizePass::run(Function &F, 10740 FunctionAnalysisManager &AM) { 10741 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10742 auto &LI = AM.getResult<LoopAnalysis>(F); 10743 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10744 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10745 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10746 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10747 auto &AA = AM.getResult<AAManager>(F); 10748 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10749 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10750 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10751 10752 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10753 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10754 [&](Loop &L) -> const LoopAccessInfo & { 10755 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10756 TLI, TTI, nullptr, nullptr, nullptr}; 10757 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10758 }; 10759 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10760 ProfileSummaryInfo *PSI = 10761 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10762 LoopVectorizeResult Result = 10763 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10764 if (!Result.MadeAnyChange) 10765 return PreservedAnalyses::all(); 10766 PreservedAnalyses PA; 10767 10768 // We currently do not preserve loopinfo/dominator analyses with outer loop 10769 // vectorization. Until this is addressed, mark these analyses as preserved 10770 // only for non-VPlan-native path. 10771 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10772 if (!EnableVPlanNativePath) { 10773 PA.preserve<LoopAnalysis>(); 10774 PA.preserve<DominatorTreeAnalysis>(); 10775 } 10776 if (!Result.MadeCFGChange) 10777 PA.preserveSet<CFGAnalyses>(); 10778 return PA; 10779 } 10780 10781 void LoopVectorizePass::printPipeline( 10782 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10783 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10784 OS, MapClassName2PassName); 10785 10786 OS << "<"; 10787 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10788 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10789 OS << ">"; 10790 } 10791