1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop. 474 /// In the case of epilogue vectorization, this function is overriden to 475 /// handle the more complex control flow around the loops. 476 virtual BasicBlock *createVectorizedLoopSkeleton(); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 483 void fixVectorizedLoop(VPTransformState &State); 484 485 // Return true if any runtime check is added. 486 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 487 488 /// A type for vectorized values in the new loop. Each value from the 489 /// original loop, when vectorized, is represented by UF vector values in the 490 /// new unrolled loop, where UF is the unroll factor. 491 using VectorParts = SmallVector<Value *, 2>; 492 493 /// Vectorize a single first-order recurrence or pointer induction PHINode in 494 /// a block. This method handles the induction variable canonicalization. It 495 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 496 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 497 VPTransformState &State); 498 499 /// A helper function to scalarize a single Instruction in the innermost loop. 500 /// Generates a sequence of scalar instances for each lane between \p MinLane 501 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 502 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 503 /// Instr's operands. 504 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 505 const VPIteration &Instance, bool IfPredicateInstr, 506 VPTransformState &State); 507 508 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 509 /// is provided, the integer induction variable will first be truncated to 510 /// the corresponding type. 511 void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, 512 Value *Start, TruncInst *Trunc, VPValue *Def, 513 VPTransformState &State); 514 515 /// Construct the vector value of a scalarized value \p V one lane at a time. 516 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 517 VPTransformState &State); 518 519 /// Try to vectorize interleaved access group \p Group with the base address 520 /// given in \p Addr, optionally masking the vector operations if \p 521 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 522 /// values in the vectorized loop. 523 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 524 ArrayRef<VPValue *> VPDefs, 525 VPTransformState &State, VPValue *Addr, 526 ArrayRef<VPValue *> StoredValues, 527 VPValue *BlockInMask = nullptr); 528 529 /// Set the debug location in the builder \p Ptr using the debug location in 530 /// \p V. If \p Ptr is None then it uses the class member's Builder. 531 void setDebugLocFromInst(const Value *V, 532 Optional<IRBuilder<> *> CustomBuilder = None); 533 534 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 535 void fixNonInductionPHIs(VPTransformState &State); 536 537 /// Returns true if the reordering of FP operations is not allowed, but we are 538 /// able to vectorize with strict in-order reductions for the given RdxDesc. 539 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 540 541 /// Create a broadcast instruction. This method generates a broadcast 542 /// instruction (shuffle) for loop invariant values and for the induction 543 /// value. If this is the induction variable then we extend it to N, N+1, ... 544 /// this is needed because each iteration in the loop corresponds to a SIMD 545 /// element. 546 virtual Value *getBroadcastInstrs(Value *V); 547 548 /// Add metadata from one instruction to another. 549 /// 550 /// This includes both the original MDs from \p From and additional ones (\see 551 /// addNewMetadata). Use this for *newly created* instructions in the vector 552 /// loop. 553 void addMetadata(Instruction *To, Instruction *From); 554 555 /// Similar to the previous function but it adds the metadata to a 556 /// vector of instructions. 557 void addMetadata(ArrayRef<Value *> To, Instruction *From); 558 559 protected: 560 friend class LoopVectorizationPlanner; 561 562 /// A small list of PHINodes. 563 using PhiVector = SmallVector<PHINode *, 4>; 564 565 /// A type for scalarized values in the new loop. Each value from the 566 /// original loop, when scalarized, is represented by UF x VF scalar values 567 /// in the new unrolled loop, where UF is the unroll factor and VF is the 568 /// vectorization factor. 569 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 570 571 /// Set up the values of the IVs correctly when exiting the vector loop. 572 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 573 Value *CountRoundDown, Value *EndValue, 574 BasicBlock *MiddleBlock); 575 576 /// Create a new induction variable inside L. 577 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 578 Value *Step, Instruction *DL); 579 580 /// Handle all cross-iteration phis in the header. 581 void fixCrossIterationPHIs(VPTransformState &State); 582 583 /// Create the exit value of first order recurrences in the middle block and 584 /// update their users. 585 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 586 VPTransformState &State); 587 588 /// Create code for the loop exit value of the reduction. 589 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 590 591 /// Clear NSW/NUW flags from reduction instructions if necessary. 592 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 593 VPTransformState &State); 594 595 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 596 /// means we need to add the appropriate incoming value from the middle 597 /// block as exiting edges from the scalar epilogue loop (if present) are 598 /// already in place, and we exit the vector loop exclusively to the middle 599 /// block. 600 void fixLCSSAPHIs(VPTransformState &State); 601 602 /// Iteratively sink the scalarized operands of a predicated instruction into 603 /// the block that was created for it. 604 void sinkScalarOperands(Instruction *PredInst); 605 606 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 607 /// represented as. 608 void truncateToMinimalBitwidths(VPTransformState &State); 609 610 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 611 /// variable on which to base the steps, \p Step is the size of the step, and 612 /// \p EntryVal is the value from the original loop that maps to the steps. 613 /// Note that \p EntryVal doesn't have to be an induction variable - it 614 /// can also be a truncate instruction. 615 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 616 const InductionDescriptor &ID, VPValue *Def, 617 VPTransformState &State); 618 619 /// Create a vector induction phi node based on an existing scalar one. \p 620 /// EntryVal is the value from the original loop that maps to the vector phi 621 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 622 /// truncate instruction, instead of widening the original IV, we widen a 623 /// version of the IV truncated to \p EntryVal's type. 624 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 625 Value *Step, Value *Start, 626 Instruction *EntryVal, VPValue *Def, 627 VPTransformState &State); 628 629 /// Returns true if an instruction \p I should be scalarized instead of 630 /// vectorized for the chosen vectorization factor. 631 bool shouldScalarizeInstruction(Instruction *I) const; 632 633 /// Returns true if we should generate a scalar version of \p IV. 634 bool needsScalarInduction(Instruction *IV) const; 635 636 /// Returns (and creates if needed) the original loop trip count. 637 Value *getOrCreateTripCount(Loop *NewLoop); 638 639 /// Returns (and creates if needed) the trip count of the widened loop. 640 Value *getOrCreateVectorTripCount(Loop *NewLoop); 641 642 /// Returns a bitcasted value to the requested vector type. 643 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 644 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 645 const DataLayout &DL); 646 647 /// Emit a bypass check to see if the vector trip count is zero, including if 648 /// it overflows. 649 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 650 651 /// Emit a bypass check to see if all of the SCEV assumptions we've 652 /// had to make are correct. Returns the block containing the checks or 653 /// nullptr if no checks have been added. 654 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 655 656 /// Emit bypass checks to check any memory assumptions we may have made. 657 /// Returns the block containing the checks or nullptr if no checks have been 658 /// added. 659 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 660 661 /// Compute the transformed value of Index at offset StartValue using step 662 /// StepValue. 663 /// For integer induction, returns StartValue + Index * StepValue. 664 /// For pointer induction, returns StartValue[Index * StepValue]. 665 /// FIXME: The newly created binary instructions should contain nsw/nuw 666 /// flags, which can be found from the original scalar operations. 667 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 668 const DataLayout &DL, 669 const InductionDescriptor &ID, 670 BasicBlock *VectorHeader) const; 671 672 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 673 /// vector loop preheader, middle block and scalar preheader. Also 674 /// allocate a loop object for the new vector loop and return it. 675 Loop *createVectorLoopSkeleton(StringRef Prefix); 676 677 /// Create new phi nodes for the induction variables to resume iteration count 678 /// in the scalar epilogue, from where the vectorized loop left off (given by 679 /// \p VectorTripCount). 680 /// In cases where the loop skeleton is more complicated (eg. epilogue 681 /// vectorization) and the resume values can come from an additional bypass 682 /// block, the \p AdditionalBypass pair provides information about the bypass 683 /// block and the end value on the edge from bypass to this loop. 684 void createInductionResumeValues( 685 Loop *L, Value *VectorTripCount, 686 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 687 688 /// Complete the loop skeleton by adding debug MDs, creating appropriate 689 /// conditional branches in the middle block, preparing the builder and 690 /// running the verifier. Take in the vector loop \p L as argument, and return 691 /// the preheader of the completed vector loop. 692 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 693 694 /// Add additional metadata to \p To that was not present on \p Orig. 695 /// 696 /// Currently this is used to add the noalias annotations based on the 697 /// inserted memchecks. Use this for instructions that are *cloned* into the 698 /// vector loop. 699 void addNewMetadata(Instruction *To, const Instruction *Orig); 700 701 /// Collect poison-generating recipes that may generate a poison value that is 702 /// used after vectorization, even when their operands are not poison. Those 703 /// recipes meet the following conditions: 704 /// * Contribute to the address computation of a recipe generating a widen 705 /// memory load/store (VPWidenMemoryInstructionRecipe or 706 /// VPInterleaveRecipe). 707 /// * Such a widen memory load/store has at least one underlying Instruction 708 /// that is in a basic block that needs predication and after vectorization 709 /// the generated instruction won't be predicated. 710 void collectPoisonGeneratingRecipes(VPTransformState &State); 711 712 /// Allow subclasses to override and print debug traces before/after vplan 713 /// execution, when trace information is requested. 714 virtual void printDebugTracesAtStart(){}; 715 virtual void printDebugTracesAtEnd(){}; 716 717 /// The original loop. 718 Loop *OrigLoop; 719 720 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 721 /// dynamic knowledge to simplify SCEV expressions and converts them to a 722 /// more usable form. 723 PredicatedScalarEvolution &PSE; 724 725 /// Loop Info. 726 LoopInfo *LI; 727 728 /// Dominator Tree. 729 DominatorTree *DT; 730 731 /// Alias Analysis. 732 AAResults *AA; 733 734 /// Target Library Info. 735 const TargetLibraryInfo *TLI; 736 737 /// Target Transform Info. 738 const TargetTransformInfo *TTI; 739 740 /// Assumption Cache. 741 AssumptionCache *AC; 742 743 /// Interface to emit optimization remarks. 744 OptimizationRemarkEmitter *ORE; 745 746 /// LoopVersioning. It's only set up (non-null) if memchecks were 747 /// used. 748 /// 749 /// This is currently only used to add no-alias metadata based on the 750 /// memchecks. The actually versioning is performed manually. 751 std::unique_ptr<LoopVersioning> LVer; 752 753 /// The vectorization SIMD factor to use. Each vector will have this many 754 /// vector elements. 755 ElementCount VF; 756 757 /// The vectorization unroll factor to use. Each scalar is vectorized to this 758 /// many different vector instructions. 759 unsigned UF; 760 761 /// The builder that we use 762 IRBuilder<> Builder; 763 764 // --- Vectorization state --- 765 766 /// The vector-loop preheader. 767 BasicBlock *LoopVectorPreHeader; 768 769 /// The scalar-loop preheader. 770 BasicBlock *LoopScalarPreHeader; 771 772 /// Middle Block between the vector and the scalar. 773 BasicBlock *LoopMiddleBlock; 774 775 /// The unique ExitBlock of the scalar loop if one exists. Note that 776 /// there can be multiple exiting edges reaching this block. 777 BasicBlock *LoopExitBlock; 778 779 /// The vector loop body. 780 BasicBlock *LoopVectorBody; 781 782 /// The scalar loop body. 783 BasicBlock *LoopScalarBody; 784 785 /// A list of all bypass blocks. The first block is the entry of the loop. 786 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 787 788 /// The new Induction variable which was added to the new block. 789 PHINode *Induction = nullptr; 790 791 /// The induction variable of the old basic block. 792 PHINode *OldInduction = nullptr; 793 794 /// Store instructions that were predicated. 795 SmallVector<Instruction *, 4> PredicatedInstructions; 796 797 /// Trip count of the original loop. 798 Value *TripCount = nullptr; 799 800 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 801 Value *VectorTripCount = nullptr; 802 803 /// The legality analysis. 804 LoopVectorizationLegality *Legal; 805 806 /// The profitablity analysis. 807 LoopVectorizationCostModel *Cost; 808 809 // Record whether runtime checks are added. 810 bool AddedSafetyChecks = false; 811 812 // Holds the end values for each induction variable. We save the end values 813 // so we can later fix-up the external users of the induction variables. 814 DenseMap<PHINode *, Value *> IVEndValues; 815 816 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 817 // fixed up at the end of vector code generation. 818 SmallVector<PHINode *, 8> OrigPHIsToFix; 819 820 /// BFI and PSI are used to check for profile guided size optimizations. 821 BlockFrequencyInfo *BFI; 822 ProfileSummaryInfo *PSI; 823 824 // Whether this loop should be optimized for size based on profile guided size 825 // optimizatios. 826 bool OptForSizeBasedOnProfile; 827 828 /// Structure to hold information about generated runtime checks, responsible 829 /// for cleaning the checks, if vectorization turns out unprofitable. 830 GeneratedRTChecks &RTChecks; 831 }; 832 833 class InnerLoopUnroller : public InnerLoopVectorizer { 834 public: 835 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 836 LoopInfo *LI, DominatorTree *DT, 837 const TargetLibraryInfo *TLI, 838 const TargetTransformInfo *TTI, AssumptionCache *AC, 839 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 840 LoopVectorizationLegality *LVL, 841 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 842 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 843 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 844 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 845 BFI, PSI, Check) {} 846 847 private: 848 Value *getBroadcastInstrs(Value *V) override; 849 }; 850 851 /// Encapsulate information regarding vectorization of a loop and its epilogue. 852 /// This information is meant to be updated and used across two stages of 853 /// epilogue vectorization. 854 struct EpilogueLoopVectorizationInfo { 855 ElementCount MainLoopVF = ElementCount::getFixed(0); 856 unsigned MainLoopUF = 0; 857 ElementCount EpilogueVF = ElementCount::getFixed(0); 858 unsigned EpilogueUF = 0; 859 BasicBlock *MainLoopIterationCountCheck = nullptr; 860 BasicBlock *EpilogueIterationCountCheck = nullptr; 861 BasicBlock *SCEVSafetyCheck = nullptr; 862 BasicBlock *MemSafetyCheck = nullptr; 863 Value *TripCount = nullptr; 864 Value *VectorTripCount = nullptr; 865 866 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 867 ElementCount EVF, unsigned EUF) 868 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 869 assert(EUF == 1 && 870 "A high UF for the epilogue loop is likely not beneficial."); 871 } 872 }; 873 874 /// An extension of the inner loop vectorizer that creates a skeleton for a 875 /// vectorized loop that has its epilogue (residual) also vectorized. 876 /// The idea is to run the vplan on a given loop twice, firstly to setup the 877 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 878 /// from the first step and vectorize the epilogue. This is achieved by 879 /// deriving two concrete strategy classes from this base class and invoking 880 /// them in succession from the loop vectorizer planner. 881 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 882 public: 883 InnerLoopAndEpilogueVectorizer( 884 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 885 DominatorTree *DT, const TargetLibraryInfo *TLI, 886 const TargetTransformInfo *TTI, AssumptionCache *AC, 887 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 888 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 889 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 890 GeneratedRTChecks &Checks) 891 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 892 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 893 Checks), 894 EPI(EPI) {} 895 896 // Override this function to handle the more complex control flow around the 897 // three loops. 898 BasicBlock *createVectorizedLoopSkeleton() final override { 899 return createEpilogueVectorizedLoopSkeleton(); 900 } 901 902 /// The interface for creating a vectorized skeleton using one of two 903 /// different strategies, each corresponding to one execution of the vplan 904 /// as described above. 905 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 906 907 /// Holds and updates state information required to vectorize the main loop 908 /// and its epilogue in two separate passes. This setup helps us avoid 909 /// regenerating and recomputing runtime safety checks. It also helps us to 910 /// shorten the iteration-count-check path length for the cases where the 911 /// iteration count of the loop is so small that the main vector loop is 912 /// completely skipped. 913 EpilogueLoopVectorizationInfo &EPI; 914 }; 915 916 /// A specialized derived class of inner loop vectorizer that performs 917 /// vectorization of *main* loops in the process of vectorizing loops and their 918 /// epilogues. 919 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 920 public: 921 EpilogueVectorizerMainLoop( 922 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 923 DominatorTree *DT, const TargetLibraryInfo *TLI, 924 const TargetTransformInfo *TTI, AssumptionCache *AC, 925 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 926 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 927 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 928 GeneratedRTChecks &Check) 929 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 930 EPI, LVL, CM, BFI, PSI, Check) {} 931 /// Implements the interface for creating a vectorized skeleton using the 932 /// *main loop* strategy (ie the first pass of vplan execution). 933 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 934 935 protected: 936 /// Emits an iteration count bypass check once for the main loop (when \p 937 /// ForEpilogue is false) and once for the epilogue loop (when \p 938 /// ForEpilogue is true). 939 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 940 bool ForEpilogue); 941 void printDebugTracesAtStart() override; 942 void printDebugTracesAtEnd() override; 943 }; 944 945 // A specialized derived class of inner loop vectorizer that performs 946 // vectorization of *epilogue* loops in the process of vectorizing loops and 947 // their epilogues. 948 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 949 public: 950 EpilogueVectorizerEpilogueLoop( 951 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 952 DominatorTree *DT, const TargetLibraryInfo *TLI, 953 const TargetTransformInfo *TTI, AssumptionCache *AC, 954 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 955 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 956 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 957 GeneratedRTChecks &Checks) 958 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 959 EPI, LVL, CM, BFI, PSI, Checks) {} 960 /// Implements the interface for creating a vectorized skeleton using the 961 /// *epilogue loop* strategy (ie the second pass of vplan execution). 962 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 963 964 protected: 965 /// Emits an iteration count bypass check after the main vector loop has 966 /// finished to see if there are any iterations left to execute by either 967 /// the vector epilogue or the scalar epilogue. 968 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 969 BasicBlock *Bypass, 970 BasicBlock *Insert); 971 void printDebugTracesAtStart() override; 972 void printDebugTracesAtEnd() override; 973 }; 974 } // end namespace llvm 975 976 /// Look for a meaningful debug location on the instruction or it's 977 /// operands. 978 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 979 if (!I) 980 return I; 981 982 DebugLoc Empty; 983 if (I->getDebugLoc() != Empty) 984 return I; 985 986 for (Use &Op : I->operands()) { 987 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 988 if (OpInst->getDebugLoc() != Empty) 989 return OpInst; 990 } 991 992 return I; 993 } 994 995 void InnerLoopVectorizer::setDebugLocFromInst( 996 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 997 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 998 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 999 const DILocation *DIL = Inst->getDebugLoc(); 1000 1001 // When a FSDiscriminator is enabled, we don't need to add the multiply 1002 // factors to the discriminators. 1003 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1004 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1005 // FIXME: For scalable vectors, assume vscale=1. 1006 auto NewDIL = 1007 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1008 if (NewDIL) 1009 B->SetCurrentDebugLocation(NewDIL.getValue()); 1010 else 1011 LLVM_DEBUG(dbgs() 1012 << "Failed to create new discriminator: " 1013 << DIL->getFilename() << " Line: " << DIL->getLine()); 1014 } else 1015 B->SetCurrentDebugLocation(DIL); 1016 } else 1017 B->SetCurrentDebugLocation(DebugLoc()); 1018 } 1019 1020 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1021 /// is passed, the message relates to that particular instruction. 1022 #ifndef NDEBUG 1023 static void debugVectorizationMessage(const StringRef Prefix, 1024 const StringRef DebugMsg, 1025 Instruction *I) { 1026 dbgs() << "LV: " << Prefix << DebugMsg; 1027 if (I != nullptr) 1028 dbgs() << " " << *I; 1029 else 1030 dbgs() << '.'; 1031 dbgs() << '\n'; 1032 } 1033 #endif 1034 1035 /// Create an analysis remark that explains why vectorization failed 1036 /// 1037 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1038 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1039 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1040 /// the location of the remark. \return the remark object that can be 1041 /// streamed to. 1042 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1043 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1044 Value *CodeRegion = TheLoop->getHeader(); 1045 DebugLoc DL = TheLoop->getStartLoc(); 1046 1047 if (I) { 1048 CodeRegion = I->getParent(); 1049 // If there is no debug location attached to the instruction, revert back to 1050 // using the loop's. 1051 if (I->getDebugLoc()) 1052 DL = I->getDebugLoc(); 1053 } 1054 1055 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1056 } 1057 1058 /// Return a value for Step multiplied by VF. 1059 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1060 int64_t Step) { 1061 assert(Ty->isIntegerTy() && "Expected an integer step"); 1062 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1063 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1064 } 1065 1066 namespace llvm { 1067 1068 /// Return the runtime value for VF. 1069 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1070 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1071 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1072 } 1073 1074 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1075 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1076 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1077 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1078 return B.CreateUIToFP(RuntimeVF, FTy); 1079 } 1080 1081 void reportVectorizationFailure(const StringRef DebugMsg, 1082 const StringRef OREMsg, const StringRef ORETag, 1083 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1084 Instruction *I) { 1085 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1086 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1087 ORE->emit( 1088 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1089 << "loop not vectorized: " << OREMsg); 1090 } 1091 1092 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1093 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1094 Instruction *I) { 1095 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1096 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1097 ORE->emit( 1098 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1099 << Msg); 1100 } 1101 1102 } // end namespace llvm 1103 1104 #ifndef NDEBUG 1105 /// \return string containing a file name and a line # for the given loop. 1106 static std::string getDebugLocString(const Loop *L) { 1107 std::string Result; 1108 if (L) { 1109 raw_string_ostream OS(Result); 1110 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1111 LoopDbgLoc.print(OS); 1112 else 1113 // Just print the module name. 1114 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1115 OS.flush(); 1116 } 1117 return Result; 1118 } 1119 #endif 1120 1121 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1122 const Instruction *Orig) { 1123 // If the loop was versioned with memchecks, add the corresponding no-alias 1124 // metadata. 1125 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1126 LVer->annotateInstWithNoAlias(To, Orig); 1127 } 1128 1129 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1130 VPTransformState &State) { 1131 1132 // Collect recipes in the backward slice of `Root` that may generate a poison 1133 // value that is used after vectorization. 1134 SmallPtrSet<VPRecipeBase *, 16> Visited; 1135 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1136 SmallVector<VPRecipeBase *, 16> Worklist; 1137 Worklist.push_back(Root); 1138 1139 // Traverse the backward slice of Root through its use-def chain. 1140 while (!Worklist.empty()) { 1141 VPRecipeBase *CurRec = Worklist.back(); 1142 Worklist.pop_back(); 1143 1144 if (!Visited.insert(CurRec).second) 1145 continue; 1146 1147 // Prune search if we find another recipe generating a widen memory 1148 // instruction. Widen memory instructions involved in address computation 1149 // will lead to gather/scatter instructions, which don't need to be 1150 // handled. 1151 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1152 isa<VPInterleaveRecipe>(CurRec)) 1153 continue; 1154 1155 // This recipe contributes to the address computation of a widen 1156 // load/store. Collect recipe if its underlying instruction has 1157 // poison-generating flags. 1158 Instruction *Instr = CurRec->getUnderlyingInstr(); 1159 if (Instr && Instr->hasPoisonGeneratingFlags()) 1160 State.MayGeneratePoisonRecipes.insert(CurRec); 1161 1162 // Add new definitions to the worklist. 1163 for (VPValue *operand : CurRec->operands()) 1164 if (VPDef *OpDef = operand->getDef()) 1165 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1166 } 1167 }); 1168 1169 // Traverse all the recipes in the VPlan and collect the poison-generating 1170 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1171 // VPInterleaveRecipe. 1172 auto Iter = depth_first( 1173 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1174 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1175 for (VPRecipeBase &Recipe : *VPBB) { 1176 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1177 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1178 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1179 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1180 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1181 collectPoisonGeneratingInstrsInBackwardSlice( 1182 cast<VPRecipeBase>(AddrDef)); 1183 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1184 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1185 if (AddrDef) { 1186 // Check if any member of the interleave group needs predication. 1187 const InterleaveGroup<Instruction> *InterGroup = 1188 InterleaveRec->getInterleaveGroup(); 1189 bool NeedPredication = false; 1190 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1191 I < NumMembers; ++I) { 1192 Instruction *Member = InterGroup->getMember(I); 1193 if (Member) 1194 NeedPredication |= 1195 Legal->blockNeedsPredication(Member->getParent()); 1196 } 1197 1198 if (NeedPredication) 1199 collectPoisonGeneratingInstrsInBackwardSlice( 1200 cast<VPRecipeBase>(AddrDef)); 1201 } 1202 } 1203 } 1204 } 1205 } 1206 1207 void InnerLoopVectorizer::addMetadata(Instruction *To, 1208 Instruction *From) { 1209 propagateMetadata(To, From); 1210 addNewMetadata(To, From); 1211 } 1212 1213 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1214 Instruction *From) { 1215 for (Value *V : To) { 1216 if (Instruction *I = dyn_cast<Instruction>(V)) 1217 addMetadata(I, From); 1218 } 1219 } 1220 1221 namespace llvm { 1222 1223 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1224 // lowered. 1225 enum ScalarEpilogueLowering { 1226 1227 // The default: allowing scalar epilogues. 1228 CM_ScalarEpilogueAllowed, 1229 1230 // Vectorization with OptForSize: don't allow epilogues. 1231 CM_ScalarEpilogueNotAllowedOptSize, 1232 1233 // A special case of vectorisation with OptForSize: loops with a very small 1234 // trip count are considered for vectorization under OptForSize, thereby 1235 // making sure the cost of their loop body is dominant, free of runtime 1236 // guards and scalar iteration overheads. 1237 CM_ScalarEpilogueNotAllowedLowTripLoop, 1238 1239 // Loop hint predicate indicating an epilogue is undesired. 1240 CM_ScalarEpilogueNotNeededUsePredicate, 1241 1242 // Directive indicating we must either tail fold or not vectorize 1243 CM_ScalarEpilogueNotAllowedUsePredicate 1244 }; 1245 1246 /// ElementCountComparator creates a total ordering for ElementCount 1247 /// for the purposes of using it in a set structure. 1248 struct ElementCountComparator { 1249 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1250 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1251 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1252 } 1253 }; 1254 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1255 1256 /// LoopVectorizationCostModel - estimates the expected speedups due to 1257 /// vectorization. 1258 /// In many cases vectorization is not profitable. This can happen because of 1259 /// a number of reasons. In this class we mainly attempt to predict the 1260 /// expected speedup/slowdowns due to the supported instruction set. We use the 1261 /// TargetTransformInfo to query the different backends for the cost of 1262 /// different operations. 1263 class LoopVectorizationCostModel { 1264 public: 1265 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1266 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1267 LoopVectorizationLegality *Legal, 1268 const TargetTransformInfo &TTI, 1269 const TargetLibraryInfo *TLI, DemandedBits *DB, 1270 AssumptionCache *AC, 1271 OptimizationRemarkEmitter *ORE, const Function *F, 1272 const LoopVectorizeHints *Hints, 1273 InterleavedAccessInfo &IAI) 1274 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1275 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1276 Hints(Hints), InterleaveInfo(IAI) {} 1277 1278 /// \return An upper bound for the vectorization factors (both fixed and 1279 /// scalable). If the factors are 0, vectorization and interleaving should be 1280 /// avoided up front. 1281 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1282 1283 /// \return True if runtime checks are required for vectorization, and false 1284 /// otherwise. 1285 bool runtimeChecksRequired(); 1286 1287 /// \return The most profitable vectorization factor and the cost of that VF. 1288 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1289 /// then this vectorization factor will be selected if vectorization is 1290 /// possible. 1291 VectorizationFactor 1292 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1293 1294 VectorizationFactor 1295 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1296 const LoopVectorizationPlanner &LVP); 1297 1298 /// Setup cost-based decisions for user vectorization factor. 1299 /// \return true if the UserVF is a feasible VF to be chosen. 1300 bool selectUserVectorizationFactor(ElementCount UserVF) { 1301 collectUniformsAndScalars(UserVF); 1302 collectInstsToScalarize(UserVF); 1303 return expectedCost(UserVF).first.isValid(); 1304 } 1305 1306 /// \return The size (in bits) of the smallest and widest types in the code 1307 /// that needs to be vectorized. We ignore values that remain scalar such as 1308 /// 64 bit loop indices. 1309 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1310 1311 /// \return The desired interleave count. 1312 /// If interleave count has been specified by metadata it will be returned. 1313 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1314 /// are the selected vectorization factor and the cost of the selected VF. 1315 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1316 1317 /// Memory access instruction may be vectorized in more than one way. 1318 /// Form of instruction after vectorization depends on cost. 1319 /// This function takes cost-based decisions for Load/Store instructions 1320 /// and collects them in a map. This decisions map is used for building 1321 /// the lists of loop-uniform and loop-scalar instructions. 1322 /// The calculated cost is saved with widening decision in order to 1323 /// avoid redundant calculations. 1324 void setCostBasedWideningDecision(ElementCount VF); 1325 1326 /// A struct that represents some properties of the register usage 1327 /// of a loop. 1328 struct RegisterUsage { 1329 /// Holds the number of loop invariant values that are used in the loop. 1330 /// The key is ClassID of target-provided register class. 1331 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1332 /// Holds the maximum number of concurrent live intervals in the loop. 1333 /// The key is ClassID of target-provided register class. 1334 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1335 }; 1336 1337 /// \return Returns information about the register usages of the loop for the 1338 /// given vectorization factors. 1339 SmallVector<RegisterUsage, 8> 1340 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1341 1342 /// Collect values we want to ignore in the cost model. 1343 void collectValuesToIgnore(); 1344 1345 /// Collect all element types in the loop for which widening is needed. 1346 void collectElementTypesForWidening(); 1347 1348 /// Split reductions into those that happen in the loop, and those that happen 1349 /// outside. In loop reductions are collected into InLoopReductionChains. 1350 void collectInLoopReductions(); 1351 1352 /// Returns true if we should use strict in-order reductions for the given 1353 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1354 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1355 /// of FP operations. 1356 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1357 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1358 } 1359 1360 /// \returns The smallest bitwidth each instruction can be represented with. 1361 /// The vector equivalents of these instructions should be truncated to this 1362 /// type. 1363 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1364 return MinBWs; 1365 } 1366 1367 /// \returns True if it is more profitable to scalarize instruction \p I for 1368 /// vectorization factor \p VF. 1369 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1370 assert(VF.isVector() && 1371 "Profitable to scalarize relevant only for VF > 1."); 1372 1373 // Cost model is not run in the VPlan-native path - return conservative 1374 // result until this changes. 1375 if (EnableVPlanNativePath) 1376 return false; 1377 1378 auto Scalars = InstsToScalarize.find(VF); 1379 assert(Scalars != InstsToScalarize.end() && 1380 "VF not yet analyzed for scalarization profitability"); 1381 return Scalars->second.find(I) != Scalars->second.end(); 1382 } 1383 1384 /// Returns true if \p I is known to be uniform after vectorization. 1385 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1386 if (VF.isScalar()) 1387 return true; 1388 1389 // Cost model is not run in the VPlan-native path - return conservative 1390 // result until this changes. 1391 if (EnableVPlanNativePath) 1392 return false; 1393 1394 auto UniformsPerVF = Uniforms.find(VF); 1395 assert(UniformsPerVF != Uniforms.end() && 1396 "VF not yet analyzed for uniformity"); 1397 return UniformsPerVF->second.count(I); 1398 } 1399 1400 /// Returns true if \p I is known to be scalar after vectorization. 1401 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1402 if (VF.isScalar()) 1403 return true; 1404 1405 // Cost model is not run in the VPlan-native path - return conservative 1406 // result until this changes. 1407 if (EnableVPlanNativePath) 1408 return false; 1409 1410 auto ScalarsPerVF = Scalars.find(VF); 1411 assert(ScalarsPerVF != Scalars.end() && 1412 "Scalar values are not calculated for VF"); 1413 return ScalarsPerVF->second.count(I); 1414 } 1415 1416 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1417 /// for vectorization factor \p VF. 1418 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1419 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1420 !isProfitableToScalarize(I, VF) && 1421 !isScalarAfterVectorization(I, VF); 1422 } 1423 1424 /// Decision that was taken during cost calculation for memory instruction. 1425 enum InstWidening { 1426 CM_Unknown, 1427 CM_Widen, // For consecutive accesses with stride +1. 1428 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1429 CM_Interleave, 1430 CM_GatherScatter, 1431 CM_Scalarize 1432 }; 1433 1434 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1435 /// instruction \p I and vector width \p VF. 1436 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1437 InstructionCost Cost) { 1438 assert(VF.isVector() && "Expected VF >=2"); 1439 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1440 } 1441 1442 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1443 /// interleaving group \p Grp and vector width \p VF. 1444 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1445 ElementCount VF, InstWidening W, 1446 InstructionCost Cost) { 1447 assert(VF.isVector() && "Expected VF >=2"); 1448 /// Broadcast this decicion to all instructions inside the group. 1449 /// But the cost will be assigned to one instruction only. 1450 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1451 if (auto *I = Grp->getMember(i)) { 1452 if (Grp->getInsertPos() == I) 1453 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1454 else 1455 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1456 } 1457 } 1458 } 1459 1460 /// Return the cost model decision for the given instruction \p I and vector 1461 /// width \p VF. Return CM_Unknown if this instruction did not pass 1462 /// through the cost modeling. 1463 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1464 assert(VF.isVector() && "Expected VF to be a vector VF"); 1465 // Cost model is not run in the VPlan-native path - return conservative 1466 // result until this changes. 1467 if (EnableVPlanNativePath) 1468 return CM_GatherScatter; 1469 1470 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1471 auto Itr = WideningDecisions.find(InstOnVF); 1472 if (Itr == WideningDecisions.end()) 1473 return CM_Unknown; 1474 return Itr->second.first; 1475 } 1476 1477 /// Return the vectorization cost for the given instruction \p I and vector 1478 /// width \p VF. 1479 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1480 assert(VF.isVector() && "Expected VF >=2"); 1481 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1482 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1483 "The cost is not calculated"); 1484 return WideningDecisions[InstOnVF].second; 1485 } 1486 1487 /// Return True if instruction \p I is an optimizable truncate whose operand 1488 /// is an induction variable. Such a truncate will be removed by adding a new 1489 /// induction variable with the destination type. 1490 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1491 // If the instruction is not a truncate, return false. 1492 auto *Trunc = dyn_cast<TruncInst>(I); 1493 if (!Trunc) 1494 return false; 1495 1496 // Get the source and destination types of the truncate. 1497 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1498 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1499 1500 // If the truncate is free for the given types, return false. Replacing a 1501 // free truncate with an induction variable would add an induction variable 1502 // update instruction to each iteration of the loop. We exclude from this 1503 // check the primary induction variable since it will need an update 1504 // instruction regardless. 1505 Value *Op = Trunc->getOperand(0); 1506 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1507 return false; 1508 1509 // If the truncated value is not an induction variable, return false. 1510 return Legal->isInductionPhi(Op); 1511 } 1512 1513 /// Collects the instructions to scalarize for each predicated instruction in 1514 /// the loop. 1515 void collectInstsToScalarize(ElementCount VF); 1516 1517 /// Collect Uniform and Scalar values for the given \p VF. 1518 /// The sets depend on CM decision for Load/Store instructions 1519 /// that may be vectorized as interleave, gather-scatter or scalarized. 1520 void collectUniformsAndScalars(ElementCount VF) { 1521 // Do the analysis once. 1522 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1523 return; 1524 setCostBasedWideningDecision(VF); 1525 collectLoopUniforms(VF); 1526 collectLoopScalars(VF); 1527 } 1528 1529 /// Returns true if the target machine supports masked store operation 1530 /// for the given \p DataType and kind of access to \p Ptr. 1531 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1532 return Legal->isConsecutivePtr(DataType, Ptr) && 1533 TTI.isLegalMaskedStore(DataType, Alignment); 1534 } 1535 1536 /// Returns true if the target machine supports masked load operation 1537 /// for the given \p DataType and kind of access to \p Ptr. 1538 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1539 return Legal->isConsecutivePtr(DataType, Ptr) && 1540 TTI.isLegalMaskedLoad(DataType, Alignment); 1541 } 1542 1543 /// Returns true if the target machine can represent \p V as a masked gather 1544 /// or scatter operation. 1545 bool isLegalGatherOrScatter(Value *V) { 1546 bool LI = isa<LoadInst>(V); 1547 bool SI = isa<StoreInst>(V); 1548 if (!LI && !SI) 1549 return false; 1550 auto *Ty = getLoadStoreType(V); 1551 Align Align = getLoadStoreAlignment(V); 1552 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1553 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1554 } 1555 1556 /// Returns true if the target machine supports all of the reduction 1557 /// variables found for the given VF. 1558 bool canVectorizeReductions(ElementCount VF) const { 1559 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1560 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1561 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1562 })); 1563 } 1564 1565 /// Returns true if \p I is an instruction that will be scalarized with 1566 /// predication. Such instructions include conditional stores and 1567 /// instructions that may divide by zero. 1568 /// If a non-zero VF has been calculated, we check if I will be scalarized 1569 /// predication for that VF. 1570 bool isScalarWithPredication(Instruction *I) const; 1571 1572 // Returns true if \p I is an instruction that will be predicated either 1573 // through scalar predication or masked load/store or masked gather/scatter. 1574 // Superset of instructions that return true for isScalarWithPredication. 1575 bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { 1576 // When we know the load is uniform and the original scalar loop was not 1577 // predicated we don't need to mark it as a predicated instruction. Any 1578 // vectorised blocks created when tail-folding are something artificial we 1579 // have introduced and we know there is always at least one active lane. 1580 // That's why we call Legal->blockNeedsPredication here because it doesn't 1581 // query tail-folding. 1582 if (IsKnownUniform && isa<LoadInst>(I) && 1583 !Legal->blockNeedsPredication(I->getParent())) 1584 return false; 1585 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1586 return false; 1587 // Loads and stores that need some form of masked operation are predicated 1588 // instructions. 1589 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1590 return Legal->isMaskRequired(I); 1591 return isScalarWithPredication(I); 1592 } 1593 1594 /// Returns true if \p I is a memory instruction with consecutive memory 1595 /// access that can be widened. 1596 bool 1597 memoryInstructionCanBeWidened(Instruction *I, 1598 ElementCount VF = ElementCount::getFixed(1)); 1599 1600 /// Returns true if \p I is a memory instruction in an interleaved-group 1601 /// of memory accesses that can be vectorized with wide vector loads/stores 1602 /// and shuffles. 1603 bool 1604 interleavedAccessCanBeWidened(Instruction *I, 1605 ElementCount VF = ElementCount::getFixed(1)); 1606 1607 /// Check if \p Instr belongs to any interleaved access group. 1608 bool isAccessInterleaved(Instruction *Instr) { 1609 return InterleaveInfo.isInterleaved(Instr); 1610 } 1611 1612 /// Get the interleaved access group that \p Instr belongs to. 1613 const InterleaveGroup<Instruction> * 1614 getInterleavedAccessGroup(Instruction *Instr) { 1615 return InterleaveInfo.getInterleaveGroup(Instr); 1616 } 1617 1618 /// Returns true if we're required to use a scalar epilogue for at least 1619 /// the final iteration of the original loop. 1620 bool requiresScalarEpilogue(ElementCount VF) const { 1621 if (!isScalarEpilogueAllowed()) 1622 return false; 1623 // If we might exit from anywhere but the latch, must run the exiting 1624 // iteration in scalar form. 1625 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1626 return true; 1627 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1628 } 1629 1630 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1631 /// loop hint annotation. 1632 bool isScalarEpilogueAllowed() const { 1633 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1634 } 1635 1636 /// Returns true if all loop blocks should be masked to fold tail loop. 1637 bool foldTailByMasking() const { return FoldTailByMasking; } 1638 1639 /// Returns true if the instructions in this block requires predication 1640 /// for any reason, e.g. because tail folding now requires a predicate 1641 /// or because the block in the original loop was predicated. 1642 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1643 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1644 } 1645 1646 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1647 /// nodes to the chain of instructions representing the reductions. Uses a 1648 /// MapVector to ensure deterministic iteration order. 1649 using ReductionChainMap = 1650 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1651 1652 /// Return the chain of instructions representing an inloop reduction. 1653 const ReductionChainMap &getInLoopReductionChains() const { 1654 return InLoopReductionChains; 1655 } 1656 1657 /// Returns true if the Phi is part of an inloop reduction. 1658 bool isInLoopReduction(PHINode *Phi) const { 1659 return InLoopReductionChains.count(Phi); 1660 } 1661 1662 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1663 /// with factor VF. Return the cost of the instruction, including 1664 /// scalarization overhead if it's needed. 1665 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1666 1667 /// Estimate cost of a call instruction CI if it were vectorized with factor 1668 /// VF. Return the cost of the instruction, including scalarization overhead 1669 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1670 /// scalarized - 1671 /// i.e. either vector version isn't available, or is too expensive. 1672 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1673 bool &NeedToScalarize) const; 1674 1675 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1676 /// that of B. 1677 bool isMoreProfitable(const VectorizationFactor &A, 1678 const VectorizationFactor &B) const; 1679 1680 /// Invalidates decisions already taken by the cost model. 1681 void invalidateCostModelingDecisions() { 1682 WideningDecisions.clear(); 1683 Uniforms.clear(); 1684 Scalars.clear(); 1685 } 1686 1687 private: 1688 unsigned NumPredStores = 0; 1689 1690 /// \return An upper bound for the vectorization factors for both 1691 /// fixed and scalable vectorization, where the minimum-known number of 1692 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1693 /// disabled or unsupported, then the scalable part will be equal to 1694 /// ElementCount::getScalable(0). 1695 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1696 ElementCount UserVF, 1697 bool FoldTailByMasking); 1698 1699 /// \return the maximized element count based on the targets vector 1700 /// registers and the loop trip-count, but limited to a maximum safe VF. 1701 /// This is a helper function of computeFeasibleMaxVF. 1702 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1703 /// issue that occurred on one of the buildbots which cannot be reproduced 1704 /// without having access to the properietary compiler (see comments on 1705 /// D98509). The issue is currently under investigation and this workaround 1706 /// will be removed as soon as possible. 1707 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1708 unsigned SmallestType, 1709 unsigned WidestType, 1710 const ElementCount &MaxSafeVF, 1711 bool FoldTailByMasking); 1712 1713 /// \return the maximum legal scalable VF, based on the safe max number 1714 /// of elements. 1715 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1716 1717 /// The vectorization cost is a combination of the cost itself and a boolean 1718 /// indicating whether any of the contributing operations will actually 1719 /// operate on vector values after type legalization in the backend. If this 1720 /// latter value is false, then all operations will be scalarized (i.e. no 1721 /// vectorization has actually taken place). 1722 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1723 1724 /// Returns the expected execution cost. The unit of the cost does 1725 /// not matter because we use the 'cost' units to compare different 1726 /// vector widths. The cost that is returned is *not* normalized by 1727 /// the factor width. If \p Invalid is not nullptr, this function 1728 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1729 /// each instruction that has an Invalid cost for the given VF. 1730 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1731 VectorizationCostTy 1732 expectedCost(ElementCount VF, 1733 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1734 1735 /// Returns the execution time cost of an instruction for a given vector 1736 /// width. Vector width of one means scalar. 1737 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1738 1739 /// The cost-computation logic from getInstructionCost which provides 1740 /// the vector type as an output parameter. 1741 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1742 Type *&VectorTy); 1743 1744 /// Return the cost of instructions in an inloop reduction pattern, if I is 1745 /// part of that pattern. 1746 Optional<InstructionCost> 1747 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1748 TTI::TargetCostKind CostKind); 1749 1750 /// Calculate vectorization cost of memory instruction \p I. 1751 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1752 1753 /// The cost computation for scalarized memory instruction. 1754 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1755 1756 /// The cost computation for interleaving group of memory instructions. 1757 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1758 1759 /// The cost computation for Gather/Scatter instruction. 1760 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1761 1762 /// The cost computation for widening instruction \p I with consecutive 1763 /// memory access. 1764 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1765 1766 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1767 /// Load: scalar load + broadcast. 1768 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1769 /// element) 1770 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1771 1772 /// Estimate the overhead of scalarizing an instruction. This is a 1773 /// convenience wrapper for the type-based getScalarizationOverhead API. 1774 InstructionCost getScalarizationOverhead(Instruction *I, 1775 ElementCount VF) const; 1776 1777 /// Returns whether the instruction is a load or store and will be a emitted 1778 /// as a vector operation. 1779 bool isConsecutiveLoadOrStore(Instruction *I); 1780 1781 /// Returns true if an artificially high cost for emulated masked memrefs 1782 /// should be used. 1783 bool useEmulatedMaskMemRefHack(Instruction *I); 1784 1785 /// Map of scalar integer values to the smallest bitwidth they can be legally 1786 /// represented as. The vector equivalents of these values should be truncated 1787 /// to this type. 1788 MapVector<Instruction *, uint64_t> MinBWs; 1789 1790 /// A type representing the costs for instructions if they were to be 1791 /// scalarized rather than vectorized. The entries are Instruction-Cost 1792 /// pairs. 1793 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1794 1795 /// A set containing all BasicBlocks that are known to present after 1796 /// vectorization as a predicated block. 1797 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1798 1799 /// Records whether it is allowed to have the original scalar loop execute at 1800 /// least once. This may be needed as a fallback loop in case runtime 1801 /// aliasing/dependence checks fail, or to handle the tail/remainder 1802 /// iterations when the trip count is unknown or doesn't divide by the VF, 1803 /// or as a peel-loop to handle gaps in interleave-groups. 1804 /// Under optsize and when the trip count is very small we don't allow any 1805 /// iterations to execute in the scalar loop. 1806 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1807 1808 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1809 bool FoldTailByMasking = false; 1810 1811 /// A map holding scalar costs for different vectorization factors. The 1812 /// presence of a cost for an instruction in the mapping indicates that the 1813 /// instruction will be scalarized when vectorizing with the associated 1814 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1815 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1816 1817 /// Holds the instructions known to be uniform after vectorization. 1818 /// The data is collected per VF. 1819 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1820 1821 /// Holds the instructions known to be scalar after vectorization. 1822 /// The data is collected per VF. 1823 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1824 1825 /// Holds the instructions (address computations) that are forced to be 1826 /// scalarized. 1827 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1828 1829 /// PHINodes of the reductions that should be expanded in-loop along with 1830 /// their associated chains of reduction operations, in program order from top 1831 /// (PHI) to bottom 1832 ReductionChainMap InLoopReductionChains; 1833 1834 /// A Map of inloop reduction operations and their immediate chain operand. 1835 /// FIXME: This can be removed once reductions can be costed correctly in 1836 /// vplan. This was added to allow quick lookup to the inloop operations, 1837 /// without having to loop through InLoopReductionChains. 1838 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1839 1840 /// Returns the expected difference in cost from scalarizing the expression 1841 /// feeding a predicated instruction \p PredInst. The instructions to 1842 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1843 /// non-negative return value implies the expression will be scalarized. 1844 /// Currently, only single-use chains are considered for scalarization. 1845 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1846 ElementCount VF); 1847 1848 /// Collect the instructions that are uniform after vectorization. An 1849 /// instruction is uniform if we represent it with a single scalar value in 1850 /// the vectorized loop corresponding to each vector iteration. Examples of 1851 /// uniform instructions include pointer operands of consecutive or 1852 /// interleaved memory accesses. Note that although uniformity implies an 1853 /// instruction will be scalar, the reverse is not true. In general, a 1854 /// scalarized instruction will be represented by VF scalar values in the 1855 /// vectorized loop, each corresponding to an iteration of the original 1856 /// scalar loop. 1857 void collectLoopUniforms(ElementCount VF); 1858 1859 /// Collect the instructions that are scalar after vectorization. An 1860 /// instruction is scalar if it is known to be uniform or will be scalarized 1861 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1862 /// to the list if they are used by a load/store instruction that is marked as 1863 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1864 /// VF values in the vectorized loop, each corresponding to an iteration of 1865 /// the original scalar loop. 1866 void collectLoopScalars(ElementCount VF); 1867 1868 /// Keeps cost model vectorization decision and cost for instructions. 1869 /// Right now it is used for memory instructions only. 1870 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1871 std::pair<InstWidening, InstructionCost>>; 1872 1873 DecisionList WideningDecisions; 1874 1875 /// Returns true if \p V is expected to be vectorized and it needs to be 1876 /// extracted. 1877 bool needsExtract(Value *V, ElementCount VF) const { 1878 Instruction *I = dyn_cast<Instruction>(V); 1879 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1880 TheLoop->isLoopInvariant(I)) 1881 return false; 1882 1883 // Assume we can vectorize V (and hence we need extraction) if the 1884 // scalars are not computed yet. This can happen, because it is called 1885 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1886 // the scalars are collected. That should be a safe assumption in most 1887 // cases, because we check if the operands have vectorizable types 1888 // beforehand in LoopVectorizationLegality. 1889 return Scalars.find(VF) == Scalars.end() || 1890 !isScalarAfterVectorization(I, VF); 1891 }; 1892 1893 /// Returns a range containing only operands needing to be extracted. 1894 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1895 ElementCount VF) const { 1896 return SmallVector<Value *, 4>(make_filter_range( 1897 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1898 } 1899 1900 /// Determines if we have the infrastructure to vectorize loop \p L and its 1901 /// epilogue, assuming the main loop is vectorized by \p VF. 1902 bool isCandidateForEpilogueVectorization(const Loop &L, 1903 const ElementCount VF) const; 1904 1905 /// Returns true if epilogue vectorization is considered profitable, and 1906 /// false otherwise. 1907 /// \p VF is the vectorization factor chosen for the original loop. 1908 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1909 1910 public: 1911 /// The loop that we evaluate. 1912 Loop *TheLoop; 1913 1914 /// Predicated scalar evolution analysis. 1915 PredicatedScalarEvolution &PSE; 1916 1917 /// Loop Info analysis. 1918 LoopInfo *LI; 1919 1920 /// Vectorization legality. 1921 LoopVectorizationLegality *Legal; 1922 1923 /// Vector target information. 1924 const TargetTransformInfo &TTI; 1925 1926 /// Target Library Info. 1927 const TargetLibraryInfo *TLI; 1928 1929 /// Demanded bits analysis. 1930 DemandedBits *DB; 1931 1932 /// Assumption cache. 1933 AssumptionCache *AC; 1934 1935 /// Interface to emit optimization remarks. 1936 OptimizationRemarkEmitter *ORE; 1937 1938 const Function *TheFunction; 1939 1940 /// Loop Vectorize Hint. 1941 const LoopVectorizeHints *Hints; 1942 1943 /// The interleave access information contains groups of interleaved accesses 1944 /// with the same stride and close to each other. 1945 InterleavedAccessInfo &InterleaveInfo; 1946 1947 /// Values to ignore in the cost model. 1948 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1949 1950 /// Values to ignore in the cost model when VF > 1. 1951 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1952 1953 /// All element types found in the loop. 1954 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1955 1956 /// Profitable vector factors. 1957 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1958 }; 1959 } // end namespace llvm 1960 1961 /// Helper struct to manage generating runtime checks for vectorization. 1962 /// 1963 /// The runtime checks are created up-front in temporary blocks to allow better 1964 /// estimating the cost and un-linked from the existing IR. After deciding to 1965 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1966 /// temporary blocks are completely removed. 1967 class GeneratedRTChecks { 1968 /// Basic block which contains the generated SCEV checks, if any. 1969 BasicBlock *SCEVCheckBlock = nullptr; 1970 1971 /// The value representing the result of the generated SCEV checks. If it is 1972 /// nullptr, either no SCEV checks have been generated or they have been used. 1973 Value *SCEVCheckCond = nullptr; 1974 1975 /// Basic block which contains the generated memory runtime checks, if any. 1976 BasicBlock *MemCheckBlock = nullptr; 1977 1978 /// The value representing the result of the generated memory runtime checks. 1979 /// If it is nullptr, either no memory runtime checks have been generated or 1980 /// they have been used. 1981 Value *MemRuntimeCheckCond = nullptr; 1982 1983 DominatorTree *DT; 1984 LoopInfo *LI; 1985 1986 SCEVExpander SCEVExp; 1987 SCEVExpander MemCheckExp; 1988 1989 public: 1990 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1991 const DataLayout &DL) 1992 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1993 MemCheckExp(SE, DL, "scev.check") {} 1994 1995 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1996 /// accurately estimate the cost of the runtime checks. The blocks are 1997 /// un-linked from the IR and is added back during vector code generation. If 1998 /// there is no vector code generation, the check blocks are removed 1999 /// completely. 2000 void Create(Loop *L, const LoopAccessInfo &LAI, 2001 const SCEVUnionPredicate &UnionPred) { 2002 2003 BasicBlock *LoopHeader = L->getHeader(); 2004 BasicBlock *Preheader = L->getLoopPreheader(); 2005 2006 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2007 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2008 // may be used by SCEVExpander. The blocks will be un-linked from their 2009 // predecessors and removed from LI & DT at the end of the function. 2010 if (!UnionPred.isAlwaysTrue()) { 2011 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2012 nullptr, "vector.scevcheck"); 2013 2014 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2015 &UnionPred, SCEVCheckBlock->getTerminator()); 2016 } 2017 2018 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2019 if (RtPtrChecking.Need) { 2020 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2021 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2022 "vector.memcheck"); 2023 2024 MemRuntimeCheckCond = 2025 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2026 RtPtrChecking.getChecks(), MemCheckExp); 2027 assert(MemRuntimeCheckCond && 2028 "no RT checks generated although RtPtrChecking " 2029 "claimed checks are required"); 2030 } 2031 2032 if (!MemCheckBlock && !SCEVCheckBlock) 2033 return; 2034 2035 // Unhook the temporary block with the checks, update various places 2036 // accordingly. 2037 if (SCEVCheckBlock) 2038 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2039 if (MemCheckBlock) 2040 MemCheckBlock->replaceAllUsesWith(Preheader); 2041 2042 if (SCEVCheckBlock) { 2043 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2044 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2045 Preheader->getTerminator()->eraseFromParent(); 2046 } 2047 if (MemCheckBlock) { 2048 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2049 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2050 Preheader->getTerminator()->eraseFromParent(); 2051 } 2052 2053 DT->changeImmediateDominator(LoopHeader, Preheader); 2054 if (MemCheckBlock) { 2055 DT->eraseNode(MemCheckBlock); 2056 LI->removeBlock(MemCheckBlock); 2057 } 2058 if (SCEVCheckBlock) { 2059 DT->eraseNode(SCEVCheckBlock); 2060 LI->removeBlock(SCEVCheckBlock); 2061 } 2062 } 2063 2064 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2065 /// unused. 2066 ~GeneratedRTChecks() { 2067 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2068 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2069 if (!SCEVCheckCond) 2070 SCEVCleaner.markResultUsed(); 2071 2072 if (!MemRuntimeCheckCond) 2073 MemCheckCleaner.markResultUsed(); 2074 2075 if (MemRuntimeCheckCond) { 2076 auto &SE = *MemCheckExp.getSE(); 2077 // Memory runtime check generation creates compares that use expanded 2078 // values. Remove them before running the SCEVExpanderCleaners. 2079 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2080 if (MemCheckExp.isInsertedInstruction(&I)) 2081 continue; 2082 SE.forgetValue(&I); 2083 I.eraseFromParent(); 2084 } 2085 } 2086 MemCheckCleaner.cleanup(); 2087 SCEVCleaner.cleanup(); 2088 2089 if (SCEVCheckCond) 2090 SCEVCheckBlock->eraseFromParent(); 2091 if (MemRuntimeCheckCond) 2092 MemCheckBlock->eraseFromParent(); 2093 } 2094 2095 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2096 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2097 /// depending on the generated condition. 2098 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2099 BasicBlock *LoopVectorPreHeader, 2100 BasicBlock *LoopExitBlock) { 2101 if (!SCEVCheckCond) 2102 return nullptr; 2103 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2104 if (C->isZero()) 2105 return nullptr; 2106 2107 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2108 2109 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2110 // Create new preheader for vector loop. 2111 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2112 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2113 2114 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2115 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2116 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2117 SCEVCheckBlock); 2118 2119 DT->addNewBlock(SCEVCheckBlock, Pred); 2120 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2121 2122 ReplaceInstWithInst( 2123 SCEVCheckBlock->getTerminator(), 2124 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2125 // Mark the check as used, to prevent it from being removed during cleanup. 2126 SCEVCheckCond = nullptr; 2127 return SCEVCheckBlock; 2128 } 2129 2130 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2131 /// the branches to branch to the vector preheader or \p Bypass, depending on 2132 /// the generated condition. 2133 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2134 BasicBlock *LoopVectorPreHeader) { 2135 // Check if we generated code that checks in runtime if arrays overlap. 2136 if (!MemRuntimeCheckCond) 2137 return nullptr; 2138 2139 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2140 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2141 MemCheckBlock); 2142 2143 DT->addNewBlock(MemCheckBlock, Pred); 2144 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2145 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2146 2147 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2148 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2149 2150 ReplaceInstWithInst( 2151 MemCheckBlock->getTerminator(), 2152 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2153 MemCheckBlock->getTerminator()->setDebugLoc( 2154 Pred->getTerminator()->getDebugLoc()); 2155 2156 // Mark the check as used, to prevent it from being removed during cleanup. 2157 MemRuntimeCheckCond = nullptr; 2158 return MemCheckBlock; 2159 } 2160 }; 2161 2162 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2163 // vectorization. The loop needs to be annotated with #pragma omp simd 2164 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2165 // vector length information is not provided, vectorization is not considered 2166 // explicit. Interleave hints are not allowed either. These limitations will be 2167 // relaxed in the future. 2168 // Please, note that we are currently forced to abuse the pragma 'clang 2169 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2170 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2171 // provides *explicit vectorization hints* (LV can bypass legal checks and 2172 // assume that vectorization is legal). However, both hints are implemented 2173 // using the same metadata (llvm.loop.vectorize, processed by 2174 // LoopVectorizeHints). This will be fixed in the future when the native IR 2175 // representation for pragma 'omp simd' is introduced. 2176 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2177 OptimizationRemarkEmitter *ORE) { 2178 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2179 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2180 2181 // Only outer loops with an explicit vectorization hint are supported. 2182 // Unannotated outer loops are ignored. 2183 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2184 return false; 2185 2186 Function *Fn = OuterLp->getHeader()->getParent(); 2187 if (!Hints.allowVectorization(Fn, OuterLp, 2188 true /*VectorizeOnlyWhenForced*/)) { 2189 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2190 return false; 2191 } 2192 2193 if (Hints.getInterleave() > 1) { 2194 // TODO: Interleave support is future work. 2195 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2196 "outer loops.\n"); 2197 Hints.emitRemarkWithHints(); 2198 return false; 2199 } 2200 2201 return true; 2202 } 2203 2204 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2205 OptimizationRemarkEmitter *ORE, 2206 SmallVectorImpl<Loop *> &V) { 2207 // Collect inner loops and outer loops without irreducible control flow. For 2208 // now, only collect outer loops that have explicit vectorization hints. If we 2209 // are stress testing the VPlan H-CFG construction, we collect the outermost 2210 // loop of every loop nest. 2211 if (L.isInnermost() || VPlanBuildStressTest || 2212 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2213 LoopBlocksRPO RPOT(&L); 2214 RPOT.perform(LI); 2215 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2216 V.push_back(&L); 2217 // TODO: Collect inner loops inside marked outer loops in case 2218 // vectorization fails for the outer loop. Do not invoke 2219 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2220 // already known to be reducible. We can use an inherited attribute for 2221 // that. 2222 return; 2223 } 2224 } 2225 for (Loop *InnerL : L) 2226 collectSupportedLoops(*InnerL, LI, ORE, V); 2227 } 2228 2229 namespace { 2230 2231 /// The LoopVectorize Pass. 2232 struct LoopVectorize : public FunctionPass { 2233 /// Pass identification, replacement for typeid 2234 static char ID; 2235 2236 LoopVectorizePass Impl; 2237 2238 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2239 bool VectorizeOnlyWhenForced = false) 2240 : FunctionPass(ID), 2241 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2242 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2243 } 2244 2245 bool runOnFunction(Function &F) override { 2246 if (skipFunction(F)) 2247 return false; 2248 2249 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2250 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2251 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2252 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2253 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2254 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2255 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2256 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2257 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2258 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2259 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2260 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2261 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2262 2263 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2264 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2265 2266 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2267 GetLAA, *ORE, PSI).MadeAnyChange; 2268 } 2269 2270 void getAnalysisUsage(AnalysisUsage &AU) const override { 2271 AU.addRequired<AssumptionCacheTracker>(); 2272 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2273 AU.addRequired<DominatorTreeWrapperPass>(); 2274 AU.addRequired<LoopInfoWrapperPass>(); 2275 AU.addRequired<ScalarEvolutionWrapperPass>(); 2276 AU.addRequired<TargetTransformInfoWrapperPass>(); 2277 AU.addRequired<AAResultsWrapperPass>(); 2278 AU.addRequired<LoopAccessLegacyAnalysis>(); 2279 AU.addRequired<DemandedBitsWrapperPass>(); 2280 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2281 AU.addRequired<InjectTLIMappingsLegacy>(); 2282 2283 // We currently do not preserve loopinfo/dominator analyses with outer loop 2284 // vectorization. Until this is addressed, mark these analyses as preserved 2285 // only for non-VPlan-native path. 2286 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2287 if (!EnableVPlanNativePath) { 2288 AU.addPreserved<LoopInfoWrapperPass>(); 2289 AU.addPreserved<DominatorTreeWrapperPass>(); 2290 } 2291 2292 AU.addPreserved<BasicAAWrapperPass>(); 2293 AU.addPreserved<GlobalsAAWrapperPass>(); 2294 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2295 } 2296 }; 2297 2298 } // end anonymous namespace 2299 2300 //===----------------------------------------------------------------------===// 2301 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2302 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2303 //===----------------------------------------------------------------------===// 2304 2305 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2306 // We need to place the broadcast of invariant variables outside the loop, 2307 // but only if it's proven safe to do so. Else, broadcast will be inside 2308 // vector loop body. 2309 Instruction *Instr = dyn_cast<Instruction>(V); 2310 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2311 (!Instr || 2312 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2313 // Place the code for broadcasting invariant variables in the new preheader. 2314 IRBuilder<>::InsertPointGuard Guard(Builder); 2315 if (SafeToHoist) 2316 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2317 2318 // Broadcast the scalar into all locations in the vector. 2319 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2320 2321 return Shuf; 2322 } 2323 2324 /// This function adds 2325 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2326 /// to each vector element of Val. The sequence starts at StartIndex. 2327 /// \p Opcode is relevant for FP induction variable. 2328 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2329 Instruction::BinaryOps BinOp, ElementCount VF, 2330 IRBuilder<> &Builder) { 2331 if (VF.isScalar()) { 2332 // When unrolling and the VF is 1, we only need to add a simple scalar. 2333 Type *Ty = Val->getType(); 2334 assert(!Ty->isVectorTy() && "Val must be a scalar"); 2335 2336 if (Ty->isFloatingPointTy()) { 2337 // Floating-point operations inherit FMF via the builder's flags. 2338 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 2339 return Builder.CreateBinOp(BinOp, Val, MulOp); 2340 } 2341 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), 2342 "induction"); 2343 } 2344 2345 // Create and check the types. 2346 auto *ValVTy = cast<VectorType>(Val->getType()); 2347 ElementCount VLen = ValVTy->getElementCount(); 2348 2349 Type *STy = Val->getType()->getScalarType(); 2350 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2351 "Induction Step must be an integer or FP"); 2352 assert(Step->getType() == STy && "Step has wrong type"); 2353 2354 SmallVector<Constant *, 8> Indices; 2355 2356 // Create a vector of consecutive numbers from zero to VF. 2357 VectorType *InitVecValVTy = ValVTy; 2358 Type *InitVecValSTy = STy; 2359 if (STy->isFloatingPointTy()) { 2360 InitVecValSTy = 2361 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2362 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2363 } 2364 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2365 2366 // Splat the StartIdx 2367 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2368 2369 if (STy->isIntegerTy()) { 2370 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2371 Step = Builder.CreateVectorSplat(VLen, Step); 2372 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2373 // FIXME: The newly created binary instructions should contain nsw/nuw 2374 // flags, which can be found from the original scalar operations. 2375 Step = Builder.CreateMul(InitVec, Step); 2376 return Builder.CreateAdd(Val, Step, "induction"); 2377 } 2378 2379 // Floating point induction. 2380 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2381 "Binary Opcode should be specified for FP induction"); 2382 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2383 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2384 2385 Step = Builder.CreateVectorSplat(VLen, Step); 2386 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2387 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2388 } 2389 2390 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2391 const InductionDescriptor &II, Value *Step, Value *Start, 2392 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2393 IRBuilder<> &Builder = State.Builder; 2394 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2395 "Expected either an induction phi-node or a truncate of it!"); 2396 2397 // Construct the initial value of the vector IV in the vector loop preheader 2398 auto CurrIP = Builder.saveIP(); 2399 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2400 if (isa<TruncInst>(EntryVal)) { 2401 assert(Start->getType()->isIntegerTy() && 2402 "Truncation requires an integer type"); 2403 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2404 Step = Builder.CreateTrunc(Step, TruncType); 2405 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2406 } 2407 2408 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2409 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2410 Value *SteppedStart = getStepVector( 2411 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2412 2413 // We create vector phi nodes for both integer and floating-point induction 2414 // variables. Here, we determine the kind of arithmetic we will perform. 2415 Instruction::BinaryOps AddOp; 2416 Instruction::BinaryOps MulOp; 2417 if (Step->getType()->isIntegerTy()) { 2418 AddOp = Instruction::Add; 2419 MulOp = Instruction::Mul; 2420 } else { 2421 AddOp = II.getInductionOpcode(); 2422 MulOp = Instruction::FMul; 2423 } 2424 2425 // Multiply the vectorization factor by the step using integer or 2426 // floating-point arithmetic as appropriate. 2427 Type *StepType = Step->getType(); 2428 Value *RuntimeVF; 2429 if (Step->getType()->isFloatingPointTy()) 2430 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2431 else 2432 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2433 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2434 2435 // Create a vector splat to use in the induction update. 2436 // 2437 // FIXME: If the step is non-constant, we create the vector splat with 2438 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2439 // handle a constant vector splat. 2440 Value *SplatVF = isa<Constant>(Mul) 2441 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2442 : Builder.CreateVectorSplat(State.VF, Mul); 2443 Builder.restoreIP(CurrIP); 2444 2445 // We may need to add the step a number of times, depending on the unroll 2446 // factor. The last of those goes into the PHI. 2447 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2448 &*LoopVectorBody->getFirstInsertionPt()); 2449 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2450 Instruction *LastInduction = VecInd; 2451 for (unsigned Part = 0; Part < UF; ++Part) { 2452 State.set(Def, LastInduction, Part); 2453 2454 if (isa<TruncInst>(EntryVal)) 2455 addMetadata(LastInduction, EntryVal); 2456 2457 LastInduction = cast<Instruction>( 2458 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2459 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2460 } 2461 2462 // Move the last step to the end of the latch block. This ensures consistent 2463 // placement of all induction updates. 2464 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2465 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2466 auto *ICmp = cast<Instruction>(Br->getCondition()); 2467 LastInduction->moveBefore(ICmp); 2468 LastInduction->setName("vec.ind.next"); 2469 2470 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2471 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2472 } 2473 2474 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2475 return Cost->isScalarAfterVectorization(I, VF) || 2476 Cost->isProfitableToScalarize(I, VF); 2477 } 2478 2479 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2480 if (shouldScalarizeInstruction(IV)) 2481 return true; 2482 auto isScalarInst = [&](User *U) -> bool { 2483 auto *I = cast<Instruction>(U); 2484 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2485 }; 2486 return llvm::any_of(IV->users(), isScalarInst); 2487 } 2488 2489 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, 2490 const InductionDescriptor &ID, 2491 Value *Start, TruncInst *Trunc, 2492 VPValue *Def, 2493 VPTransformState &State) { 2494 IRBuilder<> &Builder = State.Builder; 2495 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2496 "Primary induction variable must have an integer type"); 2497 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2498 assert(!State.VF.isZero() && "VF must be non-zero"); 2499 2500 // The value from the original loop to which we are mapping the new induction 2501 // variable. 2502 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2503 2504 auto &DL = EntryVal->getModule()->getDataLayout(); 2505 2506 // Generate code for the induction step. Note that induction steps are 2507 // required to be loop-invariant 2508 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2509 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2510 "Induction step should be loop invariant"); 2511 if (PSE.getSE()->isSCEVable(IV->getType())) { 2512 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2513 return Exp.expandCodeFor(Step, Step->getType(), 2514 State.CFG.VectorPreHeader->getTerminator()); 2515 } 2516 return cast<SCEVUnknown>(Step)->getValue(); 2517 }; 2518 2519 // The scalar value to broadcast. This is derived from the canonical 2520 // induction variable. If a truncation type is given, truncate the canonical 2521 // induction variable and step. Otherwise, derive these values from the 2522 // induction descriptor. 2523 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2524 Value *ScalarIV = Induction; 2525 if (IV != OldInduction) { 2526 ScalarIV = IV->getType()->isIntegerTy() 2527 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2528 : Builder.CreateCast(Instruction::SIToFP, Induction, 2529 IV->getType()); 2530 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, 2531 State.CFG.PrevBB); 2532 ScalarIV->setName("offset.idx"); 2533 } 2534 if (Trunc) { 2535 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2536 assert(Step->getType()->isIntegerTy() && 2537 "Truncation requires an integer step"); 2538 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2539 Step = Builder.CreateTrunc(Step, TruncType); 2540 } 2541 return ScalarIV; 2542 }; 2543 2544 // Create the vector values from the scalar IV, in the absence of creating a 2545 // vector IV. 2546 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2547 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2548 for (unsigned Part = 0; Part < UF; ++Part) { 2549 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 2550 Value *StartIdx; 2551 if (Step->getType()->isFloatingPointTy()) 2552 StartIdx = 2553 getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); 2554 else 2555 StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); 2556 2557 Value *EntryPart = 2558 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), 2559 State.VF, State.Builder); 2560 State.set(Def, EntryPart, Part); 2561 if (Trunc) 2562 addMetadata(EntryPart, Trunc); 2563 } 2564 }; 2565 2566 // Fast-math-flags propagate from the original induction instruction. 2567 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2568 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2569 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2570 2571 // Now do the actual transformations, and start with creating the step value. 2572 Value *Step = CreateStepValue(ID.getStep()); 2573 if (State.VF.isScalar()) { 2574 Value *ScalarIV = CreateScalarIV(Step); 2575 CreateSplatIV(ScalarIV, Step); 2576 return; 2577 } 2578 2579 // Determine if we want a scalar version of the induction variable. This is 2580 // true if the induction variable itself is not widened, or if it has at 2581 // least one user in the loop that is not widened. 2582 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2583 if (!NeedsScalarIV) { 2584 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2585 return; 2586 } 2587 2588 // Try to create a new independent vector induction variable. If we can't 2589 // create the phi node, we will splat the scalar induction variable in each 2590 // loop iteration. 2591 if (!shouldScalarizeInstruction(EntryVal)) { 2592 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2593 Value *ScalarIV = CreateScalarIV(Step); 2594 // Create scalar steps that can be used by instructions we will later 2595 // scalarize. Note that the addition of the scalar steps will not increase 2596 // the number of instructions in the loop in the common case prior to 2597 // InstCombine. We will be trading one vector extract for each scalar step. 2598 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2599 return; 2600 } 2601 2602 // All IV users are scalar instructions, so only emit a scalar IV, not a 2603 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2604 // predicate used by the masked loads/stores. 2605 Value *ScalarIV = CreateScalarIV(Step); 2606 if (!Cost->isScalarEpilogueAllowed()) 2607 CreateSplatIV(ScalarIV, Step); 2608 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2609 } 2610 2611 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2612 Instruction *EntryVal, 2613 const InductionDescriptor &ID, 2614 VPValue *Def, 2615 VPTransformState &State) { 2616 IRBuilder<> &Builder = State.Builder; 2617 // We shouldn't have to build scalar steps if we aren't vectorizing. 2618 assert(State.VF.isVector() && "VF should be greater than one"); 2619 // Get the value type and ensure it and the step have the same integer type. 2620 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2621 assert(ScalarIVTy == Step->getType() && 2622 "Val and Step should have the same type"); 2623 2624 // We build scalar steps for both integer and floating-point induction 2625 // variables. Here, we determine the kind of arithmetic we will perform. 2626 Instruction::BinaryOps AddOp; 2627 Instruction::BinaryOps MulOp; 2628 if (ScalarIVTy->isIntegerTy()) { 2629 AddOp = Instruction::Add; 2630 MulOp = Instruction::Mul; 2631 } else { 2632 AddOp = ID.getInductionOpcode(); 2633 MulOp = Instruction::FMul; 2634 } 2635 2636 // Determine the number of scalars we need to generate for each unroll 2637 // iteration. If EntryVal is uniform, we only need to generate the first 2638 // lane. Otherwise, we generate all VF values. 2639 bool IsUniform = 2640 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); 2641 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 2642 // Compute the scalar steps and save the results in State. 2643 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2644 ScalarIVTy->getScalarSizeInBits()); 2645 Type *VecIVTy = nullptr; 2646 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2647 if (!IsUniform && State.VF.isScalable()) { 2648 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2649 UnitStepVec = 2650 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2651 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2652 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2653 } 2654 2655 for (unsigned Part = 0; Part < State.UF; ++Part) { 2656 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2657 2658 if (!IsUniform && State.VF.isScalable()) { 2659 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2660 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2661 if (ScalarIVTy->isFloatingPointTy()) 2662 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2663 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2664 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2665 State.set(Def, Add, Part); 2666 // It's useful to record the lane values too for the known minimum number 2667 // of elements so we do those below. This improves the code quality when 2668 // trying to extract the first element, for example. 2669 } 2670 2671 if (ScalarIVTy->isFloatingPointTy()) 2672 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2673 2674 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2675 Value *StartIdx = Builder.CreateBinOp( 2676 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2677 // The step returned by `createStepForVF` is a runtime-evaluated value 2678 // when VF is scalable. Otherwise, it should be folded into a Constant. 2679 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2680 "Expected StartIdx to be folded to a constant when VF is not " 2681 "scalable"); 2682 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2683 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2684 State.set(Def, Add, VPIteration(Part, Lane)); 2685 } 2686 } 2687 } 2688 2689 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2690 const VPIteration &Instance, 2691 VPTransformState &State) { 2692 Value *ScalarInst = State.get(Def, Instance); 2693 Value *VectorValue = State.get(Def, Instance.Part); 2694 VectorValue = Builder.CreateInsertElement( 2695 VectorValue, ScalarInst, 2696 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2697 State.set(Def, VectorValue, Instance.Part); 2698 } 2699 2700 // Return whether we allow using masked interleave-groups (for dealing with 2701 // strided loads/stores that reside in predicated blocks, or for dealing 2702 // with gaps). 2703 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2704 // If an override option has been passed in for interleaved accesses, use it. 2705 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2706 return EnableMaskedInterleavedMemAccesses; 2707 2708 return TTI.enableMaskedInterleavedAccessVectorization(); 2709 } 2710 2711 // Try to vectorize the interleave group that \p Instr belongs to. 2712 // 2713 // E.g. Translate following interleaved load group (factor = 3): 2714 // for (i = 0; i < N; i+=3) { 2715 // R = Pic[i]; // Member of index 0 2716 // G = Pic[i+1]; // Member of index 1 2717 // B = Pic[i+2]; // Member of index 2 2718 // ... // do something to R, G, B 2719 // } 2720 // To: 2721 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2722 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2723 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2724 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2725 // 2726 // Or translate following interleaved store group (factor = 3): 2727 // for (i = 0; i < N; i+=3) { 2728 // ... do something to R, G, B 2729 // Pic[i] = R; // Member of index 0 2730 // Pic[i+1] = G; // Member of index 1 2731 // Pic[i+2] = B; // Member of index 2 2732 // } 2733 // To: 2734 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2735 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2736 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2737 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2738 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2739 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2740 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2741 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2742 VPValue *BlockInMask) { 2743 Instruction *Instr = Group->getInsertPos(); 2744 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2745 2746 // Prepare for the vector type of the interleaved load/store. 2747 Type *ScalarTy = getLoadStoreType(Instr); 2748 unsigned InterleaveFactor = Group->getFactor(); 2749 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2750 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2751 2752 // Prepare for the new pointers. 2753 SmallVector<Value *, 2> AddrParts; 2754 unsigned Index = Group->getIndex(Instr); 2755 2756 // TODO: extend the masked interleaved-group support to reversed access. 2757 assert((!BlockInMask || !Group->isReverse()) && 2758 "Reversed masked interleave-group not supported."); 2759 2760 // If the group is reverse, adjust the index to refer to the last vector lane 2761 // instead of the first. We adjust the index from the first vector lane, 2762 // rather than directly getting the pointer for lane VF - 1, because the 2763 // pointer operand of the interleaved access is supposed to be uniform. For 2764 // uniform instructions, we're only required to generate a value for the 2765 // first vector lane in each unroll iteration. 2766 if (Group->isReverse()) 2767 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2768 2769 for (unsigned Part = 0; Part < UF; Part++) { 2770 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2771 setDebugLocFromInst(AddrPart); 2772 2773 // Notice current instruction could be any index. Need to adjust the address 2774 // to the member of index 0. 2775 // 2776 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2777 // b = A[i]; // Member of index 0 2778 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2779 // 2780 // E.g. A[i+1] = a; // Member of index 1 2781 // A[i] = b; // Member of index 0 2782 // A[i+2] = c; // Member of index 2 (Current instruction) 2783 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2784 2785 bool InBounds = false; 2786 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2787 InBounds = gep->isInBounds(); 2788 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2789 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2790 2791 // Cast to the vector pointer type. 2792 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2793 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2794 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2795 } 2796 2797 setDebugLocFromInst(Instr); 2798 Value *PoisonVec = PoisonValue::get(VecTy); 2799 2800 Value *MaskForGaps = nullptr; 2801 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2802 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2803 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2804 } 2805 2806 // Vectorize the interleaved load group. 2807 if (isa<LoadInst>(Instr)) { 2808 // For each unroll part, create a wide load for the group. 2809 SmallVector<Value *, 2> NewLoads; 2810 for (unsigned Part = 0; Part < UF; Part++) { 2811 Instruction *NewLoad; 2812 if (BlockInMask || MaskForGaps) { 2813 assert(useMaskedInterleavedAccesses(*TTI) && 2814 "masked interleaved groups are not allowed."); 2815 Value *GroupMask = MaskForGaps; 2816 if (BlockInMask) { 2817 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2818 Value *ShuffledMask = Builder.CreateShuffleVector( 2819 BlockInMaskPart, 2820 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2821 "interleaved.mask"); 2822 GroupMask = MaskForGaps 2823 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2824 MaskForGaps) 2825 : ShuffledMask; 2826 } 2827 NewLoad = 2828 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2829 GroupMask, PoisonVec, "wide.masked.vec"); 2830 } 2831 else 2832 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2833 Group->getAlign(), "wide.vec"); 2834 Group->addMetadata(NewLoad); 2835 NewLoads.push_back(NewLoad); 2836 } 2837 2838 // For each member in the group, shuffle out the appropriate data from the 2839 // wide loads. 2840 unsigned J = 0; 2841 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2842 Instruction *Member = Group->getMember(I); 2843 2844 // Skip the gaps in the group. 2845 if (!Member) 2846 continue; 2847 2848 auto StrideMask = 2849 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2850 for (unsigned Part = 0; Part < UF; Part++) { 2851 Value *StridedVec = Builder.CreateShuffleVector( 2852 NewLoads[Part], StrideMask, "strided.vec"); 2853 2854 // If this member has different type, cast the result type. 2855 if (Member->getType() != ScalarTy) { 2856 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2857 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2858 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2859 } 2860 2861 if (Group->isReverse()) 2862 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2863 2864 State.set(VPDefs[J], StridedVec, Part); 2865 } 2866 ++J; 2867 } 2868 return; 2869 } 2870 2871 // The sub vector type for current instruction. 2872 auto *SubVT = VectorType::get(ScalarTy, VF); 2873 2874 // Vectorize the interleaved store group. 2875 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2876 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2877 "masked interleaved groups are not allowed."); 2878 assert((!MaskForGaps || !VF.isScalable()) && 2879 "masking gaps for scalable vectors is not yet supported."); 2880 for (unsigned Part = 0; Part < UF; Part++) { 2881 // Collect the stored vector from each member. 2882 SmallVector<Value *, 4> StoredVecs; 2883 for (unsigned i = 0; i < InterleaveFactor; i++) { 2884 assert((Group->getMember(i) || MaskForGaps) && 2885 "Fail to get a member from an interleaved store group"); 2886 Instruction *Member = Group->getMember(i); 2887 2888 // Skip the gaps in the group. 2889 if (!Member) { 2890 Value *Undef = PoisonValue::get(SubVT); 2891 StoredVecs.push_back(Undef); 2892 continue; 2893 } 2894 2895 Value *StoredVec = State.get(StoredValues[i], Part); 2896 2897 if (Group->isReverse()) 2898 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2899 2900 // If this member has different type, cast it to a unified type. 2901 2902 if (StoredVec->getType() != SubVT) 2903 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2904 2905 StoredVecs.push_back(StoredVec); 2906 } 2907 2908 // Concatenate all vectors into a wide vector. 2909 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2910 2911 // Interleave the elements in the wide vector. 2912 Value *IVec = Builder.CreateShuffleVector( 2913 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2914 "interleaved.vec"); 2915 2916 Instruction *NewStoreInstr; 2917 if (BlockInMask || MaskForGaps) { 2918 Value *GroupMask = MaskForGaps; 2919 if (BlockInMask) { 2920 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2921 Value *ShuffledMask = Builder.CreateShuffleVector( 2922 BlockInMaskPart, 2923 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2924 "interleaved.mask"); 2925 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2926 ShuffledMask, MaskForGaps) 2927 : ShuffledMask; 2928 } 2929 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2930 Group->getAlign(), GroupMask); 2931 } else 2932 NewStoreInstr = 2933 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2934 2935 Group->addMetadata(NewStoreInstr); 2936 } 2937 } 2938 2939 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2940 VPReplicateRecipe *RepRecipe, 2941 const VPIteration &Instance, 2942 bool IfPredicateInstr, 2943 VPTransformState &State) { 2944 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2945 2946 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2947 // the first lane and part. 2948 if (isa<NoAliasScopeDeclInst>(Instr)) 2949 if (!Instance.isFirstIteration()) 2950 return; 2951 2952 setDebugLocFromInst(Instr); 2953 2954 // Does this instruction return a value ? 2955 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2956 2957 Instruction *Cloned = Instr->clone(); 2958 if (!IsVoidRetTy) 2959 Cloned->setName(Instr->getName() + ".cloned"); 2960 2961 // If the scalarized instruction contributes to the address computation of a 2962 // widen masked load/store which was in a basic block that needed predication 2963 // and is not predicated after vectorization, we can't propagate 2964 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2965 // instruction could feed a poison value to the base address of the widen 2966 // load/store. 2967 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2968 Cloned->dropPoisonGeneratingFlags(); 2969 2970 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2971 Builder.GetInsertPoint()); 2972 // Replace the operands of the cloned instructions with their scalar 2973 // equivalents in the new loop. 2974 for (auto &I : enumerate(RepRecipe->operands())) { 2975 auto InputInstance = Instance; 2976 VPValue *Operand = I.value(); 2977 if (State.Plan->isUniformAfterVectorization(Operand)) 2978 InputInstance.Lane = VPLane::getFirstLane(); 2979 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2980 } 2981 addNewMetadata(Cloned, Instr); 2982 2983 // Place the cloned scalar in the new loop. 2984 Builder.Insert(Cloned); 2985 2986 State.set(RepRecipe, Cloned, Instance); 2987 2988 // If we just cloned a new assumption, add it the assumption cache. 2989 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2990 AC->registerAssumption(II); 2991 2992 // End if-block. 2993 if (IfPredicateInstr) 2994 PredicatedInstructions.push_back(Cloned); 2995 } 2996 2997 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2998 Value *End, Value *Step, 2999 Instruction *DL) { 3000 BasicBlock *Header = L->getHeader(); 3001 BasicBlock *Latch = L->getLoopLatch(); 3002 // As we're just creating this loop, it's possible no latch exists 3003 // yet. If so, use the header as this will be a single block loop. 3004 if (!Latch) 3005 Latch = Header; 3006 3007 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3008 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3009 setDebugLocFromInst(OldInst, &B); 3010 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3011 3012 B.SetInsertPoint(Latch->getTerminator()); 3013 setDebugLocFromInst(OldInst, &B); 3014 3015 // Create i+1 and fill the PHINode. 3016 // 3017 // If the tail is not folded, we know that End - Start >= Step (either 3018 // statically or through the minimum iteration checks). We also know that both 3019 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3020 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3021 // overflows and we can mark the induction increment as NUW. 3022 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3023 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3024 Induction->addIncoming(Start, L->getLoopPreheader()); 3025 Induction->addIncoming(Next, Latch); 3026 // Create the compare. 3027 Value *ICmp = B.CreateICmpEQ(Next, End); 3028 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3029 3030 // Now we have two terminators. Remove the old one from the block. 3031 Latch->getTerminator()->eraseFromParent(); 3032 3033 return Induction; 3034 } 3035 3036 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3037 if (TripCount) 3038 return TripCount; 3039 3040 assert(L && "Create Trip Count for null loop."); 3041 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3042 // Find the loop boundaries. 3043 ScalarEvolution *SE = PSE.getSE(); 3044 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3045 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3046 "Invalid loop count"); 3047 3048 Type *IdxTy = Legal->getWidestInductionType(); 3049 assert(IdxTy && "No type for induction"); 3050 3051 // The exit count might have the type of i64 while the phi is i32. This can 3052 // happen if we have an induction variable that is sign extended before the 3053 // compare. The only way that we get a backedge taken count is that the 3054 // induction variable was signed and as such will not overflow. In such a case 3055 // truncation is legal. 3056 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3057 IdxTy->getPrimitiveSizeInBits()) 3058 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3059 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3060 3061 // Get the total trip count from the count by adding 1. 3062 const SCEV *ExitCount = SE->getAddExpr( 3063 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3064 3065 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3066 3067 // Expand the trip count and place the new instructions in the preheader. 3068 // Notice that the pre-header does not change, only the loop body. 3069 SCEVExpander Exp(*SE, DL, "induction"); 3070 3071 // Count holds the overall loop count (N). 3072 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3073 L->getLoopPreheader()->getTerminator()); 3074 3075 if (TripCount->getType()->isPointerTy()) 3076 TripCount = 3077 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3078 L->getLoopPreheader()->getTerminator()); 3079 3080 return TripCount; 3081 } 3082 3083 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3084 if (VectorTripCount) 3085 return VectorTripCount; 3086 3087 Value *TC = getOrCreateTripCount(L); 3088 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3089 3090 Type *Ty = TC->getType(); 3091 // This is where we can make the step a runtime constant. 3092 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3093 3094 // If the tail is to be folded by masking, round the number of iterations N 3095 // up to a multiple of Step instead of rounding down. This is done by first 3096 // adding Step-1 and then rounding down. Note that it's ok if this addition 3097 // overflows: the vector induction variable will eventually wrap to zero given 3098 // that it starts at zero and its Step is a power of two; the loop will then 3099 // exit, with the last early-exit vector comparison also producing all-true. 3100 if (Cost->foldTailByMasking()) { 3101 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3102 "VF*UF must be a power of 2 when folding tail by masking"); 3103 assert(!VF.isScalable() && 3104 "Tail folding not yet supported for scalable vectors"); 3105 TC = Builder.CreateAdd( 3106 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3107 } 3108 3109 // Now we need to generate the expression for the part of the loop that the 3110 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3111 // iterations are not required for correctness, or N - Step, otherwise. Step 3112 // is equal to the vectorization factor (number of SIMD elements) times the 3113 // unroll factor (number of SIMD instructions). 3114 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3115 3116 // There are cases where we *must* run at least one iteration in the remainder 3117 // loop. See the cost model for when this can happen. If the step evenly 3118 // divides the trip count, we set the remainder to be equal to the step. If 3119 // the step does not evenly divide the trip count, no adjustment is necessary 3120 // since there will already be scalar iterations. Note that the minimum 3121 // iterations check ensures that N >= Step. 3122 if (Cost->requiresScalarEpilogue(VF)) { 3123 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3124 R = Builder.CreateSelect(IsZero, Step, R); 3125 } 3126 3127 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3128 3129 return VectorTripCount; 3130 } 3131 3132 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3133 const DataLayout &DL) { 3134 // Verify that V is a vector type with same number of elements as DstVTy. 3135 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3136 unsigned VF = DstFVTy->getNumElements(); 3137 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3138 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3139 Type *SrcElemTy = SrcVecTy->getElementType(); 3140 Type *DstElemTy = DstFVTy->getElementType(); 3141 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3142 "Vector elements must have same size"); 3143 3144 // Do a direct cast if element types are castable. 3145 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3146 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3147 } 3148 // V cannot be directly casted to desired vector type. 3149 // May happen when V is a floating point vector but DstVTy is a vector of 3150 // pointers or vice-versa. Handle this using a two-step bitcast using an 3151 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3152 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3153 "Only one type should be a pointer type"); 3154 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3155 "Only one type should be a floating point type"); 3156 Type *IntTy = 3157 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3158 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3159 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3160 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3161 } 3162 3163 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3164 BasicBlock *Bypass) { 3165 Value *Count = getOrCreateTripCount(L); 3166 // Reuse existing vector loop preheader for TC checks. 3167 // Note that new preheader block is generated for vector loop. 3168 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3169 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3170 3171 // Generate code to check if the loop's trip count is less than VF * UF, or 3172 // equal to it in case a scalar epilogue is required; this implies that the 3173 // vector trip count is zero. This check also covers the case where adding one 3174 // to the backedge-taken count overflowed leading to an incorrect trip count 3175 // of zero. In this case we will also jump to the scalar loop. 3176 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3177 : ICmpInst::ICMP_ULT; 3178 3179 // If tail is to be folded, vector loop takes care of all iterations. 3180 Value *CheckMinIters = Builder.getFalse(); 3181 if (!Cost->foldTailByMasking()) { 3182 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3183 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3184 } 3185 // Create new preheader for vector loop. 3186 LoopVectorPreHeader = 3187 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3188 "vector.ph"); 3189 3190 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3191 DT->getNode(Bypass)->getIDom()) && 3192 "TC check is expected to dominate Bypass"); 3193 3194 // Update dominator for Bypass & LoopExit (if needed). 3195 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3196 if (!Cost->requiresScalarEpilogue(VF)) 3197 // If there is an epilogue which must run, there's no edge from the 3198 // middle block to exit blocks and thus no need to update the immediate 3199 // dominator of the exit blocks. 3200 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3201 3202 ReplaceInstWithInst( 3203 TCCheckBlock->getTerminator(), 3204 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3205 LoopBypassBlocks.push_back(TCCheckBlock); 3206 } 3207 3208 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3209 3210 BasicBlock *const SCEVCheckBlock = 3211 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3212 if (!SCEVCheckBlock) 3213 return nullptr; 3214 3215 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3216 (OptForSizeBasedOnProfile && 3217 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3218 "Cannot SCEV check stride or overflow when optimizing for size"); 3219 3220 3221 // Update dominator only if this is first RT check. 3222 if (LoopBypassBlocks.empty()) { 3223 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3224 if (!Cost->requiresScalarEpilogue(VF)) 3225 // If there is an epilogue which must run, there's no edge from the 3226 // middle block to exit blocks and thus no need to update the immediate 3227 // dominator of the exit blocks. 3228 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3229 } 3230 3231 LoopBypassBlocks.push_back(SCEVCheckBlock); 3232 AddedSafetyChecks = true; 3233 return SCEVCheckBlock; 3234 } 3235 3236 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3237 BasicBlock *Bypass) { 3238 // VPlan-native path does not do any analysis for runtime checks currently. 3239 if (EnableVPlanNativePath) 3240 return nullptr; 3241 3242 BasicBlock *const MemCheckBlock = 3243 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3244 3245 // Check if we generated code that checks in runtime if arrays overlap. We put 3246 // the checks into a separate block to make the more common case of few 3247 // elements faster. 3248 if (!MemCheckBlock) 3249 return nullptr; 3250 3251 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3252 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3253 "Cannot emit memory checks when optimizing for size, unless forced " 3254 "to vectorize."); 3255 ORE->emit([&]() { 3256 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3257 L->getStartLoc(), L->getHeader()) 3258 << "Code-size may be reduced by not forcing " 3259 "vectorization, or by source-code modifications " 3260 "eliminating the need for runtime checks " 3261 "(e.g., adding 'restrict')."; 3262 }); 3263 } 3264 3265 LoopBypassBlocks.push_back(MemCheckBlock); 3266 3267 AddedSafetyChecks = true; 3268 3269 // We currently don't use LoopVersioning for the actual loop cloning but we 3270 // still use it to add the noalias metadata. 3271 LVer = std::make_unique<LoopVersioning>( 3272 *Legal->getLAI(), 3273 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3274 DT, PSE.getSE()); 3275 LVer->prepareNoAliasMetadata(); 3276 return MemCheckBlock; 3277 } 3278 3279 Value *InnerLoopVectorizer::emitTransformedIndex( 3280 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3281 const InductionDescriptor &ID, BasicBlock *VectorHeader) const { 3282 3283 SCEVExpander Exp(*SE, DL, "induction"); 3284 auto Step = ID.getStep(); 3285 auto StartValue = ID.getStartValue(); 3286 assert(Index->getType()->getScalarType() == Step->getType() && 3287 "Index scalar type does not match StepValue type"); 3288 3289 // Note: the IR at this point is broken. We cannot use SE to create any new 3290 // SCEV and then expand it, hoping that SCEV's simplification will give us 3291 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3292 // lead to various SCEV crashes. So all we can do is to use builder and rely 3293 // on InstCombine for future simplifications. Here we handle some trivial 3294 // cases only. 3295 auto CreateAdd = [&B](Value *X, Value *Y) { 3296 assert(X->getType() == Y->getType() && "Types don't match!"); 3297 if (auto *CX = dyn_cast<ConstantInt>(X)) 3298 if (CX->isZero()) 3299 return Y; 3300 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3301 if (CY->isZero()) 3302 return X; 3303 return B.CreateAdd(X, Y); 3304 }; 3305 3306 // We allow X to be a vector type, in which case Y will potentially be 3307 // splatted into a vector with the same element count. 3308 auto CreateMul = [&B](Value *X, Value *Y) { 3309 assert(X->getType()->getScalarType() == Y->getType() && 3310 "Types don't match!"); 3311 if (auto *CX = dyn_cast<ConstantInt>(X)) 3312 if (CX->isOne()) 3313 return Y; 3314 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3315 if (CY->isOne()) 3316 return X; 3317 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3318 if (XVTy && !isa<VectorType>(Y->getType())) 3319 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3320 return B.CreateMul(X, Y); 3321 }; 3322 3323 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3324 // loop, choose the end of the vector loop header (=VectorHeader), because 3325 // the DomTree is not kept up-to-date for additional blocks generated in the 3326 // vector loop. By using the header as insertion point, we guarantee that the 3327 // expanded instructions dominate all their uses. 3328 auto GetInsertPoint = [this, &B, VectorHeader]() { 3329 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3330 if (InsertBB != LoopVectorBody && 3331 LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) 3332 return VectorHeader->getTerminator(); 3333 return &*B.GetInsertPoint(); 3334 }; 3335 3336 switch (ID.getKind()) { 3337 case InductionDescriptor::IK_IntInduction: { 3338 assert(!isa<VectorType>(Index->getType()) && 3339 "Vector indices not supported for integer inductions yet"); 3340 assert(Index->getType() == StartValue->getType() && 3341 "Index type does not match StartValue type"); 3342 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3343 return B.CreateSub(StartValue, Index); 3344 auto *Offset = CreateMul( 3345 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3346 return CreateAdd(StartValue, Offset); 3347 } 3348 case InductionDescriptor::IK_PtrInduction: { 3349 assert(isa<SCEVConstant>(Step) && 3350 "Expected constant step for pointer induction"); 3351 return B.CreateGEP( 3352 ID.getElementType(), StartValue, 3353 CreateMul(Index, 3354 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3355 GetInsertPoint()))); 3356 } 3357 case InductionDescriptor::IK_FpInduction: { 3358 assert(!isa<VectorType>(Index->getType()) && 3359 "Vector indices not supported for FP inductions yet"); 3360 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3361 auto InductionBinOp = ID.getInductionBinOp(); 3362 assert(InductionBinOp && 3363 (InductionBinOp->getOpcode() == Instruction::FAdd || 3364 InductionBinOp->getOpcode() == Instruction::FSub) && 3365 "Original bin op should be defined for FP induction"); 3366 3367 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3368 Value *MulExp = B.CreateFMul(StepValue, Index); 3369 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3370 "induction"); 3371 } 3372 case InductionDescriptor::IK_NoInduction: 3373 return nullptr; 3374 } 3375 llvm_unreachable("invalid enum"); 3376 } 3377 3378 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3379 LoopScalarBody = OrigLoop->getHeader(); 3380 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3381 assert(LoopVectorPreHeader && "Invalid loop structure"); 3382 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3383 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3384 "multiple exit loop without required epilogue?"); 3385 3386 LoopMiddleBlock = 3387 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3388 LI, nullptr, Twine(Prefix) + "middle.block"); 3389 LoopScalarPreHeader = 3390 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3391 nullptr, Twine(Prefix) + "scalar.ph"); 3392 3393 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3394 3395 // Set up the middle block terminator. Two cases: 3396 // 1) If we know that we must execute the scalar epilogue, emit an 3397 // unconditional branch. 3398 // 2) Otherwise, we must have a single unique exit block (due to how we 3399 // implement the multiple exit case). In this case, set up a conditonal 3400 // branch from the middle block to the loop scalar preheader, and the 3401 // exit block. completeLoopSkeleton will update the condition to use an 3402 // iteration check, if required to decide whether to execute the remainder. 3403 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3404 BranchInst::Create(LoopScalarPreHeader) : 3405 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3406 Builder.getTrue()); 3407 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3408 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3409 3410 // We intentionally don't let SplitBlock to update LoopInfo since 3411 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3412 // LoopVectorBody is explicitly added to the correct place few lines later. 3413 LoopVectorBody = 3414 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3415 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3416 3417 // Update dominator for loop exit. 3418 if (!Cost->requiresScalarEpilogue(VF)) 3419 // If there is an epilogue which must run, there's no edge from the 3420 // middle block to exit blocks and thus no need to update the immediate 3421 // dominator of the exit blocks. 3422 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3423 3424 // Create and register the new vector loop. 3425 Loop *Lp = LI->AllocateLoop(); 3426 Loop *ParentLoop = OrigLoop->getParentLoop(); 3427 3428 // Insert the new loop into the loop nest and register the new basic blocks 3429 // before calling any utilities such as SCEV that require valid LoopInfo. 3430 if (ParentLoop) { 3431 ParentLoop->addChildLoop(Lp); 3432 } else { 3433 LI->addTopLevelLoop(Lp); 3434 } 3435 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3436 return Lp; 3437 } 3438 3439 void InnerLoopVectorizer::createInductionResumeValues( 3440 Loop *L, Value *VectorTripCount, 3441 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3442 assert(VectorTripCount && L && "Expected valid arguments"); 3443 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3444 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3445 "Inconsistent information about additional bypass."); 3446 // We are going to resume the execution of the scalar loop. 3447 // Go over all of the induction variables that we found and fix the 3448 // PHIs that are left in the scalar version of the loop. 3449 // The starting values of PHI nodes depend on the counter of the last 3450 // iteration in the vectorized loop. 3451 // If we come from a bypass edge then we need to start from the original 3452 // start value. 3453 for (auto &InductionEntry : Legal->getInductionVars()) { 3454 PHINode *OrigPhi = InductionEntry.first; 3455 InductionDescriptor II = InductionEntry.second; 3456 3457 // Create phi nodes to merge from the backedge-taken check block. 3458 PHINode *BCResumeVal = 3459 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3460 LoopScalarPreHeader->getTerminator()); 3461 // Copy original phi DL over to the new one. 3462 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3463 Value *&EndValue = IVEndValues[OrigPhi]; 3464 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3465 if (OrigPhi == OldInduction) { 3466 // We know what the end value is. 3467 EndValue = VectorTripCount; 3468 } else { 3469 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3470 3471 // Fast-math-flags propagate from the original induction instruction. 3472 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3473 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3474 3475 Type *StepType = II.getStep()->getType(); 3476 Instruction::CastOps CastOp = 3477 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3478 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3479 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3480 EndValue = 3481 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3482 EndValue->setName("ind.end"); 3483 3484 // Compute the end value for the additional bypass (if applicable). 3485 if (AdditionalBypass.first) { 3486 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3487 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3488 StepType, true); 3489 CRD = 3490 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3491 EndValueFromAdditionalBypass = 3492 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3493 EndValueFromAdditionalBypass->setName("ind.end"); 3494 } 3495 } 3496 // The new PHI merges the original incoming value, in case of a bypass, 3497 // or the value at the end of the vectorized loop. 3498 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3499 3500 // Fix the scalar body counter (PHI node). 3501 // The old induction's phi node in the scalar body needs the truncated 3502 // value. 3503 for (BasicBlock *BB : LoopBypassBlocks) 3504 BCResumeVal->addIncoming(II.getStartValue(), BB); 3505 3506 if (AdditionalBypass.first) 3507 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3508 EndValueFromAdditionalBypass); 3509 3510 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3511 } 3512 } 3513 3514 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3515 MDNode *OrigLoopID) { 3516 assert(L && "Expected valid loop."); 3517 3518 // The trip counts should be cached by now. 3519 Value *Count = getOrCreateTripCount(L); 3520 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3521 3522 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3523 3524 // Add a check in the middle block to see if we have completed 3525 // all of the iterations in the first vector loop. Three cases: 3526 // 1) If we require a scalar epilogue, there is no conditional branch as 3527 // we unconditionally branch to the scalar preheader. Do nothing. 3528 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3529 // Thus if tail is to be folded, we know we don't need to run the 3530 // remainder and we can use the previous value for the condition (true). 3531 // 3) Otherwise, construct a runtime check. 3532 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3533 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3534 Count, VectorTripCount, "cmp.n", 3535 LoopMiddleBlock->getTerminator()); 3536 3537 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3538 // of the corresponding compare because they may have ended up with 3539 // different line numbers and we want to avoid awkward line stepping while 3540 // debugging. Eg. if the compare has got a line number inside the loop. 3541 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3542 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3543 } 3544 3545 // Get ready to start creating new instructions into the vectorized body. 3546 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3547 "Inconsistent vector loop preheader"); 3548 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3549 3550 Optional<MDNode *> VectorizedLoopID = 3551 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3552 LLVMLoopVectorizeFollowupVectorized}); 3553 if (VectorizedLoopID.hasValue()) { 3554 L->setLoopID(VectorizedLoopID.getValue()); 3555 3556 // Do not setAlreadyVectorized if loop attributes have been defined 3557 // explicitly. 3558 return LoopVectorPreHeader; 3559 } 3560 3561 // Keep all loop hints from the original loop on the vector loop (we'll 3562 // replace the vectorizer-specific hints below). 3563 if (MDNode *LID = OrigLoop->getLoopID()) 3564 L->setLoopID(LID); 3565 3566 LoopVectorizeHints Hints(L, true, *ORE, TTI); 3567 Hints.setAlreadyVectorized(); 3568 3569 #ifdef EXPENSIVE_CHECKS 3570 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3571 LI->verify(*DT); 3572 #endif 3573 3574 return LoopVectorPreHeader; 3575 } 3576 3577 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3578 /* 3579 In this function we generate a new loop. The new loop will contain 3580 the vectorized instructions while the old loop will continue to run the 3581 scalar remainder. 3582 3583 [ ] <-- loop iteration number check. 3584 / | 3585 / v 3586 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3587 | / | 3588 | / v 3589 || [ ] <-- vector pre header. 3590 |/ | 3591 | v 3592 | [ ] \ 3593 | [ ]_| <-- vector loop. 3594 | | 3595 | v 3596 \ -[ ] <--- middle-block. 3597 \/ | 3598 /\ v 3599 | ->[ ] <--- new preheader. 3600 | | 3601 (opt) v <-- edge from middle to exit iff epilogue is not required. 3602 | [ ] \ 3603 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3604 \ | 3605 \ v 3606 >[ ] <-- exit block(s). 3607 ... 3608 */ 3609 3610 // Get the metadata of the original loop before it gets modified. 3611 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3612 3613 // Workaround! Compute the trip count of the original loop and cache it 3614 // before we start modifying the CFG. This code has a systemic problem 3615 // wherein it tries to run analysis over partially constructed IR; this is 3616 // wrong, and not simply for SCEV. The trip count of the original loop 3617 // simply happens to be prone to hitting this in practice. In theory, we 3618 // can hit the same issue for any SCEV, or ValueTracking query done during 3619 // mutation. See PR49900. 3620 getOrCreateTripCount(OrigLoop); 3621 3622 // Create an empty vector loop, and prepare basic blocks for the runtime 3623 // checks. 3624 Loop *Lp = createVectorLoopSkeleton(""); 3625 3626 // Now, compare the new count to zero. If it is zero skip the vector loop and 3627 // jump to the scalar loop. This check also covers the case where the 3628 // backedge-taken count is uint##_max: adding one to it will overflow leading 3629 // to an incorrect trip count of zero. In this (rare) case we will also jump 3630 // to the scalar loop. 3631 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3632 3633 // Generate the code to check any assumptions that we've made for SCEV 3634 // expressions. 3635 emitSCEVChecks(Lp, LoopScalarPreHeader); 3636 3637 // Generate the code that checks in runtime if arrays overlap. We put the 3638 // checks into a separate block to make the more common case of few elements 3639 // faster. 3640 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3641 3642 // Some loops have a single integer induction variable, while other loops 3643 // don't. One example is c++ iterators that often have multiple pointer 3644 // induction variables. In the code below we also support a case where we 3645 // don't have a single induction variable. 3646 // 3647 // We try to obtain an induction variable from the original loop as hard 3648 // as possible. However if we don't find one that: 3649 // - is an integer 3650 // - counts from zero, stepping by one 3651 // - is the size of the widest induction variable type 3652 // then we create a new one. 3653 OldInduction = Legal->getPrimaryInduction(); 3654 Type *IdxTy = Legal->getWidestInductionType(); 3655 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3656 // The loop step is equal to the vectorization factor (num of SIMD elements) 3657 // times the unroll factor (num of SIMD instructions). 3658 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3659 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3660 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3661 Induction = 3662 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3663 getDebugLocFromInstOrOperands(OldInduction)); 3664 3665 // Emit phis for the new starting index of the scalar loop. 3666 createInductionResumeValues(Lp, CountRoundDown); 3667 3668 return completeLoopSkeleton(Lp, OrigLoopID); 3669 } 3670 3671 // Fix up external users of the induction variable. At this point, we are 3672 // in LCSSA form, with all external PHIs that use the IV having one input value, 3673 // coming from the remainder loop. We need those PHIs to also have a correct 3674 // value for the IV when arriving directly from the middle block. 3675 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3676 const InductionDescriptor &II, 3677 Value *CountRoundDown, Value *EndValue, 3678 BasicBlock *MiddleBlock) { 3679 // There are two kinds of external IV usages - those that use the value 3680 // computed in the last iteration (the PHI) and those that use the penultimate 3681 // value (the value that feeds into the phi from the loop latch). 3682 // We allow both, but they, obviously, have different values. 3683 3684 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3685 3686 DenseMap<Value *, Value *> MissingVals; 3687 3688 // An external user of the last iteration's value should see the value that 3689 // the remainder loop uses to initialize its own IV. 3690 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3691 for (User *U : PostInc->users()) { 3692 Instruction *UI = cast<Instruction>(U); 3693 if (!OrigLoop->contains(UI)) { 3694 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3695 MissingVals[UI] = EndValue; 3696 } 3697 } 3698 3699 // An external user of the penultimate value need to see EndValue - Step. 3700 // The simplest way to get this is to recompute it from the constituent SCEVs, 3701 // that is Start + (Step * (CRD - 1)). 3702 for (User *U : OrigPhi->users()) { 3703 auto *UI = cast<Instruction>(U); 3704 if (!OrigLoop->contains(UI)) { 3705 const DataLayout &DL = 3706 OrigLoop->getHeader()->getModule()->getDataLayout(); 3707 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3708 3709 IRBuilder<> B(MiddleBlock->getTerminator()); 3710 3711 // Fast-math-flags propagate from the original induction instruction. 3712 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3713 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3714 3715 Value *CountMinusOne = B.CreateSub( 3716 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3717 Value *CMO = 3718 !II.getStep()->getType()->isIntegerTy() 3719 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3720 II.getStep()->getType()) 3721 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3722 CMO->setName("cast.cmo"); 3723 Value *Escape = 3724 emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); 3725 Escape->setName("ind.escape"); 3726 MissingVals[UI] = Escape; 3727 } 3728 } 3729 3730 for (auto &I : MissingVals) { 3731 PHINode *PHI = cast<PHINode>(I.first); 3732 // One corner case we have to handle is two IVs "chasing" each-other, 3733 // that is %IV2 = phi [...], [ %IV1, %latch ] 3734 // In this case, if IV1 has an external use, we need to avoid adding both 3735 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3736 // don't already have an incoming value for the middle block. 3737 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3738 PHI->addIncoming(I.second, MiddleBlock); 3739 } 3740 } 3741 3742 namespace { 3743 3744 struct CSEDenseMapInfo { 3745 static bool canHandle(const Instruction *I) { 3746 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3747 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3748 } 3749 3750 static inline Instruction *getEmptyKey() { 3751 return DenseMapInfo<Instruction *>::getEmptyKey(); 3752 } 3753 3754 static inline Instruction *getTombstoneKey() { 3755 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3756 } 3757 3758 static unsigned getHashValue(const Instruction *I) { 3759 assert(canHandle(I) && "Unknown instruction!"); 3760 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3761 I->value_op_end())); 3762 } 3763 3764 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3765 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3766 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3767 return LHS == RHS; 3768 return LHS->isIdenticalTo(RHS); 3769 } 3770 }; 3771 3772 } // end anonymous namespace 3773 3774 ///Perform cse of induction variable instructions. 3775 static void cse(BasicBlock *BB) { 3776 // Perform simple cse. 3777 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3778 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3779 if (!CSEDenseMapInfo::canHandle(&In)) 3780 continue; 3781 3782 // Check if we can replace this instruction with any of the 3783 // visited instructions. 3784 if (Instruction *V = CSEMap.lookup(&In)) { 3785 In.replaceAllUsesWith(V); 3786 In.eraseFromParent(); 3787 continue; 3788 } 3789 3790 CSEMap[&In] = &In; 3791 } 3792 } 3793 3794 InstructionCost 3795 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3796 bool &NeedToScalarize) const { 3797 Function *F = CI->getCalledFunction(); 3798 Type *ScalarRetTy = CI->getType(); 3799 SmallVector<Type *, 4> Tys, ScalarTys; 3800 for (auto &ArgOp : CI->args()) 3801 ScalarTys.push_back(ArgOp->getType()); 3802 3803 // Estimate cost of scalarized vector call. The source operands are assumed 3804 // to be vectors, so we need to extract individual elements from there, 3805 // execute VF scalar calls, and then gather the result into the vector return 3806 // value. 3807 InstructionCost ScalarCallCost = 3808 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3809 if (VF.isScalar()) 3810 return ScalarCallCost; 3811 3812 // Compute corresponding vector type for return value and arguments. 3813 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3814 for (Type *ScalarTy : ScalarTys) 3815 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3816 3817 // Compute costs of unpacking argument values for the scalar calls and 3818 // packing the return values to a vector. 3819 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3820 3821 InstructionCost Cost = 3822 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3823 3824 // If we can't emit a vector call for this function, then the currently found 3825 // cost is the cost we need to return. 3826 NeedToScalarize = true; 3827 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3828 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3829 3830 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3831 return Cost; 3832 3833 // If the corresponding vector cost is cheaper, return its cost. 3834 InstructionCost VectorCallCost = 3835 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3836 if (VectorCallCost < Cost) { 3837 NeedToScalarize = false; 3838 Cost = VectorCallCost; 3839 } 3840 return Cost; 3841 } 3842 3843 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3844 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3845 return Elt; 3846 return VectorType::get(Elt, VF); 3847 } 3848 3849 InstructionCost 3850 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3851 ElementCount VF) const { 3852 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3853 assert(ID && "Expected intrinsic call!"); 3854 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3855 FastMathFlags FMF; 3856 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3857 FMF = FPMO->getFastMathFlags(); 3858 3859 SmallVector<const Value *> Arguments(CI->args()); 3860 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3861 SmallVector<Type *> ParamTys; 3862 std::transform(FTy->param_begin(), FTy->param_end(), 3863 std::back_inserter(ParamTys), 3864 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3865 3866 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3867 dyn_cast<IntrinsicInst>(CI)); 3868 return TTI.getIntrinsicInstrCost(CostAttrs, 3869 TargetTransformInfo::TCK_RecipThroughput); 3870 } 3871 3872 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3873 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3874 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3875 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3876 } 3877 3878 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3879 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3880 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3881 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3882 } 3883 3884 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3885 // For every instruction `I` in MinBWs, truncate the operands, create a 3886 // truncated version of `I` and reextend its result. InstCombine runs 3887 // later and will remove any ext/trunc pairs. 3888 SmallPtrSet<Value *, 4> Erased; 3889 for (const auto &KV : Cost->getMinimalBitwidths()) { 3890 // If the value wasn't vectorized, we must maintain the original scalar 3891 // type. The absence of the value from State indicates that it 3892 // wasn't vectorized. 3893 // FIXME: Should not rely on getVPValue at this point. 3894 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3895 if (!State.hasAnyVectorValue(Def)) 3896 continue; 3897 for (unsigned Part = 0; Part < UF; ++Part) { 3898 Value *I = State.get(Def, Part); 3899 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3900 continue; 3901 Type *OriginalTy = I->getType(); 3902 Type *ScalarTruncatedTy = 3903 IntegerType::get(OriginalTy->getContext(), KV.second); 3904 auto *TruncatedTy = VectorType::get( 3905 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3906 if (TruncatedTy == OriginalTy) 3907 continue; 3908 3909 IRBuilder<> B(cast<Instruction>(I)); 3910 auto ShrinkOperand = [&](Value *V) -> Value * { 3911 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3912 if (ZI->getSrcTy() == TruncatedTy) 3913 return ZI->getOperand(0); 3914 return B.CreateZExtOrTrunc(V, TruncatedTy); 3915 }; 3916 3917 // The actual instruction modification depends on the instruction type, 3918 // unfortunately. 3919 Value *NewI = nullptr; 3920 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3921 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3922 ShrinkOperand(BO->getOperand(1))); 3923 3924 // Any wrapping introduced by shrinking this operation shouldn't be 3925 // considered undefined behavior. So, we can't unconditionally copy 3926 // arithmetic wrapping flags to NewI. 3927 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3928 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3929 NewI = 3930 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3931 ShrinkOperand(CI->getOperand(1))); 3932 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3933 NewI = B.CreateSelect(SI->getCondition(), 3934 ShrinkOperand(SI->getTrueValue()), 3935 ShrinkOperand(SI->getFalseValue())); 3936 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3937 switch (CI->getOpcode()) { 3938 default: 3939 llvm_unreachable("Unhandled cast!"); 3940 case Instruction::Trunc: 3941 NewI = ShrinkOperand(CI->getOperand(0)); 3942 break; 3943 case Instruction::SExt: 3944 NewI = B.CreateSExtOrTrunc( 3945 CI->getOperand(0), 3946 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3947 break; 3948 case Instruction::ZExt: 3949 NewI = B.CreateZExtOrTrunc( 3950 CI->getOperand(0), 3951 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3952 break; 3953 } 3954 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3955 auto Elements0 = 3956 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3957 auto *O0 = B.CreateZExtOrTrunc( 3958 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3959 auto Elements1 = 3960 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3961 auto *O1 = B.CreateZExtOrTrunc( 3962 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3963 3964 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3965 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3966 // Don't do anything with the operands, just extend the result. 3967 continue; 3968 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3969 auto Elements = 3970 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3971 auto *O0 = B.CreateZExtOrTrunc( 3972 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3973 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3974 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3975 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3976 auto Elements = 3977 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3978 auto *O0 = B.CreateZExtOrTrunc( 3979 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3980 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3981 } else { 3982 // If we don't know what to do, be conservative and don't do anything. 3983 continue; 3984 } 3985 3986 // Lastly, extend the result. 3987 NewI->takeName(cast<Instruction>(I)); 3988 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3989 I->replaceAllUsesWith(Res); 3990 cast<Instruction>(I)->eraseFromParent(); 3991 Erased.insert(I); 3992 State.reset(Def, Res, Part); 3993 } 3994 } 3995 3996 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3997 for (const auto &KV : Cost->getMinimalBitwidths()) { 3998 // If the value wasn't vectorized, we must maintain the original scalar 3999 // type. The absence of the value from State indicates that it 4000 // wasn't vectorized. 4001 // FIXME: Should not rely on getVPValue at this point. 4002 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4003 if (!State.hasAnyVectorValue(Def)) 4004 continue; 4005 for (unsigned Part = 0; Part < UF; ++Part) { 4006 Value *I = State.get(Def, Part); 4007 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4008 if (Inst && Inst->use_empty()) { 4009 Value *NewI = Inst->getOperand(0); 4010 Inst->eraseFromParent(); 4011 State.reset(Def, NewI, Part); 4012 } 4013 } 4014 } 4015 } 4016 4017 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4018 // Insert truncates and extends for any truncated instructions as hints to 4019 // InstCombine. 4020 if (VF.isVector()) 4021 truncateToMinimalBitwidths(State); 4022 4023 // Fix widened non-induction PHIs by setting up the PHI operands. 4024 if (OrigPHIsToFix.size()) { 4025 assert(EnableVPlanNativePath && 4026 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4027 fixNonInductionPHIs(State); 4028 } 4029 4030 // At this point every instruction in the original loop is widened to a 4031 // vector form. Now we need to fix the recurrences in the loop. These PHI 4032 // nodes are currently empty because we did not want to introduce cycles. 4033 // This is the second stage of vectorizing recurrences. 4034 fixCrossIterationPHIs(State); 4035 4036 // Forget the original basic block. 4037 PSE.getSE()->forgetLoop(OrigLoop); 4038 4039 // If we inserted an edge from the middle block to the unique exit block, 4040 // update uses outside the loop (phis) to account for the newly inserted 4041 // edge. 4042 if (!Cost->requiresScalarEpilogue(VF)) { 4043 // Fix-up external users of the induction variables. 4044 for (auto &Entry : Legal->getInductionVars()) 4045 fixupIVUsers(Entry.first, Entry.second, 4046 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4047 IVEndValues[Entry.first], LoopMiddleBlock); 4048 4049 fixLCSSAPHIs(State); 4050 } 4051 4052 for (Instruction *PI : PredicatedInstructions) 4053 sinkScalarOperands(&*PI); 4054 4055 // Remove redundant induction instructions. 4056 cse(LoopVectorBody); 4057 4058 // Set/update profile weights for the vector and remainder loops as original 4059 // loop iterations are now distributed among them. Note that original loop 4060 // represented by LoopScalarBody becomes remainder loop after vectorization. 4061 // 4062 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4063 // end up getting slightly roughened result but that should be OK since 4064 // profile is not inherently precise anyway. Note also possible bypass of 4065 // vector code caused by legality checks is ignored, assigning all the weight 4066 // to the vector loop, optimistically. 4067 // 4068 // For scalable vectorization we can't know at compile time how many iterations 4069 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4070 // vscale of '1'. 4071 setProfileInfoAfterUnrolling( 4072 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4073 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4074 } 4075 4076 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4077 // In order to support recurrences we need to be able to vectorize Phi nodes. 4078 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4079 // stage #2: We now need to fix the recurrences by adding incoming edges to 4080 // the currently empty PHI nodes. At this point every instruction in the 4081 // original loop is widened to a vector form so we can use them to construct 4082 // the incoming edges. 4083 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4084 for (VPRecipeBase &R : Header->phis()) { 4085 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4086 fixReduction(ReductionPhi, State); 4087 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4088 fixFirstOrderRecurrence(FOR, State); 4089 } 4090 } 4091 4092 void InnerLoopVectorizer::fixFirstOrderRecurrence( 4093 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 4094 // This is the second phase of vectorizing first-order recurrences. An 4095 // overview of the transformation is described below. Suppose we have the 4096 // following loop. 4097 // 4098 // for (int i = 0; i < n; ++i) 4099 // b[i] = a[i] - a[i - 1]; 4100 // 4101 // There is a first-order recurrence on "a". For this loop, the shorthand 4102 // scalar IR looks like: 4103 // 4104 // scalar.ph: 4105 // s_init = a[-1] 4106 // br scalar.body 4107 // 4108 // scalar.body: 4109 // i = phi [0, scalar.ph], [i+1, scalar.body] 4110 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4111 // s2 = a[i] 4112 // b[i] = s2 - s1 4113 // br cond, scalar.body, ... 4114 // 4115 // In this example, s1 is a recurrence because it's value depends on the 4116 // previous iteration. In the first phase of vectorization, we created a 4117 // vector phi v1 for s1. We now complete the vectorization and produce the 4118 // shorthand vector IR shown below (for VF = 4, UF = 1). 4119 // 4120 // vector.ph: 4121 // v_init = vector(..., ..., ..., a[-1]) 4122 // br vector.body 4123 // 4124 // vector.body 4125 // i = phi [0, vector.ph], [i+4, vector.body] 4126 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4127 // v2 = a[i, i+1, i+2, i+3]; 4128 // v3 = vector(v1(3), v2(0, 1, 2)) 4129 // b[i, i+1, i+2, i+3] = v2 - v3 4130 // br cond, vector.body, middle.block 4131 // 4132 // middle.block: 4133 // x = v2(3) 4134 // br scalar.ph 4135 // 4136 // scalar.ph: 4137 // s_init = phi [x, middle.block], [a[-1], otherwise] 4138 // br scalar.body 4139 // 4140 // After execution completes the vector loop, we extract the next value of 4141 // the recurrence (x) to use as the initial value in the scalar loop. 4142 4143 // Extract the last vector element in the middle block. This will be the 4144 // initial value for the recurrence when jumping to the scalar loop. 4145 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4146 Value *Incoming = State.get(PreviousDef, UF - 1); 4147 auto *ExtractForScalar = Incoming; 4148 auto *IdxTy = Builder.getInt32Ty(); 4149 if (VF.isVector()) { 4150 auto *One = ConstantInt::get(IdxTy, 1); 4151 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4152 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4153 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4154 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4155 "vector.recur.extract"); 4156 } 4157 // Extract the second last element in the middle block if the 4158 // Phi is used outside the loop. We need to extract the phi itself 4159 // and not the last element (the phi update in the current iteration). This 4160 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4161 // when the scalar loop is not run at all. 4162 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4163 if (VF.isVector()) { 4164 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4165 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4166 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4167 Incoming, Idx, "vector.recur.extract.for.phi"); 4168 } else if (UF > 1) 4169 // When loop is unrolled without vectorizing, initialize 4170 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4171 // of `Incoming`. This is analogous to the vectorized case above: extracting 4172 // the second last element when VF > 1. 4173 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4174 4175 // Fix the initial value of the original recurrence in the scalar loop. 4176 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4177 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4178 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4179 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4180 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4181 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4182 Start->addIncoming(Incoming, BB); 4183 } 4184 4185 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4186 Phi->setName("scalar.recur"); 4187 4188 // Finally, fix users of the recurrence outside the loop. The users will need 4189 // either the last value of the scalar recurrence or the last value of the 4190 // vector recurrence we extracted in the middle block. Since the loop is in 4191 // LCSSA form, we just need to find all the phi nodes for the original scalar 4192 // recurrence in the exit block, and then add an edge for the middle block. 4193 // Note that LCSSA does not imply single entry when the original scalar loop 4194 // had multiple exiting edges (as we always run the last iteration in the 4195 // scalar epilogue); in that case, there is no edge from middle to exit and 4196 // and thus no phis which needed updated. 4197 if (!Cost->requiresScalarEpilogue(VF)) 4198 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4199 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4200 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4201 } 4202 4203 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4204 VPTransformState &State) { 4205 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4206 // Get it's reduction variable descriptor. 4207 assert(Legal->isReductionVariable(OrigPhi) && 4208 "Unable to find the reduction variable"); 4209 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4210 4211 RecurKind RK = RdxDesc.getRecurrenceKind(); 4212 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4213 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4214 setDebugLocFromInst(ReductionStartValue); 4215 4216 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4217 // This is the vector-clone of the value that leaves the loop. 4218 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4219 4220 // Wrap flags are in general invalid after vectorization, clear them. 4221 clearReductionWrapFlags(RdxDesc, State); 4222 4223 // Before each round, move the insertion point right between 4224 // the PHIs and the values we are going to write. 4225 // This allows us to write both PHINodes and the extractelement 4226 // instructions. 4227 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4228 4229 setDebugLocFromInst(LoopExitInst); 4230 4231 Type *PhiTy = OrigPhi->getType(); 4232 // If tail is folded by masking, the vector value to leave the loop should be 4233 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4234 // instead of the former. For an inloop reduction the reduction will already 4235 // be predicated, and does not need to be handled here. 4236 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4237 for (unsigned Part = 0; Part < UF; ++Part) { 4238 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4239 Value *Sel = nullptr; 4240 for (User *U : VecLoopExitInst->users()) { 4241 if (isa<SelectInst>(U)) { 4242 assert(!Sel && "Reduction exit feeding two selects"); 4243 Sel = U; 4244 } else 4245 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4246 } 4247 assert(Sel && "Reduction exit feeds no select"); 4248 State.reset(LoopExitInstDef, Sel, Part); 4249 4250 // If the target can create a predicated operator for the reduction at no 4251 // extra cost in the loop (for example a predicated vadd), it can be 4252 // cheaper for the select to remain in the loop than be sunk out of it, 4253 // and so use the select value for the phi instead of the old 4254 // LoopExitValue. 4255 if (PreferPredicatedReductionSelect || 4256 TTI->preferPredicatedReductionSelect( 4257 RdxDesc.getOpcode(), PhiTy, 4258 TargetTransformInfo::ReductionFlags())) { 4259 auto *VecRdxPhi = 4260 cast<PHINode>(State.get(PhiR, Part)); 4261 VecRdxPhi->setIncomingValueForBlock( 4262 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4263 } 4264 } 4265 } 4266 4267 // If the vector reduction can be performed in a smaller type, we truncate 4268 // then extend the loop exit value to enable InstCombine to evaluate the 4269 // entire expression in the smaller type. 4270 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4271 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4272 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4273 Builder.SetInsertPoint( 4274 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4275 VectorParts RdxParts(UF); 4276 for (unsigned Part = 0; Part < UF; ++Part) { 4277 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4278 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4279 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4280 : Builder.CreateZExt(Trunc, VecTy); 4281 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4282 if (U != Trunc) { 4283 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4284 RdxParts[Part] = Extnd; 4285 } 4286 } 4287 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4288 for (unsigned Part = 0; Part < UF; ++Part) { 4289 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4290 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4291 } 4292 } 4293 4294 // Reduce all of the unrolled parts into a single vector. 4295 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4296 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4297 4298 // The middle block terminator has already been assigned a DebugLoc here (the 4299 // OrigLoop's single latch terminator). We want the whole middle block to 4300 // appear to execute on this line because: (a) it is all compiler generated, 4301 // (b) these instructions are always executed after evaluating the latch 4302 // conditional branch, and (c) other passes may add new predecessors which 4303 // terminate on this line. This is the easiest way to ensure we don't 4304 // accidentally cause an extra step back into the loop while debugging. 4305 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4306 if (PhiR->isOrdered()) 4307 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4308 else { 4309 // Floating-point operations should have some FMF to enable the reduction. 4310 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4311 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4312 for (unsigned Part = 1; Part < UF; ++Part) { 4313 Value *RdxPart = State.get(LoopExitInstDef, Part); 4314 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4315 ReducedPartRdx = Builder.CreateBinOp( 4316 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4317 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4318 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4319 ReducedPartRdx, RdxPart); 4320 else 4321 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4322 } 4323 } 4324 4325 // Create the reduction after the loop. Note that inloop reductions create the 4326 // target reduction in the loop using a Reduction recipe. 4327 if (VF.isVector() && !PhiR->isInLoop()) { 4328 ReducedPartRdx = 4329 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4330 // If the reduction can be performed in a smaller type, we need to extend 4331 // the reduction to the wider type before we branch to the original loop. 4332 if (PhiTy != RdxDesc.getRecurrenceType()) 4333 ReducedPartRdx = RdxDesc.isSigned() 4334 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4335 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4336 } 4337 4338 // Create a phi node that merges control-flow from the backedge-taken check 4339 // block and the middle block. 4340 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4341 LoopScalarPreHeader->getTerminator()); 4342 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4343 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4344 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4345 4346 // Now, we need to fix the users of the reduction variable 4347 // inside and outside of the scalar remainder loop. 4348 4349 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4350 // in the exit blocks. See comment on analogous loop in 4351 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4352 if (!Cost->requiresScalarEpilogue(VF)) 4353 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4354 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4355 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4356 4357 // Fix the scalar loop reduction variable with the incoming reduction sum 4358 // from the vector body and from the backedge value. 4359 int IncomingEdgeBlockIdx = 4360 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4361 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4362 // Pick the other block. 4363 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4364 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4365 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4366 } 4367 4368 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4369 VPTransformState &State) { 4370 RecurKind RK = RdxDesc.getRecurrenceKind(); 4371 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4372 return; 4373 4374 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4375 assert(LoopExitInstr && "null loop exit instruction"); 4376 SmallVector<Instruction *, 8> Worklist; 4377 SmallPtrSet<Instruction *, 8> Visited; 4378 Worklist.push_back(LoopExitInstr); 4379 Visited.insert(LoopExitInstr); 4380 4381 while (!Worklist.empty()) { 4382 Instruction *Cur = Worklist.pop_back_val(); 4383 if (isa<OverflowingBinaryOperator>(Cur)) 4384 for (unsigned Part = 0; Part < UF; ++Part) { 4385 // FIXME: Should not rely on getVPValue at this point. 4386 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4387 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4388 } 4389 4390 for (User *U : Cur->users()) { 4391 Instruction *UI = cast<Instruction>(U); 4392 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4393 Visited.insert(UI).second) 4394 Worklist.push_back(UI); 4395 } 4396 } 4397 } 4398 4399 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4400 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4401 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4402 // Some phis were already hand updated by the reduction and recurrence 4403 // code above, leave them alone. 4404 continue; 4405 4406 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4407 // Non-instruction incoming values will have only one value. 4408 4409 VPLane Lane = VPLane::getFirstLane(); 4410 if (isa<Instruction>(IncomingValue) && 4411 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4412 VF)) 4413 Lane = VPLane::getLastLaneForVF(VF); 4414 4415 // Can be a loop invariant incoming value or the last scalar value to be 4416 // extracted from the vectorized loop. 4417 // FIXME: Should not rely on getVPValue at this point. 4418 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4419 Value *lastIncomingValue = 4420 OrigLoop->isLoopInvariant(IncomingValue) 4421 ? IncomingValue 4422 : State.get(State.Plan->getVPValue(IncomingValue, true), 4423 VPIteration(UF - 1, Lane)); 4424 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4425 } 4426 } 4427 4428 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4429 // The basic block and loop containing the predicated instruction. 4430 auto *PredBB = PredInst->getParent(); 4431 auto *VectorLoop = LI->getLoopFor(PredBB); 4432 4433 // Initialize a worklist with the operands of the predicated instruction. 4434 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4435 4436 // Holds instructions that we need to analyze again. An instruction may be 4437 // reanalyzed if we don't yet know if we can sink it or not. 4438 SmallVector<Instruction *, 8> InstsToReanalyze; 4439 4440 // Returns true if a given use occurs in the predicated block. Phi nodes use 4441 // their operands in their corresponding predecessor blocks. 4442 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4443 auto *I = cast<Instruction>(U.getUser()); 4444 BasicBlock *BB = I->getParent(); 4445 if (auto *Phi = dyn_cast<PHINode>(I)) 4446 BB = Phi->getIncomingBlock( 4447 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4448 return BB == PredBB; 4449 }; 4450 4451 // Iteratively sink the scalarized operands of the predicated instruction 4452 // into the block we created for it. When an instruction is sunk, it's 4453 // operands are then added to the worklist. The algorithm ends after one pass 4454 // through the worklist doesn't sink a single instruction. 4455 bool Changed; 4456 do { 4457 // Add the instructions that need to be reanalyzed to the worklist, and 4458 // reset the changed indicator. 4459 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4460 InstsToReanalyze.clear(); 4461 Changed = false; 4462 4463 while (!Worklist.empty()) { 4464 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4465 4466 // We can't sink an instruction if it is a phi node, is not in the loop, 4467 // or may have side effects. 4468 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4469 I->mayHaveSideEffects()) 4470 continue; 4471 4472 // If the instruction is already in PredBB, check if we can sink its 4473 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4474 // sinking the scalar instruction I, hence it appears in PredBB; but it 4475 // may have failed to sink I's operands (recursively), which we try 4476 // (again) here. 4477 if (I->getParent() == PredBB) { 4478 Worklist.insert(I->op_begin(), I->op_end()); 4479 continue; 4480 } 4481 4482 // It's legal to sink the instruction if all its uses occur in the 4483 // predicated block. Otherwise, there's nothing to do yet, and we may 4484 // need to reanalyze the instruction. 4485 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4486 InstsToReanalyze.push_back(I); 4487 continue; 4488 } 4489 4490 // Move the instruction to the beginning of the predicated block, and add 4491 // it's operands to the worklist. 4492 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4493 Worklist.insert(I->op_begin(), I->op_end()); 4494 4495 // The sinking may have enabled other instructions to be sunk, so we will 4496 // need to iterate. 4497 Changed = true; 4498 } 4499 } while (Changed); 4500 } 4501 4502 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4503 for (PHINode *OrigPhi : OrigPHIsToFix) { 4504 VPWidenPHIRecipe *VPPhi = 4505 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4506 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4507 // Make sure the builder has a valid insert point. 4508 Builder.SetInsertPoint(NewPhi); 4509 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4510 VPValue *Inc = VPPhi->getIncomingValue(i); 4511 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4512 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4513 } 4514 } 4515 } 4516 4517 bool InnerLoopVectorizer::useOrderedReductions( 4518 const RecurrenceDescriptor &RdxDesc) { 4519 return Cost->useOrderedReductions(RdxDesc); 4520 } 4521 4522 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4523 VPWidenPHIRecipe *PhiR, 4524 VPTransformState &State) { 4525 PHINode *P = cast<PHINode>(PN); 4526 if (EnableVPlanNativePath) { 4527 // Currently we enter here in the VPlan-native path for non-induction 4528 // PHIs where all control flow is uniform. We simply widen these PHIs. 4529 // Create a vector phi with no operands - the vector phi operands will be 4530 // set at the end of vector code generation. 4531 Type *VecTy = (State.VF.isScalar()) 4532 ? PN->getType() 4533 : VectorType::get(PN->getType(), State.VF); 4534 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4535 State.set(PhiR, VecPhi, 0); 4536 OrigPHIsToFix.push_back(P); 4537 4538 return; 4539 } 4540 4541 assert(PN->getParent() == OrigLoop->getHeader() && 4542 "Non-header phis should have been handled elsewhere"); 4543 4544 // In order to support recurrences we need to be able to vectorize Phi nodes. 4545 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4546 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4547 // this value when we vectorize all of the instructions that use the PHI. 4548 4549 assert(!Legal->isReductionVariable(P) && 4550 "reductions should be handled elsewhere"); 4551 4552 setDebugLocFromInst(P); 4553 4554 // This PHINode must be an induction variable. 4555 // Make sure that we know about it. 4556 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4557 4558 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4559 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4560 4561 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4562 // which can be found from the original scalar operations. 4563 switch (II.getKind()) { 4564 case InductionDescriptor::IK_NoInduction: 4565 llvm_unreachable("Unknown induction"); 4566 case InductionDescriptor::IK_IntInduction: 4567 case InductionDescriptor::IK_FpInduction: 4568 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4569 case InductionDescriptor::IK_PtrInduction: { 4570 // Handle the pointer induction variable case. 4571 assert(P->getType()->isPointerTy() && "Unexpected type."); 4572 4573 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4574 // This is the normalized GEP that starts counting at zero. 4575 Value *PtrInd = 4576 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4577 // Determine the number of scalars we need to generate for each unroll 4578 // iteration. If the instruction is uniform, we only need to generate the 4579 // first lane. Otherwise, we generate all VF values. 4580 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4581 assert((IsUniform || !State.VF.isScalable()) && 4582 "Cannot scalarize a scalable VF"); 4583 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4584 4585 for (unsigned Part = 0; Part < UF; ++Part) { 4586 Value *PartStart = 4587 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4588 4589 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4590 Value *Idx = Builder.CreateAdd( 4591 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4592 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4593 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), 4594 DL, II, State.CFG.PrevBB); 4595 SclrGep->setName("next.gep"); 4596 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4597 } 4598 } 4599 return; 4600 } 4601 assert(isa<SCEVConstant>(II.getStep()) && 4602 "Induction step not a SCEV constant!"); 4603 Type *PhiType = II.getStep()->getType(); 4604 4605 // Build a pointer phi 4606 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4607 Type *ScStValueType = ScalarStartValue->getType(); 4608 PHINode *NewPointerPhi = 4609 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4610 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4611 4612 // A pointer induction, performed by using a gep 4613 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4614 Instruction *InductionLoc = LoopLatch->getTerminator(); 4615 const SCEV *ScalarStep = II.getStep(); 4616 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4617 Value *ScalarStepValue = 4618 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4619 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4620 Value *NumUnrolledElems = 4621 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4622 Value *InductionGEP = GetElementPtrInst::Create( 4623 II.getElementType(), NewPointerPhi, 4624 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4625 InductionLoc); 4626 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4627 4628 // Create UF many actual address geps that use the pointer 4629 // phi as base and a vectorized version of the step value 4630 // (<step*0, ..., step*N>) as offset. 4631 for (unsigned Part = 0; Part < State.UF; ++Part) { 4632 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4633 Value *StartOffsetScalar = 4634 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4635 Value *StartOffset = 4636 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4637 // Create a vector of consecutive numbers from zero to VF. 4638 StartOffset = 4639 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4640 4641 Value *GEP = Builder.CreateGEP( 4642 II.getElementType(), NewPointerPhi, 4643 Builder.CreateMul( 4644 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4645 "vector.gep")); 4646 State.set(PhiR, GEP, Part); 4647 } 4648 } 4649 } 4650 } 4651 4652 /// A helper function for checking whether an integer division-related 4653 /// instruction may divide by zero (in which case it must be predicated if 4654 /// executed conditionally in the scalar code). 4655 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4656 /// Non-zero divisors that are non compile-time constants will not be 4657 /// converted into multiplication, so we will still end up scalarizing 4658 /// the division, but can do so w/o predication. 4659 static bool mayDivideByZero(Instruction &I) { 4660 assert((I.getOpcode() == Instruction::UDiv || 4661 I.getOpcode() == Instruction::SDiv || 4662 I.getOpcode() == Instruction::URem || 4663 I.getOpcode() == Instruction::SRem) && 4664 "Unexpected instruction"); 4665 Value *Divisor = I.getOperand(1); 4666 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4667 return !CInt || CInt->isZero(); 4668 } 4669 4670 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4671 VPUser &ArgOperands, 4672 VPTransformState &State) { 4673 assert(!isa<DbgInfoIntrinsic>(I) && 4674 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4675 setDebugLocFromInst(&I); 4676 4677 Module *M = I.getParent()->getParent()->getParent(); 4678 auto *CI = cast<CallInst>(&I); 4679 4680 SmallVector<Type *, 4> Tys; 4681 for (Value *ArgOperand : CI->args()) 4682 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4683 4684 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4685 4686 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4687 // version of the instruction. 4688 // Is it beneficial to perform intrinsic call compared to lib call? 4689 bool NeedToScalarize = false; 4690 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4691 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4692 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4693 assert((UseVectorIntrinsic || !NeedToScalarize) && 4694 "Instruction should be scalarized elsewhere."); 4695 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4696 "Either the intrinsic cost or vector call cost must be valid"); 4697 4698 for (unsigned Part = 0; Part < UF; ++Part) { 4699 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4700 SmallVector<Value *, 4> Args; 4701 for (auto &I : enumerate(ArgOperands.operands())) { 4702 // Some intrinsics have a scalar argument - don't replace it with a 4703 // vector. 4704 Value *Arg; 4705 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4706 Arg = State.get(I.value(), Part); 4707 else { 4708 Arg = State.get(I.value(), VPIteration(0, 0)); 4709 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4710 TysForDecl.push_back(Arg->getType()); 4711 } 4712 Args.push_back(Arg); 4713 } 4714 4715 Function *VectorF; 4716 if (UseVectorIntrinsic) { 4717 // Use vector version of the intrinsic. 4718 if (VF.isVector()) 4719 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4720 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4721 assert(VectorF && "Can't retrieve vector intrinsic."); 4722 } else { 4723 // Use vector version of the function call. 4724 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4725 #ifndef NDEBUG 4726 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4727 "Can't create vector function."); 4728 #endif 4729 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4730 } 4731 SmallVector<OperandBundleDef, 1> OpBundles; 4732 CI->getOperandBundlesAsDefs(OpBundles); 4733 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4734 4735 if (isa<FPMathOperator>(V)) 4736 V->copyFastMathFlags(CI); 4737 4738 State.set(Def, V, Part); 4739 addMetadata(V, &I); 4740 } 4741 } 4742 4743 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4744 // We should not collect Scalars more than once per VF. Right now, this 4745 // function is called from collectUniformsAndScalars(), which already does 4746 // this check. Collecting Scalars for VF=1 does not make any sense. 4747 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4748 "This function should not be visited twice for the same VF"); 4749 4750 SmallSetVector<Instruction *, 8> Worklist; 4751 4752 // These sets are used to seed the analysis with pointers used by memory 4753 // accesses that will remain scalar. 4754 SmallSetVector<Instruction *, 8> ScalarPtrs; 4755 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4756 auto *Latch = TheLoop->getLoopLatch(); 4757 4758 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4759 // The pointer operands of loads and stores will be scalar as long as the 4760 // memory access is not a gather or scatter operation. The value operand of a 4761 // store will remain scalar if the store is scalarized. 4762 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4763 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4764 assert(WideningDecision != CM_Unknown && 4765 "Widening decision should be ready at this moment"); 4766 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4767 if (Ptr == Store->getValueOperand()) 4768 return WideningDecision == CM_Scalarize; 4769 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4770 "Ptr is neither a value or pointer operand"); 4771 return WideningDecision != CM_GatherScatter; 4772 }; 4773 4774 // A helper that returns true if the given value is a bitcast or 4775 // getelementptr instruction contained in the loop. 4776 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4777 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4778 isa<GetElementPtrInst>(V)) && 4779 !TheLoop->isLoopInvariant(V); 4780 }; 4781 4782 // A helper that evaluates a memory access's use of a pointer. If the use will 4783 // be a scalar use and the pointer is only used by memory accesses, we place 4784 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4785 // PossibleNonScalarPtrs. 4786 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4787 // We only care about bitcast and getelementptr instructions contained in 4788 // the loop. 4789 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4790 return; 4791 4792 // If the pointer has already been identified as scalar (e.g., if it was 4793 // also identified as uniform), there's nothing to do. 4794 auto *I = cast<Instruction>(Ptr); 4795 if (Worklist.count(I)) 4796 return; 4797 4798 // If the use of the pointer will be a scalar use, and all users of the 4799 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4800 // place the pointer in PossibleNonScalarPtrs. 4801 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4802 return isa<LoadInst>(U) || isa<StoreInst>(U); 4803 })) 4804 ScalarPtrs.insert(I); 4805 else 4806 PossibleNonScalarPtrs.insert(I); 4807 }; 4808 4809 // We seed the scalars analysis with three classes of instructions: (1) 4810 // instructions marked uniform-after-vectorization and (2) bitcast, 4811 // getelementptr and (pointer) phi instructions used by memory accesses 4812 // requiring a scalar use. 4813 // 4814 // (1) Add to the worklist all instructions that have been identified as 4815 // uniform-after-vectorization. 4816 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4817 4818 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4819 // memory accesses requiring a scalar use. The pointer operands of loads and 4820 // stores will be scalar as long as the memory accesses is not a gather or 4821 // scatter operation. The value operand of a store will remain scalar if the 4822 // store is scalarized. 4823 for (auto *BB : TheLoop->blocks()) 4824 for (auto &I : *BB) { 4825 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4826 evaluatePtrUse(Load, Load->getPointerOperand()); 4827 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4828 evaluatePtrUse(Store, Store->getPointerOperand()); 4829 evaluatePtrUse(Store, Store->getValueOperand()); 4830 } 4831 } 4832 for (auto *I : ScalarPtrs) 4833 if (!PossibleNonScalarPtrs.count(I)) { 4834 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4835 Worklist.insert(I); 4836 } 4837 4838 // Insert the forced scalars. 4839 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4840 // induction variable when the PHI user is scalarized. 4841 auto ForcedScalar = ForcedScalars.find(VF); 4842 if (ForcedScalar != ForcedScalars.end()) 4843 for (auto *I : ForcedScalar->second) 4844 Worklist.insert(I); 4845 4846 // Expand the worklist by looking through any bitcasts and getelementptr 4847 // instructions we've already identified as scalar. This is similar to the 4848 // expansion step in collectLoopUniforms(); however, here we're only 4849 // expanding to include additional bitcasts and getelementptr instructions. 4850 unsigned Idx = 0; 4851 while (Idx != Worklist.size()) { 4852 Instruction *Dst = Worklist[Idx++]; 4853 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4854 continue; 4855 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4856 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4857 auto *J = cast<Instruction>(U); 4858 return !TheLoop->contains(J) || Worklist.count(J) || 4859 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4860 isScalarUse(J, Src)); 4861 })) { 4862 Worklist.insert(Src); 4863 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4864 } 4865 } 4866 4867 // An induction variable will remain scalar if all users of the induction 4868 // variable and induction variable update remain scalar. 4869 for (auto &Induction : Legal->getInductionVars()) { 4870 auto *Ind = Induction.first; 4871 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4872 4873 // If tail-folding is applied, the primary induction variable will be used 4874 // to feed a vector compare. 4875 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4876 continue; 4877 4878 // Returns true if \p Indvar is a pointer induction that is used directly by 4879 // load/store instruction \p I. 4880 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4881 Instruction *I) { 4882 return Induction.second.getKind() == 4883 InductionDescriptor::IK_PtrInduction && 4884 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4885 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4886 }; 4887 4888 // Determine if all users of the induction variable are scalar after 4889 // vectorization. 4890 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4891 auto *I = cast<Instruction>(U); 4892 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4893 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4894 }); 4895 if (!ScalarInd) 4896 continue; 4897 4898 // Determine if all users of the induction variable update instruction are 4899 // scalar after vectorization. 4900 auto ScalarIndUpdate = 4901 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4902 auto *I = cast<Instruction>(U); 4903 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4904 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4905 }); 4906 if (!ScalarIndUpdate) 4907 continue; 4908 4909 // The induction variable and its update instruction will remain scalar. 4910 Worklist.insert(Ind); 4911 Worklist.insert(IndUpdate); 4912 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4913 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4914 << "\n"); 4915 } 4916 4917 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4918 } 4919 4920 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 4921 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4922 return false; 4923 switch(I->getOpcode()) { 4924 default: 4925 break; 4926 case Instruction::Load: 4927 case Instruction::Store: { 4928 if (!Legal->isMaskRequired(I)) 4929 return false; 4930 auto *Ptr = getLoadStorePointerOperand(I); 4931 auto *Ty = getLoadStoreType(I); 4932 const Align Alignment = getLoadStoreAlignment(I); 4933 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4934 TTI.isLegalMaskedGather(Ty, Alignment)) 4935 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4936 TTI.isLegalMaskedScatter(Ty, Alignment)); 4937 } 4938 case Instruction::UDiv: 4939 case Instruction::SDiv: 4940 case Instruction::SRem: 4941 case Instruction::URem: 4942 return mayDivideByZero(*I); 4943 } 4944 return false; 4945 } 4946 4947 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4948 Instruction *I, ElementCount VF) { 4949 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4950 assert(getWideningDecision(I, VF) == CM_Unknown && 4951 "Decision should not be set yet."); 4952 auto *Group = getInterleavedAccessGroup(I); 4953 assert(Group && "Must have a group."); 4954 4955 // If the instruction's allocated size doesn't equal it's type size, it 4956 // requires padding and will be scalarized. 4957 auto &DL = I->getModule()->getDataLayout(); 4958 auto *ScalarTy = getLoadStoreType(I); 4959 if (hasIrregularType(ScalarTy, DL)) 4960 return false; 4961 4962 // Check if masking is required. 4963 // A Group may need masking for one of two reasons: it resides in a block that 4964 // needs predication, or it was decided to use masking to deal with gaps 4965 // (either a gap at the end of a load-access that may result in a speculative 4966 // load, or any gaps in a store-access). 4967 bool PredicatedAccessRequiresMasking = 4968 blockNeedsPredicationForAnyReason(I->getParent()) && 4969 Legal->isMaskRequired(I); 4970 bool LoadAccessWithGapsRequiresEpilogMasking = 4971 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4972 !isScalarEpilogueAllowed(); 4973 bool StoreAccessWithGapsRequiresMasking = 4974 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4975 if (!PredicatedAccessRequiresMasking && 4976 !LoadAccessWithGapsRequiresEpilogMasking && 4977 !StoreAccessWithGapsRequiresMasking) 4978 return true; 4979 4980 // If masked interleaving is required, we expect that the user/target had 4981 // enabled it, because otherwise it either wouldn't have been created or 4982 // it should have been invalidated by the CostModel. 4983 assert(useMaskedInterleavedAccesses(TTI) && 4984 "Masked interleave-groups for predicated accesses are not enabled."); 4985 4986 if (Group->isReverse()) 4987 return false; 4988 4989 auto *Ty = getLoadStoreType(I); 4990 const Align Alignment = getLoadStoreAlignment(I); 4991 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4992 : TTI.isLegalMaskedStore(Ty, Alignment); 4993 } 4994 4995 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4996 Instruction *I, ElementCount VF) { 4997 // Get and ensure we have a valid memory instruction. 4998 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4999 5000 auto *Ptr = getLoadStorePointerOperand(I); 5001 auto *ScalarTy = getLoadStoreType(I); 5002 5003 // In order to be widened, the pointer should be consecutive, first of all. 5004 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5005 return false; 5006 5007 // If the instruction is a store located in a predicated block, it will be 5008 // scalarized. 5009 if (isScalarWithPredication(I)) 5010 return false; 5011 5012 // If the instruction's allocated size doesn't equal it's type size, it 5013 // requires padding and will be scalarized. 5014 auto &DL = I->getModule()->getDataLayout(); 5015 if (hasIrregularType(ScalarTy, DL)) 5016 return false; 5017 5018 return true; 5019 } 5020 5021 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5022 // We should not collect Uniforms more than once per VF. Right now, 5023 // this function is called from collectUniformsAndScalars(), which 5024 // already does this check. Collecting Uniforms for VF=1 does not make any 5025 // sense. 5026 5027 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5028 "This function should not be visited twice for the same VF"); 5029 5030 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5031 // not analyze again. Uniforms.count(VF) will return 1. 5032 Uniforms[VF].clear(); 5033 5034 // We now know that the loop is vectorizable! 5035 // Collect instructions inside the loop that will remain uniform after 5036 // vectorization. 5037 5038 // Global values, params and instructions outside of current loop are out of 5039 // scope. 5040 auto isOutOfScope = [&](Value *V) -> bool { 5041 Instruction *I = dyn_cast<Instruction>(V); 5042 return (!I || !TheLoop->contains(I)); 5043 }; 5044 5045 // Worklist containing uniform instructions demanding lane 0. 5046 SetVector<Instruction *> Worklist; 5047 BasicBlock *Latch = TheLoop->getLoopLatch(); 5048 5049 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5050 // that are scalar with predication must not be considered uniform after 5051 // vectorization, because that would create an erroneous replicating region 5052 // where only a single instance out of VF should be formed. 5053 // TODO: optimize such seldom cases if found important, see PR40816. 5054 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5055 if (isOutOfScope(I)) { 5056 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5057 << *I << "\n"); 5058 return; 5059 } 5060 if (isScalarWithPredication(I)) { 5061 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5062 << *I << "\n"); 5063 return; 5064 } 5065 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5066 Worklist.insert(I); 5067 }; 5068 5069 // Start with the conditional branch. If the branch condition is an 5070 // instruction contained in the loop that is only used by the branch, it is 5071 // uniform. 5072 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5073 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5074 addToWorklistIfAllowed(Cmp); 5075 5076 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5077 InstWidening WideningDecision = getWideningDecision(I, VF); 5078 assert(WideningDecision != CM_Unknown && 5079 "Widening decision should be ready at this moment"); 5080 5081 // A uniform memory op is itself uniform. We exclude uniform stores 5082 // here as they demand the last lane, not the first one. 5083 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5084 assert(WideningDecision == CM_Scalarize); 5085 return true; 5086 } 5087 5088 return (WideningDecision == CM_Widen || 5089 WideningDecision == CM_Widen_Reverse || 5090 WideningDecision == CM_Interleave); 5091 }; 5092 5093 5094 // Returns true if Ptr is the pointer operand of a memory access instruction 5095 // I, and I is known to not require scalarization. 5096 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5097 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5098 }; 5099 5100 // Holds a list of values which are known to have at least one uniform use. 5101 // Note that there may be other uses which aren't uniform. A "uniform use" 5102 // here is something which only demands lane 0 of the unrolled iterations; 5103 // it does not imply that all lanes produce the same value (e.g. this is not 5104 // the usual meaning of uniform) 5105 SetVector<Value *> HasUniformUse; 5106 5107 // Scan the loop for instructions which are either a) known to have only 5108 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5109 for (auto *BB : TheLoop->blocks()) 5110 for (auto &I : *BB) { 5111 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5112 switch (II->getIntrinsicID()) { 5113 case Intrinsic::sideeffect: 5114 case Intrinsic::experimental_noalias_scope_decl: 5115 case Intrinsic::assume: 5116 case Intrinsic::lifetime_start: 5117 case Intrinsic::lifetime_end: 5118 if (TheLoop->hasLoopInvariantOperands(&I)) 5119 addToWorklistIfAllowed(&I); 5120 break; 5121 default: 5122 break; 5123 } 5124 } 5125 5126 // ExtractValue instructions must be uniform, because the operands are 5127 // known to be loop-invariant. 5128 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5129 assert(isOutOfScope(EVI->getAggregateOperand()) && 5130 "Expected aggregate value to be loop invariant"); 5131 addToWorklistIfAllowed(EVI); 5132 continue; 5133 } 5134 5135 // If there's no pointer operand, there's nothing to do. 5136 auto *Ptr = getLoadStorePointerOperand(&I); 5137 if (!Ptr) 5138 continue; 5139 5140 // A uniform memory op is itself uniform. We exclude uniform stores 5141 // here as they demand the last lane, not the first one. 5142 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5143 addToWorklistIfAllowed(&I); 5144 5145 if (isUniformDecision(&I, VF)) { 5146 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5147 HasUniformUse.insert(Ptr); 5148 } 5149 } 5150 5151 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5152 // demanding) users. Since loops are assumed to be in LCSSA form, this 5153 // disallows uses outside the loop as well. 5154 for (auto *V : HasUniformUse) { 5155 if (isOutOfScope(V)) 5156 continue; 5157 auto *I = cast<Instruction>(V); 5158 auto UsersAreMemAccesses = 5159 llvm::all_of(I->users(), [&](User *U) -> bool { 5160 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5161 }); 5162 if (UsersAreMemAccesses) 5163 addToWorklistIfAllowed(I); 5164 } 5165 5166 // Expand Worklist in topological order: whenever a new instruction 5167 // is added , its users should be already inside Worklist. It ensures 5168 // a uniform instruction will only be used by uniform instructions. 5169 unsigned idx = 0; 5170 while (idx != Worklist.size()) { 5171 Instruction *I = Worklist[idx++]; 5172 5173 for (auto OV : I->operand_values()) { 5174 // isOutOfScope operands cannot be uniform instructions. 5175 if (isOutOfScope(OV)) 5176 continue; 5177 // First order recurrence Phi's should typically be considered 5178 // non-uniform. 5179 auto *OP = dyn_cast<PHINode>(OV); 5180 if (OP && Legal->isFirstOrderRecurrence(OP)) 5181 continue; 5182 // If all the users of the operand are uniform, then add the 5183 // operand into the uniform worklist. 5184 auto *OI = cast<Instruction>(OV); 5185 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5186 auto *J = cast<Instruction>(U); 5187 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5188 })) 5189 addToWorklistIfAllowed(OI); 5190 } 5191 } 5192 5193 // For an instruction to be added into Worklist above, all its users inside 5194 // the loop should also be in Worklist. However, this condition cannot be 5195 // true for phi nodes that form a cyclic dependence. We must process phi 5196 // nodes separately. An induction variable will remain uniform if all users 5197 // of the induction variable and induction variable update remain uniform. 5198 // The code below handles both pointer and non-pointer induction variables. 5199 for (auto &Induction : Legal->getInductionVars()) { 5200 auto *Ind = Induction.first; 5201 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5202 5203 // Determine if all users of the induction variable are uniform after 5204 // vectorization. 5205 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5206 auto *I = cast<Instruction>(U); 5207 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5208 isVectorizedMemAccessUse(I, Ind); 5209 }); 5210 if (!UniformInd) 5211 continue; 5212 5213 // Determine if all users of the induction variable update instruction are 5214 // uniform after vectorization. 5215 auto UniformIndUpdate = 5216 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5217 auto *I = cast<Instruction>(U); 5218 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5219 isVectorizedMemAccessUse(I, IndUpdate); 5220 }); 5221 if (!UniformIndUpdate) 5222 continue; 5223 5224 // The induction variable and its update instruction will remain uniform. 5225 addToWorklistIfAllowed(Ind); 5226 addToWorklistIfAllowed(IndUpdate); 5227 } 5228 5229 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5230 } 5231 5232 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5233 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5234 5235 if (Legal->getRuntimePointerChecking()->Need) { 5236 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5237 "runtime pointer checks needed. Enable vectorization of this " 5238 "loop with '#pragma clang loop vectorize(enable)' when " 5239 "compiling with -Os/-Oz", 5240 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5241 return true; 5242 } 5243 5244 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5245 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5246 "runtime SCEV checks needed. Enable vectorization of this " 5247 "loop with '#pragma clang loop vectorize(enable)' when " 5248 "compiling with -Os/-Oz", 5249 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5250 return true; 5251 } 5252 5253 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5254 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5255 reportVectorizationFailure("Runtime stride check for small trip count", 5256 "runtime stride == 1 checks needed. Enable vectorization of " 5257 "this loop without such check by compiling with -Os/-Oz", 5258 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5259 return true; 5260 } 5261 5262 return false; 5263 } 5264 5265 ElementCount 5266 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5267 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5268 return ElementCount::getScalable(0); 5269 5270 if (Hints->isScalableVectorizationDisabled()) { 5271 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5272 "ScalableVectorizationDisabled", ORE, TheLoop); 5273 return ElementCount::getScalable(0); 5274 } 5275 5276 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5277 5278 auto MaxScalableVF = ElementCount::getScalable( 5279 std::numeric_limits<ElementCount::ScalarTy>::max()); 5280 5281 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5282 // FIXME: While for scalable vectors this is currently sufficient, this should 5283 // be replaced by a more detailed mechanism that filters out specific VFs, 5284 // instead of invalidating vectorization for a whole set of VFs based on the 5285 // MaxVF. 5286 5287 // Disable scalable vectorization if the loop contains unsupported reductions. 5288 if (!canVectorizeReductions(MaxScalableVF)) { 5289 reportVectorizationInfo( 5290 "Scalable vectorization not supported for the reduction " 5291 "operations found in this loop.", 5292 "ScalableVFUnfeasible", ORE, TheLoop); 5293 return ElementCount::getScalable(0); 5294 } 5295 5296 // Disable scalable vectorization if the loop contains any instructions 5297 // with element types not supported for scalable vectors. 5298 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5299 return !Ty->isVoidTy() && 5300 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5301 })) { 5302 reportVectorizationInfo("Scalable vectorization is not supported " 5303 "for all element types found in this loop.", 5304 "ScalableVFUnfeasible", ORE, TheLoop); 5305 return ElementCount::getScalable(0); 5306 } 5307 5308 if (Legal->isSafeForAnyVectorWidth()) 5309 return MaxScalableVF; 5310 5311 // Limit MaxScalableVF by the maximum safe dependence distance. 5312 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5313 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5314 MaxVScale = 5315 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5316 MaxScalableVF = ElementCount::getScalable( 5317 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5318 if (!MaxScalableVF) 5319 reportVectorizationInfo( 5320 "Max legal vector width too small, scalable vectorization " 5321 "unfeasible.", 5322 "ScalableVFUnfeasible", ORE, TheLoop); 5323 5324 return MaxScalableVF; 5325 } 5326 5327 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5328 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5329 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5330 unsigned SmallestType, WidestType; 5331 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5332 5333 // Get the maximum safe dependence distance in bits computed by LAA. 5334 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5335 // the memory accesses that is most restrictive (involved in the smallest 5336 // dependence distance). 5337 unsigned MaxSafeElements = 5338 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5339 5340 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5341 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5342 5343 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5344 << ".\n"); 5345 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5346 << ".\n"); 5347 5348 // First analyze the UserVF, fall back if the UserVF should be ignored. 5349 if (UserVF) { 5350 auto MaxSafeUserVF = 5351 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5352 5353 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5354 // If `VF=vscale x N` is safe, then so is `VF=N` 5355 if (UserVF.isScalable()) 5356 return FixedScalableVFPair( 5357 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5358 else 5359 return UserVF; 5360 } 5361 5362 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5363 5364 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5365 // is better to ignore the hint and let the compiler choose a suitable VF. 5366 if (!UserVF.isScalable()) { 5367 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5368 << " is unsafe, clamping to max safe VF=" 5369 << MaxSafeFixedVF << ".\n"); 5370 ORE->emit([&]() { 5371 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5372 TheLoop->getStartLoc(), 5373 TheLoop->getHeader()) 5374 << "User-specified vectorization factor " 5375 << ore::NV("UserVectorizationFactor", UserVF) 5376 << " is unsafe, clamping to maximum safe vectorization factor " 5377 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5378 }); 5379 return MaxSafeFixedVF; 5380 } 5381 5382 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5383 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5384 << " is ignored because scalable vectors are not " 5385 "available.\n"); 5386 ORE->emit([&]() { 5387 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5388 TheLoop->getStartLoc(), 5389 TheLoop->getHeader()) 5390 << "User-specified vectorization factor " 5391 << ore::NV("UserVectorizationFactor", UserVF) 5392 << " is ignored because the target does not support scalable " 5393 "vectors. The compiler will pick a more suitable value."; 5394 }); 5395 } else { 5396 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5397 << " is unsafe. Ignoring scalable UserVF.\n"); 5398 ORE->emit([&]() { 5399 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5400 TheLoop->getStartLoc(), 5401 TheLoop->getHeader()) 5402 << "User-specified vectorization factor " 5403 << ore::NV("UserVectorizationFactor", UserVF) 5404 << " is unsafe. Ignoring the hint to let the compiler pick a " 5405 "more suitable value."; 5406 }); 5407 } 5408 } 5409 5410 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5411 << " / " << WidestType << " bits.\n"); 5412 5413 FixedScalableVFPair Result(ElementCount::getFixed(1), 5414 ElementCount::getScalable(0)); 5415 if (auto MaxVF = 5416 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5417 MaxSafeFixedVF, FoldTailByMasking)) 5418 Result.FixedVF = MaxVF; 5419 5420 if (auto MaxVF = 5421 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5422 MaxSafeScalableVF, FoldTailByMasking)) 5423 if (MaxVF.isScalable()) { 5424 Result.ScalableVF = MaxVF; 5425 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5426 << "\n"); 5427 } 5428 5429 return Result; 5430 } 5431 5432 FixedScalableVFPair 5433 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5434 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5435 // TODO: It may by useful to do since it's still likely to be dynamically 5436 // uniform if the target can skip. 5437 reportVectorizationFailure( 5438 "Not inserting runtime ptr check for divergent target", 5439 "runtime pointer checks needed. Not enabled for divergent target", 5440 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5441 return FixedScalableVFPair::getNone(); 5442 } 5443 5444 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5445 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5446 if (TC == 1) { 5447 reportVectorizationFailure("Single iteration (non) loop", 5448 "loop trip count is one, irrelevant for vectorization", 5449 "SingleIterationLoop", ORE, TheLoop); 5450 return FixedScalableVFPair::getNone(); 5451 } 5452 5453 switch (ScalarEpilogueStatus) { 5454 case CM_ScalarEpilogueAllowed: 5455 return computeFeasibleMaxVF(TC, UserVF, false); 5456 case CM_ScalarEpilogueNotAllowedUsePredicate: 5457 LLVM_FALLTHROUGH; 5458 case CM_ScalarEpilogueNotNeededUsePredicate: 5459 LLVM_DEBUG( 5460 dbgs() << "LV: vector predicate hint/switch found.\n" 5461 << "LV: Not allowing scalar epilogue, creating predicated " 5462 << "vector loop.\n"); 5463 break; 5464 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5465 // fallthrough as a special case of OptForSize 5466 case CM_ScalarEpilogueNotAllowedOptSize: 5467 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5468 LLVM_DEBUG( 5469 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5470 else 5471 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5472 << "count.\n"); 5473 5474 // Bail if runtime checks are required, which are not good when optimising 5475 // for size. 5476 if (runtimeChecksRequired()) 5477 return FixedScalableVFPair::getNone(); 5478 5479 break; 5480 } 5481 5482 // The only loops we can vectorize without a scalar epilogue, are loops with 5483 // a bottom-test and a single exiting block. We'd have to handle the fact 5484 // that not every instruction executes on the last iteration. This will 5485 // require a lane mask which varies through the vector loop body. (TODO) 5486 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5487 // If there was a tail-folding hint/switch, but we can't fold the tail by 5488 // masking, fallback to a vectorization with a scalar epilogue. 5489 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5490 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5491 "scalar epilogue instead.\n"); 5492 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5493 return computeFeasibleMaxVF(TC, UserVF, false); 5494 } 5495 return FixedScalableVFPair::getNone(); 5496 } 5497 5498 // Now try the tail folding 5499 5500 // Invalidate interleave groups that require an epilogue if we can't mask 5501 // the interleave-group. 5502 if (!useMaskedInterleavedAccesses(TTI)) { 5503 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5504 "No decisions should have been taken at this point"); 5505 // Note: There is no need to invalidate any cost modeling decisions here, as 5506 // non where taken so far. 5507 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5508 } 5509 5510 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5511 // Avoid tail folding if the trip count is known to be a multiple of any VF 5512 // we chose. 5513 // FIXME: The condition below pessimises the case for fixed-width vectors, 5514 // when scalable VFs are also candidates for vectorization. 5515 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5516 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5517 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5518 "MaxFixedVF must be a power of 2"); 5519 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5520 : MaxFixedVF.getFixedValue(); 5521 ScalarEvolution *SE = PSE.getSE(); 5522 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5523 const SCEV *ExitCount = SE->getAddExpr( 5524 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5525 const SCEV *Rem = SE->getURemExpr( 5526 SE->applyLoopGuards(ExitCount, TheLoop), 5527 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5528 if (Rem->isZero()) { 5529 // Accept MaxFixedVF if we do not have a tail. 5530 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5531 return MaxFactors; 5532 } 5533 } 5534 5535 // For scalable vectors, don't use tail folding as this is currently not yet 5536 // supported. The code is likely to have ended up here if the tripcount is 5537 // low, in which case it makes sense not to use scalable vectors. 5538 if (MaxFactors.ScalableVF.isVector()) 5539 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5540 5541 // If we don't know the precise trip count, or if the trip count that we 5542 // found modulo the vectorization factor is not zero, try to fold the tail 5543 // by masking. 5544 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5545 if (Legal->prepareToFoldTailByMasking()) { 5546 FoldTailByMasking = true; 5547 return MaxFactors; 5548 } 5549 5550 // If there was a tail-folding hint/switch, but we can't fold the tail by 5551 // masking, fallback to a vectorization with a scalar epilogue. 5552 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5553 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5554 "scalar epilogue instead.\n"); 5555 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5556 return MaxFactors; 5557 } 5558 5559 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5560 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5561 return FixedScalableVFPair::getNone(); 5562 } 5563 5564 if (TC == 0) { 5565 reportVectorizationFailure( 5566 "Unable to calculate the loop count due to complex control flow", 5567 "unable to calculate the loop count due to complex control flow", 5568 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5569 return FixedScalableVFPair::getNone(); 5570 } 5571 5572 reportVectorizationFailure( 5573 "Cannot optimize for size and vectorize at the same time.", 5574 "cannot optimize for size and vectorize at the same time. " 5575 "Enable vectorization of this loop with '#pragma clang loop " 5576 "vectorize(enable)' when compiling with -Os/-Oz", 5577 "NoTailLoopWithOptForSize", ORE, TheLoop); 5578 return FixedScalableVFPair::getNone(); 5579 } 5580 5581 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5582 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5583 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5584 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5585 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5586 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5587 : TargetTransformInfo::RGK_FixedWidthVector); 5588 5589 // Convenience function to return the minimum of two ElementCounts. 5590 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5591 assert((LHS.isScalable() == RHS.isScalable()) && 5592 "Scalable flags must match"); 5593 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5594 }; 5595 5596 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5597 // Note that both WidestRegister and WidestType may not be a powers of 2. 5598 auto MaxVectorElementCount = ElementCount::get( 5599 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5600 ComputeScalableMaxVF); 5601 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5602 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5603 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5604 5605 if (!MaxVectorElementCount) { 5606 LLVM_DEBUG(dbgs() << "LV: The target has no " 5607 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5608 << " vector registers.\n"); 5609 return ElementCount::getFixed(1); 5610 } 5611 5612 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5613 if (ConstTripCount && 5614 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5615 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5616 // If loop trip count (TC) is known at compile time there is no point in 5617 // choosing VF greater than TC (as done in the loop below). Select maximum 5618 // power of two which doesn't exceed TC. 5619 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5620 // when the TC is less than or equal to the known number of lanes. 5621 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5622 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5623 "exceeding the constant trip count: " 5624 << ClampedConstTripCount << "\n"); 5625 return ElementCount::getFixed(ClampedConstTripCount); 5626 } 5627 5628 ElementCount MaxVF = MaxVectorElementCount; 5629 if (TTI.shouldMaximizeVectorBandwidth() || 5630 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5631 auto MaxVectorElementCountMaxBW = ElementCount::get( 5632 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5633 ComputeScalableMaxVF); 5634 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5635 5636 // Collect all viable vectorization factors larger than the default MaxVF 5637 // (i.e. MaxVectorElementCount). 5638 SmallVector<ElementCount, 8> VFs; 5639 for (ElementCount VS = MaxVectorElementCount * 2; 5640 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5641 VFs.push_back(VS); 5642 5643 // For each VF calculate its register usage. 5644 auto RUs = calculateRegisterUsage(VFs); 5645 5646 // Select the largest VF which doesn't require more registers than existing 5647 // ones. 5648 for (int i = RUs.size() - 1; i >= 0; --i) { 5649 bool Selected = true; 5650 for (auto &pair : RUs[i].MaxLocalUsers) { 5651 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5652 if (pair.second > TargetNumRegisters) 5653 Selected = false; 5654 } 5655 if (Selected) { 5656 MaxVF = VFs[i]; 5657 break; 5658 } 5659 } 5660 if (ElementCount MinVF = 5661 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5662 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5663 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5664 << ") with target's minimum: " << MinVF << '\n'); 5665 MaxVF = MinVF; 5666 } 5667 } 5668 } 5669 return MaxVF; 5670 } 5671 5672 bool LoopVectorizationCostModel::isMoreProfitable( 5673 const VectorizationFactor &A, const VectorizationFactor &B) const { 5674 InstructionCost CostA = A.Cost; 5675 InstructionCost CostB = B.Cost; 5676 5677 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5678 5679 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5680 MaxTripCount) { 5681 // If we are folding the tail and the trip count is a known (possibly small) 5682 // constant, the trip count will be rounded up to an integer number of 5683 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5684 // which we compare directly. When not folding the tail, the total cost will 5685 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5686 // approximated with the per-lane cost below instead of using the tripcount 5687 // as here. 5688 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5689 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5690 return RTCostA < RTCostB; 5691 } 5692 5693 // Improve estimate for the vector width if it is scalable. 5694 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5695 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5696 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5697 if (A.Width.isScalable()) 5698 EstimatedWidthA *= VScale.getValue(); 5699 if (B.Width.isScalable()) 5700 EstimatedWidthB *= VScale.getValue(); 5701 } 5702 5703 // Assume vscale may be larger than 1 (or the value being tuned for), 5704 // so that scalable vectorization is slightly favorable over fixed-width 5705 // vectorization. 5706 if (A.Width.isScalable() && !B.Width.isScalable()) 5707 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5708 5709 // To avoid the need for FP division: 5710 // (CostA / A.Width) < (CostB / B.Width) 5711 // <=> (CostA * B.Width) < (CostB * A.Width) 5712 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5713 } 5714 5715 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5716 const ElementCountSet &VFCandidates) { 5717 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5718 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5719 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5720 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5721 "Expected Scalar VF to be a candidate"); 5722 5723 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5724 VectorizationFactor ChosenFactor = ScalarCost; 5725 5726 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5727 if (ForceVectorization && VFCandidates.size() > 1) { 5728 // Ignore scalar width, because the user explicitly wants vectorization. 5729 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5730 // evaluation. 5731 ChosenFactor.Cost = InstructionCost::getMax(); 5732 } 5733 5734 SmallVector<InstructionVFPair> InvalidCosts; 5735 for (const auto &i : VFCandidates) { 5736 // The cost for scalar VF=1 is already calculated, so ignore it. 5737 if (i.isScalar()) 5738 continue; 5739 5740 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5741 VectorizationFactor Candidate(i, C.first); 5742 5743 #ifndef NDEBUG 5744 unsigned AssumedMinimumVscale = 1; 5745 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5746 AssumedMinimumVscale = VScale.getValue(); 5747 unsigned Width = 5748 Candidate.Width.isScalable() 5749 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5750 : Candidate.Width.getFixedValue(); 5751 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5752 << " costs: " << (Candidate.Cost / Width)); 5753 if (i.isScalable()) 5754 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5755 << AssumedMinimumVscale << ")"); 5756 LLVM_DEBUG(dbgs() << ".\n"); 5757 #endif 5758 5759 if (!C.second && !ForceVectorization) { 5760 LLVM_DEBUG( 5761 dbgs() << "LV: Not considering vector loop of width " << i 5762 << " because it will not generate any vector instructions.\n"); 5763 continue; 5764 } 5765 5766 // If profitable add it to ProfitableVF list. 5767 if (isMoreProfitable(Candidate, ScalarCost)) 5768 ProfitableVFs.push_back(Candidate); 5769 5770 if (isMoreProfitable(Candidate, ChosenFactor)) 5771 ChosenFactor = Candidate; 5772 } 5773 5774 // Emit a report of VFs with invalid costs in the loop. 5775 if (!InvalidCosts.empty()) { 5776 // Group the remarks per instruction, keeping the instruction order from 5777 // InvalidCosts. 5778 std::map<Instruction *, unsigned> Numbering; 5779 unsigned I = 0; 5780 for (auto &Pair : InvalidCosts) 5781 if (!Numbering.count(Pair.first)) 5782 Numbering[Pair.first] = I++; 5783 5784 // Sort the list, first on instruction(number) then on VF. 5785 llvm::sort(InvalidCosts, 5786 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5787 if (Numbering[A.first] != Numbering[B.first]) 5788 return Numbering[A.first] < Numbering[B.first]; 5789 ElementCountComparator ECC; 5790 return ECC(A.second, B.second); 5791 }); 5792 5793 // For a list of ordered instruction-vf pairs: 5794 // [(load, vf1), (load, vf2), (store, vf1)] 5795 // Group the instructions together to emit separate remarks for: 5796 // load (vf1, vf2) 5797 // store (vf1) 5798 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5799 auto Subset = ArrayRef<InstructionVFPair>(); 5800 do { 5801 if (Subset.empty()) 5802 Subset = Tail.take_front(1); 5803 5804 Instruction *I = Subset.front().first; 5805 5806 // If the next instruction is different, or if there are no other pairs, 5807 // emit a remark for the collated subset. e.g. 5808 // [(load, vf1), (load, vf2))] 5809 // to emit: 5810 // remark: invalid costs for 'load' at VF=(vf, vf2) 5811 if (Subset == Tail || Tail[Subset.size()].first != I) { 5812 std::string OutString; 5813 raw_string_ostream OS(OutString); 5814 assert(!Subset.empty() && "Unexpected empty range"); 5815 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5816 for (auto &Pair : Subset) 5817 OS << (Pair.second == Subset.front().second ? "" : ", ") 5818 << Pair.second; 5819 OS << "):"; 5820 if (auto *CI = dyn_cast<CallInst>(I)) 5821 OS << " call to " << CI->getCalledFunction()->getName(); 5822 else 5823 OS << " " << I->getOpcodeName(); 5824 OS.flush(); 5825 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5826 Tail = Tail.drop_front(Subset.size()); 5827 Subset = {}; 5828 } else 5829 // Grow the subset by one element 5830 Subset = Tail.take_front(Subset.size() + 1); 5831 } while (!Tail.empty()); 5832 } 5833 5834 if (!EnableCondStoresVectorization && NumPredStores) { 5835 reportVectorizationFailure("There are conditional stores.", 5836 "store that is conditionally executed prevents vectorization", 5837 "ConditionalStore", ORE, TheLoop); 5838 ChosenFactor = ScalarCost; 5839 } 5840 5841 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5842 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5843 << "LV: Vectorization seems to be not beneficial, " 5844 << "but was forced by a user.\n"); 5845 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5846 return ChosenFactor; 5847 } 5848 5849 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5850 const Loop &L, ElementCount VF) const { 5851 // Cross iteration phis such as reductions need special handling and are 5852 // currently unsupported. 5853 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5854 return Legal->isFirstOrderRecurrence(&Phi) || 5855 Legal->isReductionVariable(&Phi); 5856 })) 5857 return false; 5858 5859 // Phis with uses outside of the loop require special handling and are 5860 // currently unsupported. 5861 for (auto &Entry : Legal->getInductionVars()) { 5862 // Look for uses of the value of the induction at the last iteration. 5863 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5864 for (User *U : PostInc->users()) 5865 if (!L.contains(cast<Instruction>(U))) 5866 return false; 5867 // Look for uses of penultimate value of the induction. 5868 for (User *U : Entry.first->users()) 5869 if (!L.contains(cast<Instruction>(U))) 5870 return false; 5871 } 5872 5873 // Induction variables that are widened require special handling that is 5874 // currently not supported. 5875 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5876 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5877 this->isProfitableToScalarize(Entry.first, VF)); 5878 })) 5879 return false; 5880 5881 // Epilogue vectorization code has not been auditted to ensure it handles 5882 // non-latch exits properly. It may be fine, but it needs auditted and 5883 // tested. 5884 if (L.getExitingBlock() != L.getLoopLatch()) 5885 return false; 5886 5887 return true; 5888 } 5889 5890 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5891 const ElementCount VF) const { 5892 // FIXME: We need a much better cost-model to take different parameters such 5893 // as register pressure, code size increase and cost of extra branches into 5894 // account. For now we apply a very crude heuristic and only consider loops 5895 // with vectorization factors larger than a certain value. 5896 // We also consider epilogue vectorization unprofitable for targets that don't 5897 // consider interleaving beneficial (eg. MVE). 5898 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5899 return false; 5900 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5901 return true; 5902 return false; 5903 } 5904 5905 VectorizationFactor 5906 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5907 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5908 VectorizationFactor Result = VectorizationFactor::Disabled(); 5909 if (!EnableEpilogueVectorization) { 5910 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5911 return Result; 5912 } 5913 5914 if (!isScalarEpilogueAllowed()) { 5915 LLVM_DEBUG( 5916 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5917 "allowed.\n";); 5918 return Result; 5919 } 5920 5921 // Not really a cost consideration, but check for unsupported cases here to 5922 // simplify the logic. 5923 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5924 LLVM_DEBUG( 5925 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5926 "not a supported candidate.\n";); 5927 return Result; 5928 } 5929 5930 if (EpilogueVectorizationForceVF > 1) { 5931 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5932 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5933 if (LVP.hasPlanWithVF(ForcedEC)) 5934 return {ForcedEC, 0}; 5935 else { 5936 LLVM_DEBUG( 5937 dbgs() 5938 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5939 return Result; 5940 } 5941 } 5942 5943 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5944 TheLoop->getHeader()->getParent()->hasMinSize()) { 5945 LLVM_DEBUG( 5946 dbgs() 5947 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5948 return Result; 5949 } 5950 5951 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5952 if (MainLoopVF.isScalable()) 5953 LLVM_DEBUG( 5954 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 5955 "yet supported. Converting to fixed-width (VF=" 5956 << FixedMainLoopVF << ") instead\n"); 5957 5958 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 5959 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5960 "this loop\n"); 5961 return Result; 5962 } 5963 5964 for (auto &NextVF : ProfitableVFs) 5965 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 5966 (Result.Width.getFixedValue() == 1 || 5967 isMoreProfitable(NextVF, Result)) && 5968 LVP.hasPlanWithVF(NextVF.Width)) 5969 Result = NextVF; 5970 5971 if (Result != VectorizationFactor::Disabled()) 5972 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5973 << Result.Width.getFixedValue() << "\n";); 5974 return Result; 5975 } 5976 5977 std::pair<unsigned, unsigned> 5978 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5979 unsigned MinWidth = -1U; 5980 unsigned MaxWidth = 8; 5981 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5982 for (Type *T : ElementTypesInLoop) { 5983 MinWidth = std::min<unsigned>( 5984 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5985 MaxWidth = std::max<unsigned>( 5986 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5987 } 5988 return {MinWidth, MaxWidth}; 5989 } 5990 5991 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5992 ElementTypesInLoop.clear(); 5993 // For each block. 5994 for (BasicBlock *BB : TheLoop->blocks()) { 5995 // For each instruction in the loop. 5996 for (Instruction &I : BB->instructionsWithoutDebug()) { 5997 Type *T = I.getType(); 5998 5999 // Skip ignored values. 6000 if (ValuesToIgnore.count(&I)) 6001 continue; 6002 6003 // Only examine Loads, Stores and PHINodes. 6004 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6005 continue; 6006 6007 // Examine PHI nodes that are reduction variables. Update the type to 6008 // account for the recurrence type. 6009 if (auto *PN = dyn_cast<PHINode>(&I)) { 6010 if (!Legal->isReductionVariable(PN)) 6011 continue; 6012 const RecurrenceDescriptor &RdxDesc = 6013 Legal->getReductionVars().find(PN)->second; 6014 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6015 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6016 RdxDesc.getRecurrenceType(), 6017 TargetTransformInfo::ReductionFlags())) 6018 continue; 6019 T = RdxDesc.getRecurrenceType(); 6020 } 6021 6022 // Examine the stored values. 6023 if (auto *ST = dyn_cast<StoreInst>(&I)) 6024 T = ST->getValueOperand()->getType(); 6025 6026 // Ignore loaded pointer types and stored pointer types that are not 6027 // vectorizable. 6028 // 6029 // FIXME: The check here attempts to predict whether a load or store will 6030 // be vectorized. We only know this for certain after a VF has 6031 // been selected. Here, we assume that if an access can be 6032 // vectorized, it will be. We should also look at extending this 6033 // optimization to non-pointer types. 6034 // 6035 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6036 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6037 continue; 6038 6039 ElementTypesInLoop.insert(T); 6040 } 6041 } 6042 } 6043 6044 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6045 unsigned LoopCost) { 6046 // -- The interleave heuristics -- 6047 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6048 // There are many micro-architectural considerations that we can't predict 6049 // at this level. For example, frontend pressure (on decode or fetch) due to 6050 // code size, or the number and capabilities of the execution ports. 6051 // 6052 // We use the following heuristics to select the interleave count: 6053 // 1. If the code has reductions, then we interleave to break the cross 6054 // iteration dependency. 6055 // 2. If the loop is really small, then we interleave to reduce the loop 6056 // overhead. 6057 // 3. We don't interleave if we think that we will spill registers to memory 6058 // due to the increased register pressure. 6059 6060 if (!isScalarEpilogueAllowed()) 6061 return 1; 6062 6063 // We used the distance for the interleave count. 6064 if (Legal->getMaxSafeDepDistBytes() != -1U) 6065 return 1; 6066 6067 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6068 const bool HasReductions = !Legal->getReductionVars().empty(); 6069 // Do not interleave loops with a relatively small known or estimated trip 6070 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6071 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6072 // because with the above conditions interleaving can expose ILP and break 6073 // cross iteration dependences for reductions. 6074 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6075 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6076 return 1; 6077 6078 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6079 // We divide by these constants so assume that we have at least one 6080 // instruction that uses at least one register. 6081 for (auto& pair : R.MaxLocalUsers) { 6082 pair.second = std::max(pair.second, 1U); 6083 } 6084 6085 // We calculate the interleave count using the following formula. 6086 // Subtract the number of loop invariants from the number of available 6087 // registers. These registers are used by all of the interleaved instances. 6088 // Next, divide the remaining registers by the number of registers that is 6089 // required by the loop, in order to estimate how many parallel instances 6090 // fit without causing spills. All of this is rounded down if necessary to be 6091 // a power of two. We want power of two interleave count to simplify any 6092 // addressing operations or alignment considerations. 6093 // We also want power of two interleave counts to ensure that the induction 6094 // variable of the vector loop wraps to zero, when tail is folded by masking; 6095 // this currently happens when OptForSize, in which case IC is set to 1 above. 6096 unsigned IC = UINT_MAX; 6097 6098 for (auto& pair : R.MaxLocalUsers) { 6099 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6100 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6101 << " registers of " 6102 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6103 if (VF.isScalar()) { 6104 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6105 TargetNumRegisters = ForceTargetNumScalarRegs; 6106 } else { 6107 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6108 TargetNumRegisters = ForceTargetNumVectorRegs; 6109 } 6110 unsigned MaxLocalUsers = pair.second; 6111 unsigned LoopInvariantRegs = 0; 6112 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6113 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6114 6115 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6116 // Don't count the induction variable as interleaved. 6117 if (EnableIndVarRegisterHeur) { 6118 TmpIC = 6119 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6120 std::max(1U, (MaxLocalUsers - 1))); 6121 } 6122 6123 IC = std::min(IC, TmpIC); 6124 } 6125 6126 // Clamp the interleave ranges to reasonable counts. 6127 unsigned MaxInterleaveCount = 6128 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6129 6130 // Check if the user has overridden the max. 6131 if (VF.isScalar()) { 6132 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6133 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6134 } else { 6135 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6136 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6137 } 6138 6139 // If trip count is known or estimated compile time constant, limit the 6140 // interleave count to be less than the trip count divided by VF, provided it 6141 // is at least 1. 6142 // 6143 // For scalable vectors we can't know if interleaving is beneficial. It may 6144 // not be beneficial for small loops if none of the lanes in the second vector 6145 // iterations is enabled. However, for larger loops, there is likely to be a 6146 // similar benefit as for fixed-width vectors. For now, we choose to leave 6147 // the InterleaveCount as if vscale is '1', although if some information about 6148 // the vector is known (e.g. min vector size), we can make a better decision. 6149 if (BestKnownTC) { 6150 MaxInterleaveCount = 6151 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6152 // Make sure MaxInterleaveCount is greater than 0. 6153 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6154 } 6155 6156 assert(MaxInterleaveCount > 0 && 6157 "Maximum interleave count must be greater than 0"); 6158 6159 // Clamp the calculated IC to be between the 1 and the max interleave count 6160 // that the target and trip count allows. 6161 if (IC > MaxInterleaveCount) 6162 IC = MaxInterleaveCount; 6163 else 6164 // Make sure IC is greater than 0. 6165 IC = std::max(1u, IC); 6166 6167 assert(IC > 0 && "Interleave count must be greater than 0."); 6168 6169 // If we did not calculate the cost for VF (because the user selected the VF) 6170 // then we calculate the cost of VF here. 6171 if (LoopCost == 0) { 6172 InstructionCost C = expectedCost(VF).first; 6173 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6174 LoopCost = *C.getValue(); 6175 } 6176 6177 assert(LoopCost && "Non-zero loop cost expected"); 6178 6179 // Interleave if we vectorized this loop and there is a reduction that could 6180 // benefit from interleaving. 6181 if (VF.isVector() && HasReductions) { 6182 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6183 return IC; 6184 } 6185 6186 // Note that if we've already vectorized the loop we will have done the 6187 // runtime check and so interleaving won't require further checks. 6188 bool InterleavingRequiresRuntimePointerCheck = 6189 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6190 6191 // We want to interleave small loops in order to reduce the loop overhead and 6192 // potentially expose ILP opportunities. 6193 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6194 << "LV: IC is " << IC << '\n' 6195 << "LV: VF is " << VF << '\n'); 6196 const bool AggressivelyInterleaveReductions = 6197 TTI.enableAggressiveInterleaving(HasReductions); 6198 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6199 // We assume that the cost overhead is 1 and we use the cost model 6200 // to estimate the cost of the loop and interleave until the cost of the 6201 // loop overhead is about 5% of the cost of the loop. 6202 unsigned SmallIC = 6203 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6204 6205 // Interleave until store/load ports (estimated by max interleave count) are 6206 // saturated. 6207 unsigned NumStores = Legal->getNumStores(); 6208 unsigned NumLoads = Legal->getNumLoads(); 6209 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6210 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6211 6212 // There is little point in interleaving for reductions containing selects 6213 // and compares when VF=1 since it may just create more overhead than it's 6214 // worth for loops with small trip counts. This is because we still have to 6215 // do the final reduction after the loop. 6216 bool HasSelectCmpReductions = 6217 HasReductions && 6218 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6219 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6220 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6221 RdxDesc.getRecurrenceKind()); 6222 }); 6223 if (HasSelectCmpReductions) { 6224 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6225 return 1; 6226 } 6227 6228 // If we have a scalar reduction (vector reductions are already dealt with 6229 // by this point), we can increase the critical path length if the loop 6230 // we're interleaving is inside another loop. For tree-wise reductions 6231 // set the limit to 2, and for ordered reductions it's best to disable 6232 // interleaving entirely. 6233 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6234 bool HasOrderedReductions = 6235 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6236 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6237 return RdxDesc.isOrdered(); 6238 }); 6239 if (HasOrderedReductions) { 6240 LLVM_DEBUG( 6241 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6242 return 1; 6243 } 6244 6245 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6246 SmallIC = std::min(SmallIC, F); 6247 StoresIC = std::min(StoresIC, F); 6248 LoadsIC = std::min(LoadsIC, F); 6249 } 6250 6251 if (EnableLoadStoreRuntimeInterleave && 6252 std::max(StoresIC, LoadsIC) > SmallIC) { 6253 LLVM_DEBUG( 6254 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6255 return std::max(StoresIC, LoadsIC); 6256 } 6257 6258 // If there are scalar reductions and TTI has enabled aggressive 6259 // interleaving for reductions, we will interleave to expose ILP. 6260 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6261 AggressivelyInterleaveReductions) { 6262 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6263 // Interleave no less than SmallIC but not as aggressive as the normal IC 6264 // to satisfy the rare situation when resources are too limited. 6265 return std::max(IC / 2, SmallIC); 6266 } else { 6267 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6268 return SmallIC; 6269 } 6270 } 6271 6272 // Interleave if this is a large loop (small loops are already dealt with by 6273 // this point) that could benefit from interleaving. 6274 if (AggressivelyInterleaveReductions) { 6275 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6276 return IC; 6277 } 6278 6279 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6280 return 1; 6281 } 6282 6283 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6284 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6285 // This function calculates the register usage by measuring the highest number 6286 // of values that are alive at a single location. Obviously, this is a very 6287 // rough estimation. We scan the loop in a topological order in order and 6288 // assign a number to each instruction. We use RPO to ensure that defs are 6289 // met before their users. We assume that each instruction that has in-loop 6290 // users starts an interval. We record every time that an in-loop value is 6291 // used, so we have a list of the first and last occurrences of each 6292 // instruction. Next, we transpose this data structure into a multi map that 6293 // holds the list of intervals that *end* at a specific location. This multi 6294 // map allows us to perform a linear search. We scan the instructions linearly 6295 // and record each time that a new interval starts, by placing it in a set. 6296 // If we find this value in the multi-map then we remove it from the set. 6297 // The max register usage is the maximum size of the set. 6298 // We also search for instructions that are defined outside the loop, but are 6299 // used inside the loop. We need this number separately from the max-interval 6300 // usage number because when we unroll, loop-invariant values do not take 6301 // more register. 6302 LoopBlocksDFS DFS(TheLoop); 6303 DFS.perform(LI); 6304 6305 RegisterUsage RU; 6306 6307 // Each 'key' in the map opens a new interval. The values 6308 // of the map are the index of the 'last seen' usage of the 6309 // instruction that is the key. 6310 using IntervalMap = DenseMap<Instruction *, unsigned>; 6311 6312 // Maps instruction to its index. 6313 SmallVector<Instruction *, 64> IdxToInstr; 6314 // Marks the end of each interval. 6315 IntervalMap EndPoint; 6316 // Saves the list of instruction indices that are used in the loop. 6317 SmallPtrSet<Instruction *, 8> Ends; 6318 // Saves the list of values that are used in the loop but are 6319 // defined outside the loop, such as arguments and constants. 6320 SmallPtrSet<Value *, 8> LoopInvariants; 6321 6322 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6323 for (Instruction &I : BB->instructionsWithoutDebug()) { 6324 IdxToInstr.push_back(&I); 6325 6326 // Save the end location of each USE. 6327 for (Value *U : I.operands()) { 6328 auto *Instr = dyn_cast<Instruction>(U); 6329 6330 // Ignore non-instruction values such as arguments, constants, etc. 6331 if (!Instr) 6332 continue; 6333 6334 // If this instruction is outside the loop then record it and continue. 6335 if (!TheLoop->contains(Instr)) { 6336 LoopInvariants.insert(Instr); 6337 continue; 6338 } 6339 6340 // Overwrite previous end points. 6341 EndPoint[Instr] = IdxToInstr.size(); 6342 Ends.insert(Instr); 6343 } 6344 } 6345 } 6346 6347 // Saves the list of intervals that end with the index in 'key'. 6348 using InstrList = SmallVector<Instruction *, 2>; 6349 DenseMap<unsigned, InstrList> TransposeEnds; 6350 6351 // Transpose the EndPoints to a list of values that end at each index. 6352 for (auto &Interval : EndPoint) 6353 TransposeEnds[Interval.second].push_back(Interval.first); 6354 6355 SmallPtrSet<Instruction *, 8> OpenIntervals; 6356 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6357 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6358 6359 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6360 6361 // A lambda that gets the register usage for the given type and VF. 6362 const auto &TTICapture = TTI; 6363 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6364 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6365 return 0; 6366 InstructionCost::CostType RegUsage = 6367 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6368 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6369 "Nonsensical values for register usage."); 6370 return RegUsage; 6371 }; 6372 6373 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6374 Instruction *I = IdxToInstr[i]; 6375 6376 // Remove all of the instructions that end at this location. 6377 InstrList &List = TransposeEnds[i]; 6378 for (Instruction *ToRemove : List) 6379 OpenIntervals.erase(ToRemove); 6380 6381 // Ignore instructions that are never used within the loop. 6382 if (!Ends.count(I)) 6383 continue; 6384 6385 // Skip ignored values. 6386 if (ValuesToIgnore.count(I)) 6387 continue; 6388 6389 // For each VF find the maximum usage of registers. 6390 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6391 // Count the number of live intervals. 6392 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6393 6394 if (VFs[j].isScalar()) { 6395 for (auto Inst : OpenIntervals) { 6396 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6397 if (RegUsage.find(ClassID) == RegUsage.end()) 6398 RegUsage[ClassID] = 1; 6399 else 6400 RegUsage[ClassID] += 1; 6401 } 6402 } else { 6403 collectUniformsAndScalars(VFs[j]); 6404 for (auto Inst : OpenIntervals) { 6405 // Skip ignored values for VF > 1. 6406 if (VecValuesToIgnore.count(Inst)) 6407 continue; 6408 if (isScalarAfterVectorization(Inst, VFs[j])) { 6409 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6410 if (RegUsage.find(ClassID) == RegUsage.end()) 6411 RegUsage[ClassID] = 1; 6412 else 6413 RegUsage[ClassID] += 1; 6414 } else { 6415 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6416 if (RegUsage.find(ClassID) == RegUsage.end()) 6417 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6418 else 6419 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6420 } 6421 } 6422 } 6423 6424 for (auto& pair : RegUsage) { 6425 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6426 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6427 else 6428 MaxUsages[j][pair.first] = pair.second; 6429 } 6430 } 6431 6432 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6433 << OpenIntervals.size() << '\n'); 6434 6435 // Add the current instruction to the list of open intervals. 6436 OpenIntervals.insert(I); 6437 } 6438 6439 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6440 SmallMapVector<unsigned, unsigned, 4> Invariant; 6441 6442 for (auto Inst : LoopInvariants) { 6443 unsigned Usage = 6444 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6445 unsigned ClassID = 6446 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6447 if (Invariant.find(ClassID) == Invariant.end()) 6448 Invariant[ClassID] = Usage; 6449 else 6450 Invariant[ClassID] += Usage; 6451 } 6452 6453 LLVM_DEBUG({ 6454 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6455 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6456 << " item\n"; 6457 for (const auto &pair : MaxUsages[i]) { 6458 dbgs() << "LV(REG): RegisterClass: " 6459 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6460 << " registers\n"; 6461 } 6462 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6463 << " item\n"; 6464 for (const auto &pair : Invariant) { 6465 dbgs() << "LV(REG): RegisterClass: " 6466 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6467 << " registers\n"; 6468 } 6469 }); 6470 6471 RU.LoopInvariantRegs = Invariant; 6472 RU.MaxLocalUsers = MaxUsages[i]; 6473 RUs[i] = RU; 6474 } 6475 6476 return RUs; 6477 } 6478 6479 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6480 // TODO: Cost model for emulated masked load/store is completely 6481 // broken. This hack guides the cost model to use an artificially 6482 // high enough value to practically disable vectorization with such 6483 // operations, except where previously deployed legality hack allowed 6484 // using very low cost values. This is to avoid regressions coming simply 6485 // from moving "masked load/store" check from legality to cost model. 6486 // Masked Load/Gather emulation was previously never allowed. 6487 // Limited number of Masked Store/Scatter emulation was allowed. 6488 assert(isPredicatedInst(I) && 6489 "Expecting a scalar emulated instruction"); 6490 return isa<LoadInst>(I) || 6491 (isa<StoreInst>(I) && 6492 NumPredStores > NumberOfStoresToPredicate); 6493 } 6494 6495 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6496 // If we aren't vectorizing the loop, or if we've already collected the 6497 // instructions to scalarize, there's nothing to do. Collection may already 6498 // have occurred if we have a user-selected VF and are now computing the 6499 // expected cost for interleaving. 6500 if (VF.isScalar() || VF.isZero() || 6501 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6502 return; 6503 6504 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6505 // not profitable to scalarize any instructions, the presence of VF in the 6506 // map will indicate that we've analyzed it already. 6507 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6508 6509 // Find all the instructions that are scalar with predication in the loop and 6510 // determine if it would be better to not if-convert the blocks they are in. 6511 // If so, we also record the instructions to scalarize. 6512 for (BasicBlock *BB : TheLoop->blocks()) { 6513 if (!blockNeedsPredicationForAnyReason(BB)) 6514 continue; 6515 for (Instruction &I : *BB) 6516 if (isScalarWithPredication(&I)) { 6517 ScalarCostsTy ScalarCosts; 6518 // Do not apply discount if scalable, because that would lead to 6519 // invalid scalarization costs. 6520 // Do not apply discount logic if hacked cost is needed 6521 // for emulated masked memrefs. 6522 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6523 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6524 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6525 // Remember that BB will remain after vectorization. 6526 PredicatedBBsAfterVectorization.insert(BB); 6527 } 6528 } 6529 } 6530 6531 int LoopVectorizationCostModel::computePredInstDiscount( 6532 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6533 assert(!isUniformAfterVectorization(PredInst, VF) && 6534 "Instruction marked uniform-after-vectorization will be predicated"); 6535 6536 // Initialize the discount to zero, meaning that the scalar version and the 6537 // vector version cost the same. 6538 InstructionCost Discount = 0; 6539 6540 // Holds instructions to analyze. The instructions we visit are mapped in 6541 // ScalarCosts. Those instructions are the ones that would be scalarized if 6542 // we find that the scalar version costs less. 6543 SmallVector<Instruction *, 8> Worklist; 6544 6545 // Returns true if the given instruction can be scalarized. 6546 auto canBeScalarized = [&](Instruction *I) -> bool { 6547 // We only attempt to scalarize instructions forming a single-use chain 6548 // from the original predicated block that would otherwise be vectorized. 6549 // Although not strictly necessary, we give up on instructions we know will 6550 // already be scalar to avoid traversing chains that are unlikely to be 6551 // beneficial. 6552 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6553 isScalarAfterVectorization(I, VF)) 6554 return false; 6555 6556 // If the instruction is scalar with predication, it will be analyzed 6557 // separately. We ignore it within the context of PredInst. 6558 if (isScalarWithPredication(I)) 6559 return false; 6560 6561 // If any of the instruction's operands are uniform after vectorization, 6562 // the instruction cannot be scalarized. This prevents, for example, a 6563 // masked load from being scalarized. 6564 // 6565 // We assume we will only emit a value for lane zero of an instruction 6566 // marked uniform after vectorization, rather than VF identical values. 6567 // Thus, if we scalarize an instruction that uses a uniform, we would 6568 // create uses of values corresponding to the lanes we aren't emitting code 6569 // for. This behavior can be changed by allowing getScalarValue to clone 6570 // the lane zero values for uniforms rather than asserting. 6571 for (Use &U : I->operands()) 6572 if (auto *J = dyn_cast<Instruction>(U.get())) 6573 if (isUniformAfterVectorization(J, VF)) 6574 return false; 6575 6576 // Otherwise, we can scalarize the instruction. 6577 return true; 6578 }; 6579 6580 // Compute the expected cost discount from scalarizing the entire expression 6581 // feeding the predicated instruction. We currently only consider expressions 6582 // that are single-use instruction chains. 6583 Worklist.push_back(PredInst); 6584 while (!Worklist.empty()) { 6585 Instruction *I = Worklist.pop_back_val(); 6586 6587 // If we've already analyzed the instruction, there's nothing to do. 6588 if (ScalarCosts.find(I) != ScalarCosts.end()) 6589 continue; 6590 6591 // Compute the cost of the vector instruction. Note that this cost already 6592 // includes the scalarization overhead of the predicated instruction. 6593 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6594 6595 // Compute the cost of the scalarized instruction. This cost is the cost of 6596 // the instruction as if it wasn't if-converted and instead remained in the 6597 // predicated block. We will scale this cost by block probability after 6598 // computing the scalarization overhead. 6599 InstructionCost ScalarCost = 6600 VF.getFixedValue() * 6601 getInstructionCost(I, ElementCount::getFixed(1)).first; 6602 6603 // Compute the scalarization overhead of needed insertelement instructions 6604 // and phi nodes. 6605 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6606 ScalarCost += TTI.getScalarizationOverhead( 6607 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6608 APInt::getAllOnes(VF.getFixedValue()), true, false); 6609 ScalarCost += 6610 VF.getFixedValue() * 6611 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6612 } 6613 6614 // Compute the scalarization overhead of needed extractelement 6615 // instructions. For each of the instruction's operands, if the operand can 6616 // be scalarized, add it to the worklist; otherwise, account for the 6617 // overhead. 6618 for (Use &U : I->operands()) 6619 if (auto *J = dyn_cast<Instruction>(U.get())) { 6620 assert(VectorType::isValidElementType(J->getType()) && 6621 "Instruction has non-scalar type"); 6622 if (canBeScalarized(J)) 6623 Worklist.push_back(J); 6624 else if (needsExtract(J, VF)) { 6625 ScalarCost += TTI.getScalarizationOverhead( 6626 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6627 APInt::getAllOnes(VF.getFixedValue()), false, true); 6628 } 6629 } 6630 6631 // Scale the total scalar cost by block probability. 6632 ScalarCost /= getReciprocalPredBlockProb(); 6633 6634 // Compute the discount. A non-negative discount means the vector version 6635 // of the instruction costs more, and scalarizing would be beneficial. 6636 Discount += VectorCost - ScalarCost; 6637 ScalarCosts[I] = ScalarCost; 6638 } 6639 6640 return *Discount.getValue(); 6641 } 6642 6643 LoopVectorizationCostModel::VectorizationCostTy 6644 LoopVectorizationCostModel::expectedCost( 6645 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6646 VectorizationCostTy Cost; 6647 6648 // For each block. 6649 for (BasicBlock *BB : TheLoop->blocks()) { 6650 VectorizationCostTy BlockCost; 6651 6652 // For each instruction in the old loop. 6653 for (Instruction &I : BB->instructionsWithoutDebug()) { 6654 // Skip ignored values. 6655 if (ValuesToIgnore.count(&I) || 6656 (VF.isVector() && VecValuesToIgnore.count(&I))) 6657 continue; 6658 6659 VectorizationCostTy C = getInstructionCost(&I, VF); 6660 6661 // Check if we should override the cost. 6662 if (C.first.isValid() && 6663 ForceTargetInstructionCost.getNumOccurrences() > 0) 6664 C.first = InstructionCost(ForceTargetInstructionCost); 6665 6666 // Keep a list of instructions with invalid costs. 6667 if (Invalid && !C.first.isValid()) 6668 Invalid->emplace_back(&I, VF); 6669 6670 BlockCost.first += C.first; 6671 BlockCost.second |= C.second; 6672 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6673 << " for VF " << VF << " For instruction: " << I 6674 << '\n'); 6675 } 6676 6677 // If we are vectorizing a predicated block, it will have been 6678 // if-converted. This means that the block's instructions (aside from 6679 // stores and instructions that may divide by zero) will now be 6680 // unconditionally executed. For the scalar case, we may not always execute 6681 // the predicated block, if it is an if-else block. Thus, scale the block's 6682 // cost by the probability of executing it. blockNeedsPredication from 6683 // Legal is used so as to not include all blocks in tail folded loops. 6684 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6685 BlockCost.first /= getReciprocalPredBlockProb(); 6686 6687 Cost.first += BlockCost.first; 6688 Cost.second |= BlockCost.second; 6689 } 6690 6691 return Cost; 6692 } 6693 6694 /// Gets Address Access SCEV after verifying that the access pattern 6695 /// is loop invariant except the induction variable dependence. 6696 /// 6697 /// This SCEV can be sent to the Target in order to estimate the address 6698 /// calculation cost. 6699 static const SCEV *getAddressAccessSCEV( 6700 Value *Ptr, 6701 LoopVectorizationLegality *Legal, 6702 PredicatedScalarEvolution &PSE, 6703 const Loop *TheLoop) { 6704 6705 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6706 if (!Gep) 6707 return nullptr; 6708 6709 // We are looking for a gep with all loop invariant indices except for one 6710 // which should be an induction variable. 6711 auto SE = PSE.getSE(); 6712 unsigned NumOperands = Gep->getNumOperands(); 6713 for (unsigned i = 1; i < NumOperands; ++i) { 6714 Value *Opd = Gep->getOperand(i); 6715 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6716 !Legal->isInductionVariable(Opd)) 6717 return nullptr; 6718 } 6719 6720 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6721 return PSE.getSCEV(Ptr); 6722 } 6723 6724 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6725 return Legal->hasStride(I->getOperand(0)) || 6726 Legal->hasStride(I->getOperand(1)); 6727 } 6728 6729 InstructionCost 6730 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6731 ElementCount VF) { 6732 assert(VF.isVector() && 6733 "Scalarization cost of instruction implies vectorization."); 6734 if (VF.isScalable()) 6735 return InstructionCost::getInvalid(); 6736 6737 Type *ValTy = getLoadStoreType(I); 6738 auto SE = PSE.getSE(); 6739 6740 unsigned AS = getLoadStoreAddressSpace(I); 6741 Value *Ptr = getLoadStorePointerOperand(I); 6742 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6743 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6744 // that it is being called from this specific place. 6745 6746 // Figure out whether the access is strided and get the stride value 6747 // if it's known in compile time 6748 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6749 6750 // Get the cost of the scalar memory instruction and address computation. 6751 InstructionCost Cost = 6752 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6753 6754 // Don't pass *I here, since it is scalar but will actually be part of a 6755 // vectorized loop where the user of it is a vectorized instruction. 6756 const Align Alignment = getLoadStoreAlignment(I); 6757 Cost += VF.getKnownMinValue() * 6758 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6759 AS, TTI::TCK_RecipThroughput); 6760 6761 // Get the overhead of the extractelement and insertelement instructions 6762 // we might create due to scalarization. 6763 Cost += getScalarizationOverhead(I, VF); 6764 6765 // If we have a predicated load/store, it will need extra i1 extracts and 6766 // conditional branches, but may not be executed for each vector lane. Scale 6767 // the cost by the probability of executing the predicated block. 6768 if (isPredicatedInst(I)) { 6769 Cost /= getReciprocalPredBlockProb(); 6770 6771 // Add the cost of an i1 extract and a branch 6772 auto *Vec_i1Ty = 6773 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6774 Cost += TTI.getScalarizationOverhead( 6775 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6776 /*Insert=*/false, /*Extract=*/true); 6777 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6778 6779 if (useEmulatedMaskMemRefHack(I)) 6780 // Artificially setting to a high enough value to practically disable 6781 // vectorization with such operations. 6782 Cost = 3000000; 6783 } 6784 6785 return Cost; 6786 } 6787 6788 InstructionCost 6789 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6790 ElementCount VF) { 6791 Type *ValTy = getLoadStoreType(I); 6792 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6793 Value *Ptr = getLoadStorePointerOperand(I); 6794 unsigned AS = getLoadStoreAddressSpace(I); 6795 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6796 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6797 6798 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6799 "Stride should be 1 or -1 for consecutive memory access"); 6800 const Align Alignment = getLoadStoreAlignment(I); 6801 InstructionCost Cost = 0; 6802 if (Legal->isMaskRequired(I)) 6803 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6804 CostKind); 6805 else 6806 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6807 CostKind, I); 6808 6809 bool Reverse = ConsecutiveStride < 0; 6810 if (Reverse) 6811 Cost += 6812 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6813 return Cost; 6814 } 6815 6816 InstructionCost 6817 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6818 ElementCount VF) { 6819 assert(Legal->isUniformMemOp(*I)); 6820 6821 Type *ValTy = getLoadStoreType(I); 6822 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6823 const Align Alignment = getLoadStoreAlignment(I); 6824 unsigned AS = getLoadStoreAddressSpace(I); 6825 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6826 if (isa<LoadInst>(I)) { 6827 return TTI.getAddressComputationCost(ValTy) + 6828 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6829 CostKind) + 6830 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6831 } 6832 StoreInst *SI = cast<StoreInst>(I); 6833 6834 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6835 return TTI.getAddressComputationCost(ValTy) + 6836 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6837 CostKind) + 6838 (isLoopInvariantStoreValue 6839 ? 0 6840 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6841 VF.getKnownMinValue() - 1)); 6842 } 6843 6844 InstructionCost 6845 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6846 ElementCount VF) { 6847 Type *ValTy = getLoadStoreType(I); 6848 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6849 const Align Alignment = getLoadStoreAlignment(I); 6850 const Value *Ptr = getLoadStorePointerOperand(I); 6851 6852 return TTI.getAddressComputationCost(VectorTy) + 6853 TTI.getGatherScatterOpCost( 6854 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6855 TargetTransformInfo::TCK_RecipThroughput, I); 6856 } 6857 6858 InstructionCost 6859 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6860 ElementCount VF) { 6861 // TODO: Once we have support for interleaving with scalable vectors 6862 // we can calculate the cost properly here. 6863 if (VF.isScalable()) 6864 return InstructionCost::getInvalid(); 6865 6866 Type *ValTy = getLoadStoreType(I); 6867 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6868 unsigned AS = getLoadStoreAddressSpace(I); 6869 6870 auto Group = getInterleavedAccessGroup(I); 6871 assert(Group && "Fail to get an interleaved access group."); 6872 6873 unsigned InterleaveFactor = Group->getFactor(); 6874 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6875 6876 // Holds the indices of existing members in the interleaved group. 6877 SmallVector<unsigned, 4> Indices; 6878 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6879 if (Group->getMember(IF)) 6880 Indices.push_back(IF); 6881 6882 // Calculate the cost of the whole interleaved group. 6883 bool UseMaskForGaps = 6884 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6885 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6886 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6887 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6888 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6889 6890 if (Group->isReverse()) { 6891 // TODO: Add support for reversed masked interleaved access. 6892 assert(!Legal->isMaskRequired(I) && 6893 "Reverse masked interleaved access not supported."); 6894 Cost += 6895 Group->getNumMembers() * 6896 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6897 } 6898 return Cost; 6899 } 6900 6901 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6902 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6903 using namespace llvm::PatternMatch; 6904 // Early exit for no inloop reductions 6905 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6906 return None; 6907 auto *VectorTy = cast<VectorType>(Ty); 6908 6909 // We are looking for a pattern of, and finding the minimal acceptable cost: 6910 // reduce(mul(ext(A), ext(B))) or 6911 // reduce(mul(A, B)) or 6912 // reduce(ext(A)) or 6913 // reduce(A). 6914 // The basic idea is that we walk down the tree to do that, finding the root 6915 // reduction instruction in InLoopReductionImmediateChains. From there we find 6916 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6917 // of the components. If the reduction cost is lower then we return it for the 6918 // reduction instruction and 0 for the other instructions in the pattern. If 6919 // it is not we return an invalid cost specifying the orignal cost method 6920 // should be used. 6921 Instruction *RetI = I; 6922 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6923 if (!RetI->hasOneUser()) 6924 return None; 6925 RetI = RetI->user_back(); 6926 } 6927 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6928 RetI->user_back()->getOpcode() == Instruction::Add) { 6929 if (!RetI->hasOneUser()) 6930 return None; 6931 RetI = RetI->user_back(); 6932 } 6933 6934 // Test if the found instruction is a reduction, and if not return an invalid 6935 // cost specifying the parent to use the original cost modelling. 6936 if (!InLoopReductionImmediateChains.count(RetI)) 6937 return None; 6938 6939 // Find the reduction this chain is a part of and calculate the basic cost of 6940 // the reduction on its own. 6941 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6942 Instruction *ReductionPhi = LastChain; 6943 while (!isa<PHINode>(ReductionPhi)) 6944 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6945 6946 const RecurrenceDescriptor &RdxDesc = 6947 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6948 6949 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6950 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6951 6952 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6953 // normal fmul instruction to the cost of the fadd reduction. 6954 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6955 BaseCost += 6956 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6957 6958 // If we're using ordered reductions then we can just return the base cost 6959 // here, since getArithmeticReductionCost calculates the full ordered 6960 // reduction cost when FP reassociation is not allowed. 6961 if (useOrderedReductions(RdxDesc)) 6962 return BaseCost; 6963 6964 // Get the operand that was not the reduction chain and match it to one of the 6965 // patterns, returning the better cost if it is found. 6966 Instruction *RedOp = RetI->getOperand(1) == LastChain 6967 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6968 : dyn_cast<Instruction>(RetI->getOperand(1)); 6969 6970 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6971 6972 Instruction *Op0, *Op1; 6973 if (RedOp && 6974 match(RedOp, 6975 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6976 match(Op0, m_ZExtOrSExt(m_Value())) && 6977 Op0->getOpcode() == Op1->getOpcode() && 6978 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6979 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6980 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6981 6982 // Matched reduce(ext(mul(ext(A), ext(B))) 6983 // Note that the extend opcodes need to all match, or if A==B they will have 6984 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6985 // which is equally fine. 6986 bool IsUnsigned = isa<ZExtInst>(Op0); 6987 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6988 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6989 6990 InstructionCost ExtCost = 6991 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6992 TTI::CastContextHint::None, CostKind, Op0); 6993 InstructionCost MulCost = 6994 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6995 InstructionCost Ext2Cost = 6996 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6997 TTI::CastContextHint::None, CostKind, RedOp); 6998 6999 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7000 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7001 CostKind); 7002 7003 if (RedCost.isValid() && 7004 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7005 return I == RetI ? RedCost : 0; 7006 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7007 !TheLoop->isLoopInvariant(RedOp)) { 7008 // Matched reduce(ext(A)) 7009 bool IsUnsigned = isa<ZExtInst>(RedOp); 7010 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7011 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7012 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7013 CostKind); 7014 7015 InstructionCost ExtCost = 7016 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7017 TTI::CastContextHint::None, CostKind, RedOp); 7018 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7019 return I == RetI ? RedCost : 0; 7020 } else if (RedOp && 7021 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7022 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7023 Op0->getOpcode() == Op1->getOpcode() && 7024 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7025 bool IsUnsigned = isa<ZExtInst>(Op0); 7026 Type *Op0Ty = Op0->getOperand(0)->getType(); 7027 Type *Op1Ty = Op1->getOperand(0)->getType(); 7028 Type *LargestOpTy = 7029 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 7030 : Op0Ty; 7031 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 7032 7033 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 7034 // different sizes. We take the largest type as the ext to reduce, and add 7035 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 7036 InstructionCost ExtCost0 = TTI.getCastInstrCost( 7037 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 7038 TTI::CastContextHint::None, CostKind, Op0); 7039 InstructionCost ExtCost1 = TTI.getCastInstrCost( 7040 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 7041 TTI::CastContextHint::None, CostKind, Op1); 7042 InstructionCost MulCost = 7043 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7044 7045 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7046 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7047 CostKind); 7048 InstructionCost ExtraExtCost = 0; 7049 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7050 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7051 ExtraExtCost = TTI.getCastInstrCost( 7052 ExtraExtOp->getOpcode(), ExtType, 7053 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7054 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7055 } 7056 7057 if (RedCost.isValid() && 7058 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7059 return I == RetI ? RedCost : 0; 7060 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7061 // Matched reduce(mul()) 7062 InstructionCost MulCost = 7063 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7064 7065 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7066 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7067 CostKind); 7068 7069 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7070 return I == RetI ? RedCost : 0; 7071 } 7072 } 7073 7074 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7075 } 7076 7077 InstructionCost 7078 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7079 ElementCount VF) { 7080 // Calculate scalar cost only. Vectorization cost should be ready at this 7081 // moment. 7082 if (VF.isScalar()) { 7083 Type *ValTy = getLoadStoreType(I); 7084 const Align Alignment = getLoadStoreAlignment(I); 7085 unsigned AS = getLoadStoreAddressSpace(I); 7086 7087 return TTI.getAddressComputationCost(ValTy) + 7088 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7089 TTI::TCK_RecipThroughput, I); 7090 } 7091 return getWideningCost(I, VF); 7092 } 7093 7094 LoopVectorizationCostModel::VectorizationCostTy 7095 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7096 ElementCount VF) { 7097 // If we know that this instruction will remain uniform, check the cost of 7098 // the scalar version. 7099 if (isUniformAfterVectorization(I, VF)) 7100 VF = ElementCount::getFixed(1); 7101 7102 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7103 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7104 7105 // Forced scalars do not have any scalarization overhead. 7106 auto ForcedScalar = ForcedScalars.find(VF); 7107 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7108 auto InstSet = ForcedScalar->second; 7109 if (InstSet.count(I)) 7110 return VectorizationCostTy( 7111 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7112 VF.getKnownMinValue()), 7113 false); 7114 } 7115 7116 Type *VectorTy; 7117 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7118 7119 bool TypeNotScalarized = false; 7120 if (VF.isVector() && VectorTy->isVectorTy()) { 7121 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7122 if (NumParts) 7123 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7124 else 7125 C = InstructionCost::getInvalid(); 7126 } 7127 return VectorizationCostTy(C, TypeNotScalarized); 7128 } 7129 7130 InstructionCost 7131 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7132 ElementCount VF) const { 7133 7134 // There is no mechanism yet to create a scalable scalarization loop, 7135 // so this is currently Invalid. 7136 if (VF.isScalable()) 7137 return InstructionCost::getInvalid(); 7138 7139 if (VF.isScalar()) 7140 return 0; 7141 7142 InstructionCost Cost = 0; 7143 Type *RetTy = ToVectorTy(I->getType(), VF); 7144 if (!RetTy->isVoidTy() && 7145 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7146 Cost += TTI.getScalarizationOverhead( 7147 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7148 false); 7149 7150 // Some targets keep addresses scalar. 7151 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7152 return Cost; 7153 7154 // Some targets support efficient element stores. 7155 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7156 return Cost; 7157 7158 // Collect operands to consider. 7159 CallInst *CI = dyn_cast<CallInst>(I); 7160 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7161 7162 // Skip operands that do not require extraction/scalarization and do not incur 7163 // any overhead. 7164 SmallVector<Type *> Tys; 7165 for (auto *V : filterExtractingOperands(Ops, VF)) 7166 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7167 return Cost + TTI.getOperandsScalarizationOverhead( 7168 filterExtractingOperands(Ops, VF), Tys); 7169 } 7170 7171 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7172 if (VF.isScalar()) 7173 return; 7174 NumPredStores = 0; 7175 for (BasicBlock *BB : TheLoop->blocks()) { 7176 // For each instruction in the old loop. 7177 for (Instruction &I : *BB) { 7178 Value *Ptr = getLoadStorePointerOperand(&I); 7179 if (!Ptr) 7180 continue; 7181 7182 // TODO: We should generate better code and update the cost model for 7183 // predicated uniform stores. Today they are treated as any other 7184 // predicated store (see added test cases in 7185 // invariant-store-vectorization.ll). 7186 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7187 NumPredStores++; 7188 7189 if (Legal->isUniformMemOp(I)) { 7190 // TODO: Avoid replicating loads and stores instead of 7191 // relying on instcombine to remove them. 7192 // Load: Scalar load + broadcast 7193 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7194 InstructionCost Cost; 7195 if (isa<StoreInst>(&I) && VF.isScalable() && 7196 isLegalGatherOrScatter(&I)) { 7197 Cost = getGatherScatterCost(&I, VF); 7198 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7199 } else { 7200 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7201 "Cannot yet scalarize uniform stores"); 7202 Cost = getUniformMemOpCost(&I, VF); 7203 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7204 } 7205 continue; 7206 } 7207 7208 // We assume that widening is the best solution when possible. 7209 if (memoryInstructionCanBeWidened(&I, VF)) { 7210 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7211 int ConsecutiveStride = Legal->isConsecutivePtr( 7212 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7213 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7214 "Expected consecutive stride."); 7215 InstWidening Decision = 7216 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7217 setWideningDecision(&I, VF, Decision, Cost); 7218 continue; 7219 } 7220 7221 // Choose between Interleaving, Gather/Scatter or Scalarization. 7222 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7223 unsigned NumAccesses = 1; 7224 if (isAccessInterleaved(&I)) { 7225 auto Group = getInterleavedAccessGroup(&I); 7226 assert(Group && "Fail to get an interleaved access group."); 7227 7228 // Make one decision for the whole group. 7229 if (getWideningDecision(&I, VF) != CM_Unknown) 7230 continue; 7231 7232 NumAccesses = Group->getNumMembers(); 7233 if (interleavedAccessCanBeWidened(&I, VF)) 7234 InterleaveCost = getInterleaveGroupCost(&I, VF); 7235 } 7236 7237 InstructionCost GatherScatterCost = 7238 isLegalGatherOrScatter(&I) 7239 ? getGatherScatterCost(&I, VF) * NumAccesses 7240 : InstructionCost::getInvalid(); 7241 7242 InstructionCost ScalarizationCost = 7243 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7244 7245 // Choose better solution for the current VF, 7246 // write down this decision and use it during vectorization. 7247 InstructionCost Cost; 7248 InstWidening Decision; 7249 if (InterleaveCost <= GatherScatterCost && 7250 InterleaveCost < ScalarizationCost) { 7251 Decision = CM_Interleave; 7252 Cost = InterleaveCost; 7253 } else if (GatherScatterCost < ScalarizationCost) { 7254 Decision = CM_GatherScatter; 7255 Cost = GatherScatterCost; 7256 } else { 7257 Decision = CM_Scalarize; 7258 Cost = ScalarizationCost; 7259 } 7260 // If the instructions belongs to an interleave group, the whole group 7261 // receives the same decision. The whole group receives the cost, but 7262 // the cost will actually be assigned to one instruction. 7263 if (auto Group = getInterleavedAccessGroup(&I)) 7264 setWideningDecision(Group, VF, Decision, Cost); 7265 else 7266 setWideningDecision(&I, VF, Decision, Cost); 7267 } 7268 } 7269 7270 // Make sure that any load of address and any other address computation 7271 // remains scalar unless there is gather/scatter support. This avoids 7272 // inevitable extracts into address registers, and also has the benefit of 7273 // activating LSR more, since that pass can't optimize vectorized 7274 // addresses. 7275 if (TTI.prefersVectorizedAddressing()) 7276 return; 7277 7278 // Start with all scalar pointer uses. 7279 SmallPtrSet<Instruction *, 8> AddrDefs; 7280 for (BasicBlock *BB : TheLoop->blocks()) 7281 for (Instruction &I : *BB) { 7282 Instruction *PtrDef = 7283 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7284 if (PtrDef && TheLoop->contains(PtrDef) && 7285 getWideningDecision(&I, VF) != CM_GatherScatter) 7286 AddrDefs.insert(PtrDef); 7287 } 7288 7289 // Add all instructions used to generate the addresses. 7290 SmallVector<Instruction *, 4> Worklist; 7291 append_range(Worklist, AddrDefs); 7292 while (!Worklist.empty()) { 7293 Instruction *I = Worklist.pop_back_val(); 7294 for (auto &Op : I->operands()) 7295 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7296 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7297 AddrDefs.insert(InstOp).second) 7298 Worklist.push_back(InstOp); 7299 } 7300 7301 for (auto *I : AddrDefs) { 7302 if (isa<LoadInst>(I)) { 7303 // Setting the desired widening decision should ideally be handled in 7304 // by cost functions, but since this involves the task of finding out 7305 // if the loaded register is involved in an address computation, it is 7306 // instead changed here when we know this is the case. 7307 InstWidening Decision = getWideningDecision(I, VF); 7308 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7309 // Scalarize a widened load of address. 7310 setWideningDecision( 7311 I, VF, CM_Scalarize, 7312 (VF.getKnownMinValue() * 7313 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7314 else if (auto Group = getInterleavedAccessGroup(I)) { 7315 // Scalarize an interleave group of address loads. 7316 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7317 if (Instruction *Member = Group->getMember(I)) 7318 setWideningDecision( 7319 Member, VF, CM_Scalarize, 7320 (VF.getKnownMinValue() * 7321 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7322 } 7323 } 7324 } else 7325 // Make sure I gets scalarized and a cost estimate without 7326 // scalarization overhead. 7327 ForcedScalars[VF].insert(I); 7328 } 7329 } 7330 7331 InstructionCost 7332 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7333 Type *&VectorTy) { 7334 Type *RetTy = I->getType(); 7335 if (canTruncateToMinimalBitwidth(I, VF)) 7336 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7337 auto SE = PSE.getSE(); 7338 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7339 7340 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7341 ElementCount VF) -> bool { 7342 if (VF.isScalar()) 7343 return true; 7344 7345 auto Scalarized = InstsToScalarize.find(VF); 7346 assert(Scalarized != InstsToScalarize.end() && 7347 "VF not yet analyzed for scalarization profitability"); 7348 return !Scalarized->second.count(I) && 7349 llvm::all_of(I->users(), [&](User *U) { 7350 auto *UI = cast<Instruction>(U); 7351 return !Scalarized->second.count(UI); 7352 }); 7353 }; 7354 (void) hasSingleCopyAfterVectorization; 7355 7356 if (isScalarAfterVectorization(I, VF)) { 7357 // With the exception of GEPs and PHIs, after scalarization there should 7358 // only be one copy of the instruction generated in the loop. This is 7359 // because the VF is either 1, or any instructions that need scalarizing 7360 // have already been dealt with by the the time we get here. As a result, 7361 // it means we don't have to multiply the instruction cost by VF. 7362 assert(I->getOpcode() == Instruction::GetElementPtr || 7363 I->getOpcode() == Instruction::PHI || 7364 (I->getOpcode() == Instruction::BitCast && 7365 I->getType()->isPointerTy()) || 7366 hasSingleCopyAfterVectorization(I, VF)); 7367 VectorTy = RetTy; 7368 } else 7369 VectorTy = ToVectorTy(RetTy, VF); 7370 7371 // TODO: We need to estimate the cost of intrinsic calls. 7372 switch (I->getOpcode()) { 7373 case Instruction::GetElementPtr: 7374 // We mark this instruction as zero-cost because the cost of GEPs in 7375 // vectorized code depends on whether the corresponding memory instruction 7376 // is scalarized or not. Therefore, we handle GEPs with the memory 7377 // instruction cost. 7378 return 0; 7379 case Instruction::Br: { 7380 // In cases of scalarized and predicated instructions, there will be VF 7381 // predicated blocks in the vectorized loop. Each branch around these 7382 // blocks requires also an extract of its vector compare i1 element. 7383 bool ScalarPredicatedBB = false; 7384 BranchInst *BI = cast<BranchInst>(I); 7385 if (VF.isVector() && BI->isConditional() && 7386 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7387 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7388 ScalarPredicatedBB = true; 7389 7390 if (ScalarPredicatedBB) { 7391 // Not possible to scalarize scalable vector with predicated instructions. 7392 if (VF.isScalable()) 7393 return InstructionCost::getInvalid(); 7394 // Return cost for branches around scalarized and predicated blocks. 7395 auto *Vec_i1Ty = 7396 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7397 return ( 7398 TTI.getScalarizationOverhead( 7399 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7400 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7401 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7402 // The back-edge branch will remain, as will all scalar branches. 7403 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7404 else 7405 // This branch will be eliminated by if-conversion. 7406 return 0; 7407 // Note: We currently assume zero cost for an unconditional branch inside 7408 // a predicated block since it will become a fall-through, although we 7409 // may decide in the future to call TTI for all branches. 7410 } 7411 case Instruction::PHI: { 7412 auto *Phi = cast<PHINode>(I); 7413 7414 // First-order recurrences are replaced by vector shuffles inside the loop. 7415 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7416 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7417 return TTI.getShuffleCost( 7418 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7419 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7420 7421 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7422 // converted into select instructions. We require N - 1 selects per phi 7423 // node, where N is the number of incoming values. 7424 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7425 return (Phi->getNumIncomingValues() - 1) * 7426 TTI.getCmpSelInstrCost( 7427 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7428 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7429 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7430 7431 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7432 } 7433 case Instruction::UDiv: 7434 case Instruction::SDiv: 7435 case Instruction::URem: 7436 case Instruction::SRem: 7437 // If we have a predicated instruction, it may not be executed for each 7438 // vector lane. Get the scalarization cost and scale this amount by the 7439 // probability of executing the predicated block. If the instruction is not 7440 // predicated, we fall through to the next case. 7441 if (VF.isVector() && isScalarWithPredication(I)) { 7442 InstructionCost Cost = 0; 7443 7444 // These instructions have a non-void type, so account for the phi nodes 7445 // that we will create. This cost is likely to be zero. The phi node 7446 // cost, if any, should be scaled by the block probability because it 7447 // models a copy at the end of each predicated block. 7448 Cost += VF.getKnownMinValue() * 7449 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7450 7451 // The cost of the non-predicated instruction. 7452 Cost += VF.getKnownMinValue() * 7453 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7454 7455 // The cost of insertelement and extractelement instructions needed for 7456 // scalarization. 7457 Cost += getScalarizationOverhead(I, VF); 7458 7459 // Scale the cost by the probability of executing the predicated blocks. 7460 // This assumes the predicated block for each vector lane is equally 7461 // likely. 7462 return Cost / getReciprocalPredBlockProb(); 7463 } 7464 LLVM_FALLTHROUGH; 7465 case Instruction::Add: 7466 case Instruction::FAdd: 7467 case Instruction::Sub: 7468 case Instruction::FSub: 7469 case Instruction::Mul: 7470 case Instruction::FMul: 7471 case Instruction::FDiv: 7472 case Instruction::FRem: 7473 case Instruction::Shl: 7474 case Instruction::LShr: 7475 case Instruction::AShr: 7476 case Instruction::And: 7477 case Instruction::Or: 7478 case Instruction::Xor: { 7479 // Since we will replace the stride by 1 the multiplication should go away. 7480 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7481 return 0; 7482 7483 // Detect reduction patterns 7484 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7485 return *RedCost; 7486 7487 // Certain instructions can be cheaper to vectorize if they have a constant 7488 // second vector operand. One example of this are shifts on x86. 7489 Value *Op2 = I->getOperand(1); 7490 TargetTransformInfo::OperandValueProperties Op2VP; 7491 TargetTransformInfo::OperandValueKind Op2VK = 7492 TTI.getOperandInfo(Op2, Op2VP); 7493 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7494 Op2VK = TargetTransformInfo::OK_UniformValue; 7495 7496 SmallVector<const Value *, 4> Operands(I->operand_values()); 7497 return TTI.getArithmeticInstrCost( 7498 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7499 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7500 } 7501 case Instruction::FNeg: { 7502 return TTI.getArithmeticInstrCost( 7503 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7504 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7505 TargetTransformInfo::OP_None, I->getOperand(0), I); 7506 } 7507 case Instruction::Select: { 7508 SelectInst *SI = cast<SelectInst>(I); 7509 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7510 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7511 7512 const Value *Op0, *Op1; 7513 using namespace llvm::PatternMatch; 7514 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7515 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7516 // select x, y, false --> x & y 7517 // select x, true, y --> x | y 7518 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7519 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7520 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7521 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7522 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7523 Op1->getType()->getScalarSizeInBits() == 1); 7524 7525 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7526 return TTI.getArithmeticInstrCost( 7527 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7528 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7529 } 7530 7531 Type *CondTy = SI->getCondition()->getType(); 7532 if (!ScalarCond) 7533 CondTy = VectorType::get(CondTy, VF); 7534 7535 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7536 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7537 Pred = Cmp->getPredicate(); 7538 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7539 CostKind, I); 7540 } 7541 case Instruction::ICmp: 7542 case Instruction::FCmp: { 7543 Type *ValTy = I->getOperand(0)->getType(); 7544 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7545 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7546 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7547 VectorTy = ToVectorTy(ValTy, VF); 7548 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7549 cast<CmpInst>(I)->getPredicate(), CostKind, 7550 I); 7551 } 7552 case Instruction::Store: 7553 case Instruction::Load: { 7554 ElementCount Width = VF; 7555 if (Width.isVector()) { 7556 InstWidening Decision = getWideningDecision(I, Width); 7557 assert(Decision != CM_Unknown && 7558 "CM decision should be taken at this point"); 7559 if (Decision == CM_Scalarize) 7560 Width = ElementCount::getFixed(1); 7561 } 7562 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7563 return getMemoryInstructionCost(I, VF); 7564 } 7565 case Instruction::BitCast: 7566 if (I->getType()->isPointerTy()) 7567 return 0; 7568 LLVM_FALLTHROUGH; 7569 case Instruction::ZExt: 7570 case Instruction::SExt: 7571 case Instruction::FPToUI: 7572 case Instruction::FPToSI: 7573 case Instruction::FPExt: 7574 case Instruction::PtrToInt: 7575 case Instruction::IntToPtr: 7576 case Instruction::SIToFP: 7577 case Instruction::UIToFP: 7578 case Instruction::Trunc: 7579 case Instruction::FPTrunc: { 7580 // Computes the CastContextHint from a Load/Store instruction. 7581 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7582 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7583 "Expected a load or a store!"); 7584 7585 if (VF.isScalar() || !TheLoop->contains(I)) 7586 return TTI::CastContextHint::Normal; 7587 7588 switch (getWideningDecision(I, VF)) { 7589 case LoopVectorizationCostModel::CM_GatherScatter: 7590 return TTI::CastContextHint::GatherScatter; 7591 case LoopVectorizationCostModel::CM_Interleave: 7592 return TTI::CastContextHint::Interleave; 7593 case LoopVectorizationCostModel::CM_Scalarize: 7594 case LoopVectorizationCostModel::CM_Widen: 7595 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7596 : TTI::CastContextHint::Normal; 7597 case LoopVectorizationCostModel::CM_Widen_Reverse: 7598 return TTI::CastContextHint::Reversed; 7599 case LoopVectorizationCostModel::CM_Unknown: 7600 llvm_unreachable("Instr did not go through cost modelling?"); 7601 } 7602 7603 llvm_unreachable("Unhandled case!"); 7604 }; 7605 7606 unsigned Opcode = I->getOpcode(); 7607 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7608 // For Trunc, the context is the only user, which must be a StoreInst. 7609 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7610 if (I->hasOneUse()) 7611 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7612 CCH = ComputeCCH(Store); 7613 } 7614 // For Z/Sext, the context is the operand, which must be a LoadInst. 7615 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7616 Opcode == Instruction::FPExt) { 7617 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7618 CCH = ComputeCCH(Load); 7619 } 7620 7621 // We optimize the truncation of induction variables having constant 7622 // integer steps. The cost of these truncations is the same as the scalar 7623 // operation. 7624 if (isOptimizableIVTruncate(I, VF)) { 7625 auto *Trunc = cast<TruncInst>(I); 7626 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7627 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7628 } 7629 7630 // Detect reduction patterns 7631 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7632 return *RedCost; 7633 7634 Type *SrcScalarTy = I->getOperand(0)->getType(); 7635 Type *SrcVecTy = 7636 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7637 if (canTruncateToMinimalBitwidth(I, VF)) { 7638 // This cast is going to be shrunk. This may remove the cast or it might 7639 // turn it into slightly different cast. For example, if MinBW == 16, 7640 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7641 // 7642 // Calculate the modified src and dest types. 7643 Type *MinVecTy = VectorTy; 7644 if (Opcode == Instruction::Trunc) { 7645 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7646 VectorTy = 7647 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7648 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7649 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7650 VectorTy = 7651 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7652 } 7653 } 7654 7655 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7656 } 7657 case Instruction::Call: { 7658 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7659 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7660 return *RedCost; 7661 bool NeedToScalarize; 7662 CallInst *CI = cast<CallInst>(I); 7663 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7664 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7665 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7666 return std::min(CallCost, IntrinsicCost); 7667 } 7668 return CallCost; 7669 } 7670 case Instruction::ExtractValue: 7671 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7672 case Instruction::Alloca: 7673 // We cannot easily widen alloca to a scalable alloca, as 7674 // the result would need to be a vector of pointers. 7675 if (VF.isScalable()) 7676 return InstructionCost::getInvalid(); 7677 LLVM_FALLTHROUGH; 7678 default: 7679 // This opcode is unknown. Assume that it is the same as 'mul'. 7680 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7681 } // end of switch. 7682 } 7683 7684 char LoopVectorize::ID = 0; 7685 7686 static const char lv_name[] = "Loop Vectorization"; 7687 7688 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7689 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7690 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7691 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7692 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7693 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7694 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7695 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7696 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7697 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7698 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7699 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7700 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7701 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7702 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7703 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7704 7705 namespace llvm { 7706 7707 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7708 7709 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7710 bool VectorizeOnlyWhenForced) { 7711 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7712 } 7713 7714 } // end namespace llvm 7715 7716 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7717 // Check if the pointer operand of a load or store instruction is 7718 // consecutive. 7719 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7720 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7721 return false; 7722 } 7723 7724 void LoopVectorizationCostModel::collectValuesToIgnore() { 7725 // Ignore ephemeral values. 7726 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7727 7728 // Ignore type-promoting instructions we identified during reduction 7729 // detection. 7730 for (auto &Reduction : Legal->getReductionVars()) { 7731 const RecurrenceDescriptor &RedDes = Reduction.second; 7732 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7733 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7734 } 7735 // Ignore type-casting instructions we identified during induction 7736 // detection. 7737 for (auto &Induction : Legal->getInductionVars()) { 7738 const InductionDescriptor &IndDes = Induction.second; 7739 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7740 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7741 } 7742 } 7743 7744 void LoopVectorizationCostModel::collectInLoopReductions() { 7745 for (auto &Reduction : Legal->getReductionVars()) { 7746 PHINode *Phi = Reduction.first; 7747 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7748 7749 // We don't collect reductions that are type promoted (yet). 7750 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7751 continue; 7752 7753 // If the target would prefer this reduction to happen "in-loop", then we 7754 // want to record it as such. 7755 unsigned Opcode = RdxDesc.getOpcode(); 7756 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7757 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7758 TargetTransformInfo::ReductionFlags())) 7759 continue; 7760 7761 // Check that we can correctly put the reductions into the loop, by 7762 // finding the chain of operations that leads from the phi to the loop 7763 // exit value. 7764 SmallVector<Instruction *, 4> ReductionOperations = 7765 RdxDesc.getReductionOpChain(Phi, TheLoop); 7766 bool InLoop = !ReductionOperations.empty(); 7767 if (InLoop) { 7768 InLoopReductionChains[Phi] = ReductionOperations; 7769 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7770 Instruction *LastChain = Phi; 7771 for (auto *I : ReductionOperations) { 7772 InLoopReductionImmediateChains[I] = LastChain; 7773 LastChain = I; 7774 } 7775 } 7776 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7777 << " reduction for phi: " << *Phi << "\n"); 7778 } 7779 } 7780 7781 // TODO: we could return a pair of values that specify the max VF and 7782 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7783 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7784 // doesn't have a cost model that can choose which plan to execute if 7785 // more than one is generated. 7786 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7787 LoopVectorizationCostModel &CM) { 7788 unsigned WidestType; 7789 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7790 return WidestVectorRegBits / WidestType; 7791 } 7792 7793 VectorizationFactor 7794 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7795 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7796 ElementCount VF = UserVF; 7797 // Outer loop handling: They may require CFG and instruction level 7798 // transformations before even evaluating whether vectorization is profitable. 7799 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7800 // the vectorization pipeline. 7801 if (!OrigLoop->isInnermost()) { 7802 // If the user doesn't provide a vectorization factor, determine a 7803 // reasonable one. 7804 if (UserVF.isZero()) { 7805 VF = ElementCount::getFixed(determineVPlanVF( 7806 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7807 .getFixedSize(), 7808 CM)); 7809 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7810 7811 // Make sure we have a VF > 1 for stress testing. 7812 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7813 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7814 << "overriding computed VF.\n"); 7815 VF = ElementCount::getFixed(4); 7816 } 7817 } 7818 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7819 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7820 "VF needs to be a power of two"); 7821 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7822 << "VF " << VF << " to build VPlans.\n"); 7823 buildVPlans(VF, VF); 7824 7825 // For VPlan build stress testing, we bail out after VPlan construction. 7826 if (VPlanBuildStressTest) 7827 return VectorizationFactor::Disabled(); 7828 7829 return {VF, 0 /*Cost*/}; 7830 } 7831 7832 LLVM_DEBUG( 7833 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7834 "VPlan-native path.\n"); 7835 return VectorizationFactor::Disabled(); 7836 } 7837 7838 Optional<VectorizationFactor> 7839 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7840 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7841 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7842 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7843 return None; 7844 7845 // Invalidate interleave groups if all blocks of loop will be predicated. 7846 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7847 !useMaskedInterleavedAccesses(*TTI)) { 7848 LLVM_DEBUG( 7849 dbgs() 7850 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7851 "which requires masked-interleaved support.\n"); 7852 if (CM.InterleaveInfo.invalidateGroups()) 7853 // Invalidating interleave groups also requires invalidating all decisions 7854 // based on them, which includes widening decisions and uniform and scalar 7855 // values. 7856 CM.invalidateCostModelingDecisions(); 7857 } 7858 7859 ElementCount MaxUserVF = 7860 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7861 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7862 if (!UserVF.isZero() && UserVFIsLegal) { 7863 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7864 "VF needs to be a power of two"); 7865 // Collect the instructions (and their associated costs) that will be more 7866 // profitable to scalarize. 7867 if (CM.selectUserVectorizationFactor(UserVF)) { 7868 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7869 CM.collectInLoopReductions(); 7870 buildVPlansWithVPRecipes(UserVF, UserVF); 7871 LLVM_DEBUG(printPlans(dbgs())); 7872 return {{UserVF, 0}}; 7873 } else 7874 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7875 "InvalidCost", ORE, OrigLoop); 7876 } 7877 7878 // Populate the set of Vectorization Factor Candidates. 7879 ElementCountSet VFCandidates; 7880 for (auto VF = ElementCount::getFixed(1); 7881 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7882 VFCandidates.insert(VF); 7883 for (auto VF = ElementCount::getScalable(1); 7884 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7885 VFCandidates.insert(VF); 7886 7887 for (const auto &VF : VFCandidates) { 7888 // Collect Uniform and Scalar instructions after vectorization with VF. 7889 CM.collectUniformsAndScalars(VF); 7890 7891 // Collect the instructions (and their associated costs) that will be more 7892 // profitable to scalarize. 7893 if (VF.isVector()) 7894 CM.collectInstsToScalarize(VF); 7895 } 7896 7897 CM.collectInLoopReductions(); 7898 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7899 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7900 7901 LLVM_DEBUG(printPlans(dbgs())); 7902 if (!MaxFactors.hasVector()) 7903 return VectorizationFactor::Disabled(); 7904 7905 // Select the optimal vectorization factor. 7906 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7907 7908 // Check if it is profitable to vectorize with runtime checks. 7909 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7910 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7911 bool PragmaThresholdReached = 7912 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7913 bool ThresholdReached = 7914 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7915 if ((ThresholdReached && !Hints.allowReordering()) || 7916 PragmaThresholdReached) { 7917 ORE->emit([&]() { 7918 return OptimizationRemarkAnalysisAliasing( 7919 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7920 OrigLoop->getHeader()) 7921 << "loop not vectorized: cannot prove it is safe to reorder " 7922 "memory operations"; 7923 }); 7924 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7925 Hints.emitRemarkWithHints(); 7926 return VectorizationFactor::Disabled(); 7927 } 7928 } 7929 return SelectedVF; 7930 } 7931 7932 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7933 assert(count_if(VPlans, 7934 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7935 1 && 7936 "Best VF has not a single VPlan."); 7937 7938 for (const VPlanPtr &Plan : VPlans) { 7939 if (Plan->hasVF(VF)) 7940 return *Plan.get(); 7941 } 7942 llvm_unreachable("No plan found!"); 7943 } 7944 7945 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7946 VPlan &BestVPlan, 7947 InnerLoopVectorizer &ILV, 7948 DominatorTree *DT) { 7949 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7950 << '\n'); 7951 7952 // Perform the actual loop transformation. 7953 7954 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7955 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7956 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7957 State.CanonicalIV = ILV.Induction; 7958 ILV.collectPoisonGeneratingRecipes(State); 7959 7960 ILV.printDebugTracesAtStart(); 7961 7962 //===------------------------------------------------===// 7963 // 7964 // Notice: any optimization or new instruction that go 7965 // into the code below should also be implemented in 7966 // the cost-model. 7967 // 7968 //===------------------------------------------------===// 7969 7970 // 2. Copy and widen instructions from the old loop into the new loop. 7971 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), State); 7972 BestVPlan.execute(&State); 7973 7974 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7975 // predication, updating analyses. 7976 ILV.fixVectorizedLoop(State); 7977 7978 ILV.printDebugTracesAtEnd(); 7979 } 7980 7981 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7982 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7983 for (const auto &Plan : VPlans) 7984 if (PrintVPlansInDotFormat) 7985 Plan->printDOT(O); 7986 else 7987 Plan->print(O); 7988 } 7989 #endif 7990 7991 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7992 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7993 7994 // We create new control-flow for the vectorized loop, so the original exit 7995 // conditions will be dead after vectorization if it's only used by the 7996 // terminator 7997 SmallVector<BasicBlock*> ExitingBlocks; 7998 OrigLoop->getExitingBlocks(ExitingBlocks); 7999 for (auto *BB : ExitingBlocks) { 8000 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8001 if (!Cmp || !Cmp->hasOneUse()) 8002 continue; 8003 8004 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8005 if (!DeadInstructions.insert(Cmp).second) 8006 continue; 8007 8008 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8009 // TODO: can recurse through operands in general 8010 for (Value *Op : Cmp->operands()) { 8011 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8012 DeadInstructions.insert(cast<Instruction>(Op)); 8013 } 8014 } 8015 8016 // We create new "steps" for induction variable updates to which the original 8017 // induction variables map. An original update instruction will be dead if 8018 // all its users except the induction variable are dead. 8019 auto *Latch = OrigLoop->getLoopLatch(); 8020 for (auto &Induction : Legal->getInductionVars()) { 8021 PHINode *Ind = Induction.first; 8022 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8023 8024 // If the tail is to be folded by masking, the primary induction variable, 8025 // if exists, isn't dead: it will be used for masking. Don't kill it. 8026 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8027 continue; 8028 8029 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8030 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8031 })) 8032 DeadInstructions.insert(IndUpdate); 8033 } 8034 } 8035 8036 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8037 8038 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8039 SmallVector<Metadata *, 4> MDs; 8040 // Reserve first location for self reference to the LoopID metadata node. 8041 MDs.push_back(nullptr); 8042 bool IsUnrollMetadata = false; 8043 MDNode *LoopID = L->getLoopID(); 8044 if (LoopID) { 8045 // First find existing loop unrolling disable metadata. 8046 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8047 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8048 if (MD) { 8049 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8050 IsUnrollMetadata = 8051 S && S->getString().startswith("llvm.loop.unroll.disable"); 8052 } 8053 MDs.push_back(LoopID->getOperand(i)); 8054 } 8055 } 8056 8057 if (!IsUnrollMetadata) { 8058 // Add runtime unroll disable metadata. 8059 LLVMContext &Context = L->getHeader()->getContext(); 8060 SmallVector<Metadata *, 1> DisableOperands; 8061 DisableOperands.push_back( 8062 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8063 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8064 MDs.push_back(DisableNode); 8065 MDNode *NewLoopID = MDNode::get(Context, MDs); 8066 // Set operand 0 to refer to the loop id itself. 8067 NewLoopID->replaceOperandWith(0, NewLoopID); 8068 L->setLoopID(NewLoopID); 8069 } 8070 } 8071 8072 //===--------------------------------------------------------------------===// 8073 // EpilogueVectorizerMainLoop 8074 //===--------------------------------------------------------------------===// 8075 8076 /// This function is partially responsible for generating the control flow 8077 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8078 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8079 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8080 Loop *Lp = createVectorLoopSkeleton(""); 8081 8082 // Generate the code to check the minimum iteration count of the vector 8083 // epilogue (see below). 8084 EPI.EpilogueIterationCountCheck = 8085 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8086 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8087 8088 // Generate the code to check any assumptions that we've made for SCEV 8089 // expressions. 8090 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8091 8092 // Generate the code that checks at runtime if arrays overlap. We put the 8093 // checks into a separate block to make the more common case of few elements 8094 // faster. 8095 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8096 8097 // Generate the iteration count check for the main loop, *after* the check 8098 // for the epilogue loop, so that the path-length is shorter for the case 8099 // that goes directly through the vector epilogue. The longer-path length for 8100 // the main loop is compensated for, by the gain from vectorizing the larger 8101 // trip count. Note: the branch will get updated later on when we vectorize 8102 // the epilogue. 8103 EPI.MainLoopIterationCountCheck = 8104 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8105 8106 // Generate the induction variable. 8107 OldInduction = Legal->getPrimaryInduction(); 8108 Type *IdxTy = Legal->getWidestInductionType(); 8109 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8110 8111 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8112 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8113 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8114 EPI.VectorTripCount = CountRoundDown; 8115 Induction = 8116 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8117 getDebugLocFromInstOrOperands(OldInduction)); 8118 8119 // Skip induction resume value creation here because they will be created in 8120 // the second pass. If we created them here, they wouldn't be used anyway, 8121 // because the vplan in the second pass still contains the inductions from the 8122 // original loop. 8123 8124 return completeLoopSkeleton(Lp, OrigLoopID); 8125 } 8126 8127 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8128 LLVM_DEBUG({ 8129 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8130 << "Main Loop VF:" << EPI.MainLoopVF 8131 << ", Main Loop UF:" << EPI.MainLoopUF 8132 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8133 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8134 }); 8135 } 8136 8137 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8138 DEBUG_WITH_TYPE(VerboseDebug, { 8139 dbgs() << "intermediate fn:\n" 8140 << *OrigLoop->getHeader()->getParent() << "\n"; 8141 }); 8142 } 8143 8144 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8145 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8146 assert(L && "Expected valid Loop."); 8147 assert(Bypass && "Expected valid bypass basic block."); 8148 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8149 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8150 Value *Count = getOrCreateTripCount(L); 8151 // Reuse existing vector loop preheader for TC checks. 8152 // Note that new preheader block is generated for vector loop. 8153 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8154 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8155 8156 // Generate code to check if the loop's trip count is less than VF * UF of the 8157 // main vector loop. 8158 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8159 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8160 8161 Value *CheckMinIters = Builder.CreateICmp( 8162 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8163 "min.iters.check"); 8164 8165 if (!ForEpilogue) 8166 TCCheckBlock->setName("vector.main.loop.iter.check"); 8167 8168 // Create new preheader for vector loop. 8169 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8170 DT, LI, nullptr, "vector.ph"); 8171 8172 if (ForEpilogue) { 8173 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8174 DT->getNode(Bypass)->getIDom()) && 8175 "TC check is expected to dominate Bypass"); 8176 8177 // Update dominator for Bypass & LoopExit. 8178 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8179 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8180 // For loops with multiple exits, there's no edge from the middle block 8181 // to exit blocks (as the epilogue must run) and thus no need to update 8182 // the immediate dominator of the exit blocks. 8183 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8184 8185 LoopBypassBlocks.push_back(TCCheckBlock); 8186 8187 // Save the trip count so we don't have to regenerate it in the 8188 // vec.epilog.iter.check. This is safe to do because the trip count 8189 // generated here dominates the vector epilog iter check. 8190 EPI.TripCount = Count; 8191 } 8192 8193 ReplaceInstWithInst( 8194 TCCheckBlock->getTerminator(), 8195 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8196 8197 return TCCheckBlock; 8198 } 8199 8200 //===--------------------------------------------------------------------===// 8201 // EpilogueVectorizerEpilogueLoop 8202 //===--------------------------------------------------------------------===// 8203 8204 /// This function is partially responsible for generating the control flow 8205 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8206 BasicBlock * 8207 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8208 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8209 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8210 8211 // Now, compare the remaining count and if there aren't enough iterations to 8212 // execute the vectorized epilogue skip to the scalar part. 8213 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8214 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8215 LoopVectorPreHeader = 8216 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8217 LI, nullptr, "vec.epilog.ph"); 8218 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8219 VecEpilogueIterationCountCheck); 8220 8221 // Adjust the control flow taking the state info from the main loop 8222 // vectorization into account. 8223 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8224 "expected this to be saved from the previous pass."); 8225 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8226 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8227 8228 DT->changeImmediateDominator(LoopVectorPreHeader, 8229 EPI.MainLoopIterationCountCheck); 8230 8231 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8232 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8233 8234 if (EPI.SCEVSafetyCheck) 8235 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8236 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8237 if (EPI.MemSafetyCheck) 8238 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8239 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8240 8241 DT->changeImmediateDominator( 8242 VecEpilogueIterationCountCheck, 8243 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8244 8245 DT->changeImmediateDominator(LoopScalarPreHeader, 8246 EPI.EpilogueIterationCountCheck); 8247 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8248 // If there is an epilogue which must run, there's no edge from the 8249 // middle block to exit blocks and thus no need to update the immediate 8250 // dominator of the exit blocks. 8251 DT->changeImmediateDominator(LoopExitBlock, 8252 EPI.EpilogueIterationCountCheck); 8253 8254 // Keep track of bypass blocks, as they feed start values to the induction 8255 // phis in the scalar loop preheader. 8256 if (EPI.SCEVSafetyCheck) 8257 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8258 if (EPI.MemSafetyCheck) 8259 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8260 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8261 8262 // Generate a resume induction for the vector epilogue and put it in the 8263 // vector epilogue preheader 8264 Type *IdxTy = Legal->getWidestInductionType(); 8265 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8266 LoopVectorPreHeader->getFirstNonPHI()); 8267 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8268 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8269 EPI.MainLoopIterationCountCheck); 8270 8271 // Generate the induction variable. 8272 OldInduction = Legal->getPrimaryInduction(); 8273 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8274 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8275 Value *StartIdx = EPResumeVal; 8276 Induction = 8277 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8278 getDebugLocFromInstOrOperands(OldInduction)); 8279 8280 // Generate induction resume values. These variables save the new starting 8281 // indexes for the scalar loop. They are used to test if there are any tail 8282 // iterations left once the vector loop has completed. 8283 // Note that when the vectorized epilogue is skipped due to iteration count 8284 // check, then the resume value for the induction variable comes from 8285 // the trip count of the main vector loop, hence passing the AdditionalBypass 8286 // argument. 8287 createInductionResumeValues(Lp, CountRoundDown, 8288 {VecEpilogueIterationCountCheck, 8289 EPI.VectorTripCount} /* AdditionalBypass */); 8290 8291 AddRuntimeUnrollDisableMetaData(Lp); 8292 return completeLoopSkeleton(Lp, OrigLoopID); 8293 } 8294 8295 BasicBlock * 8296 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8297 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8298 8299 assert(EPI.TripCount && 8300 "Expected trip count to have been safed in the first pass."); 8301 assert( 8302 (!isa<Instruction>(EPI.TripCount) || 8303 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8304 "saved trip count does not dominate insertion point."); 8305 Value *TC = EPI.TripCount; 8306 IRBuilder<> Builder(Insert->getTerminator()); 8307 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8308 8309 // Generate code to check if the loop's trip count is less than VF * UF of the 8310 // vector epilogue loop. 8311 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8312 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8313 8314 Value *CheckMinIters = 8315 Builder.CreateICmp(P, Count, 8316 createStepForVF(Builder, Count->getType(), 8317 EPI.EpilogueVF, EPI.EpilogueUF), 8318 "min.epilog.iters.check"); 8319 8320 ReplaceInstWithInst( 8321 Insert->getTerminator(), 8322 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8323 8324 LoopBypassBlocks.push_back(Insert); 8325 return Insert; 8326 } 8327 8328 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8329 LLVM_DEBUG({ 8330 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8331 << "Epilogue Loop VF:" << EPI.EpilogueVF 8332 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8333 }); 8334 } 8335 8336 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8337 DEBUG_WITH_TYPE(VerboseDebug, { 8338 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8339 }); 8340 } 8341 8342 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8343 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8344 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8345 bool PredicateAtRangeStart = Predicate(Range.Start); 8346 8347 for (ElementCount TmpVF = Range.Start * 2; 8348 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8349 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8350 Range.End = TmpVF; 8351 break; 8352 } 8353 8354 return PredicateAtRangeStart; 8355 } 8356 8357 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8358 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8359 /// of VF's starting at a given VF and extending it as much as possible. Each 8360 /// vectorization decision can potentially shorten this sub-range during 8361 /// buildVPlan(). 8362 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8363 ElementCount MaxVF) { 8364 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8365 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8366 VFRange SubRange = {VF, MaxVFPlusOne}; 8367 VPlans.push_back(buildVPlan(SubRange)); 8368 VF = SubRange.End; 8369 } 8370 } 8371 8372 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8373 VPlanPtr &Plan) { 8374 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8375 8376 // Look for cached value. 8377 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8378 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8379 if (ECEntryIt != EdgeMaskCache.end()) 8380 return ECEntryIt->second; 8381 8382 VPValue *SrcMask = createBlockInMask(Src, Plan); 8383 8384 // The terminator has to be a branch inst! 8385 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8386 assert(BI && "Unexpected terminator found"); 8387 8388 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8389 return EdgeMaskCache[Edge] = SrcMask; 8390 8391 // If source is an exiting block, we know the exit edge is dynamically dead 8392 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8393 // adding uses of an otherwise potentially dead instruction. 8394 if (OrigLoop->isLoopExiting(Src)) 8395 return EdgeMaskCache[Edge] = SrcMask; 8396 8397 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8398 assert(EdgeMask && "No Edge Mask found for condition"); 8399 8400 if (BI->getSuccessor(0) != Dst) 8401 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8402 8403 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8404 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8405 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8406 // The select version does not introduce new UB if SrcMask is false and 8407 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8408 VPValue *False = Plan->getOrAddVPValue( 8409 ConstantInt::getFalse(BI->getCondition()->getType())); 8410 EdgeMask = 8411 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8412 } 8413 8414 return EdgeMaskCache[Edge] = EdgeMask; 8415 } 8416 8417 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8418 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8419 8420 // Look for cached value. 8421 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8422 if (BCEntryIt != BlockMaskCache.end()) 8423 return BCEntryIt->second; 8424 8425 // All-one mask is modelled as no-mask following the convention for masked 8426 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8427 VPValue *BlockMask = nullptr; 8428 8429 if (OrigLoop->getHeader() == BB) { 8430 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8431 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8432 8433 // Introduce the early-exit compare IV <= BTC to form header block mask. 8434 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8435 // constructing the desired canonical IV in the header block as its first 8436 // non-phi instructions. 8437 assert(CM.foldTailByMasking() && "must fold the tail"); 8438 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8439 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8440 8441 VPValue *IV = nullptr; 8442 if (Legal->getPrimaryInduction()) 8443 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8444 else { 8445 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8446 HeaderVPBB->insert(IVRecipe, NewInsertionPoint); 8447 IV = IVRecipe; 8448 } 8449 8450 VPBuilder::InsertPointGuard Guard(Builder); 8451 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8452 if (CM.TTI.emitGetActiveLaneMask()) { 8453 VPValue *TC = Plan->getOrCreateTripCount(); 8454 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8455 } else { 8456 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8457 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8458 } 8459 return BlockMaskCache[BB] = BlockMask; 8460 } 8461 8462 // This is the block mask. We OR all incoming edges. 8463 for (auto *Predecessor : predecessors(BB)) { 8464 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8465 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8466 return BlockMaskCache[BB] = EdgeMask; 8467 8468 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8469 BlockMask = EdgeMask; 8470 continue; 8471 } 8472 8473 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8474 } 8475 8476 return BlockMaskCache[BB] = BlockMask; 8477 } 8478 8479 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8480 ArrayRef<VPValue *> Operands, 8481 VFRange &Range, 8482 VPlanPtr &Plan) { 8483 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8484 "Must be called with either a load or store"); 8485 8486 auto willWiden = [&](ElementCount VF) -> bool { 8487 if (VF.isScalar()) 8488 return false; 8489 LoopVectorizationCostModel::InstWidening Decision = 8490 CM.getWideningDecision(I, VF); 8491 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8492 "CM decision should be taken at this point."); 8493 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8494 return true; 8495 if (CM.isScalarAfterVectorization(I, VF) || 8496 CM.isProfitableToScalarize(I, VF)) 8497 return false; 8498 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8499 }; 8500 8501 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8502 return nullptr; 8503 8504 VPValue *Mask = nullptr; 8505 if (Legal->isMaskRequired(I)) 8506 Mask = createBlockInMask(I->getParent(), Plan); 8507 8508 // Determine if the pointer operand of the access is either consecutive or 8509 // reverse consecutive. 8510 LoopVectorizationCostModel::InstWidening Decision = 8511 CM.getWideningDecision(I, Range.Start); 8512 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8513 bool Consecutive = 8514 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8515 8516 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8517 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8518 Consecutive, Reverse); 8519 8520 StoreInst *Store = cast<StoreInst>(I); 8521 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8522 Mask, Consecutive, Reverse); 8523 } 8524 8525 VPWidenIntOrFpInductionRecipe * 8526 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8527 ArrayRef<VPValue *> Operands) const { 8528 // Check if this is an integer or fp induction. If so, build the recipe that 8529 // produces its scalar and vector values. 8530 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { 8531 assert(II->getStartValue() == 8532 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8533 return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); 8534 } 8535 8536 return nullptr; 8537 } 8538 8539 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8540 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8541 VPlan &Plan) const { 8542 // Optimize the special case where the source is a constant integer 8543 // induction variable. Notice that we can only optimize the 'trunc' case 8544 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8545 // (c) other casts depend on pointer size. 8546 8547 // Determine whether \p K is a truncation based on an induction variable that 8548 // can be optimized. 8549 auto isOptimizableIVTruncate = 8550 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8551 return [=](ElementCount VF) -> bool { 8552 return CM.isOptimizableIVTruncate(K, VF); 8553 }; 8554 }; 8555 8556 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8557 isOptimizableIVTruncate(I), Range)) { 8558 8559 auto *Phi = cast<PHINode>(I->getOperand(0)); 8560 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8561 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8562 return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); 8563 } 8564 return nullptr; 8565 } 8566 8567 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8568 ArrayRef<VPValue *> Operands, 8569 VPlanPtr &Plan) { 8570 // If all incoming values are equal, the incoming VPValue can be used directly 8571 // instead of creating a new VPBlendRecipe. 8572 VPValue *FirstIncoming = Operands[0]; 8573 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8574 return FirstIncoming == Inc; 8575 })) { 8576 return Operands[0]; 8577 } 8578 8579 // We know that all PHIs in non-header blocks are converted into selects, so 8580 // we don't have to worry about the insertion order and we can just use the 8581 // builder. At this point we generate the predication tree. There may be 8582 // duplications since this is a simple recursive scan, but future 8583 // optimizations will clean it up. 8584 SmallVector<VPValue *, 2> OperandsWithMask; 8585 unsigned NumIncoming = Phi->getNumIncomingValues(); 8586 8587 for (unsigned In = 0; In < NumIncoming; In++) { 8588 VPValue *EdgeMask = 8589 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8590 assert((EdgeMask || NumIncoming == 1) && 8591 "Multiple predecessors with one having a full mask"); 8592 OperandsWithMask.push_back(Operands[In]); 8593 if (EdgeMask) 8594 OperandsWithMask.push_back(EdgeMask); 8595 } 8596 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8597 } 8598 8599 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8600 ArrayRef<VPValue *> Operands, 8601 VFRange &Range) const { 8602 8603 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8604 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8605 Range); 8606 8607 if (IsPredicated) 8608 return nullptr; 8609 8610 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8611 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8612 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8613 ID == Intrinsic::pseudoprobe || 8614 ID == Intrinsic::experimental_noalias_scope_decl)) 8615 return nullptr; 8616 8617 auto willWiden = [&](ElementCount VF) -> bool { 8618 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8619 // The following case may be scalarized depending on the VF. 8620 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8621 // version of the instruction. 8622 // Is it beneficial to perform intrinsic call compared to lib call? 8623 bool NeedToScalarize = false; 8624 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8625 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8626 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8627 return UseVectorIntrinsic || !NeedToScalarize; 8628 }; 8629 8630 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8631 return nullptr; 8632 8633 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8634 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8635 } 8636 8637 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8638 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8639 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8640 // Instruction should be widened, unless it is scalar after vectorization, 8641 // scalarization is profitable or it is predicated. 8642 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8643 return CM.isScalarAfterVectorization(I, VF) || 8644 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8645 }; 8646 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8647 Range); 8648 } 8649 8650 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8651 ArrayRef<VPValue *> Operands) const { 8652 auto IsVectorizableOpcode = [](unsigned Opcode) { 8653 switch (Opcode) { 8654 case Instruction::Add: 8655 case Instruction::And: 8656 case Instruction::AShr: 8657 case Instruction::BitCast: 8658 case Instruction::FAdd: 8659 case Instruction::FCmp: 8660 case Instruction::FDiv: 8661 case Instruction::FMul: 8662 case Instruction::FNeg: 8663 case Instruction::FPExt: 8664 case Instruction::FPToSI: 8665 case Instruction::FPToUI: 8666 case Instruction::FPTrunc: 8667 case Instruction::FRem: 8668 case Instruction::FSub: 8669 case Instruction::ICmp: 8670 case Instruction::IntToPtr: 8671 case Instruction::LShr: 8672 case Instruction::Mul: 8673 case Instruction::Or: 8674 case Instruction::PtrToInt: 8675 case Instruction::SDiv: 8676 case Instruction::Select: 8677 case Instruction::SExt: 8678 case Instruction::Shl: 8679 case Instruction::SIToFP: 8680 case Instruction::SRem: 8681 case Instruction::Sub: 8682 case Instruction::Trunc: 8683 case Instruction::UDiv: 8684 case Instruction::UIToFP: 8685 case Instruction::URem: 8686 case Instruction::Xor: 8687 case Instruction::ZExt: 8688 return true; 8689 } 8690 return false; 8691 }; 8692 8693 if (!IsVectorizableOpcode(I->getOpcode())) 8694 return nullptr; 8695 8696 // Success: widen this instruction. 8697 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8698 } 8699 8700 void VPRecipeBuilder::fixHeaderPhis() { 8701 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8702 for (VPHeaderPHIRecipe *R : PhisToFix) { 8703 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8704 VPRecipeBase *IncR = 8705 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8706 R->addOperand(IncR->getVPSingleValue()); 8707 } 8708 } 8709 8710 VPBasicBlock *VPRecipeBuilder::handleReplication( 8711 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8712 VPlanPtr &Plan) { 8713 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8714 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8715 Range); 8716 8717 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8718 [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, 8719 Range); 8720 8721 // Even if the instruction is not marked as uniform, there are certain 8722 // intrinsic calls that can be effectively treated as such, so we check for 8723 // them here. Conservatively, we only do this for scalable vectors, since 8724 // for fixed-width VFs we can always fall back on full scalarization. 8725 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8726 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8727 case Intrinsic::assume: 8728 case Intrinsic::lifetime_start: 8729 case Intrinsic::lifetime_end: 8730 // For scalable vectors if one of the operands is variant then we still 8731 // want to mark as uniform, which will generate one instruction for just 8732 // the first lane of the vector. We can't scalarize the call in the same 8733 // way as for fixed-width vectors because we don't know how many lanes 8734 // there are. 8735 // 8736 // The reasons for doing it this way for scalable vectors are: 8737 // 1. For the assume intrinsic generating the instruction for the first 8738 // lane is still be better than not generating any at all. For 8739 // example, the input may be a splat across all lanes. 8740 // 2. For the lifetime start/end intrinsics the pointer operand only 8741 // does anything useful when the input comes from a stack object, 8742 // which suggests it should always be uniform. For non-stack objects 8743 // the effect is to poison the object, which still allows us to 8744 // remove the call. 8745 IsUniform = true; 8746 break; 8747 default: 8748 break; 8749 } 8750 } 8751 8752 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8753 IsUniform, IsPredicated); 8754 setRecipe(I, Recipe); 8755 Plan->addVPValue(I, Recipe); 8756 8757 // Find if I uses a predicated instruction. If so, it will use its scalar 8758 // value. Avoid hoisting the insert-element which packs the scalar value into 8759 // a vector value, as that happens iff all users use the vector value. 8760 for (VPValue *Op : Recipe->operands()) { 8761 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8762 if (!PredR) 8763 continue; 8764 auto *RepR = 8765 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8766 assert(RepR->isPredicated() && 8767 "expected Replicate recipe to be predicated"); 8768 RepR->setAlsoPack(false); 8769 } 8770 8771 // Finalize the recipe for Instr, first if it is not predicated. 8772 if (!IsPredicated) { 8773 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8774 VPBB->appendRecipe(Recipe); 8775 return VPBB; 8776 } 8777 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8778 8779 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8780 assert(SingleSucc && "VPBB must have a single successor when handling " 8781 "predicated replication."); 8782 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8783 // Record predicated instructions for above packing optimizations. 8784 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8785 VPBlockUtils::insertBlockAfter(Region, VPBB); 8786 auto *RegSucc = new VPBasicBlock(); 8787 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8788 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8789 return RegSucc; 8790 } 8791 8792 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8793 VPRecipeBase *PredRecipe, 8794 VPlanPtr &Plan) { 8795 // Instructions marked for predication are replicated and placed under an 8796 // if-then construct to prevent side-effects. 8797 8798 // Generate recipes to compute the block mask for this region. 8799 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8800 8801 // Build the triangular if-then region. 8802 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8803 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8804 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8805 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8806 auto *PHIRecipe = Instr->getType()->isVoidTy() 8807 ? nullptr 8808 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8809 if (PHIRecipe) { 8810 Plan->removeVPValueFor(Instr); 8811 Plan->addVPValue(Instr, PHIRecipe); 8812 } 8813 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8814 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8815 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8816 8817 // Note: first set Entry as region entry and then connect successors starting 8818 // from it in order, to propagate the "parent" of each VPBasicBlock. 8819 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8820 VPBlockUtils::connectBlocks(Pred, Exit); 8821 8822 return Region; 8823 } 8824 8825 VPRecipeOrVPValueTy 8826 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8827 ArrayRef<VPValue *> Operands, 8828 VFRange &Range, VPlanPtr &Plan) { 8829 // First, check for specific widening recipes that deal with calls, memory 8830 // operations, inductions and Phi nodes. 8831 if (auto *CI = dyn_cast<CallInst>(Instr)) 8832 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8833 8834 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8835 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8836 8837 VPRecipeBase *Recipe; 8838 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8839 if (Phi->getParent() != OrigLoop->getHeader()) 8840 return tryToBlend(Phi, Operands, Plan); 8841 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8842 return toVPRecipeResult(Recipe); 8843 8844 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8845 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8846 VPValue *StartV = Operands[0]; 8847 if (Legal->isReductionVariable(Phi)) { 8848 const RecurrenceDescriptor &RdxDesc = 8849 Legal->getReductionVars().find(Phi)->second; 8850 assert(RdxDesc.getRecurrenceStartValue() == 8851 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8852 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8853 CM.isInLoopReduction(Phi), 8854 CM.useOrderedReductions(RdxDesc)); 8855 } else { 8856 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8857 } 8858 8859 // Record the incoming value from the backedge, so we can add the incoming 8860 // value from the backedge after all recipes have been created. 8861 recordRecipeOf(cast<Instruction>( 8862 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8863 PhisToFix.push_back(PhiRecipe); 8864 } else { 8865 // TODO: record backedge value for remaining pointer induction phis. 8866 assert(Phi->getType()->isPointerTy() && 8867 "only pointer phis should be handled here"); 8868 assert(Legal->getInductionVars().count(Phi) && 8869 "Not an induction variable"); 8870 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8871 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8872 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8873 } 8874 8875 return toVPRecipeResult(PhiRecipe); 8876 } 8877 8878 if (isa<TruncInst>(Instr) && 8879 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8880 Range, *Plan))) 8881 return toVPRecipeResult(Recipe); 8882 8883 if (!shouldWiden(Instr, Range)) 8884 return nullptr; 8885 8886 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8887 return toVPRecipeResult(new VPWidenGEPRecipe( 8888 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8889 8890 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8891 bool InvariantCond = 8892 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8893 return toVPRecipeResult(new VPWidenSelectRecipe( 8894 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8895 } 8896 8897 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8898 } 8899 8900 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8901 ElementCount MaxVF) { 8902 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8903 8904 // Collect instructions from the original loop that will become trivially dead 8905 // in the vectorized loop. We don't need to vectorize these instructions. For 8906 // example, original induction update instructions can become dead because we 8907 // separately emit induction "steps" when generating code for the new loop. 8908 // Similarly, we create a new latch condition when setting up the structure 8909 // of the new loop, so the old one can become dead. 8910 SmallPtrSet<Instruction *, 4> DeadInstructions; 8911 collectTriviallyDeadInstructions(DeadInstructions); 8912 8913 // Add assume instructions we need to drop to DeadInstructions, to prevent 8914 // them from being added to the VPlan. 8915 // TODO: We only need to drop assumes in blocks that get flattend. If the 8916 // control flow is preserved, we should keep them. 8917 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8918 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8919 8920 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8921 // Dead instructions do not need sinking. Remove them from SinkAfter. 8922 for (Instruction *I : DeadInstructions) 8923 SinkAfter.erase(I); 8924 8925 // Cannot sink instructions after dead instructions (there won't be any 8926 // recipes for them). Instead, find the first non-dead previous instruction. 8927 for (auto &P : Legal->getSinkAfter()) { 8928 Instruction *SinkTarget = P.second; 8929 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8930 (void)FirstInst; 8931 while (DeadInstructions.contains(SinkTarget)) { 8932 assert( 8933 SinkTarget != FirstInst && 8934 "Must find a live instruction (at least the one feeding the " 8935 "first-order recurrence PHI) before reaching beginning of the block"); 8936 SinkTarget = SinkTarget->getPrevNode(); 8937 assert(SinkTarget != P.first && 8938 "sink source equals target, no sinking required"); 8939 } 8940 P.second = SinkTarget; 8941 } 8942 8943 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8944 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8945 VFRange SubRange = {VF, MaxVFPlusOne}; 8946 VPlans.push_back( 8947 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8948 VF = SubRange.End; 8949 } 8950 } 8951 8952 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8953 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8954 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8955 8956 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8957 8958 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8959 8960 // --------------------------------------------------------------------------- 8961 // Pre-construction: record ingredients whose recipes we'll need to further 8962 // process after constructing the initial VPlan. 8963 // --------------------------------------------------------------------------- 8964 8965 // Mark instructions we'll need to sink later and their targets as 8966 // ingredients whose recipe we'll need to record. 8967 for (auto &Entry : SinkAfter) { 8968 RecipeBuilder.recordRecipeOf(Entry.first); 8969 RecipeBuilder.recordRecipeOf(Entry.second); 8970 } 8971 for (auto &Reduction : CM.getInLoopReductionChains()) { 8972 PHINode *Phi = Reduction.first; 8973 RecurKind Kind = 8974 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8975 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8976 8977 RecipeBuilder.recordRecipeOf(Phi); 8978 for (auto &R : ReductionOperations) { 8979 RecipeBuilder.recordRecipeOf(R); 8980 // For min/max reducitons, where we have a pair of icmp/select, we also 8981 // need to record the ICmp recipe, so it can be removed later. 8982 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8983 "Only min/max recurrences allowed for inloop reductions"); 8984 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8985 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8986 } 8987 } 8988 8989 // For each interleave group which is relevant for this (possibly trimmed) 8990 // Range, add it to the set of groups to be later applied to the VPlan and add 8991 // placeholders for its members' Recipes which we'll be replacing with a 8992 // single VPInterleaveRecipe. 8993 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8994 auto applyIG = [IG, this](ElementCount VF) -> bool { 8995 return (VF.isVector() && // Query is illegal for VF == 1 8996 CM.getWideningDecision(IG->getInsertPos(), VF) == 8997 LoopVectorizationCostModel::CM_Interleave); 8998 }; 8999 if (!getDecisionAndClampRange(applyIG, Range)) 9000 continue; 9001 InterleaveGroups.insert(IG); 9002 for (unsigned i = 0; i < IG->getFactor(); i++) 9003 if (Instruction *Member = IG->getMember(i)) 9004 RecipeBuilder.recordRecipeOf(Member); 9005 }; 9006 9007 // --------------------------------------------------------------------------- 9008 // Build initial VPlan: Scan the body of the loop in a topological order to 9009 // visit each basic block after having visited its predecessor basic blocks. 9010 // --------------------------------------------------------------------------- 9011 9012 // Create initial VPlan skeleton, with separate header and latch blocks. 9013 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9014 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9015 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9016 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9017 auto Plan = std::make_unique<VPlan>(TopRegion); 9018 9019 // Scan the body of the loop in a topological order to visit each basic block 9020 // after having visited its predecessor basic blocks. 9021 LoopBlocksDFS DFS(OrigLoop); 9022 DFS.perform(LI); 9023 9024 VPBasicBlock *VPBB = HeaderVPBB; 9025 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9026 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9027 // Relevant instructions from basic block BB will be grouped into VPRecipe 9028 // ingredients and fill a new VPBasicBlock. 9029 unsigned VPBBsForBB = 0; 9030 VPBB->setName(BB->getName()); 9031 Builder.setInsertPoint(VPBB); 9032 9033 // Introduce each ingredient into VPlan. 9034 // TODO: Model and preserve debug instrinsics in VPlan. 9035 for (Instruction &I : BB->instructionsWithoutDebug()) { 9036 Instruction *Instr = &I; 9037 9038 // First filter out irrelevant instructions, to ensure no recipes are 9039 // built for them. 9040 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9041 continue; 9042 9043 SmallVector<VPValue *, 4> Operands; 9044 auto *Phi = dyn_cast<PHINode>(Instr); 9045 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9046 Operands.push_back(Plan->getOrAddVPValue( 9047 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9048 } else { 9049 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9050 Operands = {OpRange.begin(), OpRange.end()}; 9051 } 9052 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9053 Instr, Operands, Range, Plan)) { 9054 // If Instr can be simplified to an existing VPValue, use it. 9055 if (RecipeOrValue.is<VPValue *>()) { 9056 auto *VPV = RecipeOrValue.get<VPValue *>(); 9057 Plan->addVPValue(Instr, VPV); 9058 // If the re-used value is a recipe, register the recipe for the 9059 // instruction, in case the recipe for Instr needs to be recorded. 9060 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9061 RecipeBuilder.setRecipe(Instr, R); 9062 continue; 9063 } 9064 // Otherwise, add the new recipe. 9065 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9066 for (auto *Def : Recipe->definedValues()) { 9067 auto *UV = Def->getUnderlyingValue(); 9068 Plan->addVPValue(UV, Def); 9069 } 9070 9071 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9072 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9073 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9074 // of the header block. That can happen for truncates of induction 9075 // variables. Those recipes are moved to the phi section of the header 9076 // block after applying SinkAfter, which relies on the original 9077 // position of the trunc. 9078 assert(isa<TruncInst>(Instr)); 9079 InductionsToMove.push_back( 9080 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9081 } 9082 RecipeBuilder.setRecipe(Instr, Recipe); 9083 VPBB->appendRecipe(Recipe); 9084 continue; 9085 } 9086 9087 // Otherwise, if all widening options failed, Instruction is to be 9088 // replicated. This may create a successor for VPBB. 9089 VPBasicBlock *NextVPBB = 9090 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9091 if (NextVPBB != VPBB) { 9092 VPBB = NextVPBB; 9093 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9094 : ""); 9095 } 9096 } 9097 9098 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9099 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9100 } 9101 9102 // Fold the last, empty block into its predecessor. 9103 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9104 assert(VPBB && "expected to fold last (empty) block"); 9105 // After here, VPBB should not be used. 9106 VPBB = nullptr; 9107 9108 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9109 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9110 "entry block must be set to a VPRegionBlock having a non-empty entry " 9111 "VPBasicBlock"); 9112 RecipeBuilder.fixHeaderPhis(); 9113 9114 // --------------------------------------------------------------------------- 9115 // Transform initial VPlan: Apply previously taken decisions, in order, to 9116 // bring the VPlan to its final state. 9117 // --------------------------------------------------------------------------- 9118 9119 // Apply Sink-After legal constraints. 9120 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9121 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9122 if (Region && Region->isReplicator()) { 9123 assert(Region->getNumSuccessors() == 1 && 9124 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9125 assert(R->getParent()->size() == 1 && 9126 "A recipe in an original replicator region must be the only " 9127 "recipe in its block"); 9128 return Region; 9129 } 9130 return nullptr; 9131 }; 9132 for (auto &Entry : SinkAfter) { 9133 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9134 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9135 9136 auto *TargetRegion = GetReplicateRegion(Target); 9137 auto *SinkRegion = GetReplicateRegion(Sink); 9138 if (!SinkRegion) { 9139 // If the sink source is not a replicate region, sink the recipe directly. 9140 if (TargetRegion) { 9141 // The target is in a replication region, make sure to move Sink to 9142 // the block after it, not into the replication region itself. 9143 VPBasicBlock *NextBlock = 9144 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9145 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9146 } else 9147 Sink->moveAfter(Target); 9148 continue; 9149 } 9150 9151 // The sink source is in a replicate region. Unhook the region from the CFG. 9152 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9153 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9154 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9155 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9156 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9157 9158 if (TargetRegion) { 9159 // The target recipe is also in a replicate region, move the sink region 9160 // after the target region. 9161 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9162 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9163 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9164 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9165 } else { 9166 // The sink source is in a replicate region, we need to move the whole 9167 // replicate region, which should only contain a single recipe in the 9168 // main block. 9169 auto *SplitBlock = 9170 Target->getParent()->splitAt(std::next(Target->getIterator())); 9171 9172 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9173 9174 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9175 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9176 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9177 } 9178 } 9179 9180 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9181 9182 // Now that sink-after is done, move induction recipes for optimized truncates 9183 // to the phi section of the header block. 9184 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9185 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9186 9187 // Adjust the recipes for any inloop reductions. 9188 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9189 RecipeBuilder, Range.Start); 9190 9191 // Introduce a recipe to combine the incoming and previous values of a 9192 // first-order recurrence. 9193 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9194 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9195 if (!RecurPhi) 9196 continue; 9197 9198 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9199 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9200 auto *Region = GetReplicateRegion(PrevRecipe); 9201 if (Region) 9202 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9203 if (Region || PrevRecipe->isPhi()) 9204 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9205 else 9206 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9207 9208 auto *RecurSplice = cast<VPInstruction>( 9209 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9210 {RecurPhi, RecurPhi->getBackedgeValue()})); 9211 9212 RecurPhi->replaceAllUsesWith(RecurSplice); 9213 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9214 // all users. 9215 RecurSplice->setOperand(0, RecurPhi); 9216 } 9217 9218 // Interleave memory: for each Interleave Group we marked earlier as relevant 9219 // for this VPlan, replace the Recipes widening its memory instructions with a 9220 // single VPInterleaveRecipe at its insertion point. 9221 for (auto IG : InterleaveGroups) { 9222 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9223 RecipeBuilder.getRecipe(IG->getInsertPos())); 9224 SmallVector<VPValue *, 4> StoredValues; 9225 for (unsigned i = 0; i < IG->getFactor(); ++i) 9226 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9227 auto *StoreR = 9228 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9229 StoredValues.push_back(StoreR->getStoredValue()); 9230 } 9231 9232 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9233 Recipe->getMask()); 9234 VPIG->insertBefore(Recipe); 9235 unsigned J = 0; 9236 for (unsigned i = 0; i < IG->getFactor(); ++i) 9237 if (Instruction *Member = IG->getMember(i)) { 9238 if (!Member->getType()->isVoidTy()) { 9239 VPValue *OriginalV = Plan->getVPValue(Member); 9240 Plan->removeVPValueFor(Member); 9241 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9242 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9243 J++; 9244 } 9245 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9246 } 9247 } 9248 9249 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9250 // in ways that accessing values using original IR values is incorrect. 9251 Plan->disableValue2VPValue(); 9252 9253 VPlanTransforms::sinkScalarOperands(*Plan); 9254 VPlanTransforms::mergeReplicateRegions(*Plan); 9255 9256 std::string PlanName; 9257 raw_string_ostream RSO(PlanName); 9258 ElementCount VF = Range.Start; 9259 Plan->addVF(VF); 9260 RSO << "Initial VPlan for VF={" << VF; 9261 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9262 Plan->addVF(VF); 9263 RSO << "," << VF; 9264 } 9265 RSO << "},UF>=1"; 9266 RSO.flush(); 9267 Plan->setName(PlanName); 9268 9269 // Fold Exit block into its predecessor if possible. 9270 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9271 // VPBasicBlock as exit. 9272 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9273 9274 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9275 return Plan; 9276 } 9277 9278 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9279 // Outer loop handling: They may require CFG and instruction level 9280 // transformations before even evaluating whether vectorization is profitable. 9281 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9282 // the vectorization pipeline. 9283 assert(!OrigLoop->isInnermost()); 9284 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9285 9286 // Create new empty VPlan 9287 auto Plan = std::make_unique<VPlan>(); 9288 9289 // Build hierarchical CFG 9290 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9291 HCFGBuilder.buildHierarchicalCFG(); 9292 9293 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9294 VF *= 2) 9295 Plan->addVF(VF); 9296 9297 if (EnableVPlanPredication) { 9298 VPlanPredicator VPP(*Plan); 9299 VPP.predicate(); 9300 9301 // Avoid running transformation to recipes until masked code generation in 9302 // VPlan-native path is in place. 9303 return Plan; 9304 } 9305 9306 SmallPtrSet<Instruction *, 1> DeadInstructions; 9307 VPlanTransforms::VPInstructionsToVPRecipes( 9308 OrigLoop, Plan, 9309 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9310 DeadInstructions, *PSE.getSE()); 9311 return Plan; 9312 } 9313 9314 // Adjust the recipes for reductions. For in-loop reductions the chain of 9315 // instructions leading from the loop exit instr to the phi need to be converted 9316 // to reductions, with one operand being vector and the other being the scalar 9317 // reduction chain. For other reductions, a select is introduced between the phi 9318 // and live-out recipes when folding the tail. 9319 void LoopVectorizationPlanner::adjustRecipesForReductions( 9320 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9321 ElementCount MinVF) { 9322 for (auto &Reduction : CM.getInLoopReductionChains()) { 9323 PHINode *Phi = Reduction.first; 9324 const RecurrenceDescriptor &RdxDesc = 9325 Legal->getReductionVars().find(Phi)->second; 9326 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9327 9328 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9329 continue; 9330 9331 // ReductionOperations are orders top-down from the phi's use to the 9332 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9333 // which of the two operands will remain scalar and which will be reduced. 9334 // For minmax the chain will be the select instructions. 9335 Instruction *Chain = Phi; 9336 for (Instruction *R : ReductionOperations) { 9337 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9338 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9339 9340 VPValue *ChainOp = Plan->getVPValue(Chain); 9341 unsigned FirstOpId; 9342 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9343 "Only min/max recurrences allowed for inloop reductions"); 9344 // Recognize a call to the llvm.fmuladd intrinsic. 9345 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9346 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9347 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9348 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9349 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9350 "Expected to replace a VPWidenSelectSC"); 9351 FirstOpId = 1; 9352 } else { 9353 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9354 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9355 "Expected to replace a VPWidenSC"); 9356 FirstOpId = 0; 9357 } 9358 unsigned VecOpId = 9359 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9360 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9361 9362 auto *CondOp = CM.foldTailByMasking() 9363 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9364 : nullptr; 9365 9366 if (IsFMulAdd) { 9367 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9368 // need to create an fmul recipe to use as the vector operand for the 9369 // fadd reduction. 9370 VPInstruction *FMulRecipe = new VPInstruction( 9371 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9372 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9373 WidenRecipe->getParent()->insert(FMulRecipe, 9374 WidenRecipe->getIterator()); 9375 VecOp = FMulRecipe; 9376 } 9377 VPReductionRecipe *RedRecipe = 9378 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9379 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9380 Plan->removeVPValueFor(R); 9381 Plan->addVPValue(R, RedRecipe); 9382 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9383 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9384 WidenRecipe->eraseFromParent(); 9385 9386 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9387 VPRecipeBase *CompareRecipe = 9388 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9389 assert(isa<VPWidenRecipe>(CompareRecipe) && 9390 "Expected to replace a VPWidenSC"); 9391 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9392 "Expected no remaining users"); 9393 CompareRecipe->eraseFromParent(); 9394 } 9395 Chain = R; 9396 } 9397 } 9398 9399 // If tail is folded by masking, introduce selects between the phi 9400 // and the live-out instruction of each reduction, at the end of the latch. 9401 if (CM.foldTailByMasking()) { 9402 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9403 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9404 if (!PhiR || PhiR->isInLoop()) 9405 continue; 9406 Builder.setInsertPoint(LatchVPBB); 9407 VPValue *Cond = 9408 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9409 VPValue *Red = PhiR->getBackedgeValue(); 9410 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9411 } 9412 } 9413 } 9414 9415 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9416 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9417 VPSlotTracker &SlotTracker) const { 9418 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9419 IG->getInsertPos()->printAsOperand(O, false); 9420 O << ", "; 9421 getAddr()->printAsOperand(O, SlotTracker); 9422 VPValue *Mask = getMask(); 9423 if (Mask) { 9424 O << ", "; 9425 Mask->printAsOperand(O, SlotTracker); 9426 } 9427 9428 unsigned OpIdx = 0; 9429 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9430 if (!IG->getMember(i)) 9431 continue; 9432 if (getNumStoreOperands() > 0) { 9433 O << "\n" << Indent << " store "; 9434 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9435 O << " to index " << i; 9436 } else { 9437 O << "\n" << Indent << " "; 9438 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9439 O << " = load from index " << i; 9440 } 9441 ++OpIdx; 9442 } 9443 } 9444 #endif 9445 9446 void VPWidenCallRecipe::execute(VPTransformState &State) { 9447 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9448 *this, State); 9449 } 9450 9451 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9452 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9453 State.ILV->setDebugLocFromInst(&I); 9454 9455 // The condition can be loop invariant but still defined inside the 9456 // loop. This means that we can't just use the original 'cond' value. 9457 // We have to take the 'vectorized' value and pick the first lane. 9458 // Instcombine will make this a no-op. 9459 auto *InvarCond = 9460 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9461 9462 for (unsigned Part = 0; Part < State.UF; ++Part) { 9463 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9464 Value *Op0 = State.get(getOperand(1), Part); 9465 Value *Op1 = State.get(getOperand(2), Part); 9466 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9467 State.set(this, Sel, Part); 9468 State.ILV->addMetadata(Sel, &I); 9469 } 9470 } 9471 9472 void VPWidenRecipe::execute(VPTransformState &State) { 9473 auto &I = *cast<Instruction>(getUnderlyingValue()); 9474 auto &Builder = State.Builder; 9475 switch (I.getOpcode()) { 9476 case Instruction::Call: 9477 case Instruction::Br: 9478 case Instruction::PHI: 9479 case Instruction::GetElementPtr: 9480 case Instruction::Select: 9481 llvm_unreachable("This instruction is handled by a different recipe."); 9482 case Instruction::UDiv: 9483 case Instruction::SDiv: 9484 case Instruction::SRem: 9485 case Instruction::URem: 9486 case Instruction::Add: 9487 case Instruction::FAdd: 9488 case Instruction::Sub: 9489 case Instruction::FSub: 9490 case Instruction::FNeg: 9491 case Instruction::Mul: 9492 case Instruction::FMul: 9493 case Instruction::FDiv: 9494 case Instruction::FRem: 9495 case Instruction::Shl: 9496 case Instruction::LShr: 9497 case Instruction::AShr: 9498 case Instruction::And: 9499 case Instruction::Or: 9500 case Instruction::Xor: { 9501 // Just widen unops and binops. 9502 State.ILV->setDebugLocFromInst(&I); 9503 9504 for (unsigned Part = 0; Part < State.UF; ++Part) { 9505 SmallVector<Value *, 2> Ops; 9506 for (VPValue *VPOp : operands()) 9507 Ops.push_back(State.get(VPOp, Part)); 9508 9509 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9510 9511 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9512 VecOp->copyIRFlags(&I); 9513 9514 // If the instruction is vectorized and was in a basic block that needed 9515 // predication, we can't propagate poison-generating flags (nuw/nsw, 9516 // exact, etc.). The control flow has been linearized and the 9517 // instruction is no longer guarded by the predicate, which could make 9518 // the flag properties to no longer hold. 9519 if (State.MayGeneratePoisonRecipes.contains(this)) 9520 VecOp->dropPoisonGeneratingFlags(); 9521 } 9522 9523 // Use this vector value for all users of the original instruction. 9524 State.set(this, V, Part); 9525 State.ILV->addMetadata(V, &I); 9526 } 9527 9528 break; 9529 } 9530 case Instruction::ICmp: 9531 case Instruction::FCmp: { 9532 // Widen compares. Generate vector compares. 9533 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9534 auto *Cmp = cast<CmpInst>(&I); 9535 State.ILV->setDebugLocFromInst(Cmp); 9536 for (unsigned Part = 0; Part < State.UF; ++Part) { 9537 Value *A = State.get(getOperand(0), Part); 9538 Value *B = State.get(getOperand(1), Part); 9539 Value *C = nullptr; 9540 if (FCmp) { 9541 // Propagate fast math flags. 9542 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9543 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9544 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9545 } else { 9546 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9547 } 9548 State.set(this, C, Part); 9549 State.ILV->addMetadata(C, &I); 9550 } 9551 9552 break; 9553 } 9554 9555 case Instruction::ZExt: 9556 case Instruction::SExt: 9557 case Instruction::FPToUI: 9558 case Instruction::FPToSI: 9559 case Instruction::FPExt: 9560 case Instruction::PtrToInt: 9561 case Instruction::IntToPtr: 9562 case Instruction::SIToFP: 9563 case Instruction::UIToFP: 9564 case Instruction::Trunc: 9565 case Instruction::FPTrunc: 9566 case Instruction::BitCast: { 9567 auto *CI = cast<CastInst>(&I); 9568 State.ILV->setDebugLocFromInst(CI); 9569 9570 /// Vectorize casts. 9571 Type *DestTy = (State.VF.isScalar()) 9572 ? CI->getType() 9573 : VectorType::get(CI->getType(), State.VF); 9574 9575 for (unsigned Part = 0; Part < State.UF; ++Part) { 9576 Value *A = State.get(getOperand(0), Part); 9577 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9578 State.set(this, Cast, Part); 9579 State.ILV->addMetadata(Cast, &I); 9580 } 9581 break; 9582 } 9583 default: 9584 // This instruction is not vectorized by simple widening. 9585 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9586 llvm_unreachable("Unhandled instruction!"); 9587 } // end of switch. 9588 } 9589 9590 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9591 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9592 // Construct a vector GEP by widening the operands of the scalar GEP as 9593 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9594 // results in a vector of pointers when at least one operand of the GEP 9595 // is vector-typed. Thus, to keep the representation compact, we only use 9596 // vector-typed operands for loop-varying values. 9597 9598 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9599 // If we are vectorizing, but the GEP has only loop-invariant operands, 9600 // the GEP we build (by only using vector-typed operands for 9601 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9602 // produce a vector of pointers, we need to either arbitrarily pick an 9603 // operand to broadcast, or broadcast a clone of the original GEP. 9604 // Here, we broadcast a clone of the original. 9605 // 9606 // TODO: If at some point we decide to scalarize instructions having 9607 // loop-invariant operands, this special case will no longer be 9608 // required. We would add the scalarization decision to 9609 // collectLoopScalars() and teach getVectorValue() to broadcast 9610 // the lane-zero scalar value. 9611 auto *Clone = State.Builder.Insert(GEP->clone()); 9612 for (unsigned Part = 0; Part < State.UF; ++Part) { 9613 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9614 State.set(this, EntryPart, Part); 9615 State.ILV->addMetadata(EntryPart, GEP); 9616 } 9617 } else { 9618 // If the GEP has at least one loop-varying operand, we are sure to 9619 // produce a vector of pointers. But if we are only unrolling, we want 9620 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9621 // produce with the code below will be scalar (if VF == 1) or vector 9622 // (otherwise). Note that for the unroll-only case, we still maintain 9623 // values in the vector mapping with initVector, as we do for other 9624 // instructions. 9625 for (unsigned Part = 0; Part < State.UF; ++Part) { 9626 // The pointer operand of the new GEP. If it's loop-invariant, we 9627 // won't broadcast it. 9628 auto *Ptr = IsPtrLoopInvariant 9629 ? State.get(getOperand(0), VPIteration(0, 0)) 9630 : State.get(getOperand(0), Part); 9631 9632 // Collect all the indices for the new GEP. If any index is 9633 // loop-invariant, we won't broadcast it. 9634 SmallVector<Value *, 4> Indices; 9635 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9636 VPValue *Operand = getOperand(I); 9637 if (IsIndexLoopInvariant[I - 1]) 9638 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9639 else 9640 Indices.push_back(State.get(Operand, Part)); 9641 } 9642 9643 // If the GEP instruction is vectorized and was in a basic block that 9644 // needed predication, we can't propagate the poison-generating 'inbounds' 9645 // flag. The control flow has been linearized and the GEP is no longer 9646 // guarded by the predicate, which could make the 'inbounds' properties to 9647 // no longer hold. 9648 bool IsInBounds = 9649 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9650 9651 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9652 // but it should be a vector, otherwise. 9653 auto *NewGEP = IsInBounds 9654 ? State.Builder.CreateInBoundsGEP( 9655 GEP->getSourceElementType(), Ptr, Indices) 9656 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9657 Ptr, Indices); 9658 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9659 "NewGEP is not a pointer vector"); 9660 State.set(this, NewGEP, Part); 9661 State.ILV->addMetadata(NewGEP, GEP); 9662 } 9663 } 9664 } 9665 9666 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9667 assert(!State.Instance && "Int or FP induction being replicated."); 9668 State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), 9669 getStartValue()->getLiveInIRValue(), 9670 getTruncInst(), getVPValue(0), State); 9671 } 9672 9673 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9674 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9675 State); 9676 } 9677 9678 void VPBlendRecipe::execute(VPTransformState &State) { 9679 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9680 // We know that all PHIs in non-header blocks are converted into 9681 // selects, so we don't have to worry about the insertion order and we 9682 // can just use the builder. 9683 // At this point we generate the predication tree. There may be 9684 // duplications since this is a simple recursive scan, but future 9685 // optimizations will clean it up. 9686 9687 unsigned NumIncoming = getNumIncomingValues(); 9688 9689 // Generate a sequence of selects of the form: 9690 // SELECT(Mask3, In3, 9691 // SELECT(Mask2, In2, 9692 // SELECT(Mask1, In1, 9693 // In0))) 9694 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9695 // are essentially undef are taken from In0. 9696 InnerLoopVectorizer::VectorParts Entry(State.UF); 9697 for (unsigned In = 0; In < NumIncoming; ++In) { 9698 for (unsigned Part = 0; Part < State.UF; ++Part) { 9699 // We might have single edge PHIs (blocks) - use an identity 9700 // 'select' for the first PHI operand. 9701 Value *In0 = State.get(getIncomingValue(In), Part); 9702 if (In == 0) 9703 Entry[Part] = In0; // Initialize with the first incoming value. 9704 else { 9705 // Select between the current value and the previous incoming edge 9706 // based on the incoming mask. 9707 Value *Cond = State.get(getMask(In), Part); 9708 Entry[Part] = 9709 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9710 } 9711 } 9712 } 9713 for (unsigned Part = 0; Part < State.UF; ++Part) 9714 State.set(this, Entry[Part], Part); 9715 } 9716 9717 void VPInterleaveRecipe::execute(VPTransformState &State) { 9718 assert(!State.Instance && "Interleave group being replicated."); 9719 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9720 getStoredValues(), getMask()); 9721 } 9722 9723 void VPReductionRecipe::execute(VPTransformState &State) { 9724 assert(!State.Instance && "Reduction being replicated."); 9725 Value *PrevInChain = State.get(getChainOp(), 0); 9726 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9727 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9728 // Propagate the fast-math flags carried by the underlying instruction. 9729 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9730 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9731 for (unsigned Part = 0; Part < State.UF; ++Part) { 9732 Value *NewVecOp = State.get(getVecOp(), Part); 9733 if (VPValue *Cond = getCondOp()) { 9734 Value *NewCond = State.get(Cond, Part); 9735 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9736 Value *Iden = RdxDesc->getRecurrenceIdentity( 9737 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9738 Value *IdenVec = 9739 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9740 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9741 NewVecOp = Select; 9742 } 9743 Value *NewRed; 9744 Value *NextInChain; 9745 if (IsOrdered) { 9746 if (State.VF.isVector()) 9747 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9748 PrevInChain); 9749 else 9750 NewRed = State.Builder.CreateBinOp( 9751 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9752 NewVecOp); 9753 PrevInChain = NewRed; 9754 } else { 9755 PrevInChain = State.get(getChainOp(), Part); 9756 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9757 } 9758 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9759 NextInChain = 9760 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9761 NewRed, PrevInChain); 9762 } else if (IsOrdered) 9763 NextInChain = NewRed; 9764 else 9765 NextInChain = State.Builder.CreateBinOp( 9766 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9767 PrevInChain); 9768 State.set(this, NextInChain, Part); 9769 } 9770 } 9771 9772 void VPReplicateRecipe::execute(VPTransformState &State) { 9773 if (State.Instance) { // Generate a single instance. 9774 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9775 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9776 IsPredicated, State); 9777 // Insert scalar instance packing it into a vector. 9778 if (AlsoPack && State.VF.isVector()) { 9779 // If we're constructing lane 0, initialize to start from poison. 9780 if (State.Instance->Lane.isFirstLane()) { 9781 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9782 Value *Poison = PoisonValue::get( 9783 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9784 State.set(this, Poison, State.Instance->Part); 9785 } 9786 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9787 } 9788 return; 9789 } 9790 9791 // Generate scalar instances for all VF lanes of all UF parts, unless the 9792 // instruction is uniform inwhich case generate only the first lane for each 9793 // of the UF parts. 9794 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9795 assert((!State.VF.isScalable() || IsUniform) && 9796 "Can't scalarize a scalable vector"); 9797 for (unsigned Part = 0; Part < State.UF; ++Part) 9798 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9799 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9800 VPIteration(Part, Lane), IsPredicated, 9801 State); 9802 } 9803 9804 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9805 assert(State.Instance && "Branch on Mask works only on single instance."); 9806 9807 unsigned Part = State.Instance->Part; 9808 unsigned Lane = State.Instance->Lane.getKnownLane(); 9809 9810 Value *ConditionBit = nullptr; 9811 VPValue *BlockInMask = getMask(); 9812 if (BlockInMask) { 9813 ConditionBit = State.get(BlockInMask, Part); 9814 if (ConditionBit->getType()->isVectorTy()) 9815 ConditionBit = State.Builder.CreateExtractElement( 9816 ConditionBit, State.Builder.getInt32(Lane)); 9817 } else // Block in mask is all-one. 9818 ConditionBit = State.Builder.getTrue(); 9819 9820 // Replace the temporary unreachable terminator with a new conditional branch, 9821 // whose two destinations will be set later when they are created. 9822 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9823 assert(isa<UnreachableInst>(CurrentTerminator) && 9824 "Expected to replace unreachable terminator with conditional branch."); 9825 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9826 CondBr->setSuccessor(0, nullptr); 9827 ReplaceInstWithInst(CurrentTerminator, CondBr); 9828 } 9829 9830 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9831 assert(State.Instance && "Predicated instruction PHI works per instance."); 9832 Instruction *ScalarPredInst = 9833 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9834 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9835 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9836 assert(PredicatingBB && "Predicated block has no single predecessor."); 9837 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9838 "operand must be VPReplicateRecipe"); 9839 9840 // By current pack/unpack logic we need to generate only a single phi node: if 9841 // a vector value for the predicated instruction exists at this point it means 9842 // the instruction has vector users only, and a phi for the vector value is 9843 // needed. In this case the recipe of the predicated instruction is marked to 9844 // also do that packing, thereby "hoisting" the insert-element sequence. 9845 // Otherwise, a phi node for the scalar value is needed. 9846 unsigned Part = State.Instance->Part; 9847 if (State.hasVectorValue(getOperand(0), Part)) { 9848 Value *VectorValue = State.get(getOperand(0), Part); 9849 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9850 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9851 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9852 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9853 if (State.hasVectorValue(this, Part)) 9854 State.reset(this, VPhi, Part); 9855 else 9856 State.set(this, VPhi, Part); 9857 // NOTE: Currently we need to update the value of the operand, so the next 9858 // predicated iteration inserts its generated value in the correct vector. 9859 State.reset(getOperand(0), VPhi, Part); 9860 } else { 9861 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9862 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9863 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9864 PredicatingBB); 9865 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9866 if (State.hasScalarValue(this, *State.Instance)) 9867 State.reset(this, Phi, *State.Instance); 9868 else 9869 State.set(this, Phi, *State.Instance); 9870 // NOTE: Currently we need to update the value of the operand, so the next 9871 // predicated iteration inserts its generated value in the correct vector. 9872 State.reset(getOperand(0), Phi, *State.Instance); 9873 } 9874 } 9875 9876 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9877 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9878 9879 // Attempt to issue a wide load. 9880 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9881 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9882 9883 assert((LI || SI) && "Invalid Load/Store instruction"); 9884 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9885 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9886 9887 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9888 9889 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9890 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9891 bool CreateGatherScatter = !Consecutive; 9892 9893 auto &Builder = State.Builder; 9894 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9895 bool isMaskRequired = getMask(); 9896 if (isMaskRequired) 9897 for (unsigned Part = 0; Part < State.UF; ++Part) 9898 BlockInMaskParts[Part] = State.get(getMask(), Part); 9899 9900 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9901 // Calculate the pointer for the specific unroll-part. 9902 GetElementPtrInst *PartPtr = nullptr; 9903 9904 bool InBounds = false; 9905 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9906 InBounds = gep->isInBounds(); 9907 if (Reverse) { 9908 // If the address is consecutive but reversed, then the 9909 // wide store needs to start at the last vector element. 9910 // RunTimeVF = VScale * VF.getKnownMinValue() 9911 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9912 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9913 // NumElt = -Part * RunTimeVF 9914 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9915 // LastLane = 1 - RunTimeVF 9916 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9917 PartPtr = 9918 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9919 PartPtr->setIsInBounds(InBounds); 9920 PartPtr = cast<GetElementPtrInst>( 9921 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9922 PartPtr->setIsInBounds(InBounds); 9923 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9924 BlockInMaskParts[Part] = 9925 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9926 } else { 9927 Value *Increment = 9928 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9929 PartPtr = cast<GetElementPtrInst>( 9930 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9931 PartPtr->setIsInBounds(InBounds); 9932 } 9933 9934 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9935 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9936 }; 9937 9938 // Handle Stores: 9939 if (SI) { 9940 State.ILV->setDebugLocFromInst(SI); 9941 9942 for (unsigned Part = 0; Part < State.UF; ++Part) { 9943 Instruction *NewSI = nullptr; 9944 Value *StoredVal = State.get(StoredValue, Part); 9945 if (CreateGatherScatter) { 9946 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9947 Value *VectorGep = State.get(getAddr(), Part); 9948 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9949 MaskPart); 9950 } else { 9951 if (Reverse) { 9952 // If we store to reverse consecutive memory locations, then we need 9953 // to reverse the order of elements in the stored value. 9954 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9955 // We don't want to update the value in the map as it might be used in 9956 // another expression. So don't call resetVectorValue(StoredVal). 9957 } 9958 auto *VecPtr = 9959 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9960 if (isMaskRequired) 9961 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9962 BlockInMaskParts[Part]); 9963 else 9964 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9965 } 9966 State.ILV->addMetadata(NewSI, SI); 9967 } 9968 return; 9969 } 9970 9971 // Handle loads. 9972 assert(LI && "Must have a load instruction"); 9973 State.ILV->setDebugLocFromInst(LI); 9974 for (unsigned Part = 0; Part < State.UF; ++Part) { 9975 Value *NewLI; 9976 if (CreateGatherScatter) { 9977 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9978 Value *VectorGep = State.get(getAddr(), Part); 9979 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9980 nullptr, "wide.masked.gather"); 9981 State.ILV->addMetadata(NewLI, LI); 9982 } else { 9983 auto *VecPtr = 9984 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9985 if (isMaskRequired) 9986 NewLI = Builder.CreateMaskedLoad( 9987 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9988 PoisonValue::get(DataTy), "wide.masked.load"); 9989 else 9990 NewLI = 9991 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9992 9993 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9994 State.ILV->addMetadata(NewLI, LI); 9995 if (Reverse) 9996 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9997 } 9998 9999 State.set(getVPSingleValue(), NewLI, Part); 10000 } 10001 } 10002 10003 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10004 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10005 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10006 // for predication. 10007 static ScalarEpilogueLowering getScalarEpilogueLowering( 10008 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10009 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10010 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10011 LoopVectorizationLegality &LVL) { 10012 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10013 // don't look at hints or options, and don't request a scalar epilogue. 10014 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10015 // LoopAccessInfo (due to code dependency and not being able to reliably get 10016 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10017 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10018 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10019 // back to the old way and vectorize with versioning when forced. See D81345.) 10020 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10021 PGSOQueryType::IRPass) && 10022 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10023 return CM_ScalarEpilogueNotAllowedOptSize; 10024 10025 // 2) If set, obey the directives 10026 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10027 switch (PreferPredicateOverEpilogue) { 10028 case PreferPredicateTy::ScalarEpilogue: 10029 return CM_ScalarEpilogueAllowed; 10030 case PreferPredicateTy::PredicateElseScalarEpilogue: 10031 return CM_ScalarEpilogueNotNeededUsePredicate; 10032 case PreferPredicateTy::PredicateOrDontVectorize: 10033 return CM_ScalarEpilogueNotAllowedUsePredicate; 10034 }; 10035 } 10036 10037 // 3) If set, obey the hints 10038 switch (Hints.getPredicate()) { 10039 case LoopVectorizeHints::FK_Enabled: 10040 return CM_ScalarEpilogueNotNeededUsePredicate; 10041 case LoopVectorizeHints::FK_Disabled: 10042 return CM_ScalarEpilogueAllowed; 10043 }; 10044 10045 // 4) if the TTI hook indicates this is profitable, request predication. 10046 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10047 LVL.getLAI())) 10048 return CM_ScalarEpilogueNotNeededUsePredicate; 10049 10050 return CM_ScalarEpilogueAllowed; 10051 } 10052 10053 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10054 // If Values have been set for this Def return the one relevant for \p Part. 10055 if (hasVectorValue(Def, Part)) 10056 return Data.PerPartOutput[Def][Part]; 10057 10058 if (!hasScalarValue(Def, {Part, 0})) { 10059 Value *IRV = Def->getLiveInIRValue(); 10060 Value *B = ILV->getBroadcastInstrs(IRV); 10061 set(Def, B, Part); 10062 return B; 10063 } 10064 10065 Value *ScalarValue = get(Def, {Part, 0}); 10066 // If we aren't vectorizing, we can just copy the scalar map values over 10067 // to the vector map. 10068 if (VF.isScalar()) { 10069 set(Def, ScalarValue, Part); 10070 return ScalarValue; 10071 } 10072 10073 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10074 bool IsUniform = RepR && RepR->isUniform(); 10075 10076 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10077 // Check if there is a scalar value for the selected lane. 10078 if (!hasScalarValue(Def, {Part, LastLane})) { 10079 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10080 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10081 "unexpected recipe found to be invariant"); 10082 IsUniform = true; 10083 LastLane = 0; 10084 } 10085 10086 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10087 // Set the insert point after the last scalarized instruction or after the 10088 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10089 // will directly follow the scalar definitions. 10090 auto OldIP = Builder.saveIP(); 10091 auto NewIP = 10092 isa<PHINode>(LastInst) 10093 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10094 : std::next(BasicBlock::iterator(LastInst)); 10095 Builder.SetInsertPoint(&*NewIP); 10096 10097 // However, if we are vectorizing, we need to construct the vector values. 10098 // If the value is known to be uniform after vectorization, we can just 10099 // broadcast the scalar value corresponding to lane zero for each unroll 10100 // iteration. Otherwise, we construct the vector values using 10101 // insertelement instructions. Since the resulting vectors are stored in 10102 // State, we will only generate the insertelements once. 10103 Value *VectorValue = nullptr; 10104 if (IsUniform) { 10105 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10106 set(Def, VectorValue, Part); 10107 } else { 10108 // Initialize packing with insertelements to start from undef. 10109 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10110 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10111 set(Def, Undef, Part); 10112 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10113 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10114 VectorValue = get(Def, Part); 10115 } 10116 Builder.restoreIP(OldIP); 10117 return VectorValue; 10118 } 10119 10120 // Process the loop in the VPlan-native vectorization path. This path builds 10121 // VPlan upfront in the vectorization pipeline, which allows to apply 10122 // VPlan-to-VPlan transformations from the very beginning without modifying the 10123 // input LLVM IR. 10124 static bool processLoopInVPlanNativePath( 10125 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10126 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10127 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10128 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10129 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10130 LoopVectorizationRequirements &Requirements) { 10131 10132 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10133 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10134 return false; 10135 } 10136 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10137 Function *F = L->getHeader()->getParent(); 10138 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10139 10140 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10141 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10142 10143 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10144 &Hints, IAI); 10145 // Use the planner for outer loop vectorization. 10146 // TODO: CM is not used at this point inside the planner. Turn CM into an 10147 // optional argument if we don't need it in the future. 10148 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10149 Requirements, ORE); 10150 10151 // Get user vectorization factor. 10152 ElementCount UserVF = Hints.getWidth(); 10153 10154 CM.collectElementTypesForWidening(); 10155 10156 // Plan how to best vectorize, return the best VF and its cost. 10157 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10158 10159 // If we are stress testing VPlan builds, do not attempt to generate vector 10160 // code. Masked vector code generation support will follow soon. 10161 // Also, do not attempt to vectorize if no vector code will be produced. 10162 if (VPlanBuildStressTest || EnableVPlanPredication || 10163 VectorizationFactor::Disabled() == VF) 10164 return false; 10165 10166 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10167 10168 { 10169 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10170 F->getParent()->getDataLayout()); 10171 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10172 &CM, BFI, PSI, Checks); 10173 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10174 << L->getHeader()->getParent()->getName() << "\"\n"); 10175 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10176 } 10177 10178 // Mark the loop as already vectorized to avoid vectorizing again. 10179 Hints.setAlreadyVectorized(); 10180 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10181 return true; 10182 } 10183 10184 // Emit a remark if there are stores to floats that required a floating point 10185 // extension. If the vectorized loop was generated with floating point there 10186 // will be a performance penalty from the conversion overhead and the change in 10187 // the vector width. 10188 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10189 SmallVector<Instruction *, 4> Worklist; 10190 for (BasicBlock *BB : L->getBlocks()) { 10191 for (Instruction &Inst : *BB) { 10192 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10193 if (S->getValueOperand()->getType()->isFloatTy()) 10194 Worklist.push_back(S); 10195 } 10196 } 10197 } 10198 10199 // Traverse the floating point stores upwards searching, for floating point 10200 // conversions. 10201 SmallPtrSet<const Instruction *, 4> Visited; 10202 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10203 while (!Worklist.empty()) { 10204 auto *I = Worklist.pop_back_val(); 10205 if (!L->contains(I)) 10206 continue; 10207 if (!Visited.insert(I).second) 10208 continue; 10209 10210 // Emit a remark if the floating point store required a floating 10211 // point conversion. 10212 // TODO: More work could be done to identify the root cause such as a 10213 // constant or a function return type and point the user to it. 10214 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10215 ORE->emit([&]() { 10216 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10217 I->getDebugLoc(), L->getHeader()) 10218 << "floating point conversion changes vector width. " 10219 << "Mixed floating point precision requires an up/down " 10220 << "cast that will negatively impact performance."; 10221 }); 10222 10223 for (Use &Op : I->operands()) 10224 if (auto *OpI = dyn_cast<Instruction>(Op)) 10225 Worklist.push_back(OpI); 10226 } 10227 } 10228 10229 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10230 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10231 !EnableLoopInterleaving), 10232 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10233 !EnableLoopVectorization) {} 10234 10235 bool LoopVectorizePass::processLoop(Loop *L) { 10236 assert((EnableVPlanNativePath || L->isInnermost()) && 10237 "VPlan-native path is not enabled. Only process inner loops."); 10238 10239 #ifndef NDEBUG 10240 const std::string DebugLocStr = getDebugLocString(L); 10241 #endif /* NDEBUG */ 10242 10243 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10244 << L->getHeader()->getParent()->getName() << "\" from " 10245 << DebugLocStr << "\n"); 10246 10247 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10248 10249 LLVM_DEBUG( 10250 dbgs() << "LV: Loop hints:" 10251 << " force=" 10252 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10253 ? "disabled" 10254 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10255 ? "enabled" 10256 : "?")) 10257 << " width=" << Hints.getWidth() 10258 << " interleave=" << Hints.getInterleave() << "\n"); 10259 10260 // Function containing loop 10261 Function *F = L->getHeader()->getParent(); 10262 10263 // Looking at the diagnostic output is the only way to determine if a loop 10264 // was vectorized (other than looking at the IR or machine code), so it 10265 // is important to generate an optimization remark for each loop. Most of 10266 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10267 // generated as OptimizationRemark and OptimizationRemarkMissed are 10268 // less verbose reporting vectorized loops and unvectorized loops that may 10269 // benefit from vectorization, respectively. 10270 10271 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10272 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10273 return false; 10274 } 10275 10276 PredicatedScalarEvolution PSE(*SE, *L); 10277 10278 // Check if it is legal to vectorize the loop. 10279 LoopVectorizationRequirements Requirements; 10280 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10281 &Requirements, &Hints, DB, AC, BFI, PSI); 10282 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10283 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10284 Hints.emitRemarkWithHints(); 10285 return false; 10286 } 10287 10288 // Check the function attributes and profiles to find out if this function 10289 // should be optimized for size. 10290 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10291 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10292 10293 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10294 // here. They may require CFG and instruction level transformations before 10295 // even evaluating whether vectorization is profitable. Since we cannot modify 10296 // the incoming IR, we need to build VPlan upfront in the vectorization 10297 // pipeline. 10298 if (!L->isInnermost()) 10299 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10300 ORE, BFI, PSI, Hints, Requirements); 10301 10302 assert(L->isInnermost() && "Inner loop expected."); 10303 10304 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10305 // count by optimizing for size, to minimize overheads. 10306 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10307 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10308 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10309 << "This loop is worth vectorizing only if no scalar " 10310 << "iteration overheads are incurred."); 10311 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10312 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10313 else { 10314 LLVM_DEBUG(dbgs() << "\n"); 10315 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10316 } 10317 } 10318 10319 // Check the function attributes to see if implicit floats are allowed. 10320 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10321 // an integer loop and the vector instructions selected are purely integer 10322 // vector instructions? 10323 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10324 reportVectorizationFailure( 10325 "Can't vectorize when the NoImplicitFloat attribute is used", 10326 "loop not vectorized due to NoImplicitFloat attribute", 10327 "NoImplicitFloat", ORE, L); 10328 Hints.emitRemarkWithHints(); 10329 return false; 10330 } 10331 10332 // Check if the target supports potentially unsafe FP vectorization. 10333 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10334 // for the target we're vectorizing for, to make sure none of the 10335 // additional fp-math flags can help. 10336 if (Hints.isPotentiallyUnsafe() && 10337 TTI->isFPVectorizationPotentiallyUnsafe()) { 10338 reportVectorizationFailure( 10339 "Potentially unsafe FP op prevents vectorization", 10340 "loop not vectorized due to unsafe FP support.", 10341 "UnsafeFP", ORE, L); 10342 Hints.emitRemarkWithHints(); 10343 return false; 10344 } 10345 10346 bool AllowOrderedReductions; 10347 // If the flag is set, use that instead and override the TTI behaviour. 10348 if (ForceOrderedReductions.getNumOccurrences() > 0) 10349 AllowOrderedReductions = ForceOrderedReductions; 10350 else 10351 AllowOrderedReductions = TTI->enableOrderedReductions(); 10352 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10353 ORE->emit([&]() { 10354 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10355 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10356 ExactFPMathInst->getDebugLoc(), 10357 ExactFPMathInst->getParent()) 10358 << "loop not vectorized: cannot prove it is safe to reorder " 10359 "floating-point operations"; 10360 }); 10361 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10362 "reorder floating-point operations\n"); 10363 Hints.emitRemarkWithHints(); 10364 return false; 10365 } 10366 10367 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10368 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10369 10370 // If an override option has been passed in for interleaved accesses, use it. 10371 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10372 UseInterleaved = EnableInterleavedMemAccesses; 10373 10374 // Analyze interleaved memory accesses. 10375 if (UseInterleaved) { 10376 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10377 } 10378 10379 // Use the cost model. 10380 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10381 F, &Hints, IAI); 10382 CM.collectValuesToIgnore(); 10383 CM.collectElementTypesForWidening(); 10384 10385 // Use the planner for vectorization. 10386 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10387 Requirements, ORE); 10388 10389 // Get user vectorization factor and interleave count. 10390 ElementCount UserVF = Hints.getWidth(); 10391 unsigned UserIC = Hints.getInterleave(); 10392 10393 // Plan how to best vectorize, return the best VF and its cost. 10394 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10395 10396 VectorizationFactor VF = VectorizationFactor::Disabled(); 10397 unsigned IC = 1; 10398 10399 if (MaybeVF) { 10400 VF = *MaybeVF; 10401 // Select the interleave count. 10402 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10403 } 10404 10405 // Identify the diagnostic messages that should be produced. 10406 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10407 bool VectorizeLoop = true, InterleaveLoop = true; 10408 if (VF.Width.isScalar()) { 10409 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10410 VecDiagMsg = std::make_pair( 10411 "VectorizationNotBeneficial", 10412 "the cost-model indicates that vectorization is not beneficial"); 10413 VectorizeLoop = false; 10414 } 10415 10416 if (!MaybeVF && UserIC > 1) { 10417 // Tell the user interleaving was avoided up-front, despite being explicitly 10418 // requested. 10419 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10420 "interleaving should be avoided up front\n"); 10421 IntDiagMsg = std::make_pair( 10422 "InterleavingAvoided", 10423 "Ignoring UserIC, because interleaving was avoided up front"); 10424 InterleaveLoop = false; 10425 } else if (IC == 1 && UserIC <= 1) { 10426 // Tell the user interleaving is not beneficial. 10427 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10428 IntDiagMsg = std::make_pair( 10429 "InterleavingNotBeneficial", 10430 "the cost-model indicates that interleaving is not beneficial"); 10431 InterleaveLoop = false; 10432 if (UserIC == 1) { 10433 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10434 IntDiagMsg.second += 10435 " and is explicitly disabled or interleave count is set to 1"; 10436 } 10437 } else if (IC > 1 && UserIC == 1) { 10438 // Tell the user interleaving is beneficial, but it explicitly disabled. 10439 LLVM_DEBUG( 10440 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10441 IntDiagMsg = std::make_pair( 10442 "InterleavingBeneficialButDisabled", 10443 "the cost-model indicates that interleaving is beneficial " 10444 "but is explicitly disabled or interleave count is set to 1"); 10445 InterleaveLoop = false; 10446 } 10447 10448 // Override IC if user provided an interleave count. 10449 IC = UserIC > 0 ? UserIC : IC; 10450 10451 // Emit diagnostic messages, if any. 10452 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10453 if (!VectorizeLoop && !InterleaveLoop) { 10454 // Do not vectorize or interleaving the loop. 10455 ORE->emit([&]() { 10456 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10457 L->getStartLoc(), L->getHeader()) 10458 << VecDiagMsg.second; 10459 }); 10460 ORE->emit([&]() { 10461 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10462 L->getStartLoc(), L->getHeader()) 10463 << IntDiagMsg.second; 10464 }); 10465 return false; 10466 } else if (!VectorizeLoop && InterleaveLoop) { 10467 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10468 ORE->emit([&]() { 10469 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10470 L->getStartLoc(), L->getHeader()) 10471 << VecDiagMsg.second; 10472 }); 10473 } else if (VectorizeLoop && !InterleaveLoop) { 10474 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10475 << ") in " << DebugLocStr << '\n'); 10476 ORE->emit([&]() { 10477 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10478 L->getStartLoc(), L->getHeader()) 10479 << IntDiagMsg.second; 10480 }); 10481 } else if (VectorizeLoop && InterleaveLoop) { 10482 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10483 << ") in " << DebugLocStr << '\n'); 10484 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10485 } 10486 10487 bool DisableRuntimeUnroll = false; 10488 MDNode *OrigLoopID = L->getLoopID(); 10489 { 10490 // Optimistically generate runtime checks. Drop them if they turn out to not 10491 // be profitable. Limit the scope of Checks, so the cleanup happens 10492 // immediately after vector codegeneration is done. 10493 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10494 F->getParent()->getDataLayout()); 10495 if (!VF.Width.isScalar() || IC > 1) 10496 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10497 10498 using namespace ore; 10499 if (!VectorizeLoop) { 10500 assert(IC > 1 && "interleave count should not be 1 or 0"); 10501 // If we decided that it is not legal to vectorize the loop, then 10502 // interleave it. 10503 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10504 &CM, BFI, PSI, Checks); 10505 10506 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10507 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10508 10509 ORE->emit([&]() { 10510 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10511 L->getHeader()) 10512 << "interleaved loop (interleaved count: " 10513 << NV("InterleaveCount", IC) << ")"; 10514 }); 10515 } else { 10516 // If we decided that it is *legal* to vectorize the loop, then do it. 10517 10518 // Consider vectorizing the epilogue too if it's profitable. 10519 VectorizationFactor EpilogueVF = 10520 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10521 if (EpilogueVF.Width.isVector()) { 10522 10523 // The first pass vectorizes the main loop and creates a scalar epilogue 10524 // to be vectorized by executing the plan (potentially with a different 10525 // factor) again shortly afterwards. 10526 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10527 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10528 EPI, &LVL, &CM, BFI, PSI, Checks); 10529 10530 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10531 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10532 DT); 10533 ++LoopsVectorized; 10534 10535 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10536 formLCSSARecursively(*L, *DT, LI, SE); 10537 10538 // Second pass vectorizes the epilogue and adjusts the control flow 10539 // edges from the first pass. 10540 EPI.MainLoopVF = EPI.EpilogueVF; 10541 EPI.MainLoopUF = EPI.EpilogueUF; 10542 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10543 ORE, EPI, &LVL, &CM, BFI, PSI, 10544 Checks); 10545 10546 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10547 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10548 DT); 10549 ++LoopsEpilogueVectorized; 10550 10551 if (!MainILV.areSafetyChecksAdded()) 10552 DisableRuntimeUnroll = true; 10553 } else { 10554 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10555 &LVL, &CM, BFI, PSI, Checks); 10556 10557 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10558 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10559 ++LoopsVectorized; 10560 10561 // Add metadata to disable runtime unrolling a scalar loop when there 10562 // are no runtime checks about strides and memory. A scalar loop that is 10563 // rarely used is not worth unrolling. 10564 if (!LB.areSafetyChecksAdded()) 10565 DisableRuntimeUnroll = true; 10566 } 10567 // Report the vectorization decision. 10568 ORE->emit([&]() { 10569 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10570 L->getHeader()) 10571 << "vectorized loop (vectorization width: " 10572 << NV("VectorizationFactor", VF.Width) 10573 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10574 }); 10575 } 10576 10577 if (ORE->allowExtraAnalysis(LV_NAME)) 10578 checkMixedPrecision(L, ORE); 10579 } 10580 10581 Optional<MDNode *> RemainderLoopID = 10582 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10583 LLVMLoopVectorizeFollowupEpilogue}); 10584 if (RemainderLoopID.hasValue()) { 10585 L->setLoopID(RemainderLoopID.getValue()); 10586 } else { 10587 if (DisableRuntimeUnroll) 10588 AddRuntimeUnrollDisableMetaData(L); 10589 10590 // Mark the loop as already vectorized to avoid vectorizing again. 10591 Hints.setAlreadyVectorized(); 10592 } 10593 10594 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10595 return true; 10596 } 10597 10598 LoopVectorizeResult LoopVectorizePass::runImpl( 10599 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10600 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10601 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10602 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10603 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10604 SE = &SE_; 10605 LI = &LI_; 10606 TTI = &TTI_; 10607 DT = &DT_; 10608 BFI = &BFI_; 10609 TLI = TLI_; 10610 AA = &AA_; 10611 AC = &AC_; 10612 GetLAA = &GetLAA_; 10613 DB = &DB_; 10614 ORE = &ORE_; 10615 PSI = PSI_; 10616 10617 // Don't attempt if 10618 // 1. the target claims to have no vector registers, and 10619 // 2. interleaving won't help ILP. 10620 // 10621 // The second condition is necessary because, even if the target has no 10622 // vector registers, loop vectorization may still enable scalar 10623 // interleaving. 10624 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10625 TTI->getMaxInterleaveFactor(1) < 2) 10626 return LoopVectorizeResult(false, false); 10627 10628 bool Changed = false, CFGChanged = false; 10629 10630 // The vectorizer requires loops to be in simplified form. 10631 // Since simplification may add new inner loops, it has to run before the 10632 // legality and profitability checks. This means running the loop vectorizer 10633 // will simplify all loops, regardless of whether anything end up being 10634 // vectorized. 10635 for (auto &L : *LI) 10636 Changed |= CFGChanged |= 10637 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10638 10639 // Build up a worklist of inner-loops to vectorize. This is necessary as 10640 // the act of vectorizing or partially unrolling a loop creates new loops 10641 // and can invalidate iterators across the loops. 10642 SmallVector<Loop *, 8> Worklist; 10643 10644 for (Loop *L : *LI) 10645 collectSupportedLoops(*L, LI, ORE, Worklist); 10646 10647 LoopsAnalyzed += Worklist.size(); 10648 10649 // Now walk the identified inner loops. 10650 while (!Worklist.empty()) { 10651 Loop *L = Worklist.pop_back_val(); 10652 10653 // For the inner loops we actually process, form LCSSA to simplify the 10654 // transform. 10655 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10656 10657 Changed |= CFGChanged |= processLoop(L); 10658 } 10659 10660 // Process each loop nest in the function. 10661 return LoopVectorizeResult(Changed, CFGChanged); 10662 } 10663 10664 PreservedAnalyses LoopVectorizePass::run(Function &F, 10665 FunctionAnalysisManager &AM) { 10666 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10667 auto &LI = AM.getResult<LoopAnalysis>(F); 10668 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10669 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10670 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10671 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10672 auto &AA = AM.getResult<AAManager>(F); 10673 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10674 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10675 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10676 10677 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10678 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10679 [&](Loop &L) -> const LoopAccessInfo & { 10680 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10681 TLI, TTI, nullptr, nullptr, nullptr}; 10682 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10683 }; 10684 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10685 ProfileSummaryInfo *PSI = 10686 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10687 LoopVectorizeResult Result = 10688 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10689 if (!Result.MadeAnyChange) 10690 return PreservedAnalyses::all(); 10691 PreservedAnalyses PA; 10692 10693 // We currently do not preserve loopinfo/dominator analyses with outer loop 10694 // vectorization. Until this is addressed, mark these analyses as preserved 10695 // only for non-VPlan-native path. 10696 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10697 if (!EnableVPlanNativePath) { 10698 PA.preserve<LoopAnalysis>(); 10699 PA.preserve<DominatorTreeAnalysis>(); 10700 } 10701 10702 if (Result.MadeCFGChange) { 10703 // Making CFG changes likely means a loop got vectorized. Indicate that 10704 // extra simplification passes should be run. 10705 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10706 // be run if runtime checks have been added. 10707 AM.getResult<ShouldRunExtraVectorPasses>(F); 10708 PA.preserve<ShouldRunExtraVectorPasses>(); 10709 } else { 10710 PA.preserveSet<CFGAnalyses>(); 10711 } 10712 return PA; 10713 } 10714 10715 void LoopVectorizePass::printPipeline( 10716 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10717 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10718 OS, MapClassName2PassName); 10719 10720 OS << "<"; 10721 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10722 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10723 OS << ">"; 10724 } 10725