1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop. 474 /// In the case of epilogue vectorization, this function is overriden to 475 /// handle the more complex control flow around the loops. 476 virtual BasicBlock *createVectorizedLoopSkeleton(); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 483 void fixVectorizedLoop(VPTransformState &State); 484 485 // Return true if any runtime check is added. 486 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 487 488 /// A type for vectorized values in the new loop. Each value from the 489 /// original loop, when vectorized, is represented by UF vector values in the 490 /// new unrolled loop, where UF is the unroll factor. 491 using VectorParts = SmallVector<Value *, 2>; 492 493 /// Vectorize a single first-order recurrence or pointer induction PHINode in 494 /// a block. This method handles the induction variable canonicalization. It 495 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 496 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 497 VPTransformState &State); 498 499 /// A helper function to scalarize a single Instruction in the innermost loop. 500 /// Generates a sequence of scalar instances for each lane between \p MinLane 501 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 502 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 503 /// Instr's operands. 504 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 505 const VPIteration &Instance, bool IfPredicateInstr, 506 VPTransformState &State); 507 508 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 509 /// is provided, the integer induction variable will first be truncated to 510 /// the corresponding type. 511 void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, 512 Value *Start, TruncInst *Trunc, VPValue *Def, 513 VPTransformState &State); 514 515 /// Construct the vector value of a scalarized value \p V one lane at a time. 516 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 517 VPTransformState &State); 518 519 /// Try to vectorize interleaved access group \p Group with the base address 520 /// given in \p Addr, optionally masking the vector operations if \p 521 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 522 /// values in the vectorized loop. 523 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 524 ArrayRef<VPValue *> VPDefs, 525 VPTransformState &State, VPValue *Addr, 526 ArrayRef<VPValue *> StoredValues, 527 VPValue *BlockInMask = nullptr); 528 529 /// Set the debug location in the builder \p Ptr using the debug location in 530 /// \p V. If \p Ptr is None then it uses the class member's Builder. 531 void setDebugLocFromInst(const Value *V, 532 Optional<IRBuilder<> *> CustomBuilder = None); 533 534 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 535 void fixNonInductionPHIs(VPTransformState &State); 536 537 /// Returns true if the reordering of FP operations is not allowed, but we are 538 /// able to vectorize with strict in-order reductions for the given RdxDesc. 539 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 540 541 /// Create a broadcast instruction. This method generates a broadcast 542 /// instruction (shuffle) for loop invariant values and for the induction 543 /// value. If this is the induction variable then we extend it to N, N+1, ... 544 /// this is needed because each iteration in the loop corresponds to a SIMD 545 /// element. 546 virtual Value *getBroadcastInstrs(Value *V); 547 548 /// Add metadata from one instruction to another. 549 /// 550 /// This includes both the original MDs from \p From and additional ones (\see 551 /// addNewMetadata). Use this for *newly created* instructions in the vector 552 /// loop. 553 void addMetadata(Instruction *To, Instruction *From); 554 555 /// Similar to the previous function but it adds the metadata to a 556 /// vector of instructions. 557 void addMetadata(ArrayRef<Value *> To, Instruction *From); 558 559 protected: 560 friend class LoopVectorizationPlanner; 561 562 /// A small list of PHINodes. 563 using PhiVector = SmallVector<PHINode *, 4>; 564 565 /// A type for scalarized values in the new loop. Each value from the 566 /// original loop, when scalarized, is represented by UF x VF scalar values 567 /// in the new unrolled loop, where UF is the unroll factor and VF is the 568 /// vectorization factor. 569 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 570 571 /// Set up the values of the IVs correctly when exiting the vector loop. 572 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 573 Value *CountRoundDown, Value *EndValue, 574 BasicBlock *MiddleBlock); 575 576 /// Create a new induction variable inside L. 577 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 578 Value *Step, Instruction *DL); 579 580 /// Handle all cross-iteration phis in the header. 581 void fixCrossIterationPHIs(VPTransformState &State); 582 583 /// Create the exit value of first order recurrences in the middle block and 584 /// update their users. 585 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 586 587 /// Create code for the loop exit value of the reduction. 588 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 589 590 /// Clear NSW/NUW flags from reduction instructions if necessary. 591 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 592 VPTransformState &State); 593 594 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 595 /// means we need to add the appropriate incoming value from the middle 596 /// block as exiting edges from the scalar epilogue loop (if present) are 597 /// already in place, and we exit the vector loop exclusively to the middle 598 /// block. 599 void fixLCSSAPHIs(VPTransformState &State); 600 601 /// Iteratively sink the scalarized operands of a predicated instruction into 602 /// the block that was created for it. 603 void sinkScalarOperands(Instruction *PredInst); 604 605 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 606 /// represented as. 607 void truncateToMinimalBitwidths(VPTransformState &State); 608 609 /// This function adds 610 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 611 /// to each vector element of Val. The sequence starts at StartIndex. 612 /// \p Opcode is relevant for FP induction variable. 613 virtual Value * 614 getStepVector(Value *Val, Value *StartIdx, Value *Step, 615 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 616 617 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 618 /// variable on which to base the steps, \p Step is the size of the step, and 619 /// \p EntryVal is the value from the original loop that maps to the steps. 620 /// Note that \p EntryVal doesn't have to be an induction variable - it 621 /// can also be a truncate instruction. 622 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 623 const InductionDescriptor &ID, VPValue *Def, 624 VPTransformState &State); 625 626 /// Create a vector induction phi node based on an existing scalar one. \p 627 /// EntryVal is the value from the original loop that maps to the vector phi 628 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 629 /// truncate instruction, instead of widening the original IV, we widen a 630 /// version of the IV truncated to \p EntryVal's type. 631 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 632 Value *Step, Value *Start, 633 Instruction *EntryVal, VPValue *Def, 634 VPTransformState &State); 635 636 /// Returns true if an instruction \p I should be scalarized instead of 637 /// vectorized for the chosen vectorization factor. 638 bool shouldScalarizeInstruction(Instruction *I) const; 639 640 /// Returns true if we should generate a scalar version of \p IV. 641 bool needsScalarInduction(Instruction *IV) const; 642 643 /// Generate a shuffle sequence that will reverse the vector Vec. 644 virtual Value *reverseVector(Value *Vec); 645 646 /// Returns (and creates if needed) the original loop trip count. 647 Value *getOrCreateTripCount(Loop *NewLoop); 648 649 /// Returns (and creates if needed) the trip count of the widened loop. 650 Value *getOrCreateVectorTripCount(Loop *NewLoop); 651 652 /// Returns a bitcasted value to the requested vector type. 653 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 654 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 655 const DataLayout &DL); 656 657 /// Emit a bypass check to see if the vector trip count is zero, including if 658 /// it overflows. 659 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 660 661 /// Emit a bypass check to see if all of the SCEV assumptions we've 662 /// had to make are correct. Returns the block containing the checks or 663 /// nullptr if no checks have been added. 664 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 665 666 /// Emit bypass checks to check any memory assumptions we may have made. 667 /// Returns the block containing the checks or nullptr if no checks have been 668 /// added. 669 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 670 671 /// Compute the transformed value of Index at offset StartValue using step 672 /// StepValue. 673 /// For integer induction, returns StartValue + Index * StepValue. 674 /// For pointer induction, returns StartValue[Index * StepValue]. 675 /// FIXME: The newly created binary instructions should contain nsw/nuw 676 /// flags, which can be found from the original scalar operations. 677 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 678 const DataLayout &DL, 679 const InductionDescriptor &ID, 680 BasicBlock *VectorHeader) const; 681 682 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 683 /// vector loop preheader, middle block and scalar preheader. Also 684 /// allocate a loop object for the new vector loop and return it. 685 Loop *createVectorLoopSkeleton(StringRef Prefix); 686 687 /// Create new phi nodes for the induction variables to resume iteration count 688 /// in the scalar epilogue, from where the vectorized loop left off (given by 689 /// \p VectorTripCount). 690 /// In cases where the loop skeleton is more complicated (eg. epilogue 691 /// vectorization) and the resume values can come from an additional bypass 692 /// block, the \p AdditionalBypass pair provides information about the bypass 693 /// block and the end value on the edge from bypass to this loop. 694 void createInductionResumeValues( 695 Loop *L, Value *VectorTripCount, 696 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 697 698 /// Complete the loop skeleton by adding debug MDs, creating appropriate 699 /// conditional branches in the middle block, preparing the builder and 700 /// running the verifier. Take in the vector loop \p L as argument, and return 701 /// the preheader of the completed vector loop. 702 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 703 704 /// Add additional metadata to \p To that was not present on \p Orig. 705 /// 706 /// Currently this is used to add the noalias annotations based on the 707 /// inserted memchecks. Use this for instructions that are *cloned* into the 708 /// vector loop. 709 void addNewMetadata(Instruction *To, const Instruction *Orig); 710 711 /// Collect poison-generating recipes that may generate a poison value that is 712 /// used after vectorization, even when their operands are not poison. Those 713 /// recipes meet the following conditions: 714 /// * Contribute to the address computation of a recipe generating a widen 715 /// memory load/store (VPWidenMemoryInstructionRecipe or 716 /// VPInterleaveRecipe). 717 /// * Such a widen memory load/store has at least one underlying Instruction 718 /// that is in a basic block that needs predication and after vectorization 719 /// the generated instruction won't be predicated. 720 void collectPoisonGeneratingRecipes(VPTransformState &State); 721 722 /// Allow subclasses to override and print debug traces before/after vplan 723 /// execution, when trace information is requested. 724 virtual void printDebugTracesAtStart(){}; 725 virtual void printDebugTracesAtEnd(){}; 726 727 /// The original loop. 728 Loop *OrigLoop; 729 730 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 731 /// dynamic knowledge to simplify SCEV expressions and converts them to a 732 /// more usable form. 733 PredicatedScalarEvolution &PSE; 734 735 /// Loop Info. 736 LoopInfo *LI; 737 738 /// Dominator Tree. 739 DominatorTree *DT; 740 741 /// Alias Analysis. 742 AAResults *AA; 743 744 /// Target Library Info. 745 const TargetLibraryInfo *TLI; 746 747 /// Target Transform Info. 748 const TargetTransformInfo *TTI; 749 750 /// Assumption Cache. 751 AssumptionCache *AC; 752 753 /// Interface to emit optimization remarks. 754 OptimizationRemarkEmitter *ORE; 755 756 /// LoopVersioning. It's only set up (non-null) if memchecks were 757 /// used. 758 /// 759 /// This is currently only used to add no-alias metadata based on the 760 /// memchecks. The actually versioning is performed manually. 761 std::unique_ptr<LoopVersioning> LVer; 762 763 /// The vectorization SIMD factor to use. Each vector will have this many 764 /// vector elements. 765 ElementCount VF; 766 767 /// The vectorization unroll factor to use. Each scalar is vectorized to this 768 /// many different vector instructions. 769 unsigned UF; 770 771 /// The builder that we use 772 IRBuilder<> Builder; 773 774 // --- Vectorization state --- 775 776 /// The vector-loop preheader. 777 BasicBlock *LoopVectorPreHeader; 778 779 /// The scalar-loop preheader. 780 BasicBlock *LoopScalarPreHeader; 781 782 /// Middle Block between the vector and the scalar. 783 BasicBlock *LoopMiddleBlock; 784 785 /// The unique ExitBlock of the scalar loop if one exists. Note that 786 /// there can be multiple exiting edges reaching this block. 787 BasicBlock *LoopExitBlock; 788 789 /// The vector loop body. 790 BasicBlock *LoopVectorBody; 791 792 /// The scalar loop body. 793 BasicBlock *LoopScalarBody; 794 795 /// A list of all bypass blocks. The first block is the entry of the loop. 796 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 797 798 /// The new Induction variable which was added to the new block. 799 PHINode *Induction = nullptr; 800 801 /// The induction variable of the old basic block. 802 PHINode *OldInduction = nullptr; 803 804 /// Store instructions that were predicated. 805 SmallVector<Instruction *, 4> PredicatedInstructions; 806 807 /// Trip count of the original loop. 808 Value *TripCount = nullptr; 809 810 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 811 Value *VectorTripCount = nullptr; 812 813 /// The legality analysis. 814 LoopVectorizationLegality *Legal; 815 816 /// The profitablity analysis. 817 LoopVectorizationCostModel *Cost; 818 819 // Record whether runtime checks are added. 820 bool AddedSafetyChecks = false; 821 822 // Holds the end values for each induction variable. We save the end values 823 // so we can later fix-up the external users of the induction variables. 824 DenseMap<PHINode *, Value *> IVEndValues; 825 826 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 827 // fixed up at the end of vector code generation. 828 SmallVector<PHINode *, 8> OrigPHIsToFix; 829 830 /// BFI and PSI are used to check for profile guided size optimizations. 831 BlockFrequencyInfo *BFI; 832 ProfileSummaryInfo *PSI; 833 834 // Whether this loop should be optimized for size based on profile guided size 835 // optimizatios. 836 bool OptForSizeBasedOnProfile; 837 838 /// Structure to hold information about generated runtime checks, responsible 839 /// for cleaning the checks, if vectorization turns out unprofitable. 840 GeneratedRTChecks &RTChecks; 841 }; 842 843 class InnerLoopUnroller : public InnerLoopVectorizer { 844 public: 845 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 846 LoopInfo *LI, DominatorTree *DT, 847 const TargetLibraryInfo *TLI, 848 const TargetTransformInfo *TTI, AssumptionCache *AC, 849 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 850 LoopVectorizationLegality *LVL, 851 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 852 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 853 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 854 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 855 BFI, PSI, Check) {} 856 857 private: 858 Value *getBroadcastInstrs(Value *V) override; 859 Value *getStepVector( 860 Value *Val, Value *StartIdx, Value *Step, 861 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 862 Value *reverseVector(Value *Vec) override; 863 }; 864 865 /// Encapsulate information regarding vectorization of a loop and its epilogue. 866 /// This information is meant to be updated and used across two stages of 867 /// epilogue vectorization. 868 struct EpilogueLoopVectorizationInfo { 869 ElementCount MainLoopVF = ElementCount::getFixed(0); 870 unsigned MainLoopUF = 0; 871 ElementCount EpilogueVF = ElementCount::getFixed(0); 872 unsigned EpilogueUF = 0; 873 BasicBlock *MainLoopIterationCountCheck = nullptr; 874 BasicBlock *EpilogueIterationCountCheck = nullptr; 875 BasicBlock *SCEVSafetyCheck = nullptr; 876 BasicBlock *MemSafetyCheck = nullptr; 877 Value *TripCount = nullptr; 878 Value *VectorTripCount = nullptr; 879 880 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 881 ElementCount EVF, unsigned EUF) 882 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 883 assert(EUF == 1 && 884 "A high UF for the epilogue loop is likely not beneficial."); 885 } 886 }; 887 888 /// An extension of the inner loop vectorizer that creates a skeleton for a 889 /// vectorized loop that has its epilogue (residual) also vectorized. 890 /// The idea is to run the vplan on a given loop twice, firstly to setup the 891 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 892 /// from the first step and vectorize the epilogue. This is achieved by 893 /// deriving two concrete strategy classes from this base class and invoking 894 /// them in succession from the loop vectorizer planner. 895 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 896 public: 897 InnerLoopAndEpilogueVectorizer( 898 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 899 DominatorTree *DT, const TargetLibraryInfo *TLI, 900 const TargetTransformInfo *TTI, AssumptionCache *AC, 901 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 902 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 903 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 904 GeneratedRTChecks &Checks) 905 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 906 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 907 Checks), 908 EPI(EPI) {} 909 910 // Override this function to handle the more complex control flow around the 911 // three loops. 912 BasicBlock *createVectorizedLoopSkeleton() final override { 913 return createEpilogueVectorizedLoopSkeleton(); 914 } 915 916 /// The interface for creating a vectorized skeleton using one of two 917 /// different strategies, each corresponding to one execution of the vplan 918 /// as described above. 919 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 920 921 /// Holds and updates state information required to vectorize the main loop 922 /// and its epilogue in two separate passes. This setup helps us avoid 923 /// regenerating and recomputing runtime safety checks. It also helps us to 924 /// shorten the iteration-count-check path length for the cases where the 925 /// iteration count of the loop is so small that the main vector loop is 926 /// completely skipped. 927 EpilogueLoopVectorizationInfo &EPI; 928 }; 929 930 /// A specialized derived class of inner loop vectorizer that performs 931 /// vectorization of *main* loops in the process of vectorizing loops and their 932 /// epilogues. 933 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 934 public: 935 EpilogueVectorizerMainLoop( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Check) 943 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI, LVL, CM, BFI, PSI, Check) {} 945 /// Implements the interface for creating a vectorized skeleton using the 946 /// *main loop* strategy (ie the first pass of vplan execution). 947 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 948 949 protected: 950 /// Emits an iteration count bypass check once for the main loop (when \p 951 /// ForEpilogue is false) and once for the epilogue loop (when \p 952 /// ForEpilogue is true). 953 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 954 bool ForEpilogue); 955 void printDebugTracesAtStart() override; 956 void printDebugTracesAtEnd() override; 957 }; 958 959 // A specialized derived class of inner loop vectorizer that performs 960 // vectorization of *epilogue* loops in the process of vectorizing loops and 961 // their epilogues. 962 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 963 public: 964 EpilogueVectorizerEpilogueLoop( 965 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 966 DominatorTree *DT, const TargetLibraryInfo *TLI, 967 const TargetTransformInfo *TTI, AssumptionCache *AC, 968 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 969 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 970 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 971 GeneratedRTChecks &Checks) 972 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 973 EPI, LVL, CM, BFI, PSI, Checks) {} 974 /// Implements the interface for creating a vectorized skeleton using the 975 /// *epilogue loop* strategy (ie the second pass of vplan execution). 976 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 977 978 protected: 979 /// Emits an iteration count bypass check after the main vector loop has 980 /// finished to see if there are any iterations left to execute by either 981 /// the vector epilogue or the scalar epilogue. 982 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 983 BasicBlock *Bypass, 984 BasicBlock *Insert); 985 void printDebugTracesAtStart() override; 986 void printDebugTracesAtEnd() override; 987 }; 988 } // end namespace llvm 989 990 /// Look for a meaningful debug location on the instruction or it's 991 /// operands. 992 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 993 if (!I) 994 return I; 995 996 DebugLoc Empty; 997 if (I->getDebugLoc() != Empty) 998 return I; 999 1000 for (Use &Op : I->operands()) { 1001 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1002 if (OpInst->getDebugLoc() != Empty) 1003 return OpInst; 1004 } 1005 1006 return I; 1007 } 1008 1009 void InnerLoopVectorizer::setDebugLocFromInst( 1010 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1011 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1012 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1013 const DILocation *DIL = Inst->getDebugLoc(); 1014 1015 // When a FSDiscriminator is enabled, we don't need to add the multiply 1016 // factors to the discriminators. 1017 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1018 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1019 // FIXME: For scalable vectors, assume vscale=1. 1020 auto NewDIL = 1021 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1022 if (NewDIL) 1023 B->SetCurrentDebugLocation(NewDIL.getValue()); 1024 else 1025 LLVM_DEBUG(dbgs() 1026 << "Failed to create new discriminator: " 1027 << DIL->getFilename() << " Line: " << DIL->getLine()); 1028 } else 1029 B->SetCurrentDebugLocation(DIL); 1030 } else 1031 B->SetCurrentDebugLocation(DebugLoc()); 1032 } 1033 1034 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1035 /// is passed, the message relates to that particular instruction. 1036 #ifndef NDEBUG 1037 static void debugVectorizationMessage(const StringRef Prefix, 1038 const StringRef DebugMsg, 1039 Instruction *I) { 1040 dbgs() << "LV: " << Prefix << DebugMsg; 1041 if (I != nullptr) 1042 dbgs() << " " << *I; 1043 else 1044 dbgs() << '.'; 1045 dbgs() << '\n'; 1046 } 1047 #endif 1048 1049 /// Create an analysis remark that explains why vectorization failed 1050 /// 1051 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1052 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1053 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1054 /// the location of the remark. \return the remark object that can be 1055 /// streamed to. 1056 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1057 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1058 Value *CodeRegion = TheLoop->getHeader(); 1059 DebugLoc DL = TheLoop->getStartLoc(); 1060 1061 if (I) { 1062 CodeRegion = I->getParent(); 1063 // If there is no debug location attached to the instruction, revert back to 1064 // using the loop's. 1065 if (I->getDebugLoc()) 1066 DL = I->getDebugLoc(); 1067 } 1068 1069 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1070 } 1071 1072 /// Return a value for Step multiplied by VF. 1073 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1074 int64_t Step) { 1075 assert(Ty->isIntegerTy() && "Expected an integer step"); 1076 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1077 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1078 } 1079 1080 namespace llvm { 1081 1082 /// Return the runtime value for VF. 1083 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1084 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1085 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1086 } 1087 1088 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1089 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1090 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1091 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1092 return B.CreateUIToFP(RuntimeVF, FTy); 1093 } 1094 1095 void reportVectorizationFailure(const StringRef DebugMsg, 1096 const StringRef OREMsg, const StringRef ORETag, 1097 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1098 Instruction *I) { 1099 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1100 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1101 ORE->emit( 1102 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1103 << "loop not vectorized: " << OREMsg); 1104 } 1105 1106 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1107 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1108 Instruction *I) { 1109 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1110 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1111 ORE->emit( 1112 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1113 << Msg); 1114 } 1115 1116 } // end namespace llvm 1117 1118 #ifndef NDEBUG 1119 /// \return string containing a file name and a line # for the given loop. 1120 static std::string getDebugLocString(const Loop *L) { 1121 std::string Result; 1122 if (L) { 1123 raw_string_ostream OS(Result); 1124 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1125 LoopDbgLoc.print(OS); 1126 else 1127 // Just print the module name. 1128 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1129 OS.flush(); 1130 } 1131 return Result; 1132 } 1133 #endif 1134 1135 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1136 const Instruction *Orig) { 1137 // If the loop was versioned with memchecks, add the corresponding no-alias 1138 // metadata. 1139 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1140 LVer->annotateInstWithNoAlias(To, Orig); 1141 } 1142 1143 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1144 VPTransformState &State) { 1145 1146 // Collect recipes in the backward slice of `Root` that may generate a poison 1147 // value that is used after vectorization. 1148 SmallPtrSet<VPRecipeBase *, 16> Visited; 1149 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1150 SmallVector<VPRecipeBase *, 16> Worklist; 1151 Worklist.push_back(Root); 1152 1153 // Traverse the backward slice of Root through its use-def chain. 1154 while (!Worklist.empty()) { 1155 VPRecipeBase *CurRec = Worklist.back(); 1156 Worklist.pop_back(); 1157 1158 if (!Visited.insert(CurRec).second) 1159 continue; 1160 1161 // Prune search if we find another recipe generating a widen memory 1162 // instruction. Widen memory instructions involved in address computation 1163 // will lead to gather/scatter instructions, which don't need to be 1164 // handled. 1165 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1166 isa<VPInterleaveRecipe>(CurRec)) 1167 continue; 1168 1169 // This recipe contributes to the address computation of a widen 1170 // load/store. Collect recipe if its underlying instruction has 1171 // poison-generating flags. 1172 Instruction *Instr = CurRec->getUnderlyingInstr(); 1173 if (Instr && Instr->hasPoisonGeneratingFlags()) 1174 State.MayGeneratePoisonRecipes.insert(CurRec); 1175 1176 // Add new definitions to the worklist. 1177 for (VPValue *operand : CurRec->operands()) 1178 if (VPDef *OpDef = operand->getDef()) 1179 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1180 } 1181 }); 1182 1183 // Traverse all the recipes in the VPlan and collect the poison-generating 1184 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1185 // VPInterleaveRecipe. 1186 auto Iter = depth_first( 1187 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1188 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1189 for (VPRecipeBase &Recipe : *VPBB) { 1190 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1191 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1192 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1193 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1194 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1195 collectPoisonGeneratingInstrsInBackwardSlice( 1196 cast<VPRecipeBase>(AddrDef)); 1197 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1198 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1199 if (AddrDef) { 1200 // Check if any member of the interleave group needs predication. 1201 const InterleaveGroup<Instruction> *InterGroup = 1202 InterleaveRec->getInterleaveGroup(); 1203 bool NeedPredication = false; 1204 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1205 I < NumMembers; ++I) { 1206 Instruction *Member = InterGroup->getMember(I); 1207 if (Member) 1208 NeedPredication |= 1209 Legal->blockNeedsPredication(Member->getParent()); 1210 } 1211 1212 if (NeedPredication) 1213 collectPoisonGeneratingInstrsInBackwardSlice( 1214 cast<VPRecipeBase>(AddrDef)); 1215 } 1216 } 1217 } 1218 } 1219 } 1220 1221 void InnerLoopVectorizer::addMetadata(Instruction *To, 1222 Instruction *From) { 1223 propagateMetadata(To, From); 1224 addNewMetadata(To, From); 1225 } 1226 1227 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1228 Instruction *From) { 1229 for (Value *V : To) { 1230 if (Instruction *I = dyn_cast<Instruction>(V)) 1231 addMetadata(I, From); 1232 } 1233 } 1234 1235 namespace llvm { 1236 1237 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1238 // lowered. 1239 enum ScalarEpilogueLowering { 1240 1241 // The default: allowing scalar epilogues. 1242 CM_ScalarEpilogueAllowed, 1243 1244 // Vectorization with OptForSize: don't allow epilogues. 1245 CM_ScalarEpilogueNotAllowedOptSize, 1246 1247 // A special case of vectorisation with OptForSize: loops with a very small 1248 // trip count are considered for vectorization under OptForSize, thereby 1249 // making sure the cost of their loop body is dominant, free of runtime 1250 // guards and scalar iteration overheads. 1251 CM_ScalarEpilogueNotAllowedLowTripLoop, 1252 1253 // Loop hint predicate indicating an epilogue is undesired. 1254 CM_ScalarEpilogueNotNeededUsePredicate, 1255 1256 // Directive indicating we must either tail fold or not vectorize 1257 CM_ScalarEpilogueNotAllowedUsePredicate 1258 }; 1259 1260 /// ElementCountComparator creates a total ordering for ElementCount 1261 /// for the purposes of using it in a set structure. 1262 struct ElementCountComparator { 1263 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1264 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1265 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1266 } 1267 }; 1268 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1269 1270 /// LoopVectorizationCostModel - estimates the expected speedups due to 1271 /// vectorization. 1272 /// In many cases vectorization is not profitable. This can happen because of 1273 /// a number of reasons. In this class we mainly attempt to predict the 1274 /// expected speedup/slowdowns due to the supported instruction set. We use the 1275 /// TargetTransformInfo to query the different backends for the cost of 1276 /// different operations. 1277 class LoopVectorizationCostModel { 1278 public: 1279 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1280 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1281 LoopVectorizationLegality *Legal, 1282 const TargetTransformInfo &TTI, 1283 const TargetLibraryInfo *TLI, DemandedBits *DB, 1284 AssumptionCache *AC, 1285 OptimizationRemarkEmitter *ORE, const Function *F, 1286 const LoopVectorizeHints *Hints, 1287 InterleavedAccessInfo &IAI) 1288 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1289 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1290 Hints(Hints), InterleaveInfo(IAI) {} 1291 1292 /// \return An upper bound for the vectorization factors (both fixed and 1293 /// scalable). If the factors are 0, vectorization and interleaving should be 1294 /// avoided up front. 1295 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1296 1297 /// \return True if runtime checks are required for vectorization, and false 1298 /// otherwise. 1299 bool runtimeChecksRequired(); 1300 1301 /// \return The most profitable vectorization factor and the cost of that VF. 1302 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1303 /// then this vectorization factor will be selected if vectorization is 1304 /// possible. 1305 VectorizationFactor 1306 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1307 1308 VectorizationFactor 1309 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1310 const LoopVectorizationPlanner &LVP); 1311 1312 /// Setup cost-based decisions for user vectorization factor. 1313 /// \return true if the UserVF is a feasible VF to be chosen. 1314 bool selectUserVectorizationFactor(ElementCount UserVF) { 1315 collectUniformsAndScalars(UserVF); 1316 collectInstsToScalarize(UserVF); 1317 return expectedCost(UserVF).first.isValid(); 1318 } 1319 1320 /// \return The size (in bits) of the smallest and widest types in the code 1321 /// that needs to be vectorized. We ignore values that remain scalar such as 1322 /// 64 bit loop indices. 1323 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1324 1325 /// \return The desired interleave count. 1326 /// If interleave count has been specified by metadata it will be returned. 1327 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1328 /// are the selected vectorization factor and the cost of the selected VF. 1329 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1330 1331 /// Memory access instruction may be vectorized in more than one way. 1332 /// Form of instruction after vectorization depends on cost. 1333 /// This function takes cost-based decisions for Load/Store instructions 1334 /// and collects them in a map. This decisions map is used for building 1335 /// the lists of loop-uniform and loop-scalar instructions. 1336 /// The calculated cost is saved with widening decision in order to 1337 /// avoid redundant calculations. 1338 void setCostBasedWideningDecision(ElementCount VF); 1339 1340 /// A struct that represents some properties of the register usage 1341 /// of a loop. 1342 struct RegisterUsage { 1343 /// Holds the number of loop invariant values that are used in the loop. 1344 /// The key is ClassID of target-provided register class. 1345 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1346 /// Holds the maximum number of concurrent live intervals in the loop. 1347 /// The key is ClassID of target-provided register class. 1348 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1349 }; 1350 1351 /// \return Returns information about the register usages of the loop for the 1352 /// given vectorization factors. 1353 SmallVector<RegisterUsage, 8> 1354 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1355 1356 /// Collect values we want to ignore in the cost model. 1357 void collectValuesToIgnore(); 1358 1359 /// Collect all element types in the loop for which widening is needed. 1360 void collectElementTypesForWidening(); 1361 1362 /// Split reductions into those that happen in the loop, and those that happen 1363 /// outside. In loop reductions are collected into InLoopReductionChains. 1364 void collectInLoopReductions(); 1365 1366 /// Returns true if we should use strict in-order reductions for the given 1367 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1368 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1369 /// of FP operations. 1370 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1371 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1372 } 1373 1374 /// \returns The smallest bitwidth each instruction can be represented with. 1375 /// The vector equivalents of these instructions should be truncated to this 1376 /// type. 1377 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1378 return MinBWs; 1379 } 1380 1381 /// \returns True if it is more profitable to scalarize instruction \p I for 1382 /// vectorization factor \p VF. 1383 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1384 assert(VF.isVector() && 1385 "Profitable to scalarize relevant only for VF > 1."); 1386 1387 // Cost model is not run in the VPlan-native path - return conservative 1388 // result until this changes. 1389 if (EnableVPlanNativePath) 1390 return false; 1391 1392 auto Scalars = InstsToScalarize.find(VF); 1393 assert(Scalars != InstsToScalarize.end() && 1394 "VF not yet analyzed for scalarization profitability"); 1395 return Scalars->second.find(I) != Scalars->second.end(); 1396 } 1397 1398 /// Returns true if \p I is known to be uniform after vectorization. 1399 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1400 if (VF.isScalar()) 1401 return true; 1402 1403 // Cost model is not run in the VPlan-native path - return conservative 1404 // result until this changes. 1405 if (EnableVPlanNativePath) 1406 return false; 1407 1408 auto UniformsPerVF = Uniforms.find(VF); 1409 assert(UniformsPerVF != Uniforms.end() && 1410 "VF not yet analyzed for uniformity"); 1411 return UniformsPerVF->second.count(I); 1412 } 1413 1414 /// Returns true if \p I is known to be scalar after vectorization. 1415 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1416 if (VF.isScalar()) 1417 return true; 1418 1419 // Cost model is not run in the VPlan-native path - return conservative 1420 // result until this changes. 1421 if (EnableVPlanNativePath) 1422 return false; 1423 1424 auto ScalarsPerVF = Scalars.find(VF); 1425 assert(ScalarsPerVF != Scalars.end() && 1426 "Scalar values are not calculated for VF"); 1427 return ScalarsPerVF->second.count(I); 1428 } 1429 1430 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1431 /// for vectorization factor \p VF. 1432 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1433 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1434 !isProfitableToScalarize(I, VF) && 1435 !isScalarAfterVectorization(I, VF); 1436 } 1437 1438 /// Decision that was taken during cost calculation for memory instruction. 1439 enum InstWidening { 1440 CM_Unknown, 1441 CM_Widen, // For consecutive accesses with stride +1. 1442 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1443 CM_Interleave, 1444 CM_GatherScatter, 1445 CM_Scalarize 1446 }; 1447 1448 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1449 /// instruction \p I and vector width \p VF. 1450 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1451 InstructionCost Cost) { 1452 assert(VF.isVector() && "Expected VF >=2"); 1453 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1454 } 1455 1456 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1457 /// interleaving group \p Grp and vector width \p VF. 1458 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1459 ElementCount VF, InstWidening W, 1460 InstructionCost Cost) { 1461 assert(VF.isVector() && "Expected VF >=2"); 1462 /// Broadcast this decicion to all instructions inside the group. 1463 /// But the cost will be assigned to one instruction only. 1464 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1465 if (auto *I = Grp->getMember(i)) { 1466 if (Grp->getInsertPos() == I) 1467 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1468 else 1469 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1470 } 1471 } 1472 } 1473 1474 /// Return the cost model decision for the given instruction \p I and vector 1475 /// width \p VF. Return CM_Unknown if this instruction did not pass 1476 /// through the cost modeling. 1477 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1478 assert(VF.isVector() && "Expected VF to be a vector VF"); 1479 // Cost model is not run in the VPlan-native path - return conservative 1480 // result until this changes. 1481 if (EnableVPlanNativePath) 1482 return CM_GatherScatter; 1483 1484 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1485 auto Itr = WideningDecisions.find(InstOnVF); 1486 if (Itr == WideningDecisions.end()) 1487 return CM_Unknown; 1488 return Itr->second.first; 1489 } 1490 1491 /// Return the vectorization cost for the given instruction \p I and vector 1492 /// width \p VF. 1493 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1494 assert(VF.isVector() && "Expected VF >=2"); 1495 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1496 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1497 "The cost is not calculated"); 1498 return WideningDecisions[InstOnVF].second; 1499 } 1500 1501 /// Return True if instruction \p I is an optimizable truncate whose operand 1502 /// is an induction variable. Such a truncate will be removed by adding a new 1503 /// induction variable with the destination type. 1504 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1505 // If the instruction is not a truncate, return false. 1506 auto *Trunc = dyn_cast<TruncInst>(I); 1507 if (!Trunc) 1508 return false; 1509 1510 // Get the source and destination types of the truncate. 1511 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1512 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1513 1514 // If the truncate is free for the given types, return false. Replacing a 1515 // free truncate with an induction variable would add an induction variable 1516 // update instruction to each iteration of the loop. We exclude from this 1517 // check the primary induction variable since it will need an update 1518 // instruction regardless. 1519 Value *Op = Trunc->getOperand(0); 1520 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1521 return false; 1522 1523 // If the truncated value is not an induction variable, return false. 1524 return Legal->isInductionPhi(Op); 1525 } 1526 1527 /// Collects the instructions to scalarize for each predicated instruction in 1528 /// the loop. 1529 void collectInstsToScalarize(ElementCount VF); 1530 1531 /// Collect Uniform and Scalar values for the given \p VF. 1532 /// The sets depend on CM decision for Load/Store instructions 1533 /// that may be vectorized as interleave, gather-scatter or scalarized. 1534 void collectUniformsAndScalars(ElementCount VF) { 1535 // Do the analysis once. 1536 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1537 return; 1538 setCostBasedWideningDecision(VF); 1539 collectLoopUniforms(VF); 1540 collectLoopScalars(VF); 1541 } 1542 1543 /// Returns true if the target machine supports masked store operation 1544 /// for the given \p DataType and kind of access to \p Ptr. 1545 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1546 return Legal->isConsecutivePtr(DataType, Ptr) && 1547 TTI.isLegalMaskedStore(DataType, Alignment); 1548 } 1549 1550 /// Returns true if the target machine supports masked load operation 1551 /// for the given \p DataType and kind of access to \p Ptr. 1552 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1553 return Legal->isConsecutivePtr(DataType, Ptr) && 1554 TTI.isLegalMaskedLoad(DataType, Alignment); 1555 } 1556 1557 /// Returns true if the target machine can represent \p V as a masked gather 1558 /// or scatter operation. 1559 bool isLegalGatherOrScatter(Value *V) { 1560 bool LI = isa<LoadInst>(V); 1561 bool SI = isa<StoreInst>(V); 1562 if (!LI && !SI) 1563 return false; 1564 auto *Ty = getLoadStoreType(V); 1565 Align Align = getLoadStoreAlignment(V); 1566 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1567 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1568 } 1569 1570 /// Returns true if the target machine supports all of the reduction 1571 /// variables found for the given VF. 1572 bool canVectorizeReductions(ElementCount VF) const { 1573 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1574 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1575 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1576 })); 1577 } 1578 1579 /// Returns true if \p I is an instruction that will be scalarized with 1580 /// predication. Such instructions include conditional stores and 1581 /// instructions that may divide by zero. 1582 /// If a non-zero VF has been calculated, we check if I will be scalarized 1583 /// predication for that VF. 1584 bool isScalarWithPredication(Instruction *I) const; 1585 1586 // Returns true if \p I is an instruction that will be predicated either 1587 // through scalar predication or masked load/store or masked gather/scatter. 1588 // Superset of instructions that return true for isScalarWithPredication. 1589 bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { 1590 // When we know the load is uniform and the original scalar loop was not 1591 // predicated we don't need to mark it as a predicated instruction. Any 1592 // vectorised blocks created when tail-folding are something artificial we 1593 // have introduced and we know there is always at least one active lane. 1594 // That's why we call Legal->blockNeedsPredication here because it doesn't 1595 // query tail-folding. 1596 if (IsKnownUniform && isa<LoadInst>(I) && 1597 !Legal->blockNeedsPredication(I->getParent())) 1598 return false; 1599 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1600 return false; 1601 // Loads and stores that need some form of masked operation are predicated 1602 // instructions. 1603 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1604 return Legal->isMaskRequired(I); 1605 return isScalarWithPredication(I); 1606 } 1607 1608 /// Returns true if \p I is a memory instruction with consecutive memory 1609 /// access that can be widened. 1610 bool 1611 memoryInstructionCanBeWidened(Instruction *I, 1612 ElementCount VF = ElementCount::getFixed(1)); 1613 1614 /// Returns true if \p I is a memory instruction in an interleaved-group 1615 /// of memory accesses that can be vectorized with wide vector loads/stores 1616 /// and shuffles. 1617 bool 1618 interleavedAccessCanBeWidened(Instruction *I, 1619 ElementCount VF = ElementCount::getFixed(1)); 1620 1621 /// Check if \p Instr belongs to any interleaved access group. 1622 bool isAccessInterleaved(Instruction *Instr) { 1623 return InterleaveInfo.isInterleaved(Instr); 1624 } 1625 1626 /// Get the interleaved access group that \p Instr belongs to. 1627 const InterleaveGroup<Instruction> * 1628 getInterleavedAccessGroup(Instruction *Instr) { 1629 return InterleaveInfo.getInterleaveGroup(Instr); 1630 } 1631 1632 /// Returns true if we're required to use a scalar epilogue for at least 1633 /// the final iteration of the original loop. 1634 bool requiresScalarEpilogue(ElementCount VF) const { 1635 if (!isScalarEpilogueAllowed()) 1636 return false; 1637 // If we might exit from anywhere but the latch, must run the exiting 1638 // iteration in scalar form. 1639 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1640 return true; 1641 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1642 } 1643 1644 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1645 /// loop hint annotation. 1646 bool isScalarEpilogueAllowed() const { 1647 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1648 } 1649 1650 /// Returns true if all loop blocks should be masked to fold tail loop. 1651 bool foldTailByMasking() const { return FoldTailByMasking; } 1652 1653 /// Returns true if the instructions in this block requires predication 1654 /// for any reason, e.g. because tail folding now requires a predicate 1655 /// or because the block in the original loop was predicated. 1656 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1657 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1658 } 1659 1660 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1661 /// nodes to the chain of instructions representing the reductions. Uses a 1662 /// MapVector to ensure deterministic iteration order. 1663 using ReductionChainMap = 1664 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1665 1666 /// Return the chain of instructions representing an inloop reduction. 1667 const ReductionChainMap &getInLoopReductionChains() const { 1668 return InLoopReductionChains; 1669 } 1670 1671 /// Returns true if the Phi is part of an inloop reduction. 1672 bool isInLoopReduction(PHINode *Phi) const { 1673 return InLoopReductionChains.count(Phi); 1674 } 1675 1676 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1677 /// with factor VF. Return the cost of the instruction, including 1678 /// scalarization overhead if it's needed. 1679 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1680 1681 /// Estimate cost of a call instruction CI if it were vectorized with factor 1682 /// VF. Return the cost of the instruction, including scalarization overhead 1683 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1684 /// scalarized - 1685 /// i.e. either vector version isn't available, or is too expensive. 1686 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1687 bool &NeedToScalarize) const; 1688 1689 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1690 /// that of B. 1691 bool isMoreProfitable(const VectorizationFactor &A, 1692 const VectorizationFactor &B) const; 1693 1694 /// Invalidates decisions already taken by the cost model. 1695 void invalidateCostModelingDecisions() { 1696 WideningDecisions.clear(); 1697 Uniforms.clear(); 1698 Scalars.clear(); 1699 } 1700 1701 private: 1702 unsigned NumPredStores = 0; 1703 1704 /// \return An upper bound for the vectorization factors for both 1705 /// fixed and scalable vectorization, where the minimum-known number of 1706 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1707 /// disabled or unsupported, then the scalable part will be equal to 1708 /// ElementCount::getScalable(0). 1709 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1710 ElementCount UserVF, 1711 bool FoldTailByMasking); 1712 1713 /// \return the maximized element count based on the targets vector 1714 /// registers and the loop trip-count, but limited to a maximum safe VF. 1715 /// This is a helper function of computeFeasibleMaxVF. 1716 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1717 /// issue that occurred on one of the buildbots which cannot be reproduced 1718 /// without having access to the properietary compiler (see comments on 1719 /// D98509). The issue is currently under investigation and this workaround 1720 /// will be removed as soon as possible. 1721 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1722 unsigned SmallestType, 1723 unsigned WidestType, 1724 const ElementCount &MaxSafeVF, 1725 bool FoldTailByMasking); 1726 1727 /// \return the maximum legal scalable VF, based on the safe max number 1728 /// of elements. 1729 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1730 1731 /// The vectorization cost is a combination of the cost itself and a boolean 1732 /// indicating whether any of the contributing operations will actually 1733 /// operate on vector values after type legalization in the backend. If this 1734 /// latter value is false, then all operations will be scalarized (i.e. no 1735 /// vectorization has actually taken place). 1736 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1737 1738 /// Returns the expected execution cost. The unit of the cost does 1739 /// not matter because we use the 'cost' units to compare different 1740 /// vector widths. The cost that is returned is *not* normalized by 1741 /// the factor width. If \p Invalid is not nullptr, this function 1742 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1743 /// each instruction that has an Invalid cost for the given VF. 1744 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1745 VectorizationCostTy 1746 expectedCost(ElementCount VF, 1747 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1748 1749 /// Returns the execution time cost of an instruction for a given vector 1750 /// width. Vector width of one means scalar. 1751 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1752 1753 /// The cost-computation logic from getInstructionCost which provides 1754 /// the vector type as an output parameter. 1755 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1756 Type *&VectorTy); 1757 1758 /// Return the cost of instructions in an inloop reduction pattern, if I is 1759 /// part of that pattern. 1760 Optional<InstructionCost> 1761 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1762 TTI::TargetCostKind CostKind); 1763 1764 /// Calculate vectorization cost of memory instruction \p I. 1765 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1766 1767 /// The cost computation for scalarized memory instruction. 1768 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1769 1770 /// The cost computation for interleaving group of memory instructions. 1771 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1772 1773 /// The cost computation for Gather/Scatter instruction. 1774 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1775 1776 /// The cost computation for widening instruction \p I with consecutive 1777 /// memory access. 1778 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1779 1780 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1781 /// Load: scalar load + broadcast. 1782 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1783 /// element) 1784 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1785 1786 /// Estimate the overhead of scalarizing an instruction. This is a 1787 /// convenience wrapper for the type-based getScalarizationOverhead API. 1788 InstructionCost getScalarizationOverhead(Instruction *I, 1789 ElementCount VF) const; 1790 1791 /// Returns whether the instruction is a load or store and will be a emitted 1792 /// as a vector operation. 1793 bool isConsecutiveLoadOrStore(Instruction *I); 1794 1795 /// Returns true if an artificially high cost for emulated masked memrefs 1796 /// should be used. 1797 bool useEmulatedMaskMemRefHack(Instruction *I); 1798 1799 /// Map of scalar integer values to the smallest bitwidth they can be legally 1800 /// represented as. The vector equivalents of these values should be truncated 1801 /// to this type. 1802 MapVector<Instruction *, uint64_t> MinBWs; 1803 1804 /// A type representing the costs for instructions if they were to be 1805 /// scalarized rather than vectorized. The entries are Instruction-Cost 1806 /// pairs. 1807 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1808 1809 /// A set containing all BasicBlocks that are known to present after 1810 /// vectorization as a predicated block. 1811 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1812 1813 /// Records whether it is allowed to have the original scalar loop execute at 1814 /// least once. This may be needed as a fallback loop in case runtime 1815 /// aliasing/dependence checks fail, or to handle the tail/remainder 1816 /// iterations when the trip count is unknown or doesn't divide by the VF, 1817 /// or as a peel-loop to handle gaps in interleave-groups. 1818 /// Under optsize and when the trip count is very small we don't allow any 1819 /// iterations to execute in the scalar loop. 1820 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1821 1822 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1823 bool FoldTailByMasking = false; 1824 1825 /// A map holding scalar costs for different vectorization factors. The 1826 /// presence of a cost for an instruction in the mapping indicates that the 1827 /// instruction will be scalarized when vectorizing with the associated 1828 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1829 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1830 1831 /// Holds the instructions known to be uniform after vectorization. 1832 /// The data is collected per VF. 1833 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1834 1835 /// Holds the instructions known to be scalar after vectorization. 1836 /// The data is collected per VF. 1837 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1838 1839 /// Holds the instructions (address computations) that are forced to be 1840 /// scalarized. 1841 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1842 1843 /// PHINodes of the reductions that should be expanded in-loop along with 1844 /// their associated chains of reduction operations, in program order from top 1845 /// (PHI) to bottom 1846 ReductionChainMap InLoopReductionChains; 1847 1848 /// A Map of inloop reduction operations and their immediate chain operand. 1849 /// FIXME: This can be removed once reductions can be costed correctly in 1850 /// vplan. This was added to allow quick lookup to the inloop operations, 1851 /// without having to loop through InLoopReductionChains. 1852 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1853 1854 /// Returns the expected difference in cost from scalarizing the expression 1855 /// feeding a predicated instruction \p PredInst. The instructions to 1856 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1857 /// non-negative return value implies the expression will be scalarized. 1858 /// Currently, only single-use chains are considered for scalarization. 1859 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1860 ElementCount VF); 1861 1862 /// Collect the instructions that are uniform after vectorization. An 1863 /// instruction is uniform if we represent it with a single scalar value in 1864 /// the vectorized loop corresponding to each vector iteration. Examples of 1865 /// uniform instructions include pointer operands of consecutive or 1866 /// interleaved memory accesses. Note that although uniformity implies an 1867 /// instruction will be scalar, the reverse is not true. In general, a 1868 /// scalarized instruction will be represented by VF scalar values in the 1869 /// vectorized loop, each corresponding to an iteration of the original 1870 /// scalar loop. 1871 void collectLoopUniforms(ElementCount VF); 1872 1873 /// Collect the instructions that are scalar after vectorization. An 1874 /// instruction is scalar if it is known to be uniform or will be scalarized 1875 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1876 /// to the list if they are used by a load/store instruction that is marked as 1877 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1878 /// VF values in the vectorized loop, each corresponding to an iteration of 1879 /// the original scalar loop. 1880 void collectLoopScalars(ElementCount VF); 1881 1882 /// Keeps cost model vectorization decision and cost for instructions. 1883 /// Right now it is used for memory instructions only. 1884 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1885 std::pair<InstWidening, InstructionCost>>; 1886 1887 DecisionList WideningDecisions; 1888 1889 /// Returns true if \p V is expected to be vectorized and it needs to be 1890 /// extracted. 1891 bool needsExtract(Value *V, ElementCount VF) const { 1892 Instruction *I = dyn_cast<Instruction>(V); 1893 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1894 TheLoop->isLoopInvariant(I)) 1895 return false; 1896 1897 // Assume we can vectorize V (and hence we need extraction) if the 1898 // scalars are not computed yet. This can happen, because it is called 1899 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1900 // the scalars are collected. That should be a safe assumption in most 1901 // cases, because we check if the operands have vectorizable types 1902 // beforehand in LoopVectorizationLegality. 1903 return Scalars.find(VF) == Scalars.end() || 1904 !isScalarAfterVectorization(I, VF); 1905 }; 1906 1907 /// Returns a range containing only operands needing to be extracted. 1908 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1909 ElementCount VF) const { 1910 return SmallVector<Value *, 4>(make_filter_range( 1911 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1912 } 1913 1914 /// Determines if we have the infrastructure to vectorize loop \p L and its 1915 /// epilogue, assuming the main loop is vectorized by \p VF. 1916 bool isCandidateForEpilogueVectorization(const Loop &L, 1917 const ElementCount VF) const; 1918 1919 /// Returns true if epilogue vectorization is considered profitable, and 1920 /// false otherwise. 1921 /// \p VF is the vectorization factor chosen for the original loop. 1922 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1923 1924 public: 1925 /// The loop that we evaluate. 1926 Loop *TheLoop; 1927 1928 /// Predicated scalar evolution analysis. 1929 PredicatedScalarEvolution &PSE; 1930 1931 /// Loop Info analysis. 1932 LoopInfo *LI; 1933 1934 /// Vectorization legality. 1935 LoopVectorizationLegality *Legal; 1936 1937 /// Vector target information. 1938 const TargetTransformInfo &TTI; 1939 1940 /// Target Library Info. 1941 const TargetLibraryInfo *TLI; 1942 1943 /// Demanded bits analysis. 1944 DemandedBits *DB; 1945 1946 /// Assumption cache. 1947 AssumptionCache *AC; 1948 1949 /// Interface to emit optimization remarks. 1950 OptimizationRemarkEmitter *ORE; 1951 1952 const Function *TheFunction; 1953 1954 /// Loop Vectorize Hint. 1955 const LoopVectorizeHints *Hints; 1956 1957 /// The interleave access information contains groups of interleaved accesses 1958 /// with the same stride and close to each other. 1959 InterleavedAccessInfo &InterleaveInfo; 1960 1961 /// Values to ignore in the cost model. 1962 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1963 1964 /// Values to ignore in the cost model when VF > 1. 1965 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1966 1967 /// All element types found in the loop. 1968 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1969 1970 /// Profitable vector factors. 1971 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1972 }; 1973 } // end namespace llvm 1974 1975 /// Helper struct to manage generating runtime checks for vectorization. 1976 /// 1977 /// The runtime checks are created up-front in temporary blocks to allow better 1978 /// estimating the cost and un-linked from the existing IR. After deciding to 1979 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1980 /// temporary blocks are completely removed. 1981 class GeneratedRTChecks { 1982 /// Basic block which contains the generated SCEV checks, if any. 1983 BasicBlock *SCEVCheckBlock = nullptr; 1984 1985 /// The value representing the result of the generated SCEV checks. If it is 1986 /// nullptr, either no SCEV checks have been generated or they have been used. 1987 Value *SCEVCheckCond = nullptr; 1988 1989 /// Basic block which contains the generated memory runtime checks, if any. 1990 BasicBlock *MemCheckBlock = nullptr; 1991 1992 /// The value representing the result of the generated memory runtime checks. 1993 /// If it is nullptr, either no memory runtime checks have been generated or 1994 /// they have been used. 1995 Value *MemRuntimeCheckCond = nullptr; 1996 1997 DominatorTree *DT; 1998 LoopInfo *LI; 1999 2000 SCEVExpander SCEVExp; 2001 SCEVExpander MemCheckExp; 2002 2003 public: 2004 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 2005 const DataLayout &DL) 2006 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 2007 MemCheckExp(SE, DL, "scev.check") {} 2008 2009 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 2010 /// accurately estimate the cost of the runtime checks. The blocks are 2011 /// un-linked from the IR and is added back during vector code generation. If 2012 /// there is no vector code generation, the check blocks are removed 2013 /// completely. 2014 void Create(Loop *L, const LoopAccessInfo &LAI, 2015 const SCEVUnionPredicate &UnionPred) { 2016 2017 BasicBlock *LoopHeader = L->getHeader(); 2018 BasicBlock *Preheader = L->getLoopPreheader(); 2019 2020 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2021 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2022 // may be used by SCEVExpander. The blocks will be un-linked from their 2023 // predecessors and removed from LI & DT at the end of the function. 2024 if (!UnionPred.isAlwaysTrue()) { 2025 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2026 nullptr, "vector.scevcheck"); 2027 2028 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2029 &UnionPred, SCEVCheckBlock->getTerminator()); 2030 } 2031 2032 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2033 if (RtPtrChecking.Need) { 2034 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2035 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2036 "vector.memcheck"); 2037 2038 MemRuntimeCheckCond = 2039 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2040 RtPtrChecking.getChecks(), MemCheckExp); 2041 assert(MemRuntimeCheckCond && 2042 "no RT checks generated although RtPtrChecking " 2043 "claimed checks are required"); 2044 } 2045 2046 if (!MemCheckBlock && !SCEVCheckBlock) 2047 return; 2048 2049 // Unhook the temporary block with the checks, update various places 2050 // accordingly. 2051 if (SCEVCheckBlock) 2052 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2053 if (MemCheckBlock) 2054 MemCheckBlock->replaceAllUsesWith(Preheader); 2055 2056 if (SCEVCheckBlock) { 2057 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2058 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2059 Preheader->getTerminator()->eraseFromParent(); 2060 } 2061 if (MemCheckBlock) { 2062 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2063 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2064 Preheader->getTerminator()->eraseFromParent(); 2065 } 2066 2067 DT->changeImmediateDominator(LoopHeader, Preheader); 2068 if (MemCheckBlock) { 2069 DT->eraseNode(MemCheckBlock); 2070 LI->removeBlock(MemCheckBlock); 2071 } 2072 if (SCEVCheckBlock) { 2073 DT->eraseNode(SCEVCheckBlock); 2074 LI->removeBlock(SCEVCheckBlock); 2075 } 2076 } 2077 2078 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2079 /// unused. 2080 ~GeneratedRTChecks() { 2081 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2082 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2083 if (!SCEVCheckCond) 2084 SCEVCleaner.markResultUsed(); 2085 2086 if (!MemRuntimeCheckCond) 2087 MemCheckCleaner.markResultUsed(); 2088 2089 if (MemRuntimeCheckCond) { 2090 auto &SE = *MemCheckExp.getSE(); 2091 // Memory runtime check generation creates compares that use expanded 2092 // values. Remove them before running the SCEVExpanderCleaners. 2093 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2094 if (MemCheckExp.isInsertedInstruction(&I)) 2095 continue; 2096 SE.forgetValue(&I); 2097 I.eraseFromParent(); 2098 } 2099 } 2100 MemCheckCleaner.cleanup(); 2101 SCEVCleaner.cleanup(); 2102 2103 if (SCEVCheckCond) 2104 SCEVCheckBlock->eraseFromParent(); 2105 if (MemRuntimeCheckCond) 2106 MemCheckBlock->eraseFromParent(); 2107 } 2108 2109 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2110 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2111 /// depending on the generated condition. 2112 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2113 BasicBlock *LoopVectorPreHeader, 2114 BasicBlock *LoopExitBlock) { 2115 if (!SCEVCheckCond) 2116 return nullptr; 2117 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2118 if (C->isZero()) 2119 return nullptr; 2120 2121 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2122 2123 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2124 // Create new preheader for vector loop. 2125 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2126 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2127 2128 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2129 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2130 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2131 SCEVCheckBlock); 2132 2133 DT->addNewBlock(SCEVCheckBlock, Pred); 2134 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2135 2136 ReplaceInstWithInst( 2137 SCEVCheckBlock->getTerminator(), 2138 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2139 // Mark the check as used, to prevent it from being removed during cleanup. 2140 SCEVCheckCond = nullptr; 2141 return SCEVCheckBlock; 2142 } 2143 2144 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2145 /// the branches to branch to the vector preheader or \p Bypass, depending on 2146 /// the generated condition. 2147 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2148 BasicBlock *LoopVectorPreHeader) { 2149 // Check if we generated code that checks in runtime if arrays overlap. 2150 if (!MemRuntimeCheckCond) 2151 return nullptr; 2152 2153 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2154 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2155 MemCheckBlock); 2156 2157 DT->addNewBlock(MemCheckBlock, Pred); 2158 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2159 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2160 2161 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2162 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2163 2164 ReplaceInstWithInst( 2165 MemCheckBlock->getTerminator(), 2166 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2167 MemCheckBlock->getTerminator()->setDebugLoc( 2168 Pred->getTerminator()->getDebugLoc()); 2169 2170 // Mark the check as used, to prevent it from being removed during cleanup. 2171 MemRuntimeCheckCond = nullptr; 2172 return MemCheckBlock; 2173 } 2174 }; 2175 2176 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2177 // vectorization. The loop needs to be annotated with #pragma omp simd 2178 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2179 // vector length information is not provided, vectorization is not considered 2180 // explicit. Interleave hints are not allowed either. These limitations will be 2181 // relaxed in the future. 2182 // Please, note that we are currently forced to abuse the pragma 'clang 2183 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2184 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2185 // provides *explicit vectorization hints* (LV can bypass legal checks and 2186 // assume that vectorization is legal). However, both hints are implemented 2187 // using the same metadata (llvm.loop.vectorize, processed by 2188 // LoopVectorizeHints). This will be fixed in the future when the native IR 2189 // representation for pragma 'omp simd' is introduced. 2190 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2191 OptimizationRemarkEmitter *ORE) { 2192 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2193 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2194 2195 // Only outer loops with an explicit vectorization hint are supported. 2196 // Unannotated outer loops are ignored. 2197 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2198 return false; 2199 2200 Function *Fn = OuterLp->getHeader()->getParent(); 2201 if (!Hints.allowVectorization(Fn, OuterLp, 2202 true /*VectorizeOnlyWhenForced*/)) { 2203 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2204 return false; 2205 } 2206 2207 if (Hints.getInterleave() > 1) { 2208 // TODO: Interleave support is future work. 2209 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2210 "outer loops.\n"); 2211 Hints.emitRemarkWithHints(); 2212 return false; 2213 } 2214 2215 return true; 2216 } 2217 2218 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2219 OptimizationRemarkEmitter *ORE, 2220 SmallVectorImpl<Loop *> &V) { 2221 // Collect inner loops and outer loops without irreducible control flow. For 2222 // now, only collect outer loops that have explicit vectorization hints. If we 2223 // are stress testing the VPlan H-CFG construction, we collect the outermost 2224 // loop of every loop nest. 2225 if (L.isInnermost() || VPlanBuildStressTest || 2226 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2227 LoopBlocksRPO RPOT(&L); 2228 RPOT.perform(LI); 2229 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2230 V.push_back(&L); 2231 // TODO: Collect inner loops inside marked outer loops in case 2232 // vectorization fails for the outer loop. Do not invoke 2233 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2234 // already known to be reducible. We can use an inherited attribute for 2235 // that. 2236 return; 2237 } 2238 } 2239 for (Loop *InnerL : L) 2240 collectSupportedLoops(*InnerL, LI, ORE, V); 2241 } 2242 2243 namespace { 2244 2245 /// The LoopVectorize Pass. 2246 struct LoopVectorize : public FunctionPass { 2247 /// Pass identification, replacement for typeid 2248 static char ID; 2249 2250 LoopVectorizePass Impl; 2251 2252 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2253 bool VectorizeOnlyWhenForced = false) 2254 : FunctionPass(ID), 2255 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2256 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2257 } 2258 2259 bool runOnFunction(Function &F) override { 2260 if (skipFunction(F)) 2261 return false; 2262 2263 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2264 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2265 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2266 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2267 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2268 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2269 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2270 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2271 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2272 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2273 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2274 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2275 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2276 2277 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2278 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2279 2280 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2281 GetLAA, *ORE, PSI).MadeAnyChange; 2282 } 2283 2284 void getAnalysisUsage(AnalysisUsage &AU) const override { 2285 AU.addRequired<AssumptionCacheTracker>(); 2286 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2287 AU.addRequired<DominatorTreeWrapperPass>(); 2288 AU.addRequired<LoopInfoWrapperPass>(); 2289 AU.addRequired<ScalarEvolutionWrapperPass>(); 2290 AU.addRequired<TargetTransformInfoWrapperPass>(); 2291 AU.addRequired<AAResultsWrapperPass>(); 2292 AU.addRequired<LoopAccessLegacyAnalysis>(); 2293 AU.addRequired<DemandedBitsWrapperPass>(); 2294 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2295 AU.addRequired<InjectTLIMappingsLegacy>(); 2296 2297 // We currently do not preserve loopinfo/dominator analyses with outer loop 2298 // vectorization. Until this is addressed, mark these analyses as preserved 2299 // only for non-VPlan-native path. 2300 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2301 if (!EnableVPlanNativePath) { 2302 AU.addPreserved<LoopInfoWrapperPass>(); 2303 AU.addPreserved<DominatorTreeWrapperPass>(); 2304 } 2305 2306 AU.addPreserved<BasicAAWrapperPass>(); 2307 AU.addPreserved<GlobalsAAWrapperPass>(); 2308 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2309 } 2310 }; 2311 2312 } // end anonymous namespace 2313 2314 //===----------------------------------------------------------------------===// 2315 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2316 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2317 //===----------------------------------------------------------------------===// 2318 2319 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2320 // We need to place the broadcast of invariant variables outside the loop, 2321 // but only if it's proven safe to do so. Else, broadcast will be inside 2322 // vector loop body. 2323 Instruction *Instr = dyn_cast<Instruction>(V); 2324 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2325 (!Instr || 2326 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2327 // Place the code for broadcasting invariant variables in the new preheader. 2328 IRBuilder<>::InsertPointGuard Guard(Builder); 2329 if (SafeToHoist) 2330 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2331 2332 // Broadcast the scalar into all locations in the vector. 2333 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2334 2335 return Shuf; 2336 } 2337 2338 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2339 const InductionDescriptor &II, Value *Step, Value *Start, 2340 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2341 IRBuilder<> &Builder = State.Builder; 2342 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2343 "Expected either an induction phi-node or a truncate of it!"); 2344 2345 // Construct the initial value of the vector IV in the vector loop preheader 2346 auto CurrIP = Builder.saveIP(); 2347 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2348 if (isa<TruncInst>(EntryVal)) { 2349 assert(Start->getType()->isIntegerTy() && 2350 "Truncation requires an integer type"); 2351 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2352 Step = Builder.CreateTrunc(Step, TruncType); 2353 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2354 } 2355 2356 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2357 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2358 Value *SteppedStart = 2359 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2360 2361 // We create vector phi nodes for both integer and floating-point induction 2362 // variables. Here, we determine the kind of arithmetic we will perform. 2363 Instruction::BinaryOps AddOp; 2364 Instruction::BinaryOps MulOp; 2365 if (Step->getType()->isIntegerTy()) { 2366 AddOp = Instruction::Add; 2367 MulOp = Instruction::Mul; 2368 } else { 2369 AddOp = II.getInductionOpcode(); 2370 MulOp = Instruction::FMul; 2371 } 2372 2373 // Multiply the vectorization factor by the step using integer or 2374 // floating-point arithmetic as appropriate. 2375 Type *StepType = Step->getType(); 2376 Value *RuntimeVF; 2377 if (Step->getType()->isFloatingPointTy()) 2378 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2379 else 2380 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2381 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2382 2383 // Create a vector splat to use in the induction update. 2384 // 2385 // FIXME: If the step is non-constant, we create the vector splat with 2386 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2387 // handle a constant vector splat. 2388 Value *SplatVF = isa<Constant>(Mul) 2389 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2390 : Builder.CreateVectorSplat(State.VF, Mul); 2391 Builder.restoreIP(CurrIP); 2392 2393 // We may need to add the step a number of times, depending on the unroll 2394 // factor. The last of those goes into the PHI. 2395 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2396 &*LoopVectorBody->getFirstInsertionPt()); 2397 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2398 Instruction *LastInduction = VecInd; 2399 for (unsigned Part = 0; Part < UF; ++Part) { 2400 State.set(Def, LastInduction, Part); 2401 2402 if (isa<TruncInst>(EntryVal)) 2403 addMetadata(LastInduction, EntryVal); 2404 2405 LastInduction = cast<Instruction>( 2406 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2407 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2408 } 2409 2410 // Move the last step to the end of the latch block. This ensures consistent 2411 // placement of all induction updates. 2412 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2413 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2414 auto *ICmp = cast<Instruction>(Br->getCondition()); 2415 LastInduction->moveBefore(ICmp); 2416 LastInduction->setName("vec.ind.next"); 2417 2418 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2419 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2420 } 2421 2422 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2423 return Cost->isScalarAfterVectorization(I, VF) || 2424 Cost->isProfitableToScalarize(I, VF); 2425 } 2426 2427 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2428 if (shouldScalarizeInstruction(IV)) 2429 return true; 2430 auto isScalarInst = [&](User *U) -> bool { 2431 auto *I = cast<Instruction>(U); 2432 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2433 }; 2434 return llvm::any_of(IV->users(), isScalarInst); 2435 } 2436 2437 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, 2438 const InductionDescriptor &ID, 2439 Value *Start, TruncInst *Trunc, 2440 VPValue *Def, 2441 VPTransformState &State) { 2442 IRBuilder<> &Builder = State.Builder; 2443 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2444 "Primary induction variable must have an integer type"); 2445 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2446 2447 // The value from the original loop to which we are mapping the new induction 2448 // variable. 2449 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2450 2451 auto &DL = EntryVal->getModule()->getDataLayout(); 2452 2453 // Generate code for the induction step. Note that induction steps are 2454 // required to be loop-invariant 2455 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2456 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2457 "Induction step should be loop invariant"); 2458 if (PSE.getSE()->isSCEVable(IV->getType())) { 2459 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2460 return Exp.expandCodeFor(Step, Step->getType(), 2461 State.CFG.VectorPreHeader->getTerminator()); 2462 } 2463 return cast<SCEVUnknown>(Step)->getValue(); 2464 }; 2465 2466 // The scalar value to broadcast. This is derived from the canonical 2467 // induction variable. If a truncation type is given, truncate the canonical 2468 // induction variable and step. Otherwise, derive these values from the 2469 // induction descriptor. 2470 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2471 Value *ScalarIV = Induction; 2472 if (IV != OldInduction) { 2473 ScalarIV = IV->getType()->isIntegerTy() 2474 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2475 : Builder.CreateCast(Instruction::SIToFP, Induction, 2476 IV->getType()); 2477 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, 2478 State.CFG.PrevBB); 2479 ScalarIV->setName("offset.idx"); 2480 } 2481 if (Trunc) { 2482 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2483 assert(Step->getType()->isIntegerTy() && 2484 "Truncation requires an integer step"); 2485 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2486 Step = Builder.CreateTrunc(Step, TruncType); 2487 } 2488 return ScalarIV; 2489 }; 2490 2491 // Create the vector values from the scalar IV, in the absence of creating a 2492 // vector IV. 2493 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2494 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2495 for (unsigned Part = 0; Part < UF; ++Part) { 2496 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 2497 Value *StartIdx; 2498 if (Step->getType()->isFloatingPointTy()) 2499 StartIdx = 2500 getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); 2501 else 2502 StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); 2503 2504 Value *EntryPart = 2505 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2506 State.set(Def, EntryPart, Part); 2507 if (Trunc) 2508 addMetadata(EntryPart, Trunc); 2509 } 2510 }; 2511 2512 // Fast-math-flags propagate from the original induction instruction. 2513 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2514 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2515 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2516 2517 // Now do the actual transformations, and start with creating the step value. 2518 Value *Step = CreateStepValue(ID.getStep()); 2519 if (State.VF.isZero() || State.VF.isScalar()) { 2520 Value *ScalarIV = CreateScalarIV(Step); 2521 CreateSplatIV(ScalarIV, Step); 2522 return; 2523 } 2524 2525 // Determine if we want a scalar version of the induction variable. This is 2526 // true if the induction variable itself is not widened, or if it has at 2527 // least one user in the loop that is not widened. 2528 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2529 if (!NeedsScalarIV) { 2530 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2531 return; 2532 } 2533 2534 // Try to create a new independent vector induction variable. If we can't 2535 // create the phi node, we will splat the scalar induction variable in each 2536 // loop iteration. 2537 if (!shouldScalarizeInstruction(EntryVal)) { 2538 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2539 Value *ScalarIV = CreateScalarIV(Step); 2540 // Create scalar steps that can be used by instructions we will later 2541 // scalarize. Note that the addition of the scalar steps will not increase 2542 // the number of instructions in the loop in the common case prior to 2543 // InstCombine. We will be trading one vector extract for each scalar step. 2544 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2545 return; 2546 } 2547 2548 // All IV users are scalar instructions, so only emit a scalar IV, not a 2549 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2550 // predicate used by the masked loads/stores. 2551 Value *ScalarIV = CreateScalarIV(Step); 2552 if (!Cost->isScalarEpilogueAllowed()) 2553 CreateSplatIV(ScalarIV, Step); 2554 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2555 } 2556 2557 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2558 Value *Step, 2559 Instruction::BinaryOps BinOp) { 2560 // Create and check the types. 2561 auto *ValVTy = cast<VectorType>(Val->getType()); 2562 ElementCount VLen = ValVTy->getElementCount(); 2563 2564 Type *STy = Val->getType()->getScalarType(); 2565 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2566 "Induction Step must be an integer or FP"); 2567 assert(Step->getType() == STy && "Step has wrong type"); 2568 2569 SmallVector<Constant *, 8> Indices; 2570 2571 // Create a vector of consecutive numbers from zero to VF. 2572 VectorType *InitVecValVTy = ValVTy; 2573 Type *InitVecValSTy = STy; 2574 if (STy->isFloatingPointTy()) { 2575 InitVecValSTy = 2576 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2577 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2578 } 2579 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2580 2581 // Splat the StartIdx 2582 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2583 2584 if (STy->isIntegerTy()) { 2585 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2586 Step = Builder.CreateVectorSplat(VLen, Step); 2587 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2588 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2589 // which can be found from the original scalar operations. 2590 Step = Builder.CreateMul(InitVec, Step); 2591 return Builder.CreateAdd(Val, Step, "induction"); 2592 } 2593 2594 // Floating point induction. 2595 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2596 "Binary Opcode should be specified for FP induction"); 2597 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2598 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2599 2600 Step = Builder.CreateVectorSplat(VLen, Step); 2601 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2602 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2603 } 2604 2605 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2606 Instruction *EntryVal, 2607 const InductionDescriptor &ID, 2608 VPValue *Def, 2609 VPTransformState &State) { 2610 IRBuilder<> &Builder = State.Builder; 2611 // We shouldn't have to build scalar steps if we aren't vectorizing. 2612 assert(State.VF.isVector() && "VF should be greater than one"); 2613 // Get the value type and ensure it and the step have the same integer type. 2614 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2615 assert(ScalarIVTy == Step->getType() && 2616 "Val and Step should have the same type"); 2617 2618 // We build scalar steps for both integer and floating-point induction 2619 // variables. Here, we determine the kind of arithmetic we will perform. 2620 Instruction::BinaryOps AddOp; 2621 Instruction::BinaryOps MulOp; 2622 if (ScalarIVTy->isIntegerTy()) { 2623 AddOp = Instruction::Add; 2624 MulOp = Instruction::Mul; 2625 } else { 2626 AddOp = ID.getInductionOpcode(); 2627 MulOp = Instruction::FMul; 2628 } 2629 2630 // Determine the number of scalars we need to generate for each unroll 2631 // iteration. If EntryVal is uniform, we only need to generate the first 2632 // lane. Otherwise, we generate all VF values. 2633 bool IsUniform = 2634 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); 2635 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 2636 // Compute the scalar steps and save the results in State. 2637 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2638 ScalarIVTy->getScalarSizeInBits()); 2639 Type *VecIVTy = nullptr; 2640 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2641 if (!IsUniform && State.VF.isScalable()) { 2642 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2643 UnitStepVec = 2644 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2645 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2646 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2647 } 2648 2649 for (unsigned Part = 0; Part < State.UF; ++Part) { 2650 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2651 2652 if (!IsUniform && State.VF.isScalable()) { 2653 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2654 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2655 if (ScalarIVTy->isFloatingPointTy()) 2656 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2657 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2658 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2659 State.set(Def, Add, Part); 2660 // It's useful to record the lane values too for the known minimum number 2661 // of elements so we do those below. This improves the code quality when 2662 // trying to extract the first element, for example. 2663 } 2664 2665 if (ScalarIVTy->isFloatingPointTy()) 2666 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2667 2668 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2669 Value *StartIdx = Builder.CreateBinOp( 2670 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2671 // The step returned by `createStepForVF` is a runtime-evaluated value 2672 // when VF is scalable. Otherwise, it should be folded into a Constant. 2673 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2674 "Expected StartIdx to be folded to a constant when VF is not " 2675 "scalable"); 2676 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2677 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2678 State.set(Def, Add, VPIteration(Part, Lane)); 2679 } 2680 } 2681 } 2682 2683 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2684 const VPIteration &Instance, 2685 VPTransformState &State) { 2686 Value *ScalarInst = State.get(Def, Instance); 2687 Value *VectorValue = State.get(Def, Instance.Part); 2688 VectorValue = Builder.CreateInsertElement( 2689 VectorValue, ScalarInst, 2690 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2691 State.set(Def, VectorValue, Instance.Part); 2692 } 2693 2694 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2695 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2696 return Builder.CreateVectorReverse(Vec, "reverse"); 2697 } 2698 2699 // Return whether we allow using masked interleave-groups (for dealing with 2700 // strided loads/stores that reside in predicated blocks, or for dealing 2701 // with gaps). 2702 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2703 // If an override option has been passed in for interleaved accesses, use it. 2704 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2705 return EnableMaskedInterleavedMemAccesses; 2706 2707 return TTI.enableMaskedInterleavedAccessVectorization(); 2708 } 2709 2710 // Try to vectorize the interleave group that \p Instr belongs to. 2711 // 2712 // E.g. Translate following interleaved load group (factor = 3): 2713 // for (i = 0; i < N; i+=3) { 2714 // R = Pic[i]; // Member of index 0 2715 // G = Pic[i+1]; // Member of index 1 2716 // B = Pic[i+2]; // Member of index 2 2717 // ... // do something to R, G, B 2718 // } 2719 // To: 2720 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2721 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2722 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2723 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2724 // 2725 // Or translate following interleaved store group (factor = 3): 2726 // for (i = 0; i < N; i+=3) { 2727 // ... do something to R, G, B 2728 // Pic[i] = R; // Member of index 0 2729 // Pic[i+1] = G; // Member of index 1 2730 // Pic[i+2] = B; // Member of index 2 2731 // } 2732 // To: 2733 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2734 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2735 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2736 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2737 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2738 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2739 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2740 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2741 VPValue *BlockInMask) { 2742 Instruction *Instr = Group->getInsertPos(); 2743 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2744 2745 // Prepare for the vector type of the interleaved load/store. 2746 Type *ScalarTy = getLoadStoreType(Instr); 2747 unsigned InterleaveFactor = Group->getFactor(); 2748 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2749 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2750 2751 // Prepare for the new pointers. 2752 SmallVector<Value *, 2> AddrParts; 2753 unsigned Index = Group->getIndex(Instr); 2754 2755 // TODO: extend the masked interleaved-group support to reversed access. 2756 assert((!BlockInMask || !Group->isReverse()) && 2757 "Reversed masked interleave-group not supported."); 2758 2759 // If the group is reverse, adjust the index to refer to the last vector lane 2760 // instead of the first. We adjust the index from the first vector lane, 2761 // rather than directly getting the pointer for lane VF - 1, because the 2762 // pointer operand of the interleaved access is supposed to be uniform. For 2763 // uniform instructions, we're only required to generate a value for the 2764 // first vector lane in each unroll iteration. 2765 if (Group->isReverse()) 2766 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2767 2768 for (unsigned Part = 0; Part < UF; Part++) { 2769 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2770 setDebugLocFromInst(AddrPart); 2771 2772 // Notice current instruction could be any index. Need to adjust the address 2773 // to the member of index 0. 2774 // 2775 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2776 // b = A[i]; // Member of index 0 2777 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2778 // 2779 // E.g. A[i+1] = a; // Member of index 1 2780 // A[i] = b; // Member of index 0 2781 // A[i+2] = c; // Member of index 2 (Current instruction) 2782 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2783 2784 bool InBounds = false; 2785 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2786 InBounds = gep->isInBounds(); 2787 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2788 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2789 2790 // Cast to the vector pointer type. 2791 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2792 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2793 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2794 } 2795 2796 setDebugLocFromInst(Instr); 2797 Value *PoisonVec = PoisonValue::get(VecTy); 2798 2799 Value *MaskForGaps = nullptr; 2800 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2801 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2802 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2803 } 2804 2805 // Vectorize the interleaved load group. 2806 if (isa<LoadInst>(Instr)) { 2807 // For each unroll part, create a wide load for the group. 2808 SmallVector<Value *, 2> NewLoads; 2809 for (unsigned Part = 0; Part < UF; Part++) { 2810 Instruction *NewLoad; 2811 if (BlockInMask || MaskForGaps) { 2812 assert(useMaskedInterleavedAccesses(*TTI) && 2813 "masked interleaved groups are not allowed."); 2814 Value *GroupMask = MaskForGaps; 2815 if (BlockInMask) { 2816 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2817 Value *ShuffledMask = Builder.CreateShuffleVector( 2818 BlockInMaskPart, 2819 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2820 "interleaved.mask"); 2821 GroupMask = MaskForGaps 2822 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2823 MaskForGaps) 2824 : ShuffledMask; 2825 } 2826 NewLoad = 2827 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2828 GroupMask, PoisonVec, "wide.masked.vec"); 2829 } 2830 else 2831 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2832 Group->getAlign(), "wide.vec"); 2833 Group->addMetadata(NewLoad); 2834 NewLoads.push_back(NewLoad); 2835 } 2836 2837 // For each member in the group, shuffle out the appropriate data from the 2838 // wide loads. 2839 unsigned J = 0; 2840 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2841 Instruction *Member = Group->getMember(I); 2842 2843 // Skip the gaps in the group. 2844 if (!Member) 2845 continue; 2846 2847 auto StrideMask = 2848 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2849 for (unsigned Part = 0; Part < UF; Part++) { 2850 Value *StridedVec = Builder.CreateShuffleVector( 2851 NewLoads[Part], StrideMask, "strided.vec"); 2852 2853 // If this member has different type, cast the result type. 2854 if (Member->getType() != ScalarTy) { 2855 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2856 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2857 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2858 } 2859 2860 if (Group->isReverse()) 2861 StridedVec = reverseVector(StridedVec); 2862 2863 State.set(VPDefs[J], StridedVec, Part); 2864 } 2865 ++J; 2866 } 2867 return; 2868 } 2869 2870 // The sub vector type for current instruction. 2871 auto *SubVT = VectorType::get(ScalarTy, VF); 2872 2873 // Vectorize the interleaved store group. 2874 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2875 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2876 "masked interleaved groups are not allowed."); 2877 assert((!MaskForGaps || !VF.isScalable()) && 2878 "masking gaps for scalable vectors is not yet supported."); 2879 for (unsigned Part = 0; Part < UF; Part++) { 2880 // Collect the stored vector from each member. 2881 SmallVector<Value *, 4> StoredVecs; 2882 for (unsigned i = 0; i < InterleaveFactor; i++) { 2883 assert((Group->getMember(i) || MaskForGaps) && 2884 "Fail to get a member from an interleaved store group"); 2885 Instruction *Member = Group->getMember(i); 2886 2887 // Skip the gaps in the group. 2888 if (!Member) { 2889 Value *Undef = PoisonValue::get(SubVT); 2890 StoredVecs.push_back(Undef); 2891 continue; 2892 } 2893 2894 Value *StoredVec = State.get(StoredValues[i], Part); 2895 2896 if (Group->isReverse()) 2897 StoredVec = reverseVector(StoredVec); 2898 2899 // If this member has different type, cast it to a unified type. 2900 2901 if (StoredVec->getType() != SubVT) 2902 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2903 2904 StoredVecs.push_back(StoredVec); 2905 } 2906 2907 // Concatenate all vectors into a wide vector. 2908 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2909 2910 // Interleave the elements in the wide vector. 2911 Value *IVec = Builder.CreateShuffleVector( 2912 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2913 "interleaved.vec"); 2914 2915 Instruction *NewStoreInstr; 2916 if (BlockInMask || MaskForGaps) { 2917 Value *GroupMask = MaskForGaps; 2918 if (BlockInMask) { 2919 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2920 Value *ShuffledMask = Builder.CreateShuffleVector( 2921 BlockInMaskPart, 2922 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2923 "interleaved.mask"); 2924 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2925 ShuffledMask, MaskForGaps) 2926 : ShuffledMask; 2927 } 2928 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2929 Group->getAlign(), GroupMask); 2930 } else 2931 NewStoreInstr = 2932 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2933 2934 Group->addMetadata(NewStoreInstr); 2935 } 2936 } 2937 2938 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2939 VPReplicateRecipe *RepRecipe, 2940 const VPIteration &Instance, 2941 bool IfPredicateInstr, 2942 VPTransformState &State) { 2943 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2944 2945 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2946 // the first lane and part. 2947 if (isa<NoAliasScopeDeclInst>(Instr)) 2948 if (!Instance.isFirstIteration()) 2949 return; 2950 2951 setDebugLocFromInst(Instr); 2952 2953 // Does this instruction return a value ? 2954 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2955 2956 Instruction *Cloned = Instr->clone(); 2957 if (!IsVoidRetTy) 2958 Cloned->setName(Instr->getName() + ".cloned"); 2959 2960 // If the scalarized instruction contributes to the address computation of a 2961 // widen masked load/store which was in a basic block that needed predication 2962 // and is not predicated after vectorization, we can't propagate 2963 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2964 // instruction could feed a poison value to the base address of the widen 2965 // load/store. 2966 if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) 2967 Cloned->dropPoisonGeneratingFlags(); 2968 2969 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2970 Builder.GetInsertPoint()); 2971 // Replace the operands of the cloned instructions with their scalar 2972 // equivalents in the new loop. 2973 for (auto &I : enumerate(RepRecipe->operands())) { 2974 auto InputInstance = Instance; 2975 VPValue *Operand = I.value(); 2976 if (State.Plan->isUniformAfterVectorization(Operand)) 2977 InputInstance.Lane = VPLane::getFirstLane(); 2978 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2979 } 2980 addNewMetadata(Cloned, Instr); 2981 2982 // Place the cloned scalar in the new loop. 2983 Builder.Insert(Cloned); 2984 2985 State.set(RepRecipe, Cloned, Instance); 2986 2987 // If we just cloned a new assumption, add it the assumption cache. 2988 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2989 AC->registerAssumption(II); 2990 2991 // End if-block. 2992 if (IfPredicateInstr) 2993 PredicatedInstructions.push_back(Cloned); 2994 } 2995 2996 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2997 Value *End, Value *Step, 2998 Instruction *DL) { 2999 BasicBlock *Header = L->getHeader(); 3000 BasicBlock *Latch = L->getLoopLatch(); 3001 // As we're just creating this loop, it's possible no latch exists 3002 // yet. If so, use the header as this will be a single block loop. 3003 if (!Latch) 3004 Latch = Header; 3005 3006 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3007 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3008 setDebugLocFromInst(OldInst, &B); 3009 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3010 3011 B.SetInsertPoint(Latch->getTerminator()); 3012 setDebugLocFromInst(OldInst, &B); 3013 3014 // Create i+1 and fill the PHINode. 3015 // 3016 // If the tail is not folded, we know that End - Start >= Step (either 3017 // statically or through the minimum iteration checks). We also know that both 3018 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3019 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3020 // overflows and we can mark the induction increment as NUW. 3021 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3022 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3023 Induction->addIncoming(Start, L->getLoopPreheader()); 3024 Induction->addIncoming(Next, Latch); 3025 // Create the compare. 3026 Value *ICmp = B.CreateICmpEQ(Next, End); 3027 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3028 3029 // Now we have two terminators. Remove the old one from the block. 3030 Latch->getTerminator()->eraseFromParent(); 3031 3032 return Induction; 3033 } 3034 3035 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3036 if (TripCount) 3037 return TripCount; 3038 3039 assert(L && "Create Trip Count for null loop."); 3040 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3041 // Find the loop boundaries. 3042 ScalarEvolution *SE = PSE.getSE(); 3043 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3044 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3045 "Invalid loop count"); 3046 3047 Type *IdxTy = Legal->getWidestInductionType(); 3048 assert(IdxTy && "No type for induction"); 3049 3050 // The exit count might have the type of i64 while the phi is i32. This can 3051 // happen if we have an induction variable that is sign extended before the 3052 // compare. The only way that we get a backedge taken count is that the 3053 // induction variable was signed and as such will not overflow. In such a case 3054 // truncation is legal. 3055 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3056 IdxTy->getPrimitiveSizeInBits()) 3057 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3058 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3059 3060 // Get the total trip count from the count by adding 1. 3061 const SCEV *ExitCount = SE->getAddExpr( 3062 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3063 3064 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3065 3066 // Expand the trip count and place the new instructions in the preheader. 3067 // Notice that the pre-header does not change, only the loop body. 3068 SCEVExpander Exp(*SE, DL, "induction"); 3069 3070 // Count holds the overall loop count (N). 3071 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3072 L->getLoopPreheader()->getTerminator()); 3073 3074 if (TripCount->getType()->isPointerTy()) 3075 TripCount = 3076 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3077 L->getLoopPreheader()->getTerminator()); 3078 3079 return TripCount; 3080 } 3081 3082 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3083 if (VectorTripCount) 3084 return VectorTripCount; 3085 3086 Value *TC = getOrCreateTripCount(L); 3087 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3088 3089 Type *Ty = TC->getType(); 3090 // This is where we can make the step a runtime constant. 3091 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3092 3093 // If the tail is to be folded by masking, round the number of iterations N 3094 // up to a multiple of Step instead of rounding down. This is done by first 3095 // adding Step-1 and then rounding down. Note that it's ok if this addition 3096 // overflows: the vector induction variable will eventually wrap to zero given 3097 // that it starts at zero and its Step is a power of two; the loop will then 3098 // exit, with the last early-exit vector comparison also producing all-true. 3099 if (Cost->foldTailByMasking()) { 3100 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3101 "VF*UF must be a power of 2 when folding tail by masking"); 3102 assert(!VF.isScalable() && 3103 "Tail folding not yet supported for scalable vectors"); 3104 TC = Builder.CreateAdd( 3105 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3106 } 3107 3108 // Now we need to generate the expression for the part of the loop that the 3109 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3110 // iterations are not required for correctness, or N - Step, otherwise. Step 3111 // is equal to the vectorization factor (number of SIMD elements) times the 3112 // unroll factor (number of SIMD instructions). 3113 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3114 3115 // There are cases where we *must* run at least one iteration in the remainder 3116 // loop. See the cost model for when this can happen. If the step evenly 3117 // divides the trip count, we set the remainder to be equal to the step. If 3118 // the step does not evenly divide the trip count, no adjustment is necessary 3119 // since there will already be scalar iterations. Note that the minimum 3120 // iterations check ensures that N >= Step. 3121 if (Cost->requiresScalarEpilogue(VF)) { 3122 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3123 R = Builder.CreateSelect(IsZero, Step, R); 3124 } 3125 3126 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3127 3128 return VectorTripCount; 3129 } 3130 3131 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3132 const DataLayout &DL) { 3133 // Verify that V is a vector type with same number of elements as DstVTy. 3134 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3135 unsigned VF = DstFVTy->getNumElements(); 3136 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3137 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3138 Type *SrcElemTy = SrcVecTy->getElementType(); 3139 Type *DstElemTy = DstFVTy->getElementType(); 3140 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3141 "Vector elements must have same size"); 3142 3143 // Do a direct cast if element types are castable. 3144 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3145 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3146 } 3147 // V cannot be directly casted to desired vector type. 3148 // May happen when V is a floating point vector but DstVTy is a vector of 3149 // pointers or vice-versa. Handle this using a two-step bitcast using an 3150 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3151 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3152 "Only one type should be a pointer type"); 3153 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3154 "Only one type should be a floating point type"); 3155 Type *IntTy = 3156 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3157 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3158 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3159 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3160 } 3161 3162 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3163 BasicBlock *Bypass) { 3164 Value *Count = getOrCreateTripCount(L); 3165 // Reuse existing vector loop preheader for TC checks. 3166 // Note that new preheader block is generated for vector loop. 3167 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3168 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3169 3170 // Generate code to check if the loop's trip count is less than VF * UF, or 3171 // equal to it in case a scalar epilogue is required; this implies that the 3172 // vector trip count is zero. This check also covers the case where adding one 3173 // to the backedge-taken count overflowed leading to an incorrect trip count 3174 // of zero. In this case we will also jump to the scalar loop. 3175 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3176 : ICmpInst::ICMP_ULT; 3177 3178 // If tail is to be folded, vector loop takes care of all iterations. 3179 Value *CheckMinIters = Builder.getFalse(); 3180 if (!Cost->foldTailByMasking()) { 3181 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3182 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3183 } 3184 // Create new preheader for vector loop. 3185 LoopVectorPreHeader = 3186 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3187 "vector.ph"); 3188 3189 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3190 DT->getNode(Bypass)->getIDom()) && 3191 "TC check is expected to dominate Bypass"); 3192 3193 // Update dominator for Bypass & LoopExit (if needed). 3194 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3195 if (!Cost->requiresScalarEpilogue(VF)) 3196 // If there is an epilogue which must run, there's no edge from the 3197 // middle block to exit blocks and thus no need to update the immediate 3198 // dominator of the exit blocks. 3199 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3200 3201 ReplaceInstWithInst( 3202 TCCheckBlock->getTerminator(), 3203 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3204 LoopBypassBlocks.push_back(TCCheckBlock); 3205 } 3206 3207 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3208 3209 BasicBlock *const SCEVCheckBlock = 3210 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3211 if (!SCEVCheckBlock) 3212 return nullptr; 3213 3214 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3215 (OptForSizeBasedOnProfile && 3216 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3217 "Cannot SCEV check stride or overflow when optimizing for size"); 3218 3219 3220 // Update dominator only if this is first RT check. 3221 if (LoopBypassBlocks.empty()) { 3222 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3223 if (!Cost->requiresScalarEpilogue(VF)) 3224 // If there is an epilogue which must run, there's no edge from the 3225 // middle block to exit blocks and thus no need to update the immediate 3226 // dominator of the exit blocks. 3227 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3228 } 3229 3230 LoopBypassBlocks.push_back(SCEVCheckBlock); 3231 AddedSafetyChecks = true; 3232 return SCEVCheckBlock; 3233 } 3234 3235 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3236 BasicBlock *Bypass) { 3237 // VPlan-native path does not do any analysis for runtime checks currently. 3238 if (EnableVPlanNativePath) 3239 return nullptr; 3240 3241 BasicBlock *const MemCheckBlock = 3242 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3243 3244 // Check if we generated code that checks in runtime if arrays overlap. We put 3245 // the checks into a separate block to make the more common case of few 3246 // elements faster. 3247 if (!MemCheckBlock) 3248 return nullptr; 3249 3250 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3251 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3252 "Cannot emit memory checks when optimizing for size, unless forced " 3253 "to vectorize."); 3254 ORE->emit([&]() { 3255 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3256 L->getStartLoc(), L->getHeader()) 3257 << "Code-size may be reduced by not forcing " 3258 "vectorization, or by source-code modifications " 3259 "eliminating the need for runtime checks " 3260 "(e.g., adding 'restrict')."; 3261 }); 3262 } 3263 3264 LoopBypassBlocks.push_back(MemCheckBlock); 3265 3266 AddedSafetyChecks = true; 3267 3268 // We currently don't use LoopVersioning for the actual loop cloning but we 3269 // still use it to add the noalias metadata. 3270 LVer = std::make_unique<LoopVersioning>( 3271 *Legal->getLAI(), 3272 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3273 DT, PSE.getSE()); 3274 LVer->prepareNoAliasMetadata(); 3275 return MemCheckBlock; 3276 } 3277 3278 Value *InnerLoopVectorizer::emitTransformedIndex( 3279 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3280 const InductionDescriptor &ID, BasicBlock *VectorHeader) const { 3281 3282 SCEVExpander Exp(*SE, DL, "induction"); 3283 auto Step = ID.getStep(); 3284 auto StartValue = ID.getStartValue(); 3285 assert(Index->getType()->getScalarType() == Step->getType() && 3286 "Index scalar type does not match StepValue type"); 3287 3288 // Note: the IR at this point is broken. We cannot use SE to create any new 3289 // SCEV and then expand it, hoping that SCEV's simplification will give us 3290 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3291 // lead to various SCEV crashes. So all we can do is to use builder and rely 3292 // on InstCombine for future simplifications. Here we handle some trivial 3293 // cases only. 3294 auto CreateAdd = [&B](Value *X, Value *Y) { 3295 assert(X->getType() == Y->getType() && "Types don't match!"); 3296 if (auto *CX = dyn_cast<ConstantInt>(X)) 3297 if (CX->isZero()) 3298 return Y; 3299 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3300 if (CY->isZero()) 3301 return X; 3302 return B.CreateAdd(X, Y); 3303 }; 3304 3305 // We allow X to be a vector type, in which case Y will potentially be 3306 // splatted into a vector with the same element count. 3307 auto CreateMul = [&B](Value *X, Value *Y) { 3308 assert(X->getType()->getScalarType() == Y->getType() && 3309 "Types don't match!"); 3310 if (auto *CX = dyn_cast<ConstantInt>(X)) 3311 if (CX->isOne()) 3312 return Y; 3313 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3314 if (CY->isOne()) 3315 return X; 3316 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3317 if (XVTy && !isa<VectorType>(Y->getType())) 3318 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3319 return B.CreateMul(X, Y); 3320 }; 3321 3322 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3323 // loop, choose the end of the vector loop header (=VectorHeader), because 3324 // the DomTree is not kept up-to-date for additional blocks generated in the 3325 // vector loop. By using the header as insertion point, we guarantee that the 3326 // expanded instructions dominate all their uses. 3327 auto GetInsertPoint = [this, &B, VectorHeader]() { 3328 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3329 if (InsertBB != LoopVectorBody && 3330 LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) 3331 return VectorHeader->getTerminator(); 3332 return &*B.GetInsertPoint(); 3333 }; 3334 3335 switch (ID.getKind()) { 3336 case InductionDescriptor::IK_IntInduction: { 3337 assert(!isa<VectorType>(Index->getType()) && 3338 "Vector indices not supported for integer inductions yet"); 3339 assert(Index->getType() == StartValue->getType() && 3340 "Index type does not match StartValue type"); 3341 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3342 return B.CreateSub(StartValue, Index); 3343 auto *Offset = CreateMul( 3344 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3345 return CreateAdd(StartValue, Offset); 3346 } 3347 case InductionDescriptor::IK_PtrInduction: { 3348 assert(isa<SCEVConstant>(Step) && 3349 "Expected constant step for pointer induction"); 3350 return B.CreateGEP( 3351 ID.getElementType(), StartValue, 3352 CreateMul(Index, 3353 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3354 GetInsertPoint()))); 3355 } 3356 case InductionDescriptor::IK_FpInduction: { 3357 assert(!isa<VectorType>(Index->getType()) && 3358 "Vector indices not supported for FP inductions yet"); 3359 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3360 auto InductionBinOp = ID.getInductionBinOp(); 3361 assert(InductionBinOp && 3362 (InductionBinOp->getOpcode() == Instruction::FAdd || 3363 InductionBinOp->getOpcode() == Instruction::FSub) && 3364 "Original bin op should be defined for FP induction"); 3365 3366 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3367 Value *MulExp = B.CreateFMul(StepValue, Index); 3368 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3369 "induction"); 3370 } 3371 case InductionDescriptor::IK_NoInduction: 3372 return nullptr; 3373 } 3374 llvm_unreachable("invalid enum"); 3375 } 3376 3377 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3378 LoopScalarBody = OrigLoop->getHeader(); 3379 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3380 assert(LoopVectorPreHeader && "Invalid loop structure"); 3381 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3382 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3383 "multiple exit loop without required epilogue?"); 3384 3385 LoopMiddleBlock = 3386 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3387 LI, nullptr, Twine(Prefix) + "middle.block"); 3388 LoopScalarPreHeader = 3389 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3390 nullptr, Twine(Prefix) + "scalar.ph"); 3391 3392 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3393 3394 // Set up the middle block terminator. Two cases: 3395 // 1) If we know that we must execute the scalar epilogue, emit an 3396 // unconditional branch. 3397 // 2) Otherwise, we must have a single unique exit block (due to how we 3398 // implement the multiple exit case). In this case, set up a conditonal 3399 // branch from the middle block to the loop scalar preheader, and the 3400 // exit block. completeLoopSkeleton will update the condition to use an 3401 // iteration check, if required to decide whether to execute the remainder. 3402 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3403 BranchInst::Create(LoopScalarPreHeader) : 3404 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3405 Builder.getTrue()); 3406 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3407 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3408 3409 // We intentionally don't let SplitBlock to update LoopInfo since 3410 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3411 // LoopVectorBody is explicitly added to the correct place few lines later. 3412 LoopVectorBody = 3413 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3414 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3415 3416 // Update dominator for loop exit. 3417 if (!Cost->requiresScalarEpilogue(VF)) 3418 // If there is an epilogue which must run, there's no edge from the 3419 // middle block to exit blocks and thus no need to update the immediate 3420 // dominator of the exit blocks. 3421 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3422 3423 // Create and register the new vector loop. 3424 Loop *Lp = LI->AllocateLoop(); 3425 Loop *ParentLoop = OrigLoop->getParentLoop(); 3426 3427 // Insert the new loop into the loop nest and register the new basic blocks 3428 // before calling any utilities such as SCEV that require valid LoopInfo. 3429 if (ParentLoop) { 3430 ParentLoop->addChildLoop(Lp); 3431 } else { 3432 LI->addTopLevelLoop(Lp); 3433 } 3434 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3435 return Lp; 3436 } 3437 3438 void InnerLoopVectorizer::createInductionResumeValues( 3439 Loop *L, Value *VectorTripCount, 3440 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3441 assert(VectorTripCount && L && "Expected valid arguments"); 3442 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3443 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3444 "Inconsistent information about additional bypass."); 3445 // We are going to resume the execution of the scalar loop. 3446 // Go over all of the induction variables that we found and fix the 3447 // PHIs that are left in the scalar version of the loop. 3448 // The starting values of PHI nodes depend on the counter of the last 3449 // iteration in the vectorized loop. 3450 // If we come from a bypass edge then we need to start from the original 3451 // start value. 3452 for (auto &InductionEntry : Legal->getInductionVars()) { 3453 PHINode *OrigPhi = InductionEntry.first; 3454 InductionDescriptor II = InductionEntry.second; 3455 3456 // Create phi nodes to merge from the backedge-taken check block. 3457 PHINode *BCResumeVal = 3458 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3459 LoopScalarPreHeader->getTerminator()); 3460 // Copy original phi DL over to the new one. 3461 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3462 Value *&EndValue = IVEndValues[OrigPhi]; 3463 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3464 if (OrigPhi == OldInduction) { 3465 // We know what the end value is. 3466 EndValue = VectorTripCount; 3467 } else { 3468 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3469 3470 // Fast-math-flags propagate from the original induction instruction. 3471 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3472 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3473 3474 Type *StepType = II.getStep()->getType(); 3475 Instruction::CastOps CastOp = 3476 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3477 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3478 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3479 EndValue = 3480 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3481 EndValue->setName("ind.end"); 3482 3483 // Compute the end value for the additional bypass (if applicable). 3484 if (AdditionalBypass.first) { 3485 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3486 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3487 StepType, true); 3488 CRD = 3489 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3490 EndValueFromAdditionalBypass = 3491 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3492 EndValueFromAdditionalBypass->setName("ind.end"); 3493 } 3494 } 3495 // The new PHI merges the original incoming value, in case of a bypass, 3496 // or the value at the end of the vectorized loop. 3497 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3498 3499 // Fix the scalar body counter (PHI node). 3500 // The old induction's phi node in the scalar body needs the truncated 3501 // value. 3502 for (BasicBlock *BB : LoopBypassBlocks) 3503 BCResumeVal->addIncoming(II.getStartValue(), BB); 3504 3505 if (AdditionalBypass.first) 3506 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3507 EndValueFromAdditionalBypass); 3508 3509 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3510 } 3511 } 3512 3513 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3514 MDNode *OrigLoopID) { 3515 assert(L && "Expected valid loop."); 3516 3517 // The trip counts should be cached by now. 3518 Value *Count = getOrCreateTripCount(L); 3519 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3520 3521 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3522 3523 // Add a check in the middle block to see if we have completed 3524 // all of the iterations in the first vector loop. Three cases: 3525 // 1) If we require a scalar epilogue, there is no conditional branch as 3526 // we unconditionally branch to the scalar preheader. Do nothing. 3527 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3528 // Thus if tail is to be folded, we know we don't need to run the 3529 // remainder and we can use the previous value for the condition (true). 3530 // 3) Otherwise, construct a runtime check. 3531 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3532 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3533 Count, VectorTripCount, "cmp.n", 3534 LoopMiddleBlock->getTerminator()); 3535 3536 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3537 // of the corresponding compare because they may have ended up with 3538 // different line numbers and we want to avoid awkward line stepping while 3539 // debugging. Eg. if the compare has got a line number inside the loop. 3540 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3541 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3542 } 3543 3544 // Get ready to start creating new instructions into the vectorized body. 3545 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3546 "Inconsistent vector loop preheader"); 3547 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3548 3549 Optional<MDNode *> VectorizedLoopID = 3550 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3551 LLVMLoopVectorizeFollowupVectorized}); 3552 if (VectorizedLoopID.hasValue()) { 3553 L->setLoopID(VectorizedLoopID.getValue()); 3554 3555 // Do not setAlreadyVectorized if loop attributes have been defined 3556 // explicitly. 3557 return LoopVectorPreHeader; 3558 } 3559 3560 // Keep all loop hints from the original loop on the vector loop (we'll 3561 // replace the vectorizer-specific hints below). 3562 if (MDNode *LID = OrigLoop->getLoopID()) 3563 L->setLoopID(LID); 3564 3565 LoopVectorizeHints Hints(L, true, *ORE); 3566 Hints.setAlreadyVectorized(); 3567 3568 #ifdef EXPENSIVE_CHECKS 3569 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3570 LI->verify(*DT); 3571 #endif 3572 3573 return LoopVectorPreHeader; 3574 } 3575 3576 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3577 /* 3578 In this function we generate a new loop. The new loop will contain 3579 the vectorized instructions while the old loop will continue to run the 3580 scalar remainder. 3581 3582 [ ] <-- loop iteration number check. 3583 / | 3584 / v 3585 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3586 | / | 3587 | / v 3588 || [ ] <-- vector pre header. 3589 |/ | 3590 | v 3591 | [ ] \ 3592 | [ ]_| <-- vector loop. 3593 | | 3594 | v 3595 \ -[ ] <--- middle-block. 3596 \/ | 3597 /\ v 3598 | ->[ ] <--- new preheader. 3599 | | 3600 (opt) v <-- edge from middle to exit iff epilogue is not required. 3601 | [ ] \ 3602 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3603 \ | 3604 \ v 3605 >[ ] <-- exit block(s). 3606 ... 3607 */ 3608 3609 // Get the metadata of the original loop before it gets modified. 3610 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3611 3612 // Workaround! Compute the trip count of the original loop and cache it 3613 // before we start modifying the CFG. This code has a systemic problem 3614 // wherein it tries to run analysis over partially constructed IR; this is 3615 // wrong, and not simply for SCEV. The trip count of the original loop 3616 // simply happens to be prone to hitting this in practice. In theory, we 3617 // can hit the same issue for any SCEV, or ValueTracking query done during 3618 // mutation. See PR49900. 3619 getOrCreateTripCount(OrigLoop); 3620 3621 // Create an empty vector loop, and prepare basic blocks for the runtime 3622 // checks. 3623 Loop *Lp = createVectorLoopSkeleton(""); 3624 3625 // Now, compare the new count to zero. If it is zero skip the vector loop and 3626 // jump to the scalar loop. This check also covers the case where the 3627 // backedge-taken count is uint##_max: adding one to it will overflow leading 3628 // to an incorrect trip count of zero. In this (rare) case we will also jump 3629 // to the scalar loop. 3630 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3631 3632 // Generate the code to check any assumptions that we've made for SCEV 3633 // expressions. 3634 emitSCEVChecks(Lp, LoopScalarPreHeader); 3635 3636 // Generate the code that checks in runtime if arrays overlap. We put the 3637 // checks into a separate block to make the more common case of few elements 3638 // faster. 3639 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3640 3641 // Some loops have a single integer induction variable, while other loops 3642 // don't. One example is c++ iterators that often have multiple pointer 3643 // induction variables. In the code below we also support a case where we 3644 // don't have a single induction variable. 3645 // 3646 // We try to obtain an induction variable from the original loop as hard 3647 // as possible. However if we don't find one that: 3648 // - is an integer 3649 // - counts from zero, stepping by one 3650 // - is the size of the widest induction variable type 3651 // then we create a new one. 3652 OldInduction = Legal->getPrimaryInduction(); 3653 Type *IdxTy = Legal->getWidestInductionType(); 3654 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3655 // The loop step is equal to the vectorization factor (num of SIMD elements) 3656 // times the unroll factor (num of SIMD instructions). 3657 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3658 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3659 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3660 Induction = 3661 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3662 getDebugLocFromInstOrOperands(OldInduction)); 3663 3664 // Emit phis for the new starting index of the scalar loop. 3665 createInductionResumeValues(Lp, CountRoundDown); 3666 3667 return completeLoopSkeleton(Lp, OrigLoopID); 3668 } 3669 3670 // Fix up external users of the induction variable. At this point, we are 3671 // in LCSSA form, with all external PHIs that use the IV having one input value, 3672 // coming from the remainder loop. We need those PHIs to also have a correct 3673 // value for the IV when arriving directly from the middle block. 3674 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3675 const InductionDescriptor &II, 3676 Value *CountRoundDown, Value *EndValue, 3677 BasicBlock *MiddleBlock) { 3678 // There are two kinds of external IV usages - those that use the value 3679 // computed in the last iteration (the PHI) and those that use the penultimate 3680 // value (the value that feeds into the phi from the loop latch). 3681 // We allow both, but they, obviously, have different values. 3682 3683 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3684 3685 DenseMap<Value *, Value *> MissingVals; 3686 3687 // An external user of the last iteration's value should see the value that 3688 // the remainder loop uses to initialize its own IV. 3689 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3690 for (User *U : PostInc->users()) { 3691 Instruction *UI = cast<Instruction>(U); 3692 if (!OrigLoop->contains(UI)) { 3693 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3694 MissingVals[UI] = EndValue; 3695 } 3696 } 3697 3698 // An external user of the penultimate value need to see EndValue - Step. 3699 // The simplest way to get this is to recompute it from the constituent SCEVs, 3700 // that is Start + (Step * (CRD - 1)). 3701 for (User *U : OrigPhi->users()) { 3702 auto *UI = cast<Instruction>(U); 3703 if (!OrigLoop->contains(UI)) { 3704 const DataLayout &DL = 3705 OrigLoop->getHeader()->getModule()->getDataLayout(); 3706 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3707 3708 IRBuilder<> B(MiddleBlock->getTerminator()); 3709 3710 // Fast-math-flags propagate from the original induction instruction. 3711 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3712 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3713 3714 Value *CountMinusOne = B.CreateSub( 3715 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3716 Value *CMO = 3717 !II.getStep()->getType()->isIntegerTy() 3718 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3719 II.getStep()->getType()) 3720 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3721 CMO->setName("cast.cmo"); 3722 Value *Escape = 3723 emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); 3724 Escape->setName("ind.escape"); 3725 MissingVals[UI] = Escape; 3726 } 3727 } 3728 3729 for (auto &I : MissingVals) { 3730 PHINode *PHI = cast<PHINode>(I.first); 3731 // One corner case we have to handle is two IVs "chasing" each-other, 3732 // that is %IV2 = phi [...], [ %IV1, %latch ] 3733 // In this case, if IV1 has an external use, we need to avoid adding both 3734 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3735 // don't already have an incoming value for the middle block. 3736 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3737 PHI->addIncoming(I.second, MiddleBlock); 3738 } 3739 } 3740 3741 namespace { 3742 3743 struct CSEDenseMapInfo { 3744 static bool canHandle(const Instruction *I) { 3745 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3746 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3747 } 3748 3749 static inline Instruction *getEmptyKey() { 3750 return DenseMapInfo<Instruction *>::getEmptyKey(); 3751 } 3752 3753 static inline Instruction *getTombstoneKey() { 3754 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3755 } 3756 3757 static unsigned getHashValue(const Instruction *I) { 3758 assert(canHandle(I) && "Unknown instruction!"); 3759 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3760 I->value_op_end())); 3761 } 3762 3763 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3764 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3765 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3766 return LHS == RHS; 3767 return LHS->isIdenticalTo(RHS); 3768 } 3769 }; 3770 3771 } // end anonymous namespace 3772 3773 ///Perform cse of induction variable instructions. 3774 static void cse(BasicBlock *BB) { 3775 // Perform simple cse. 3776 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3777 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3778 if (!CSEDenseMapInfo::canHandle(&In)) 3779 continue; 3780 3781 // Check if we can replace this instruction with any of the 3782 // visited instructions. 3783 if (Instruction *V = CSEMap.lookup(&In)) { 3784 In.replaceAllUsesWith(V); 3785 In.eraseFromParent(); 3786 continue; 3787 } 3788 3789 CSEMap[&In] = &In; 3790 } 3791 } 3792 3793 InstructionCost 3794 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3795 bool &NeedToScalarize) const { 3796 Function *F = CI->getCalledFunction(); 3797 Type *ScalarRetTy = CI->getType(); 3798 SmallVector<Type *, 4> Tys, ScalarTys; 3799 for (auto &ArgOp : CI->args()) 3800 ScalarTys.push_back(ArgOp->getType()); 3801 3802 // Estimate cost of scalarized vector call. The source operands are assumed 3803 // to be vectors, so we need to extract individual elements from there, 3804 // execute VF scalar calls, and then gather the result into the vector return 3805 // value. 3806 InstructionCost ScalarCallCost = 3807 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3808 if (VF.isScalar()) 3809 return ScalarCallCost; 3810 3811 // Compute corresponding vector type for return value and arguments. 3812 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3813 for (Type *ScalarTy : ScalarTys) 3814 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3815 3816 // Compute costs of unpacking argument values for the scalar calls and 3817 // packing the return values to a vector. 3818 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3819 3820 InstructionCost Cost = 3821 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3822 3823 // If we can't emit a vector call for this function, then the currently found 3824 // cost is the cost we need to return. 3825 NeedToScalarize = true; 3826 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3827 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3828 3829 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3830 return Cost; 3831 3832 // If the corresponding vector cost is cheaper, return its cost. 3833 InstructionCost VectorCallCost = 3834 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3835 if (VectorCallCost < Cost) { 3836 NeedToScalarize = false; 3837 Cost = VectorCallCost; 3838 } 3839 return Cost; 3840 } 3841 3842 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3843 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3844 return Elt; 3845 return VectorType::get(Elt, VF); 3846 } 3847 3848 InstructionCost 3849 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3850 ElementCount VF) const { 3851 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3852 assert(ID && "Expected intrinsic call!"); 3853 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3854 FastMathFlags FMF; 3855 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3856 FMF = FPMO->getFastMathFlags(); 3857 3858 SmallVector<const Value *> Arguments(CI->args()); 3859 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3860 SmallVector<Type *> ParamTys; 3861 std::transform(FTy->param_begin(), FTy->param_end(), 3862 std::back_inserter(ParamTys), 3863 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3864 3865 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3866 dyn_cast<IntrinsicInst>(CI)); 3867 return TTI.getIntrinsicInstrCost(CostAttrs, 3868 TargetTransformInfo::TCK_RecipThroughput); 3869 } 3870 3871 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3872 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3873 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3874 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3875 } 3876 3877 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3878 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3879 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3880 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3881 } 3882 3883 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3884 // For every instruction `I` in MinBWs, truncate the operands, create a 3885 // truncated version of `I` and reextend its result. InstCombine runs 3886 // later and will remove any ext/trunc pairs. 3887 SmallPtrSet<Value *, 4> Erased; 3888 for (const auto &KV : Cost->getMinimalBitwidths()) { 3889 // If the value wasn't vectorized, we must maintain the original scalar 3890 // type. The absence of the value from State indicates that it 3891 // wasn't vectorized. 3892 // FIXME: Should not rely on getVPValue at this point. 3893 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3894 if (!State.hasAnyVectorValue(Def)) 3895 continue; 3896 for (unsigned Part = 0; Part < UF; ++Part) { 3897 Value *I = State.get(Def, Part); 3898 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3899 continue; 3900 Type *OriginalTy = I->getType(); 3901 Type *ScalarTruncatedTy = 3902 IntegerType::get(OriginalTy->getContext(), KV.second); 3903 auto *TruncatedTy = VectorType::get( 3904 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3905 if (TruncatedTy == OriginalTy) 3906 continue; 3907 3908 IRBuilder<> B(cast<Instruction>(I)); 3909 auto ShrinkOperand = [&](Value *V) -> Value * { 3910 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3911 if (ZI->getSrcTy() == TruncatedTy) 3912 return ZI->getOperand(0); 3913 return B.CreateZExtOrTrunc(V, TruncatedTy); 3914 }; 3915 3916 // The actual instruction modification depends on the instruction type, 3917 // unfortunately. 3918 Value *NewI = nullptr; 3919 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3920 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3921 ShrinkOperand(BO->getOperand(1))); 3922 3923 // Any wrapping introduced by shrinking this operation shouldn't be 3924 // considered undefined behavior. So, we can't unconditionally copy 3925 // arithmetic wrapping flags to NewI. 3926 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3927 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3928 NewI = 3929 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3930 ShrinkOperand(CI->getOperand(1))); 3931 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3932 NewI = B.CreateSelect(SI->getCondition(), 3933 ShrinkOperand(SI->getTrueValue()), 3934 ShrinkOperand(SI->getFalseValue())); 3935 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3936 switch (CI->getOpcode()) { 3937 default: 3938 llvm_unreachable("Unhandled cast!"); 3939 case Instruction::Trunc: 3940 NewI = ShrinkOperand(CI->getOperand(0)); 3941 break; 3942 case Instruction::SExt: 3943 NewI = B.CreateSExtOrTrunc( 3944 CI->getOperand(0), 3945 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3946 break; 3947 case Instruction::ZExt: 3948 NewI = B.CreateZExtOrTrunc( 3949 CI->getOperand(0), 3950 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3951 break; 3952 } 3953 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3954 auto Elements0 = 3955 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3956 auto *O0 = B.CreateZExtOrTrunc( 3957 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3958 auto Elements1 = 3959 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3960 auto *O1 = B.CreateZExtOrTrunc( 3961 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3962 3963 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3964 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3965 // Don't do anything with the operands, just extend the result. 3966 continue; 3967 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3968 auto Elements = 3969 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3970 auto *O0 = B.CreateZExtOrTrunc( 3971 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3972 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3973 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3974 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3975 auto Elements = 3976 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3977 auto *O0 = B.CreateZExtOrTrunc( 3978 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3979 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3980 } else { 3981 // If we don't know what to do, be conservative and don't do anything. 3982 continue; 3983 } 3984 3985 // Lastly, extend the result. 3986 NewI->takeName(cast<Instruction>(I)); 3987 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3988 I->replaceAllUsesWith(Res); 3989 cast<Instruction>(I)->eraseFromParent(); 3990 Erased.insert(I); 3991 State.reset(Def, Res, Part); 3992 } 3993 } 3994 3995 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3996 for (const auto &KV : Cost->getMinimalBitwidths()) { 3997 // If the value wasn't vectorized, we must maintain the original scalar 3998 // type. The absence of the value from State indicates that it 3999 // wasn't vectorized. 4000 // FIXME: Should not rely on getVPValue at this point. 4001 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4002 if (!State.hasAnyVectorValue(Def)) 4003 continue; 4004 for (unsigned Part = 0; Part < UF; ++Part) { 4005 Value *I = State.get(Def, Part); 4006 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4007 if (Inst && Inst->use_empty()) { 4008 Value *NewI = Inst->getOperand(0); 4009 Inst->eraseFromParent(); 4010 State.reset(Def, NewI, Part); 4011 } 4012 } 4013 } 4014 } 4015 4016 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4017 // Insert truncates and extends for any truncated instructions as hints to 4018 // InstCombine. 4019 if (VF.isVector()) 4020 truncateToMinimalBitwidths(State); 4021 4022 // Fix widened non-induction PHIs by setting up the PHI operands. 4023 if (OrigPHIsToFix.size()) { 4024 assert(EnableVPlanNativePath && 4025 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4026 fixNonInductionPHIs(State); 4027 } 4028 4029 // At this point every instruction in the original loop is widened to a 4030 // vector form. Now we need to fix the recurrences in the loop. These PHI 4031 // nodes are currently empty because we did not want to introduce cycles. 4032 // This is the second stage of vectorizing recurrences. 4033 fixCrossIterationPHIs(State); 4034 4035 // Forget the original basic block. 4036 PSE.getSE()->forgetLoop(OrigLoop); 4037 4038 // If we inserted an edge from the middle block to the unique exit block, 4039 // update uses outside the loop (phis) to account for the newly inserted 4040 // edge. 4041 if (!Cost->requiresScalarEpilogue(VF)) { 4042 // Fix-up external users of the induction variables. 4043 for (auto &Entry : Legal->getInductionVars()) 4044 fixupIVUsers(Entry.first, Entry.second, 4045 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4046 IVEndValues[Entry.first], LoopMiddleBlock); 4047 4048 fixLCSSAPHIs(State); 4049 } 4050 4051 for (Instruction *PI : PredicatedInstructions) 4052 sinkScalarOperands(&*PI); 4053 4054 // Remove redundant induction instructions. 4055 cse(LoopVectorBody); 4056 4057 // Set/update profile weights for the vector and remainder loops as original 4058 // loop iterations are now distributed among them. Note that original loop 4059 // represented by LoopScalarBody becomes remainder loop after vectorization. 4060 // 4061 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4062 // end up getting slightly roughened result but that should be OK since 4063 // profile is not inherently precise anyway. Note also possible bypass of 4064 // vector code caused by legality checks is ignored, assigning all the weight 4065 // to the vector loop, optimistically. 4066 // 4067 // For scalable vectorization we can't know at compile time how many iterations 4068 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4069 // vscale of '1'. 4070 setProfileInfoAfterUnrolling( 4071 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4072 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4073 } 4074 4075 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4076 // In order to support recurrences we need to be able to vectorize Phi nodes. 4077 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4078 // stage #2: We now need to fix the recurrences by adding incoming edges to 4079 // the currently empty PHI nodes. At this point every instruction in the 4080 // original loop is widened to a vector form so we can use them to construct 4081 // the incoming edges. 4082 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4083 for (VPRecipeBase &R : Header->phis()) { 4084 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4085 fixReduction(ReductionPhi, State); 4086 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4087 fixFirstOrderRecurrence(FOR, State); 4088 } 4089 } 4090 4091 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4092 VPTransformState &State) { 4093 // This is the second phase of vectorizing first-order recurrences. An 4094 // overview of the transformation is described below. Suppose we have the 4095 // following loop. 4096 // 4097 // for (int i = 0; i < n; ++i) 4098 // b[i] = a[i] - a[i - 1]; 4099 // 4100 // There is a first-order recurrence on "a". For this loop, the shorthand 4101 // scalar IR looks like: 4102 // 4103 // scalar.ph: 4104 // s_init = a[-1] 4105 // br scalar.body 4106 // 4107 // scalar.body: 4108 // i = phi [0, scalar.ph], [i+1, scalar.body] 4109 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4110 // s2 = a[i] 4111 // b[i] = s2 - s1 4112 // br cond, scalar.body, ... 4113 // 4114 // In this example, s1 is a recurrence because it's value depends on the 4115 // previous iteration. In the first phase of vectorization, we created a 4116 // vector phi v1 for s1. We now complete the vectorization and produce the 4117 // shorthand vector IR shown below (for VF = 4, UF = 1). 4118 // 4119 // vector.ph: 4120 // v_init = vector(..., ..., ..., a[-1]) 4121 // br vector.body 4122 // 4123 // vector.body 4124 // i = phi [0, vector.ph], [i+4, vector.body] 4125 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4126 // v2 = a[i, i+1, i+2, i+3]; 4127 // v3 = vector(v1(3), v2(0, 1, 2)) 4128 // b[i, i+1, i+2, i+3] = v2 - v3 4129 // br cond, vector.body, middle.block 4130 // 4131 // middle.block: 4132 // x = v2(3) 4133 // br scalar.ph 4134 // 4135 // scalar.ph: 4136 // s_init = phi [x, middle.block], [a[-1], otherwise] 4137 // br scalar.body 4138 // 4139 // After execution completes the vector loop, we extract the next value of 4140 // the recurrence (x) to use as the initial value in the scalar loop. 4141 4142 // Extract the last vector element in the middle block. This will be the 4143 // initial value for the recurrence when jumping to the scalar loop. 4144 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4145 Value *Incoming = State.get(PreviousDef, UF - 1); 4146 auto *ExtractForScalar = Incoming; 4147 auto *IdxTy = Builder.getInt32Ty(); 4148 if (VF.isVector()) { 4149 auto *One = ConstantInt::get(IdxTy, 1); 4150 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4151 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4152 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4153 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4154 "vector.recur.extract"); 4155 } 4156 // Extract the second last element in the middle block if the 4157 // Phi is used outside the loop. We need to extract the phi itself 4158 // and not the last element (the phi update in the current iteration). This 4159 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4160 // when the scalar loop is not run at all. 4161 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4162 if (VF.isVector()) { 4163 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4164 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4165 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4166 Incoming, Idx, "vector.recur.extract.for.phi"); 4167 } else if (UF > 1) 4168 // When loop is unrolled without vectorizing, initialize 4169 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4170 // of `Incoming`. This is analogous to the vectorized case above: extracting 4171 // the second last element when VF > 1. 4172 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4173 4174 // Fix the initial value of the original recurrence in the scalar loop. 4175 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4176 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4177 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4178 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4179 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4180 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4181 Start->addIncoming(Incoming, BB); 4182 } 4183 4184 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4185 Phi->setName("scalar.recur"); 4186 4187 // Finally, fix users of the recurrence outside the loop. The users will need 4188 // either the last value of the scalar recurrence or the last value of the 4189 // vector recurrence we extracted in the middle block. Since the loop is in 4190 // LCSSA form, we just need to find all the phi nodes for the original scalar 4191 // recurrence in the exit block, and then add an edge for the middle block. 4192 // Note that LCSSA does not imply single entry when the original scalar loop 4193 // had multiple exiting edges (as we always run the last iteration in the 4194 // scalar epilogue); in that case, there is no edge from middle to exit and 4195 // and thus no phis which needed updated. 4196 if (!Cost->requiresScalarEpilogue(VF)) 4197 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4198 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4199 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4200 } 4201 4202 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4203 VPTransformState &State) { 4204 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4205 // Get it's reduction variable descriptor. 4206 assert(Legal->isReductionVariable(OrigPhi) && 4207 "Unable to find the reduction variable"); 4208 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4209 4210 RecurKind RK = RdxDesc.getRecurrenceKind(); 4211 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4212 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4213 setDebugLocFromInst(ReductionStartValue); 4214 4215 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4216 // This is the vector-clone of the value that leaves the loop. 4217 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4218 4219 // Wrap flags are in general invalid after vectorization, clear them. 4220 clearReductionWrapFlags(RdxDesc, State); 4221 4222 // Before each round, move the insertion point right between 4223 // the PHIs and the values we are going to write. 4224 // This allows us to write both PHINodes and the extractelement 4225 // instructions. 4226 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4227 4228 setDebugLocFromInst(LoopExitInst); 4229 4230 Type *PhiTy = OrigPhi->getType(); 4231 // If tail is folded by masking, the vector value to leave the loop should be 4232 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4233 // instead of the former. For an inloop reduction the reduction will already 4234 // be predicated, and does not need to be handled here. 4235 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4236 for (unsigned Part = 0; Part < UF; ++Part) { 4237 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4238 Value *Sel = nullptr; 4239 for (User *U : VecLoopExitInst->users()) { 4240 if (isa<SelectInst>(U)) { 4241 assert(!Sel && "Reduction exit feeding two selects"); 4242 Sel = U; 4243 } else 4244 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4245 } 4246 assert(Sel && "Reduction exit feeds no select"); 4247 State.reset(LoopExitInstDef, Sel, Part); 4248 4249 // If the target can create a predicated operator for the reduction at no 4250 // extra cost in the loop (for example a predicated vadd), it can be 4251 // cheaper for the select to remain in the loop than be sunk out of it, 4252 // and so use the select value for the phi instead of the old 4253 // LoopExitValue. 4254 if (PreferPredicatedReductionSelect || 4255 TTI->preferPredicatedReductionSelect( 4256 RdxDesc.getOpcode(), PhiTy, 4257 TargetTransformInfo::ReductionFlags())) { 4258 auto *VecRdxPhi = 4259 cast<PHINode>(State.get(PhiR, Part)); 4260 VecRdxPhi->setIncomingValueForBlock( 4261 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4262 } 4263 } 4264 } 4265 4266 // If the vector reduction can be performed in a smaller type, we truncate 4267 // then extend the loop exit value to enable InstCombine to evaluate the 4268 // entire expression in the smaller type. 4269 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4270 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4271 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4272 Builder.SetInsertPoint( 4273 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4274 VectorParts RdxParts(UF); 4275 for (unsigned Part = 0; Part < UF; ++Part) { 4276 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4277 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4278 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4279 : Builder.CreateZExt(Trunc, VecTy); 4280 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4281 if (U != Trunc) { 4282 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4283 RdxParts[Part] = Extnd; 4284 } 4285 } 4286 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4287 for (unsigned Part = 0; Part < UF; ++Part) { 4288 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4289 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4290 } 4291 } 4292 4293 // Reduce all of the unrolled parts into a single vector. 4294 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4295 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4296 4297 // The middle block terminator has already been assigned a DebugLoc here (the 4298 // OrigLoop's single latch terminator). We want the whole middle block to 4299 // appear to execute on this line because: (a) it is all compiler generated, 4300 // (b) these instructions are always executed after evaluating the latch 4301 // conditional branch, and (c) other passes may add new predecessors which 4302 // terminate on this line. This is the easiest way to ensure we don't 4303 // accidentally cause an extra step back into the loop while debugging. 4304 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4305 if (PhiR->isOrdered()) 4306 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4307 else { 4308 // Floating-point operations should have some FMF to enable the reduction. 4309 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4310 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4311 for (unsigned Part = 1; Part < UF; ++Part) { 4312 Value *RdxPart = State.get(LoopExitInstDef, Part); 4313 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4314 ReducedPartRdx = Builder.CreateBinOp( 4315 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4316 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4317 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4318 ReducedPartRdx, RdxPart); 4319 else 4320 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4321 } 4322 } 4323 4324 // Create the reduction after the loop. Note that inloop reductions create the 4325 // target reduction in the loop using a Reduction recipe. 4326 if (VF.isVector() && !PhiR->isInLoop()) { 4327 ReducedPartRdx = 4328 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4329 // If the reduction can be performed in a smaller type, we need to extend 4330 // the reduction to the wider type before we branch to the original loop. 4331 if (PhiTy != RdxDesc.getRecurrenceType()) 4332 ReducedPartRdx = RdxDesc.isSigned() 4333 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4334 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4335 } 4336 4337 // Create a phi node that merges control-flow from the backedge-taken check 4338 // block and the middle block. 4339 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4340 LoopScalarPreHeader->getTerminator()); 4341 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4342 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4343 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4344 4345 // Now, we need to fix the users of the reduction variable 4346 // inside and outside of the scalar remainder loop. 4347 4348 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4349 // in the exit blocks. See comment on analogous loop in 4350 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4351 if (!Cost->requiresScalarEpilogue(VF)) 4352 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4353 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4354 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4355 4356 // Fix the scalar loop reduction variable with the incoming reduction sum 4357 // from the vector body and from the backedge value. 4358 int IncomingEdgeBlockIdx = 4359 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4360 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4361 // Pick the other block. 4362 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4363 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4364 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4365 } 4366 4367 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4368 VPTransformState &State) { 4369 RecurKind RK = RdxDesc.getRecurrenceKind(); 4370 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4371 return; 4372 4373 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4374 assert(LoopExitInstr && "null loop exit instruction"); 4375 SmallVector<Instruction *, 8> Worklist; 4376 SmallPtrSet<Instruction *, 8> Visited; 4377 Worklist.push_back(LoopExitInstr); 4378 Visited.insert(LoopExitInstr); 4379 4380 while (!Worklist.empty()) { 4381 Instruction *Cur = Worklist.pop_back_val(); 4382 if (isa<OverflowingBinaryOperator>(Cur)) 4383 for (unsigned Part = 0; Part < UF; ++Part) { 4384 // FIXME: Should not rely on getVPValue at this point. 4385 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4386 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4387 } 4388 4389 for (User *U : Cur->users()) { 4390 Instruction *UI = cast<Instruction>(U); 4391 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4392 Visited.insert(UI).second) 4393 Worklist.push_back(UI); 4394 } 4395 } 4396 } 4397 4398 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4399 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4400 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4401 // Some phis were already hand updated by the reduction and recurrence 4402 // code above, leave them alone. 4403 continue; 4404 4405 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4406 // Non-instruction incoming values will have only one value. 4407 4408 VPLane Lane = VPLane::getFirstLane(); 4409 if (isa<Instruction>(IncomingValue) && 4410 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4411 VF)) 4412 Lane = VPLane::getLastLaneForVF(VF); 4413 4414 // Can be a loop invariant incoming value or the last scalar value to be 4415 // extracted from the vectorized loop. 4416 // FIXME: Should not rely on getVPValue at this point. 4417 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4418 Value *lastIncomingValue = 4419 OrigLoop->isLoopInvariant(IncomingValue) 4420 ? IncomingValue 4421 : State.get(State.Plan->getVPValue(IncomingValue, true), 4422 VPIteration(UF - 1, Lane)); 4423 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4424 } 4425 } 4426 4427 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4428 // The basic block and loop containing the predicated instruction. 4429 auto *PredBB = PredInst->getParent(); 4430 auto *VectorLoop = LI->getLoopFor(PredBB); 4431 4432 // Initialize a worklist with the operands of the predicated instruction. 4433 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4434 4435 // Holds instructions that we need to analyze again. An instruction may be 4436 // reanalyzed if we don't yet know if we can sink it or not. 4437 SmallVector<Instruction *, 8> InstsToReanalyze; 4438 4439 // Returns true if a given use occurs in the predicated block. Phi nodes use 4440 // their operands in their corresponding predecessor blocks. 4441 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4442 auto *I = cast<Instruction>(U.getUser()); 4443 BasicBlock *BB = I->getParent(); 4444 if (auto *Phi = dyn_cast<PHINode>(I)) 4445 BB = Phi->getIncomingBlock( 4446 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4447 return BB == PredBB; 4448 }; 4449 4450 // Iteratively sink the scalarized operands of the predicated instruction 4451 // into the block we created for it. When an instruction is sunk, it's 4452 // operands are then added to the worklist. The algorithm ends after one pass 4453 // through the worklist doesn't sink a single instruction. 4454 bool Changed; 4455 do { 4456 // Add the instructions that need to be reanalyzed to the worklist, and 4457 // reset the changed indicator. 4458 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4459 InstsToReanalyze.clear(); 4460 Changed = false; 4461 4462 while (!Worklist.empty()) { 4463 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4464 4465 // We can't sink an instruction if it is a phi node, is not in the loop, 4466 // or may have side effects. 4467 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4468 I->mayHaveSideEffects()) 4469 continue; 4470 4471 // If the instruction is already in PredBB, check if we can sink its 4472 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4473 // sinking the scalar instruction I, hence it appears in PredBB; but it 4474 // may have failed to sink I's operands (recursively), which we try 4475 // (again) here. 4476 if (I->getParent() == PredBB) { 4477 Worklist.insert(I->op_begin(), I->op_end()); 4478 continue; 4479 } 4480 4481 // It's legal to sink the instruction if all its uses occur in the 4482 // predicated block. Otherwise, there's nothing to do yet, and we may 4483 // need to reanalyze the instruction. 4484 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4485 InstsToReanalyze.push_back(I); 4486 continue; 4487 } 4488 4489 // Move the instruction to the beginning of the predicated block, and add 4490 // it's operands to the worklist. 4491 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4492 Worklist.insert(I->op_begin(), I->op_end()); 4493 4494 // The sinking may have enabled other instructions to be sunk, so we will 4495 // need to iterate. 4496 Changed = true; 4497 } 4498 } while (Changed); 4499 } 4500 4501 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4502 for (PHINode *OrigPhi : OrigPHIsToFix) { 4503 VPWidenPHIRecipe *VPPhi = 4504 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4505 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4506 // Make sure the builder has a valid insert point. 4507 Builder.SetInsertPoint(NewPhi); 4508 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4509 VPValue *Inc = VPPhi->getIncomingValue(i); 4510 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4511 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4512 } 4513 } 4514 } 4515 4516 bool InnerLoopVectorizer::useOrderedReductions( 4517 const RecurrenceDescriptor &RdxDesc) { 4518 return Cost->useOrderedReductions(RdxDesc); 4519 } 4520 4521 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4522 VPWidenPHIRecipe *PhiR, 4523 VPTransformState &State) { 4524 PHINode *P = cast<PHINode>(PN); 4525 if (EnableVPlanNativePath) { 4526 // Currently we enter here in the VPlan-native path for non-induction 4527 // PHIs where all control flow is uniform. We simply widen these PHIs. 4528 // Create a vector phi with no operands - the vector phi operands will be 4529 // set at the end of vector code generation. 4530 Type *VecTy = (State.VF.isScalar()) 4531 ? PN->getType() 4532 : VectorType::get(PN->getType(), State.VF); 4533 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4534 State.set(PhiR, VecPhi, 0); 4535 OrigPHIsToFix.push_back(P); 4536 4537 return; 4538 } 4539 4540 assert(PN->getParent() == OrigLoop->getHeader() && 4541 "Non-header phis should have been handled elsewhere"); 4542 4543 // In order to support recurrences we need to be able to vectorize Phi nodes. 4544 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4545 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4546 // this value when we vectorize all of the instructions that use the PHI. 4547 4548 assert(!Legal->isReductionVariable(P) && 4549 "reductions should be handled elsewhere"); 4550 4551 setDebugLocFromInst(P); 4552 4553 // This PHINode must be an induction variable. 4554 // Make sure that we know about it. 4555 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4556 4557 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4558 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4559 4560 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4561 // which can be found from the original scalar operations. 4562 switch (II.getKind()) { 4563 case InductionDescriptor::IK_NoInduction: 4564 llvm_unreachable("Unknown induction"); 4565 case InductionDescriptor::IK_IntInduction: 4566 case InductionDescriptor::IK_FpInduction: 4567 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4568 case InductionDescriptor::IK_PtrInduction: { 4569 // Handle the pointer induction variable case. 4570 assert(P->getType()->isPointerTy() && "Unexpected type."); 4571 4572 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4573 // This is the normalized GEP that starts counting at zero. 4574 Value *PtrInd = 4575 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4576 // Determine the number of scalars we need to generate for each unroll 4577 // iteration. If the instruction is uniform, we only need to generate the 4578 // first lane. Otherwise, we generate all VF values. 4579 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4580 assert((IsUniform || !State.VF.isScalable()) && 4581 "Cannot scalarize a scalable VF"); 4582 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4583 4584 for (unsigned Part = 0; Part < UF; ++Part) { 4585 Value *PartStart = 4586 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4587 4588 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4589 Value *Idx = Builder.CreateAdd( 4590 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4591 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4592 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), 4593 DL, II, State.CFG.PrevBB); 4594 SclrGep->setName("next.gep"); 4595 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4596 } 4597 } 4598 return; 4599 } 4600 assert(isa<SCEVConstant>(II.getStep()) && 4601 "Induction step not a SCEV constant!"); 4602 Type *PhiType = II.getStep()->getType(); 4603 4604 // Build a pointer phi 4605 Value *ScalarStartValue = II.getStartValue(); 4606 Type *ScStValueType = ScalarStartValue->getType(); 4607 PHINode *NewPointerPhi = 4608 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4609 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4610 4611 // A pointer induction, performed by using a gep 4612 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4613 Instruction *InductionLoc = LoopLatch->getTerminator(); 4614 const SCEV *ScalarStep = II.getStep(); 4615 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4616 Value *ScalarStepValue = 4617 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4618 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4619 Value *NumUnrolledElems = 4620 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4621 Value *InductionGEP = GetElementPtrInst::Create( 4622 II.getElementType(), NewPointerPhi, 4623 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4624 InductionLoc); 4625 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4626 4627 // Create UF many actual address geps that use the pointer 4628 // phi as base and a vectorized version of the step value 4629 // (<step*0, ..., step*N>) as offset. 4630 for (unsigned Part = 0; Part < State.UF; ++Part) { 4631 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4632 Value *StartOffsetScalar = 4633 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4634 Value *StartOffset = 4635 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4636 // Create a vector of consecutive numbers from zero to VF. 4637 StartOffset = 4638 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4639 4640 Value *GEP = Builder.CreateGEP( 4641 II.getElementType(), NewPointerPhi, 4642 Builder.CreateMul( 4643 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4644 "vector.gep")); 4645 State.set(PhiR, GEP, Part); 4646 } 4647 } 4648 } 4649 } 4650 4651 /// A helper function for checking whether an integer division-related 4652 /// instruction may divide by zero (in which case it must be predicated if 4653 /// executed conditionally in the scalar code). 4654 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4655 /// Non-zero divisors that are non compile-time constants will not be 4656 /// converted into multiplication, so we will still end up scalarizing 4657 /// the division, but can do so w/o predication. 4658 static bool mayDivideByZero(Instruction &I) { 4659 assert((I.getOpcode() == Instruction::UDiv || 4660 I.getOpcode() == Instruction::SDiv || 4661 I.getOpcode() == Instruction::URem || 4662 I.getOpcode() == Instruction::SRem) && 4663 "Unexpected instruction"); 4664 Value *Divisor = I.getOperand(1); 4665 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4666 return !CInt || CInt->isZero(); 4667 } 4668 4669 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4670 VPUser &ArgOperands, 4671 VPTransformState &State) { 4672 assert(!isa<DbgInfoIntrinsic>(I) && 4673 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4674 setDebugLocFromInst(&I); 4675 4676 Module *M = I.getParent()->getParent()->getParent(); 4677 auto *CI = cast<CallInst>(&I); 4678 4679 SmallVector<Type *, 4> Tys; 4680 for (Value *ArgOperand : CI->args()) 4681 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4682 4683 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4684 4685 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4686 // version of the instruction. 4687 // Is it beneficial to perform intrinsic call compared to lib call? 4688 bool NeedToScalarize = false; 4689 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4690 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4691 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4692 assert((UseVectorIntrinsic || !NeedToScalarize) && 4693 "Instruction should be scalarized elsewhere."); 4694 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4695 "Either the intrinsic cost or vector call cost must be valid"); 4696 4697 for (unsigned Part = 0; Part < UF; ++Part) { 4698 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4699 SmallVector<Value *, 4> Args; 4700 for (auto &I : enumerate(ArgOperands.operands())) { 4701 // Some intrinsics have a scalar argument - don't replace it with a 4702 // vector. 4703 Value *Arg; 4704 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4705 Arg = State.get(I.value(), Part); 4706 else { 4707 Arg = State.get(I.value(), VPIteration(0, 0)); 4708 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4709 TysForDecl.push_back(Arg->getType()); 4710 } 4711 Args.push_back(Arg); 4712 } 4713 4714 Function *VectorF; 4715 if (UseVectorIntrinsic) { 4716 // Use vector version of the intrinsic. 4717 if (VF.isVector()) 4718 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4719 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4720 assert(VectorF && "Can't retrieve vector intrinsic."); 4721 } else { 4722 // Use vector version of the function call. 4723 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4724 #ifndef NDEBUG 4725 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4726 "Can't create vector function."); 4727 #endif 4728 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4729 } 4730 SmallVector<OperandBundleDef, 1> OpBundles; 4731 CI->getOperandBundlesAsDefs(OpBundles); 4732 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4733 4734 if (isa<FPMathOperator>(V)) 4735 V->copyFastMathFlags(CI); 4736 4737 State.set(Def, V, Part); 4738 addMetadata(V, &I); 4739 } 4740 } 4741 4742 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4743 // We should not collect Scalars more than once per VF. Right now, this 4744 // function is called from collectUniformsAndScalars(), which already does 4745 // this check. Collecting Scalars for VF=1 does not make any sense. 4746 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4747 "This function should not be visited twice for the same VF"); 4748 4749 SmallSetVector<Instruction *, 8> Worklist; 4750 4751 // These sets are used to seed the analysis with pointers used by memory 4752 // accesses that will remain scalar. 4753 SmallSetVector<Instruction *, 8> ScalarPtrs; 4754 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4755 auto *Latch = TheLoop->getLoopLatch(); 4756 4757 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4758 // The pointer operands of loads and stores will be scalar as long as the 4759 // memory access is not a gather or scatter operation. The value operand of a 4760 // store will remain scalar if the store is scalarized. 4761 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4762 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4763 assert(WideningDecision != CM_Unknown && 4764 "Widening decision should be ready at this moment"); 4765 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4766 if (Ptr == Store->getValueOperand()) 4767 return WideningDecision == CM_Scalarize; 4768 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4769 "Ptr is neither a value or pointer operand"); 4770 return WideningDecision != CM_GatherScatter; 4771 }; 4772 4773 // A helper that returns true if the given value is a bitcast or 4774 // getelementptr instruction contained in the loop. 4775 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4776 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4777 isa<GetElementPtrInst>(V)) && 4778 !TheLoop->isLoopInvariant(V); 4779 }; 4780 4781 // A helper that evaluates a memory access's use of a pointer. If the use will 4782 // be a scalar use and the pointer is only used by memory accesses, we place 4783 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4784 // PossibleNonScalarPtrs. 4785 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4786 // We only care about bitcast and getelementptr instructions contained in 4787 // the loop. 4788 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4789 return; 4790 4791 // If the pointer has already been identified as scalar (e.g., if it was 4792 // also identified as uniform), there's nothing to do. 4793 auto *I = cast<Instruction>(Ptr); 4794 if (Worklist.count(I)) 4795 return; 4796 4797 // If the use of the pointer will be a scalar use, and all users of the 4798 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4799 // place the pointer in PossibleNonScalarPtrs. 4800 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4801 return isa<LoadInst>(U) || isa<StoreInst>(U); 4802 })) 4803 ScalarPtrs.insert(I); 4804 else 4805 PossibleNonScalarPtrs.insert(I); 4806 }; 4807 4808 // We seed the scalars analysis with three classes of instructions: (1) 4809 // instructions marked uniform-after-vectorization and (2) bitcast, 4810 // getelementptr and (pointer) phi instructions used by memory accesses 4811 // requiring a scalar use. 4812 // 4813 // (1) Add to the worklist all instructions that have been identified as 4814 // uniform-after-vectorization. 4815 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4816 4817 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4818 // memory accesses requiring a scalar use. The pointer operands of loads and 4819 // stores will be scalar as long as the memory accesses is not a gather or 4820 // scatter operation. The value operand of a store will remain scalar if the 4821 // store is scalarized. 4822 for (auto *BB : TheLoop->blocks()) 4823 for (auto &I : *BB) { 4824 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4825 evaluatePtrUse(Load, Load->getPointerOperand()); 4826 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4827 evaluatePtrUse(Store, Store->getPointerOperand()); 4828 evaluatePtrUse(Store, Store->getValueOperand()); 4829 } 4830 } 4831 for (auto *I : ScalarPtrs) 4832 if (!PossibleNonScalarPtrs.count(I)) { 4833 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4834 Worklist.insert(I); 4835 } 4836 4837 // Insert the forced scalars. 4838 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4839 // induction variable when the PHI user is scalarized. 4840 auto ForcedScalar = ForcedScalars.find(VF); 4841 if (ForcedScalar != ForcedScalars.end()) 4842 for (auto *I : ForcedScalar->second) 4843 Worklist.insert(I); 4844 4845 // Expand the worklist by looking through any bitcasts and getelementptr 4846 // instructions we've already identified as scalar. This is similar to the 4847 // expansion step in collectLoopUniforms(); however, here we're only 4848 // expanding to include additional bitcasts and getelementptr instructions. 4849 unsigned Idx = 0; 4850 while (Idx != Worklist.size()) { 4851 Instruction *Dst = Worklist[Idx++]; 4852 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4853 continue; 4854 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4855 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4856 auto *J = cast<Instruction>(U); 4857 return !TheLoop->contains(J) || Worklist.count(J) || 4858 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4859 isScalarUse(J, Src)); 4860 })) { 4861 Worklist.insert(Src); 4862 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4863 } 4864 } 4865 4866 // An induction variable will remain scalar if all users of the induction 4867 // variable and induction variable update remain scalar. 4868 for (auto &Induction : Legal->getInductionVars()) { 4869 auto *Ind = Induction.first; 4870 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4871 4872 // If tail-folding is applied, the primary induction variable will be used 4873 // to feed a vector compare. 4874 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4875 continue; 4876 4877 // Returns true if \p Indvar is a pointer induction that is used directly by 4878 // load/store instruction \p I. 4879 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4880 Instruction *I) { 4881 return Induction.second.getKind() == 4882 InductionDescriptor::IK_PtrInduction && 4883 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4884 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4885 }; 4886 4887 // Determine if all users of the induction variable are scalar after 4888 // vectorization. 4889 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4890 auto *I = cast<Instruction>(U); 4891 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4892 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4893 }); 4894 if (!ScalarInd) 4895 continue; 4896 4897 // Determine if all users of the induction variable update instruction are 4898 // scalar after vectorization. 4899 auto ScalarIndUpdate = 4900 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4901 auto *I = cast<Instruction>(U); 4902 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4903 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4904 }); 4905 if (!ScalarIndUpdate) 4906 continue; 4907 4908 // The induction variable and its update instruction will remain scalar. 4909 Worklist.insert(Ind); 4910 Worklist.insert(IndUpdate); 4911 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4912 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4913 << "\n"); 4914 } 4915 4916 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4917 } 4918 4919 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 4920 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4921 return false; 4922 switch(I->getOpcode()) { 4923 default: 4924 break; 4925 case Instruction::Load: 4926 case Instruction::Store: { 4927 if (!Legal->isMaskRequired(I)) 4928 return false; 4929 auto *Ptr = getLoadStorePointerOperand(I); 4930 auto *Ty = getLoadStoreType(I); 4931 const Align Alignment = getLoadStoreAlignment(I); 4932 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4933 TTI.isLegalMaskedGather(Ty, Alignment)) 4934 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4935 TTI.isLegalMaskedScatter(Ty, Alignment)); 4936 } 4937 case Instruction::UDiv: 4938 case Instruction::SDiv: 4939 case Instruction::SRem: 4940 case Instruction::URem: 4941 return mayDivideByZero(*I); 4942 } 4943 return false; 4944 } 4945 4946 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4947 Instruction *I, ElementCount VF) { 4948 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4949 assert(getWideningDecision(I, VF) == CM_Unknown && 4950 "Decision should not be set yet."); 4951 auto *Group = getInterleavedAccessGroup(I); 4952 assert(Group && "Must have a group."); 4953 4954 // If the instruction's allocated size doesn't equal it's type size, it 4955 // requires padding and will be scalarized. 4956 auto &DL = I->getModule()->getDataLayout(); 4957 auto *ScalarTy = getLoadStoreType(I); 4958 if (hasIrregularType(ScalarTy, DL)) 4959 return false; 4960 4961 // Check if masking is required. 4962 // A Group may need masking for one of two reasons: it resides in a block that 4963 // needs predication, or it was decided to use masking to deal with gaps 4964 // (either a gap at the end of a load-access that may result in a speculative 4965 // load, or any gaps in a store-access). 4966 bool PredicatedAccessRequiresMasking = 4967 blockNeedsPredicationForAnyReason(I->getParent()) && 4968 Legal->isMaskRequired(I); 4969 bool LoadAccessWithGapsRequiresEpilogMasking = 4970 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4971 !isScalarEpilogueAllowed(); 4972 bool StoreAccessWithGapsRequiresMasking = 4973 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4974 if (!PredicatedAccessRequiresMasking && 4975 !LoadAccessWithGapsRequiresEpilogMasking && 4976 !StoreAccessWithGapsRequiresMasking) 4977 return true; 4978 4979 // If masked interleaving is required, we expect that the user/target had 4980 // enabled it, because otherwise it either wouldn't have been created or 4981 // it should have been invalidated by the CostModel. 4982 assert(useMaskedInterleavedAccesses(TTI) && 4983 "Masked interleave-groups for predicated accesses are not enabled."); 4984 4985 if (Group->isReverse()) 4986 return false; 4987 4988 auto *Ty = getLoadStoreType(I); 4989 const Align Alignment = getLoadStoreAlignment(I); 4990 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4991 : TTI.isLegalMaskedStore(Ty, Alignment); 4992 } 4993 4994 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4995 Instruction *I, ElementCount VF) { 4996 // Get and ensure we have a valid memory instruction. 4997 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4998 4999 auto *Ptr = getLoadStorePointerOperand(I); 5000 auto *ScalarTy = getLoadStoreType(I); 5001 5002 // In order to be widened, the pointer should be consecutive, first of all. 5003 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5004 return false; 5005 5006 // If the instruction is a store located in a predicated block, it will be 5007 // scalarized. 5008 if (isScalarWithPredication(I)) 5009 return false; 5010 5011 // If the instruction's allocated size doesn't equal it's type size, it 5012 // requires padding and will be scalarized. 5013 auto &DL = I->getModule()->getDataLayout(); 5014 if (hasIrregularType(ScalarTy, DL)) 5015 return false; 5016 5017 return true; 5018 } 5019 5020 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5021 // We should not collect Uniforms more than once per VF. Right now, 5022 // this function is called from collectUniformsAndScalars(), which 5023 // already does this check. Collecting Uniforms for VF=1 does not make any 5024 // sense. 5025 5026 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5027 "This function should not be visited twice for the same VF"); 5028 5029 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5030 // not analyze again. Uniforms.count(VF) will return 1. 5031 Uniforms[VF].clear(); 5032 5033 // We now know that the loop is vectorizable! 5034 // Collect instructions inside the loop that will remain uniform after 5035 // vectorization. 5036 5037 // Global values, params and instructions outside of current loop are out of 5038 // scope. 5039 auto isOutOfScope = [&](Value *V) -> bool { 5040 Instruction *I = dyn_cast<Instruction>(V); 5041 return (!I || !TheLoop->contains(I)); 5042 }; 5043 5044 // Worklist containing uniform instructions demanding lane 0. 5045 SetVector<Instruction *> Worklist; 5046 BasicBlock *Latch = TheLoop->getLoopLatch(); 5047 5048 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5049 // that are scalar with predication must not be considered uniform after 5050 // vectorization, because that would create an erroneous replicating region 5051 // where only a single instance out of VF should be formed. 5052 // TODO: optimize such seldom cases if found important, see PR40816. 5053 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5054 if (isOutOfScope(I)) { 5055 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5056 << *I << "\n"); 5057 return; 5058 } 5059 if (isScalarWithPredication(I)) { 5060 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5061 << *I << "\n"); 5062 return; 5063 } 5064 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5065 Worklist.insert(I); 5066 }; 5067 5068 // Start with the conditional branch. If the branch condition is an 5069 // instruction contained in the loop that is only used by the branch, it is 5070 // uniform. 5071 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5072 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5073 addToWorklistIfAllowed(Cmp); 5074 5075 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5076 InstWidening WideningDecision = getWideningDecision(I, VF); 5077 assert(WideningDecision != CM_Unknown && 5078 "Widening decision should be ready at this moment"); 5079 5080 // A uniform memory op is itself uniform. We exclude uniform stores 5081 // here as they demand the last lane, not the first one. 5082 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5083 assert(WideningDecision == CM_Scalarize); 5084 return true; 5085 } 5086 5087 return (WideningDecision == CM_Widen || 5088 WideningDecision == CM_Widen_Reverse || 5089 WideningDecision == CM_Interleave); 5090 }; 5091 5092 5093 // Returns true if Ptr is the pointer operand of a memory access instruction 5094 // I, and I is known to not require scalarization. 5095 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5096 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5097 }; 5098 5099 // Holds a list of values which are known to have at least one uniform use. 5100 // Note that there may be other uses which aren't uniform. A "uniform use" 5101 // here is something which only demands lane 0 of the unrolled iterations; 5102 // it does not imply that all lanes produce the same value (e.g. this is not 5103 // the usual meaning of uniform) 5104 SetVector<Value *> HasUniformUse; 5105 5106 // Scan the loop for instructions which are either a) known to have only 5107 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5108 for (auto *BB : TheLoop->blocks()) 5109 for (auto &I : *BB) { 5110 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5111 switch (II->getIntrinsicID()) { 5112 case Intrinsic::sideeffect: 5113 case Intrinsic::experimental_noalias_scope_decl: 5114 case Intrinsic::assume: 5115 case Intrinsic::lifetime_start: 5116 case Intrinsic::lifetime_end: 5117 if (TheLoop->hasLoopInvariantOperands(&I)) 5118 addToWorklistIfAllowed(&I); 5119 break; 5120 default: 5121 break; 5122 } 5123 } 5124 5125 // ExtractValue instructions must be uniform, because the operands are 5126 // known to be loop-invariant. 5127 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5128 assert(isOutOfScope(EVI->getAggregateOperand()) && 5129 "Expected aggregate value to be loop invariant"); 5130 addToWorklistIfAllowed(EVI); 5131 continue; 5132 } 5133 5134 // If there's no pointer operand, there's nothing to do. 5135 auto *Ptr = getLoadStorePointerOperand(&I); 5136 if (!Ptr) 5137 continue; 5138 5139 // A uniform memory op is itself uniform. We exclude uniform stores 5140 // here as they demand the last lane, not the first one. 5141 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5142 addToWorklistIfAllowed(&I); 5143 5144 if (isUniformDecision(&I, VF)) { 5145 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5146 HasUniformUse.insert(Ptr); 5147 } 5148 } 5149 5150 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5151 // demanding) users. Since loops are assumed to be in LCSSA form, this 5152 // disallows uses outside the loop as well. 5153 for (auto *V : HasUniformUse) { 5154 if (isOutOfScope(V)) 5155 continue; 5156 auto *I = cast<Instruction>(V); 5157 auto UsersAreMemAccesses = 5158 llvm::all_of(I->users(), [&](User *U) -> bool { 5159 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5160 }); 5161 if (UsersAreMemAccesses) 5162 addToWorklistIfAllowed(I); 5163 } 5164 5165 // Expand Worklist in topological order: whenever a new instruction 5166 // is added , its users should be already inside Worklist. It ensures 5167 // a uniform instruction will only be used by uniform instructions. 5168 unsigned idx = 0; 5169 while (idx != Worklist.size()) { 5170 Instruction *I = Worklist[idx++]; 5171 5172 for (auto OV : I->operand_values()) { 5173 // isOutOfScope operands cannot be uniform instructions. 5174 if (isOutOfScope(OV)) 5175 continue; 5176 // First order recurrence Phi's should typically be considered 5177 // non-uniform. 5178 auto *OP = dyn_cast<PHINode>(OV); 5179 if (OP && Legal->isFirstOrderRecurrence(OP)) 5180 continue; 5181 // If all the users of the operand are uniform, then add the 5182 // operand into the uniform worklist. 5183 auto *OI = cast<Instruction>(OV); 5184 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5185 auto *J = cast<Instruction>(U); 5186 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5187 })) 5188 addToWorklistIfAllowed(OI); 5189 } 5190 } 5191 5192 // For an instruction to be added into Worklist above, all its users inside 5193 // the loop should also be in Worklist. However, this condition cannot be 5194 // true for phi nodes that form a cyclic dependence. We must process phi 5195 // nodes separately. An induction variable will remain uniform if all users 5196 // of the induction variable and induction variable update remain uniform. 5197 // The code below handles both pointer and non-pointer induction variables. 5198 for (auto &Induction : Legal->getInductionVars()) { 5199 auto *Ind = Induction.first; 5200 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5201 5202 // Determine if all users of the induction variable are uniform after 5203 // vectorization. 5204 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5205 auto *I = cast<Instruction>(U); 5206 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5207 isVectorizedMemAccessUse(I, Ind); 5208 }); 5209 if (!UniformInd) 5210 continue; 5211 5212 // Determine if all users of the induction variable update instruction are 5213 // uniform after vectorization. 5214 auto UniformIndUpdate = 5215 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5216 auto *I = cast<Instruction>(U); 5217 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5218 isVectorizedMemAccessUse(I, IndUpdate); 5219 }); 5220 if (!UniformIndUpdate) 5221 continue; 5222 5223 // The induction variable and its update instruction will remain uniform. 5224 addToWorklistIfAllowed(Ind); 5225 addToWorklistIfAllowed(IndUpdate); 5226 } 5227 5228 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5229 } 5230 5231 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5232 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5233 5234 if (Legal->getRuntimePointerChecking()->Need) { 5235 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5236 "runtime pointer checks needed. Enable vectorization of this " 5237 "loop with '#pragma clang loop vectorize(enable)' when " 5238 "compiling with -Os/-Oz", 5239 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5240 return true; 5241 } 5242 5243 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5244 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5245 "runtime SCEV checks needed. Enable vectorization of this " 5246 "loop with '#pragma clang loop vectorize(enable)' when " 5247 "compiling with -Os/-Oz", 5248 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5249 return true; 5250 } 5251 5252 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5253 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5254 reportVectorizationFailure("Runtime stride check for small trip count", 5255 "runtime stride == 1 checks needed. Enable vectorization of " 5256 "this loop without such check by compiling with -Os/-Oz", 5257 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5258 return true; 5259 } 5260 5261 return false; 5262 } 5263 5264 ElementCount 5265 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5266 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5267 return ElementCount::getScalable(0); 5268 5269 if (Hints->isScalableVectorizationDisabled()) { 5270 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5271 "ScalableVectorizationDisabled", ORE, TheLoop); 5272 return ElementCount::getScalable(0); 5273 } 5274 5275 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5276 5277 auto MaxScalableVF = ElementCount::getScalable( 5278 std::numeric_limits<ElementCount::ScalarTy>::max()); 5279 5280 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5281 // FIXME: While for scalable vectors this is currently sufficient, this should 5282 // be replaced by a more detailed mechanism that filters out specific VFs, 5283 // instead of invalidating vectorization for a whole set of VFs based on the 5284 // MaxVF. 5285 5286 // Disable scalable vectorization if the loop contains unsupported reductions. 5287 if (!canVectorizeReductions(MaxScalableVF)) { 5288 reportVectorizationInfo( 5289 "Scalable vectorization not supported for the reduction " 5290 "operations found in this loop.", 5291 "ScalableVFUnfeasible", ORE, TheLoop); 5292 return ElementCount::getScalable(0); 5293 } 5294 5295 // Disable scalable vectorization if the loop contains any instructions 5296 // with element types not supported for scalable vectors. 5297 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5298 return !Ty->isVoidTy() && 5299 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5300 })) { 5301 reportVectorizationInfo("Scalable vectorization is not supported " 5302 "for all element types found in this loop.", 5303 "ScalableVFUnfeasible", ORE, TheLoop); 5304 return ElementCount::getScalable(0); 5305 } 5306 5307 if (Legal->isSafeForAnyVectorWidth()) 5308 return MaxScalableVF; 5309 5310 // Limit MaxScalableVF by the maximum safe dependence distance. 5311 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5312 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5313 MaxVScale = 5314 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5315 MaxScalableVF = ElementCount::getScalable( 5316 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5317 if (!MaxScalableVF) 5318 reportVectorizationInfo( 5319 "Max legal vector width too small, scalable vectorization " 5320 "unfeasible.", 5321 "ScalableVFUnfeasible", ORE, TheLoop); 5322 5323 return MaxScalableVF; 5324 } 5325 5326 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5327 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5328 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5329 unsigned SmallestType, WidestType; 5330 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5331 5332 // Get the maximum safe dependence distance in bits computed by LAA. 5333 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5334 // the memory accesses that is most restrictive (involved in the smallest 5335 // dependence distance). 5336 unsigned MaxSafeElements = 5337 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5338 5339 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5340 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5341 5342 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5343 << ".\n"); 5344 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5345 << ".\n"); 5346 5347 // First analyze the UserVF, fall back if the UserVF should be ignored. 5348 if (UserVF) { 5349 auto MaxSafeUserVF = 5350 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5351 5352 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5353 // If `VF=vscale x N` is safe, then so is `VF=N` 5354 if (UserVF.isScalable()) 5355 return FixedScalableVFPair( 5356 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5357 else 5358 return UserVF; 5359 } 5360 5361 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5362 5363 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5364 // is better to ignore the hint and let the compiler choose a suitable VF. 5365 if (!UserVF.isScalable()) { 5366 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5367 << " is unsafe, clamping to max safe VF=" 5368 << MaxSafeFixedVF << ".\n"); 5369 ORE->emit([&]() { 5370 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5371 TheLoop->getStartLoc(), 5372 TheLoop->getHeader()) 5373 << "User-specified vectorization factor " 5374 << ore::NV("UserVectorizationFactor", UserVF) 5375 << " is unsafe, clamping to maximum safe vectorization factor " 5376 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5377 }); 5378 return MaxSafeFixedVF; 5379 } 5380 5381 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5382 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5383 << " is ignored because scalable vectors are not " 5384 "available.\n"); 5385 ORE->emit([&]() { 5386 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5387 TheLoop->getStartLoc(), 5388 TheLoop->getHeader()) 5389 << "User-specified vectorization factor " 5390 << ore::NV("UserVectorizationFactor", UserVF) 5391 << " is ignored because the target does not support scalable " 5392 "vectors. The compiler will pick a more suitable value."; 5393 }); 5394 } else { 5395 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5396 << " is unsafe. Ignoring scalable UserVF.\n"); 5397 ORE->emit([&]() { 5398 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5399 TheLoop->getStartLoc(), 5400 TheLoop->getHeader()) 5401 << "User-specified vectorization factor " 5402 << ore::NV("UserVectorizationFactor", UserVF) 5403 << " is unsafe. Ignoring the hint to let the compiler pick a " 5404 "more suitable value."; 5405 }); 5406 } 5407 } 5408 5409 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5410 << " / " << WidestType << " bits.\n"); 5411 5412 FixedScalableVFPair Result(ElementCount::getFixed(1), 5413 ElementCount::getScalable(0)); 5414 if (auto MaxVF = 5415 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5416 MaxSafeFixedVF, FoldTailByMasking)) 5417 Result.FixedVF = MaxVF; 5418 5419 if (auto MaxVF = 5420 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5421 MaxSafeScalableVF, FoldTailByMasking)) 5422 if (MaxVF.isScalable()) { 5423 Result.ScalableVF = MaxVF; 5424 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5425 << "\n"); 5426 } 5427 5428 return Result; 5429 } 5430 5431 FixedScalableVFPair 5432 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5433 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5434 // TODO: It may by useful to do since it's still likely to be dynamically 5435 // uniform if the target can skip. 5436 reportVectorizationFailure( 5437 "Not inserting runtime ptr check for divergent target", 5438 "runtime pointer checks needed. Not enabled for divergent target", 5439 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5440 return FixedScalableVFPair::getNone(); 5441 } 5442 5443 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5444 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5445 if (TC == 1) { 5446 reportVectorizationFailure("Single iteration (non) loop", 5447 "loop trip count is one, irrelevant for vectorization", 5448 "SingleIterationLoop", ORE, TheLoop); 5449 return FixedScalableVFPair::getNone(); 5450 } 5451 5452 switch (ScalarEpilogueStatus) { 5453 case CM_ScalarEpilogueAllowed: 5454 return computeFeasibleMaxVF(TC, UserVF, false); 5455 case CM_ScalarEpilogueNotAllowedUsePredicate: 5456 LLVM_FALLTHROUGH; 5457 case CM_ScalarEpilogueNotNeededUsePredicate: 5458 LLVM_DEBUG( 5459 dbgs() << "LV: vector predicate hint/switch found.\n" 5460 << "LV: Not allowing scalar epilogue, creating predicated " 5461 << "vector loop.\n"); 5462 break; 5463 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5464 // fallthrough as a special case of OptForSize 5465 case CM_ScalarEpilogueNotAllowedOptSize: 5466 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5467 LLVM_DEBUG( 5468 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5469 else 5470 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5471 << "count.\n"); 5472 5473 // Bail if runtime checks are required, which are not good when optimising 5474 // for size. 5475 if (runtimeChecksRequired()) 5476 return FixedScalableVFPair::getNone(); 5477 5478 break; 5479 } 5480 5481 // The only loops we can vectorize without a scalar epilogue, are loops with 5482 // a bottom-test and a single exiting block. We'd have to handle the fact 5483 // that not every instruction executes on the last iteration. This will 5484 // require a lane mask which varies through the vector loop body. (TODO) 5485 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5486 // If there was a tail-folding hint/switch, but we can't fold the tail by 5487 // masking, fallback to a vectorization with a scalar epilogue. 5488 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5489 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5490 "scalar epilogue instead.\n"); 5491 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5492 return computeFeasibleMaxVF(TC, UserVF, false); 5493 } 5494 return FixedScalableVFPair::getNone(); 5495 } 5496 5497 // Now try the tail folding 5498 5499 // Invalidate interleave groups that require an epilogue if we can't mask 5500 // the interleave-group. 5501 if (!useMaskedInterleavedAccesses(TTI)) { 5502 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5503 "No decisions should have been taken at this point"); 5504 // Note: There is no need to invalidate any cost modeling decisions here, as 5505 // non where taken so far. 5506 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5507 } 5508 5509 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5510 // Avoid tail folding if the trip count is known to be a multiple of any VF 5511 // we chose. 5512 // FIXME: The condition below pessimises the case for fixed-width vectors, 5513 // when scalable VFs are also candidates for vectorization. 5514 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5515 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5516 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5517 "MaxFixedVF must be a power of 2"); 5518 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5519 : MaxFixedVF.getFixedValue(); 5520 ScalarEvolution *SE = PSE.getSE(); 5521 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5522 const SCEV *ExitCount = SE->getAddExpr( 5523 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5524 const SCEV *Rem = SE->getURemExpr( 5525 SE->applyLoopGuards(ExitCount, TheLoop), 5526 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5527 if (Rem->isZero()) { 5528 // Accept MaxFixedVF if we do not have a tail. 5529 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5530 return MaxFactors; 5531 } 5532 } 5533 5534 // For scalable vectors, don't use tail folding as this is currently not yet 5535 // supported. The code is likely to have ended up here if the tripcount is 5536 // low, in which case it makes sense not to use scalable vectors. 5537 if (MaxFactors.ScalableVF.isVector()) 5538 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5539 5540 // If we don't know the precise trip count, or if the trip count that we 5541 // found modulo the vectorization factor is not zero, try to fold the tail 5542 // by masking. 5543 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5544 if (Legal->prepareToFoldTailByMasking()) { 5545 FoldTailByMasking = true; 5546 return MaxFactors; 5547 } 5548 5549 // If there was a tail-folding hint/switch, but we can't fold the tail by 5550 // masking, fallback to a vectorization with a scalar epilogue. 5551 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5552 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5553 "scalar epilogue instead.\n"); 5554 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5555 return MaxFactors; 5556 } 5557 5558 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5559 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5560 return FixedScalableVFPair::getNone(); 5561 } 5562 5563 if (TC == 0) { 5564 reportVectorizationFailure( 5565 "Unable to calculate the loop count due to complex control flow", 5566 "unable to calculate the loop count due to complex control flow", 5567 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5568 return FixedScalableVFPair::getNone(); 5569 } 5570 5571 reportVectorizationFailure( 5572 "Cannot optimize for size and vectorize at the same time.", 5573 "cannot optimize for size and vectorize at the same time. " 5574 "Enable vectorization of this loop with '#pragma clang loop " 5575 "vectorize(enable)' when compiling with -Os/-Oz", 5576 "NoTailLoopWithOptForSize", ORE, TheLoop); 5577 return FixedScalableVFPair::getNone(); 5578 } 5579 5580 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5581 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5582 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5583 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5584 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5585 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5586 : TargetTransformInfo::RGK_FixedWidthVector); 5587 5588 // Convenience function to return the minimum of two ElementCounts. 5589 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5590 assert((LHS.isScalable() == RHS.isScalable()) && 5591 "Scalable flags must match"); 5592 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5593 }; 5594 5595 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5596 // Note that both WidestRegister and WidestType may not be a powers of 2. 5597 auto MaxVectorElementCount = ElementCount::get( 5598 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5599 ComputeScalableMaxVF); 5600 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5601 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5602 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5603 5604 if (!MaxVectorElementCount) { 5605 LLVM_DEBUG(dbgs() << "LV: The target has no " 5606 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5607 << " vector registers.\n"); 5608 return ElementCount::getFixed(1); 5609 } 5610 5611 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5612 if (ConstTripCount && 5613 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5614 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5615 // If loop trip count (TC) is known at compile time there is no point in 5616 // choosing VF greater than TC (as done in the loop below). Select maximum 5617 // power of two which doesn't exceed TC. 5618 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5619 // when the TC is less than or equal to the known number of lanes. 5620 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5621 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5622 "exceeding the constant trip count: " 5623 << ClampedConstTripCount << "\n"); 5624 return ElementCount::getFixed(ClampedConstTripCount); 5625 } 5626 5627 ElementCount MaxVF = MaxVectorElementCount; 5628 if (TTI.shouldMaximizeVectorBandwidth() || 5629 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5630 auto MaxVectorElementCountMaxBW = ElementCount::get( 5631 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5632 ComputeScalableMaxVF); 5633 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5634 5635 // Collect all viable vectorization factors larger than the default MaxVF 5636 // (i.e. MaxVectorElementCount). 5637 SmallVector<ElementCount, 8> VFs; 5638 for (ElementCount VS = MaxVectorElementCount * 2; 5639 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5640 VFs.push_back(VS); 5641 5642 // For each VF calculate its register usage. 5643 auto RUs = calculateRegisterUsage(VFs); 5644 5645 // Select the largest VF which doesn't require more registers than existing 5646 // ones. 5647 for (int i = RUs.size() - 1; i >= 0; --i) { 5648 bool Selected = true; 5649 for (auto &pair : RUs[i].MaxLocalUsers) { 5650 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5651 if (pair.second > TargetNumRegisters) 5652 Selected = false; 5653 } 5654 if (Selected) { 5655 MaxVF = VFs[i]; 5656 break; 5657 } 5658 } 5659 if (ElementCount MinVF = 5660 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5661 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5662 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5663 << ") with target's minimum: " << MinVF << '\n'); 5664 MaxVF = MinVF; 5665 } 5666 } 5667 } 5668 return MaxVF; 5669 } 5670 5671 bool LoopVectorizationCostModel::isMoreProfitable( 5672 const VectorizationFactor &A, const VectorizationFactor &B) const { 5673 InstructionCost CostA = A.Cost; 5674 InstructionCost CostB = B.Cost; 5675 5676 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5677 5678 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5679 MaxTripCount) { 5680 // If we are folding the tail and the trip count is a known (possibly small) 5681 // constant, the trip count will be rounded up to an integer number of 5682 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5683 // which we compare directly. When not folding the tail, the total cost will 5684 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5685 // approximated with the per-lane cost below instead of using the tripcount 5686 // as here. 5687 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5688 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5689 return RTCostA < RTCostB; 5690 } 5691 5692 // Improve estimate for the vector width if it is scalable. 5693 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5694 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5695 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5696 if (A.Width.isScalable()) 5697 EstimatedWidthA *= VScale.getValue(); 5698 if (B.Width.isScalable()) 5699 EstimatedWidthB *= VScale.getValue(); 5700 } 5701 5702 // When set to preferred, for now assume vscale may be larger than 1 (or the 5703 // one being tuned for), so that scalable vectorization is slightly favorable 5704 // over fixed-width vectorization. 5705 if (Hints->isScalableVectorizationPreferred()) 5706 if (A.Width.isScalable() && !B.Width.isScalable()) 5707 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5708 5709 // To avoid the need for FP division: 5710 // (CostA / A.Width) < (CostB / B.Width) 5711 // <=> (CostA * B.Width) < (CostB * A.Width) 5712 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5713 } 5714 5715 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5716 const ElementCountSet &VFCandidates) { 5717 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5718 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5719 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5720 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5721 "Expected Scalar VF to be a candidate"); 5722 5723 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5724 VectorizationFactor ChosenFactor = ScalarCost; 5725 5726 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5727 if (ForceVectorization && VFCandidates.size() > 1) { 5728 // Ignore scalar width, because the user explicitly wants vectorization. 5729 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5730 // evaluation. 5731 ChosenFactor.Cost = InstructionCost::getMax(); 5732 } 5733 5734 SmallVector<InstructionVFPair> InvalidCosts; 5735 for (const auto &i : VFCandidates) { 5736 // The cost for scalar VF=1 is already calculated, so ignore it. 5737 if (i.isScalar()) 5738 continue; 5739 5740 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5741 VectorizationFactor Candidate(i, C.first); 5742 5743 #ifndef NDEBUG 5744 unsigned AssumedMinimumVscale = 1; 5745 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5746 AssumedMinimumVscale = VScale.getValue(); 5747 unsigned Width = 5748 Candidate.Width.isScalable() 5749 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5750 : Candidate.Width.getFixedValue(); 5751 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5752 << " costs: " << (Candidate.Cost / Width)); 5753 if (i.isScalable()) 5754 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5755 << AssumedMinimumVscale << ")"); 5756 LLVM_DEBUG(dbgs() << ".\n"); 5757 #endif 5758 5759 if (!C.second && !ForceVectorization) { 5760 LLVM_DEBUG( 5761 dbgs() << "LV: Not considering vector loop of width " << i 5762 << " because it will not generate any vector instructions.\n"); 5763 continue; 5764 } 5765 5766 // If profitable add it to ProfitableVF list. 5767 if (isMoreProfitable(Candidate, ScalarCost)) 5768 ProfitableVFs.push_back(Candidate); 5769 5770 if (isMoreProfitable(Candidate, ChosenFactor)) 5771 ChosenFactor = Candidate; 5772 } 5773 5774 // Emit a report of VFs with invalid costs in the loop. 5775 if (!InvalidCosts.empty()) { 5776 // Group the remarks per instruction, keeping the instruction order from 5777 // InvalidCosts. 5778 std::map<Instruction *, unsigned> Numbering; 5779 unsigned I = 0; 5780 for (auto &Pair : InvalidCosts) 5781 if (!Numbering.count(Pair.first)) 5782 Numbering[Pair.first] = I++; 5783 5784 // Sort the list, first on instruction(number) then on VF. 5785 llvm::sort(InvalidCosts, 5786 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5787 if (Numbering[A.first] != Numbering[B.first]) 5788 return Numbering[A.first] < Numbering[B.first]; 5789 ElementCountComparator ECC; 5790 return ECC(A.second, B.second); 5791 }); 5792 5793 // For a list of ordered instruction-vf pairs: 5794 // [(load, vf1), (load, vf2), (store, vf1)] 5795 // Group the instructions together to emit separate remarks for: 5796 // load (vf1, vf2) 5797 // store (vf1) 5798 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5799 auto Subset = ArrayRef<InstructionVFPair>(); 5800 do { 5801 if (Subset.empty()) 5802 Subset = Tail.take_front(1); 5803 5804 Instruction *I = Subset.front().first; 5805 5806 // If the next instruction is different, or if there are no other pairs, 5807 // emit a remark for the collated subset. e.g. 5808 // [(load, vf1), (load, vf2))] 5809 // to emit: 5810 // remark: invalid costs for 'load' at VF=(vf, vf2) 5811 if (Subset == Tail || Tail[Subset.size()].first != I) { 5812 std::string OutString; 5813 raw_string_ostream OS(OutString); 5814 assert(!Subset.empty() && "Unexpected empty range"); 5815 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5816 for (auto &Pair : Subset) 5817 OS << (Pair.second == Subset.front().second ? "" : ", ") 5818 << Pair.second; 5819 OS << "):"; 5820 if (auto *CI = dyn_cast<CallInst>(I)) 5821 OS << " call to " << CI->getCalledFunction()->getName(); 5822 else 5823 OS << " " << I->getOpcodeName(); 5824 OS.flush(); 5825 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5826 Tail = Tail.drop_front(Subset.size()); 5827 Subset = {}; 5828 } else 5829 // Grow the subset by one element 5830 Subset = Tail.take_front(Subset.size() + 1); 5831 } while (!Tail.empty()); 5832 } 5833 5834 if (!EnableCondStoresVectorization && NumPredStores) { 5835 reportVectorizationFailure("There are conditional stores.", 5836 "store that is conditionally executed prevents vectorization", 5837 "ConditionalStore", ORE, TheLoop); 5838 ChosenFactor = ScalarCost; 5839 } 5840 5841 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5842 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5843 << "LV: Vectorization seems to be not beneficial, " 5844 << "but was forced by a user.\n"); 5845 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5846 return ChosenFactor; 5847 } 5848 5849 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5850 const Loop &L, ElementCount VF) const { 5851 // Cross iteration phis such as reductions need special handling and are 5852 // currently unsupported. 5853 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5854 return Legal->isFirstOrderRecurrence(&Phi) || 5855 Legal->isReductionVariable(&Phi); 5856 })) 5857 return false; 5858 5859 // Phis with uses outside of the loop require special handling and are 5860 // currently unsupported. 5861 for (auto &Entry : Legal->getInductionVars()) { 5862 // Look for uses of the value of the induction at the last iteration. 5863 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5864 for (User *U : PostInc->users()) 5865 if (!L.contains(cast<Instruction>(U))) 5866 return false; 5867 // Look for uses of penultimate value of the induction. 5868 for (User *U : Entry.first->users()) 5869 if (!L.contains(cast<Instruction>(U))) 5870 return false; 5871 } 5872 5873 // Induction variables that are widened require special handling that is 5874 // currently not supported. 5875 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5876 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5877 this->isProfitableToScalarize(Entry.first, VF)); 5878 })) 5879 return false; 5880 5881 // Epilogue vectorization code has not been auditted to ensure it handles 5882 // non-latch exits properly. It may be fine, but it needs auditted and 5883 // tested. 5884 if (L.getExitingBlock() != L.getLoopLatch()) 5885 return false; 5886 5887 return true; 5888 } 5889 5890 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5891 const ElementCount VF) const { 5892 // FIXME: We need a much better cost-model to take different parameters such 5893 // as register pressure, code size increase and cost of extra branches into 5894 // account. For now we apply a very crude heuristic and only consider loops 5895 // with vectorization factors larger than a certain value. 5896 // We also consider epilogue vectorization unprofitable for targets that don't 5897 // consider interleaving beneficial (eg. MVE). 5898 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5899 return false; 5900 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5901 return true; 5902 return false; 5903 } 5904 5905 VectorizationFactor 5906 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5907 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5908 VectorizationFactor Result = VectorizationFactor::Disabled(); 5909 if (!EnableEpilogueVectorization) { 5910 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5911 return Result; 5912 } 5913 5914 if (!isScalarEpilogueAllowed()) { 5915 LLVM_DEBUG( 5916 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5917 "allowed.\n";); 5918 return Result; 5919 } 5920 5921 // Not really a cost consideration, but check for unsupported cases here to 5922 // simplify the logic. 5923 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5924 LLVM_DEBUG( 5925 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5926 "not a supported candidate.\n";); 5927 return Result; 5928 } 5929 5930 if (EpilogueVectorizationForceVF > 1) { 5931 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5932 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5933 if (LVP.hasPlanWithVF(ForcedEC)) 5934 return {ForcedEC, 0}; 5935 else { 5936 LLVM_DEBUG( 5937 dbgs() 5938 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5939 return Result; 5940 } 5941 } 5942 5943 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5944 TheLoop->getHeader()->getParent()->hasMinSize()) { 5945 LLVM_DEBUG( 5946 dbgs() 5947 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5948 return Result; 5949 } 5950 5951 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5952 if (MainLoopVF.isScalable()) 5953 LLVM_DEBUG( 5954 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 5955 "yet supported. Converting to fixed-width (VF=" 5956 << FixedMainLoopVF << ") instead\n"); 5957 5958 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 5959 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5960 "this loop\n"); 5961 return Result; 5962 } 5963 5964 for (auto &NextVF : ProfitableVFs) 5965 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 5966 (Result.Width.getFixedValue() == 1 || 5967 isMoreProfitable(NextVF, Result)) && 5968 LVP.hasPlanWithVF(NextVF.Width)) 5969 Result = NextVF; 5970 5971 if (Result != VectorizationFactor::Disabled()) 5972 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5973 << Result.Width.getFixedValue() << "\n";); 5974 return Result; 5975 } 5976 5977 std::pair<unsigned, unsigned> 5978 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5979 unsigned MinWidth = -1U; 5980 unsigned MaxWidth = 8; 5981 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5982 for (Type *T : ElementTypesInLoop) { 5983 MinWidth = std::min<unsigned>( 5984 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5985 MaxWidth = std::max<unsigned>( 5986 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5987 } 5988 return {MinWidth, MaxWidth}; 5989 } 5990 5991 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5992 ElementTypesInLoop.clear(); 5993 // For each block. 5994 for (BasicBlock *BB : TheLoop->blocks()) { 5995 // For each instruction in the loop. 5996 for (Instruction &I : BB->instructionsWithoutDebug()) { 5997 Type *T = I.getType(); 5998 5999 // Skip ignored values. 6000 if (ValuesToIgnore.count(&I)) 6001 continue; 6002 6003 // Only examine Loads, Stores and PHINodes. 6004 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6005 continue; 6006 6007 // Examine PHI nodes that are reduction variables. Update the type to 6008 // account for the recurrence type. 6009 if (auto *PN = dyn_cast<PHINode>(&I)) { 6010 if (!Legal->isReductionVariable(PN)) 6011 continue; 6012 const RecurrenceDescriptor &RdxDesc = 6013 Legal->getReductionVars().find(PN)->second; 6014 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6015 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6016 RdxDesc.getRecurrenceType(), 6017 TargetTransformInfo::ReductionFlags())) 6018 continue; 6019 T = RdxDesc.getRecurrenceType(); 6020 } 6021 6022 // Examine the stored values. 6023 if (auto *ST = dyn_cast<StoreInst>(&I)) 6024 T = ST->getValueOperand()->getType(); 6025 6026 // Ignore loaded pointer types and stored pointer types that are not 6027 // vectorizable. 6028 // 6029 // FIXME: The check here attempts to predict whether a load or store will 6030 // be vectorized. We only know this for certain after a VF has 6031 // been selected. Here, we assume that if an access can be 6032 // vectorized, it will be. We should also look at extending this 6033 // optimization to non-pointer types. 6034 // 6035 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6036 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6037 continue; 6038 6039 ElementTypesInLoop.insert(T); 6040 } 6041 } 6042 } 6043 6044 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6045 unsigned LoopCost) { 6046 // -- The interleave heuristics -- 6047 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6048 // There are many micro-architectural considerations that we can't predict 6049 // at this level. For example, frontend pressure (on decode or fetch) due to 6050 // code size, or the number and capabilities of the execution ports. 6051 // 6052 // We use the following heuristics to select the interleave count: 6053 // 1. If the code has reductions, then we interleave to break the cross 6054 // iteration dependency. 6055 // 2. If the loop is really small, then we interleave to reduce the loop 6056 // overhead. 6057 // 3. We don't interleave if we think that we will spill registers to memory 6058 // due to the increased register pressure. 6059 6060 if (!isScalarEpilogueAllowed()) 6061 return 1; 6062 6063 // We used the distance for the interleave count. 6064 if (Legal->getMaxSafeDepDistBytes() != -1U) 6065 return 1; 6066 6067 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6068 const bool HasReductions = !Legal->getReductionVars().empty(); 6069 // Do not interleave loops with a relatively small known or estimated trip 6070 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6071 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6072 // because with the above conditions interleaving can expose ILP and break 6073 // cross iteration dependences for reductions. 6074 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6075 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6076 return 1; 6077 6078 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6079 // We divide by these constants so assume that we have at least one 6080 // instruction that uses at least one register. 6081 for (auto& pair : R.MaxLocalUsers) { 6082 pair.second = std::max(pair.second, 1U); 6083 } 6084 6085 // We calculate the interleave count using the following formula. 6086 // Subtract the number of loop invariants from the number of available 6087 // registers. These registers are used by all of the interleaved instances. 6088 // Next, divide the remaining registers by the number of registers that is 6089 // required by the loop, in order to estimate how many parallel instances 6090 // fit without causing spills. All of this is rounded down if necessary to be 6091 // a power of two. We want power of two interleave count to simplify any 6092 // addressing operations or alignment considerations. 6093 // We also want power of two interleave counts to ensure that the induction 6094 // variable of the vector loop wraps to zero, when tail is folded by masking; 6095 // this currently happens when OptForSize, in which case IC is set to 1 above. 6096 unsigned IC = UINT_MAX; 6097 6098 for (auto& pair : R.MaxLocalUsers) { 6099 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6100 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6101 << " registers of " 6102 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6103 if (VF.isScalar()) { 6104 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6105 TargetNumRegisters = ForceTargetNumScalarRegs; 6106 } else { 6107 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6108 TargetNumRegisters = ForceTargetNumVectorRegs; 6109 } 6110 unsigned MaxLocalUsers = pair.second; 6111 unsigned LoopInvariantRegs = 0; 6112 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6113 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6114 6115 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6116 // Don't count the induction variable as interleaved. 6117 if (EnableIndVarRegisterHeur) { 6118 TmpIC = 6119 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6120 std::max(1U, (MaxLocalUsers - 1))); 6121 } 6122 6123 IC = std::min(IC, TmpIC); 6124 } 6125 6126 // Clamp the interleave ranges to reasonable counts. 6127 unsigned MaxInterleaveCount = 6128 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6129 6130 // Check if the user has overridden the max. 6131 if (VF.isScalar()) { 6132 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6133 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6134 } else { 6135 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6136 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6137 } 6138 6139 // If trip count is known or estimated compile time constant, limit the 6140 // interleave count to be less than the trip count divided by VF, provided it 6141 // is at least 1. 6142 // 6143 // For scalable vectors we can't know if interleaving is beneficial. It may 6144 // not be beneficial for small loops if none of the lanes in the second vector 6145 // iterations is enabled. However, for larger loops, there is likely to be a 6146 // similar benefit as for fixed-width vectors. For now, we choose to leave 6147 // the InterleaveCount as if vscale is '1', although if some information about 6148 // the vector is known (e.g. min vector size), we can make a better decision. 6149 if (BestKnownTC) { 6150 MaxInterleaveCount = 6151 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6152 // Make sure MaxInterleaveCount is greater than 0. 6153 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6154 } 6155 6156 assert(MaxInterleaveCount > 0 && 6157 "Maximum interleave count must be greater than 0"); 6158 6159 // Clamp the calculated IC to be between the 1 and the max interleave count 6160 // that the target and trip count allows. 6161 if (IC > MaxInterleaveCount) 6162 IC = MaxInterleaveCount; 6163 else 6164 // Make sure IC is greater than 0. 6165 IC = std::max(1u, IC); 6166 6167 assert(IC > 0 && "Interleave count must be greater than 0."); 6168 6169 // If we did not calculate the cost for VF (because the user selected the VF) 6170 // then we calculate the cost of VF here. 6171 if (LoopCost == 0) { 6172 InstructionCost C = expectedCost(VF).first; 6173 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6174 LoopCost = *C.getValue(); 6175 } 6176 6177 assert(LoopCost && "Non-zero loop cost expected"); 6178 6179 // Interleave if we vectorized this loop and there is a reduction that could 6180 // benefit from interleaving. 6181 if (VF.isVector() && HasReductions) { 6182 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6183 return IC; 6184 } 6185 6186 // Note that if we've already vectorized the loop we will have done the 6187 // runtime check and so interleaving won't require further checks. 6188 bool InterleavingRequiresRuntimePointerCheck = 6189 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6190 6191 // We want to interleave small loops in order to reduce the loop overhead and 6192 // potentially expose ILP opportunities. 6193 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6194 << "LV: IC is " << IC << '\n' 6195 << "LV: VF is " << VF << '\n'); 6196 const bool AggressivelyInterleaveReductions = 6197 TTI.enableAggressiveInterleaving(HasReductions); 6198 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6199 // We assume that the cost overhead is 1 and we use the cost model 6200 // to estimate the cost of the loop and interleave until the cost of the 6201 // loop overhead is about 5% of the cost of the loop. 6202 unsigned SmallIC = 6203 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6204 6205 // Interleave until store/load ports (estimated by max interleave count) are 6206 // saturated. 6207 unsigned NumStores = Legal->getNumStores(); 6208 unsigned NumLoads = Legal->getNumLoads(); 6209 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6210 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6211 6212 // There is little point in interleaving for reductions containing selects 6213 // and compares when VF=1 since it may just create more overhead than it's 6214 // worth for loops with small trip counts. This is because we still have to 6215 // do the final reduction after the loop. 6216 bool HasSelectCmpReductions = 6217 HasReductions && 6218 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6219 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6220 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6221 RdxDesc.getRecurrenceKind()); 6222 }); 6223 if (HasSelectCmpReductions) { 6224 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6225 return 1; 6226 } 6227 6228 // If we have a scalar reduction (vector reductions are already dealt with 6229 // by this point), we can increase the critical path length if the loop 6230 // we're interleaving is inside another loop. For tree-wise reductions 6231 // set the limit to 2, and for ordered reductions it's best to disable 6232 // interleaving entirely. 6233 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6234 bool HasOrderedReductions = 6235 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6236 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6237 return RdxDesc.isOrdered(); 6238 }); 6239 if (HasOrderedReductions) { 6240 LLVM_DEBUG( 6241 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6242 return 1; 6243 } 6244 6245 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6246 SmallIC = std::min(SmallIC, F); 6247 StoresIC = std::min(StoresIC, F); 6248 LoadsIC = std::min(LoadsIC, F); 6249 } 6250 6251 if (EnableLoadStoreRuntimeInterleave && 6252 std::max(StoresIC, LoadsIC) > SmallIC) { 6253 LLVM_DEBUG( 6254 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6255 return std::max(StoresIC, LoadsIC); 6256 } 6257 6258 // If there are scalar reductions and TTI has enabled aggressive 6259 // interleaving for reductions, we will interleave to expose ILP. 6260 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6261 AggressivelyInterleaveReductions) { 6262 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6263 // Interleave no less than SmallIC but not as aggressive as the normal IC 6264 // to satisfy the rare situation when resources are too limited. 6265 return std::max(IC / 2, SmallIC); 6266 } else { 6267 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6268 return SmallIC; 6269 } 6270 } 6271 6272 // Interleave if this is a large loop (small loops are already dealt with by 6273 // this point) that could benefit from interleaving. 6274 if (AggressivelyInterleaveReductions) { 6275 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6276 return IC; 6277 } 6278 6279 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6280 return 1; 6281 } 6282 6283 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6284 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6285 // This function calculates the register usage by measuring the highest number 6286 // of values that are alive at a single location. Obviously, this is a very 6287 // rough estimation. We scan the loop in a topological order in order and 6288 // assign a number to each instruction. We use RPO to ensure that defs are 6289 // met before their users. We assume that each instruction that has in-loop 6290 // users starts an interval. We record every time that an in-loop value is 6291 // used, so we have a list of the first and last occurrences of each 6292 // instruction. Next, we transpose this data structure into a multi map that 6293 // holds the list of intervals that *end* at a specific location. This multi 6294 // map allows us to perform a linear search. We scan the instructions linearly 6295 // and record each time that a new interval starts, by placing it in a set. 6296 // If we find this value in the multi-map then we remove it from the set. 6297 // The max register usage is the maximum size of the set. 6298 // We also search for instructions that are defined outside the loop, but are 6299 // used inside the loop. We need this number separately from the max-interval 6300 // usage number because when we unroll, loop-invariant values do not take 6301 // more register. 6302 LoopBlocksDFS DFS(TheLoop); 6303 DFS.perform(LI); 6304 6305 RegisterUsage RU; 6306 6307 // Each 'key' in the map opens a new interval. The values 6308 // of the map are the index of the 'last seen' usage of the 6309 // instruction that is the key. 6310 using IntervalMap = DenseMap<Instruction *, unsigned>; 6311 6312 // Maps instruction to its index. 6313 SmallVector<Instruction *, 64> IdxToInstr; 6314 // Marks the end of each interval. 6315 IntervalMap EndPoint; 6316 // Saves the list of instruction indices that are used in the loop. 6317 SmallPtrSet<Instruction *, 8> Ends; 6318 // Saves the list of values that are used in the loop but are 6319 // defined outside the loop, such as arguments and constants. 6320 SmallPtrSet<Value *, 8> LoopInvariants; 6321 6322 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6323 for (Instruction &I : BB->instructionsWithoutDebug()) { 6324 IdxToInstr.push_back(&I); 6325 6326 // Save the end location of each USE. 6327 for (Value *U : I.operands()) { 6328 auto *Instr = dyn_cast<Instruction>(U); 6329 6330 // Ignore non-instruction values such as arguments, constants, etc. 6331 if (!Instr) 6332 continue; 6333 6334 // If this instruction is outside the loop then record it and continue. 6335 if (!TheLoop->contains(Instr)) { 6336 LoopInvariants.insert(Instr); 6337 continue; 6338 } 6339 6340 // Overwrite previous end points. 6341 EndPoint[Instr] = IdxToInstr.size(); 6342 Ends.insert(Instr); 6343 } 6344 } 6345 } 6346 6347 // Saves the list of intervals that end with the index in 'key'. 6348 using InstrList = SmallVector<Instruction *, 2>; 6349 DenseMap<unsigned, InstrList> TransposeEnds; 6350 6351 // Transpose the EndPoints to a list of values that end at each index. 6352 for (auto &Interval : EndPoint) 6353 TransposeEnds[Interval.second].push_back(Interval.first); 6354 6355 SmallPtrSet<Instruction *, 8> OpenIntervals; 6356 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6357 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6358 6359 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6360 6361 // A lambda that gets the register usage for the given type and VF. 6362 const auto &TTICapture = TTI; 6363 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6364 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6365 return 0; 6366 InstructionCost::CostType RegUsage = 6367 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6368 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6369 "Nonsensical values for register usage."); 6370 return RegUsage; 6371 }; 6372 6373 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6374 Instruction *I = IdxToInstr[i]; 6375 6376 // Remove all of the instructions that end at this location. 6377 InstrList &List = TransposeEnds[i]; 6378 for (Instruction *ToRemove : List) 6379 OpenIntervals.erase(ToRemove); 6380 6381 // Ignore instructions that are never used within the loop. 6382 if (!Ends.count(I)) 6383 continue; 6384 6385 // Skip ignored values. 6386 if (ValuesToIgnore.count(I)) 6387 continue; 6388 6389 // For each VF find the maximum usage of registers. 6390 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6391 // Count the number of live intervals. 6392 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6393 6394 if (VFs[j].isScalar()) { 6395 for (auto Inst : OpenIntervals) { 6396 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6397 if (RegUsage.find(ClassID) == RegUsage.end()) 6398 RegUsage[ClassID] = 1; 6399 else 6400 RegUsage[ClassID] += 1; 6401 } 6402 } else { 6403 collectUniformsAndScalars(VFs[j]); 6404 for (auto Inst : OpenIntervals) { 6405 // Skip ignored values for VF > 1. 6406 if (VecValuesToIgnore.count(Inst)) 6407 continue; 6408 if (isScalarAfterVectorization(Inst, VFs[j])) { 6409 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6410 if (RegUsage.find(ClassID) == RegUsage.end()) 6411 RegUsage[ClassID] = 1; 6412 else 6413 RegUsage[ClassID] += 1; 6414 } else { 6415 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6416 if (RegUsage.find(ClassID) == RegUsage.end()) 6417 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6418 else 6419 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6420 } 6421 } 6422 } 6423 6424 for (auto& pair : RegUsage) { 6425 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6426 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6427 else 6428 MaxUsages[j][pair.first] = pair.second; 6429 } 6430 } 6431 6432 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6433 << OpenIntervals.size() << '\n'); 6434 6435 // Add the current instruction to the list of open intervals. 6436 OpenIntervals.insert(I); 6437 } 6438 6439 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6440 SmallMapVector<unsigned, unsigned, 4> Invariant; 6441 6442 for (auto Inst : LoopInvariants) { 6443 unsigned Usage = 6444 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6445 unsigned ClassID = 6446 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6447 if (Invariant.find(ClassID) == Invariant.end()) 6448 Invariant[ClassID] = Usage; 6449 else 6450 Invariant[ClassID] += Usage; 6451 } 6452 6453 LLVM_DEBUG({ 6454 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6455 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6456 << " item\n"; 6457 for (const auto &pair : MaxUsages[i]) { 6458 dbgs() << "LV(REG): RegisterClass: " 6459 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6460 << " registers\n"; 6461 } 6462 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6463 << " item\n"; 6464 for (const auto &pair : Invariant) { 6465 dbgs() << "LV(REG): RegisterClass: " 6466 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6467 << " registers\n"; 6468 } 6469 }); 6470 6471 RU.LoopInvariantRegs = Invariant; 6472 RU.MaxLocalUsers = MaxUsages[i]; 6473 RUs[i] = RU; 6474 } 6475 6476 return RUs; 6477 } 6478 6479 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6480 // TODO: Cost model for emulated masked load/store is completely 6481 // broken. This hack guides the cost model to use an artificially 6482 // high enough value to practically disable vectorization with such 6483 // operations, except where previously deployed legality hack allowed 6484 // using very low cost values. This is to avoid regressions coming simply 6485 // from moving "masked load/store" check from legality to cost model. 6486 // Masked Load/Gather emulation was previously never allowed. 6487 // Limited number of Masked Store/Scatter emulation was allowed. 6488 assert(isPredicatedInst(I) && 6489 "Expecting a scalar emulated instruction"); 6490 return isa<LoadInst>(I) || 6491 (isa<StoreInst>(I) && 6492 NumPredStores > NumberOfStoresToPredicate); 6493 } 6494 6495 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6496 // If we aren't vectorizing the loop, or if we've already collected the 6497 // instructions to scalarize, there's nothing to do. Collection may already 6498 // have occurred if we have a user-selected VF and are now computing the 6499 // expected cost for interleaving. 6500 if (VF.isScalar() || VF.isZero() || 6501 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6502 return; 6503 6504 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6505 // not profitable to scalarize any instructions, the presence of VF in the 6506 // map will indicate that we've analyzed it already. 6507 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6508 6509 // Find all the instructions that are scalar with predication in the loop and 6510 // determine if it would be better to not if-convert the blocks they are in. 6511 // If so, we also record the instructions to scalarize. 6512 for (BasicBlock *BB : TheLoop->blocks()) { 6513 if (!blockNeedsPredicationForAnyReason(BB)) 6514 continue; 6515 for (Instruction &I : *BB) 6516 if (isScalarWithPredication(&I)) { 6517 ScalarCostsTy ScalarCosts; 6518 // Do not apply discount if scalable, because that would lead to 6519 // invalid scalarization costs. 6520 // Do not apply discount logic if hacked cost is needed 6521 // for emulated masked memrefs. 6522 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6523 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6524 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6525 // Remember that BB will remain after vectorization. 6526 PredicatedBBsAfterVectorization.insert(BB); 6527 } 6528 } 6529 } 6530 6531 int LoopVectorizationCostModel::computePredInstDiscount( 6532 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6533 assert(!isUniformAfterVectorization(PredInst, VF) && 6534 "Instruction marked uniform-after-vectorization will be predicated"); 6535 6536 // Initialize the discount to zero, meaning that the scalar version and the 6537 // vector version cost the same. 6538 InstructionCost Discount = 0; 6539 6540 // Holds instructions to analyze. The instructions we visit are mapped in 6541 // ScalarCosts. Those instructions are the ones that would be scalarized if 6542 // we find that the scalar version costs less. 6543 SmallVector<Instruction *, 8> Worklist; 6544 6545 // Returns true if the given instruction can be scalarized. 6546 auto canBeScalarized = [&](Instruction *I) -> bool { 6547 // We only attempt to scalarize instructions forming a single-use chain 6548 // from the original predicated block that would otherwise be vectorized. 6549 // Although not strictly necessary, we give up on instructions we know will 6550 // already be scalar to avoid traversing chains that are unlikely to be 6551 // beneficial. 6552 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6553 isScalarAfterVectorization(I, VF)) 6554 return false; 6555 6556 // If the instruction is scalar with predication, it will be analyzed 6557 // separately. We ignore it within the context of PredInst. 6558 if (isScalarWithPredication(I)) 6559 return false; 6560 6561 // If any of the instruction's operands are uniform after vectorization, 6562 // the instruction cannot be scalarized. This prevents, for example, a 6563 // masked load from being scalarized. 6564 // 6565 // We assume we will only emit a value for lane zero of an instruction 6566 // marked uniform after vectorization, rather than VF identical values. 6567 // Thus, if we scalarize an instruction that uses a uniform, we would 6568 // create uses of values corresponding to the lanes we aren't emitting code 6569 // for. This behavior can be changed by allowing getScalarValue to clone 6570 // the lane zero values for uniforms rather than asserting. 6571 for (Use &U : I->operands()) 6572 if (auto *J = dyn_cast<Instruction>(U.get())) 6573 if (isUniformAfterVectorization(J, VF)) 6574 return false; 6575 6576 // Otherwise, we can scalarize the instruction. 6577 return true; 6578 }; 6579 6580 // Compute the expected cost discount from scalarizing the entire expression 6581 // feeding the predicated instruction. We currently only consider expressions 6582 // that are single-use instruction chains. 6583 Worklist.push_back(PredInst); 6584 while (!Worklist.empty()) { 6585 Instruction *I = Worklist.pop_back_val(); 6586 6587 // If we've already analyzed the instruction, there's nothing to do. 6588 if (ScalarCosts.find(I) != ScalarCosts.end()) 6589 continue; 6590 6591 // Compute the cost of the vector instruction. Note that this cost already 6592 // includes the scalarization overhead of the predicated instruction. 6593 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6594 6595 // Compute the cost of the scalarized instruction. This cost is the cost of 6596 // the instruction as if it wasn't if-converted and instead remained in the 6597 // predicated block. We will scale this cost by block probability after 6598 // computing the scalarization overhead. 6599 InstructionCost ScalarCost = 6600 VF.getFixedValue() * 6601 getInstructionCost(I, ElementCount::getFixed(1)).first; 6602 6603 // Compute the scalarization overhead of needed insertelement instructions 6604 // and phi nodes. 6605 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6606 ScalarCost += TTI.getScalarizationOverhead( 6607 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6608 APInt::getAllOnes(VF.getFixedValue()), true, false); 6609 ScalarCost += 6610 VF.getFixedValue() * 6611 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6612 } 6613 6614 // Compute the scalarization overhead of needed extractelement 6615 // instructions. For each of the instruction's operands, if the operand can 6616 // be scalarized, add it to the worklist; otherwise, account for the 6617 // overhead. 6618 for (Use &U : I->operands()) 6619 if (auto *J = dyn_cast<Instruction>(U.get())) { 6620 assert(VectorType::isValidElementType(J->getType()) && 6621 "Instruction has non-scalar type"); 6622 if (canBeScalarized(J)) 6623 Worklist.push_back(J); 6624 else if (needsExtract(J, VF)) { 6625 ScalarCost += TTI.getScalarizationOverhead( 6626 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6627 APInt::getAllOnes(VF.getFixedValue()), false, true); 6628 } 6629 } 6630 6631 // Scale the total scalar cost by block probability. 6632 ScalarCost /= getReciprocalPredBlockProb(); 6633 6634 // Compute the discount. A non-negative discount means the vector version 6635 // of the instruction costs more, and scalarizing would be beneficial. 6636 Discount += VectorCost - ScalarCost; 6637 ScalarCosts[I] = ScalarCost; 6638 } 6639 6640 return *Discount.getValue(); 6641 } 6642 6643 LoopVectorizationCostModel::VectorizationCostTy 6644 LoopVectorizationCostModel::expectedCost( 6645 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6646 VectorizationCostTy Cost; 6647 6648 // For each block. 6649 for (BasicBlock *BB : TheLoop->blocks()) { 6650 VectorizationCostTy BlockCost; 6651 6652 // For each instruction in the old loop. 6653 for (Instruction &I : BB->instructionsWithoutDebug()) { 6654 // Skip ignored values. 6655 if (ValuesToIgnore.count(&I) || 6656 (VF.isVector() && VecValuesToIgnore.count(&I))) 6657 continue; 6658 6659 VectorizationCostTy C = getInstructionCost(&I, VF); 6660 6661 // Check if we should override the cost. 6662 if (C.first.isValid() && 6663 ForceTargetInstructionCost.getNumOccurrences() > 0) 6664 C.first = InstructionCost(ForceTargetInstructionCost); 6665 6666 // Keep a list of instructions with invalid costs. 6667 if (Invalid && !C.first.isValid()) 6668 Invalid->emplace_back(&I, VF); 6669 6670 BlockCost.first += C.first; 6671 BlockCost.second |= C.second; 6672 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6673 << " for VF " << VF << " For instruction: " << I 6674 << '\n'); 6675 } 6676 6677 // If we are vectorizing a predicated block, it will have been 6678 // if-converted. This means that the block's instructions (aside from 6679 // stores and instructions that may divide by zero) will now be 6680 // unconditionally executed. For the scalar case, we may not always execute 6681 // the predicated block, if it is an if-else block. Thus, scale the block's 6682 // cost by the probability of executing it. blockNeedsPredication from 6683 // Legal is used so as to not include all blocks in tail folded loops. 6684 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6685 BlockCost.first /= getReciprocalPredBlockProb(); 6686 6687 Cost.first += BlockCost.first; 6688 Cost.second |= BlockCost.second; 6689 } 6690 6691 return Cost; 6692 } 6693 6694 /// Gets Address Access SCEV after verifying that the access pattern 6695 /// is loop invariant except the induction variable dependence. 6696 /// 6697 /// This SCEV can be sent to the Target in order to estimate the address 6698 /// calculation cost. 6699 static const SCEV *getAddressAccessSCEV( 6700 Value *Ptr, 6701 LoopVectorizationLegality *Legal, 6702 PredicatedScalarEvolution &PSE, 6703 const Loop *TheLoop) { 6704 6705 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6706 if (!Gep) 6707 return nullptr; 6708 6709 // We are looking for a gep with all loop invariant indices except for one 6710 // which should be an induction variable. 6711 auto SE = PSE.getSE(); 6712 unsigned NumOperands = Gep->getNumOperands(); 6713 for (unsigned i = 1; i < NumOperands; ++i) { 6714 Value *Opd = Gep->getOperand(i); 6715 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6716 !Legal->isInductionVariable(Opd)) 6717 return nullptr; 6718 } 6719 6720 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6721 return PSE.getSCEV(Ptr); 6722 } 6723 6724 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6725 return Legal->hasStride(I->getOperand(0)) || 6726 Legal->hasStride(I->getOperand(1)); 6727 } 6728 6729 InstructionCost 6730 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6731 ElementCount VF) { 6732 assert(VF.isVector() && 6733 "Scalarization cost of instruction implies vectorization."); 6734 if (VF.isScalable()) 6735 return InstructionCost::getInvalid(); 6736 6737 Type *ValTy = getLoadStoreType(I); 6738 auto SE = PSE.getSE(); 6739 6740 unsigned AS = getLoadStoreAddressSpace(I); 6741 Value *Ptr = getLoadStorePointerOperand(I); 6742 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6743 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6744 // that it is being called from this specific place. 6745 6746 // Figure out whether the access is strided and get the stride value 6747 // if it's known in compile time 6748 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6749 6750 // Get the cost of the scalar memory instruction and address computation. 6751 InstructionCost Cost = 6752 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6753 6754 // Don't pass *I here, since it is scalar but will actually be part of a 6755 // vectorized loop where the user of it is a vectorized instruction. 6756 const Align Alignment = getLoadStoreAlignment(I); 6757 Cost += VF.getKnownMinValue() * 6758 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6759 AS, TTI::TCK_RecipThroughput); 6760 6761 // Get the overhead of the extractelement and insertelement instructions 6762 // we might create due to scalarization. 6763 Cost += getScalarizationOverhead(I, VF); 6764 6765 // If we have a predicated load/store, it will need extra i1 extracts and 6766 // conditional branches, but may not be executed for each vector lane. Scale 6767 // the cost by the probability of executing the predicated block. 6768 if (isPredicatedInst(I)) { 6769 Cost /= getReciprocalPredBlockProb(); 6770 6771 // Add the cost of an i1 extract and a branch 6772 auto *Vec_i1Ty = 6773 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6774 Cost += TTI.getScalarizationOverhead( 6775 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6776 /*Insert=*/false, /*Extract=*/true); 6777 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6778 6779 if (useEmulatedMaskMemRefHack(I)) 6780 // Artificially setting to a high enough value to practically disable 6781 // vectorization with such operations. 6782 Cost = 3000000; 6783 } 6784 6785 return Cost; 6786 } 6787 6788 InstructionCost 6789 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6790 ElementCount VF) { 6791 Type *ValTy = getLoadStoreType(I); 6792 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6793 Value *Ptr = getLoadStorePointerOperand(I); 6794 unsigned AS = getLoadStoreAddressSpace(I); 6795 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6796 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6797 6798 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6799 "Stride should be 1 or -1 for consecutive memory access"); 6800 const Align Alignment = getLoadStoreAlignment(I); 6801 InstructionCost Cost = 0; 6802 if (Legal->isMaskRequired(I)) 6803 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6804 CostKind); 6805 else 6806 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6807 CostKind, I); 6808 6809 bool Reverse = ConsecutiveStride < 0; 6810 if (Reverse) 6811 Cost += 6812 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6813 return Cost; 6814 } 6815 6816 InstructionCost 6817 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6818 ElementCount VF) { 6819 assert(Legal->isUniformMemOp(*I)); 6820 6821 Type *ValTy = getLoadStoreType(I); 6822 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6823 const Align Alignment = getLoadStoreAlignment(I); 6824 unsigned AS = getLoadStoreAddressSpace(I); 6825 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6826 if (isa<LoadInst>(I)) { 6827 return TTI.getAddressComputationCost(ValTy) + 6828 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6829 CostKind) + 6830 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6831 } 6832 StoreInst *SI = cast<StoreInst>(I); 6833 6834 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6835 return TTI.getAddressComputationCost(ValTy) + 6836 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6837 CostKind) + 6838 (isLoopInvariantStoreValue 6839 ? 0 6840 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6841 VF.getKnownMinValue() - 1)); 6842 } 6843 6844 InstructionCost 6845 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6846 ElementCount VF) { 6847 Type *ValTy = getLoadStoreType(I); 6848 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6849 const Align Alignment = getLoadStoreAlignment(I); 6850 const Value *Ptr = getLoadStorePointerOperand(I); 6851 6852 return TTI.getAddressComputationCost(VectorTy) + 6853 TTI.getGatherScatterOpCost( 6854 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6855 TargetTransformInfo::TCK_RecipThroughput, I); 6856 } 6857 6858 InstructionCost 6859 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6860 ElementCount VF) { 6861 // TODO: Once we have support for interleaving with scalable vectors 6862 // we can calculate the cost properly here. 6863 if (VF.isScalable()) 6864 return InstructionCost::getInvalid(); 6865 6866 Type *ValTy = getLoadStoreType(I); 6867 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6868 unsigned AS = getLoadStoreAddressSpace(I); 6869 6870 auto Group = getInterleavedAccessGroup(I); 6871 assert(Group && "Fail to get an interleaved access group."); 6872 6873 unsigned InterleaveFactor = Group->getFactor(); 6874 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6875 6876 // Holds the indices of existing members in the interleaved group. 6877 SmallVector<unsigned, 4> Indices; 6878 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6879 if (Group->getMember(IF)) 6880 Indices.push_back(IF); 6881 6882 // Calculate the cost of the whole interleaved group. 6883 bool UseMaskForGaps = 6884 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6885 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6886 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6887 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6888 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6889 6890 if (Group->isReverse()) { 6891 // TODO: Add support for reversed masked interleaved access. 6892 assert(!Legal->isMaskRequired(I) && 6893 "Reverse masked interleaved access not supported."); 6894 Cost += 6895 Group->getNumMembers() * 6896 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6897 } 6898 return Cost; 6899 } 6900 6901 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6902 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6903 using namespace llvm::PatternMatch; 6904 // Early exit for no inloop reductions 6905 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6906 return None; 6907 auto *VectorTy = cast<VectorType>(Ty); 6908 6909 // We are looking for a pattern of, and finding the minimal acceptable cost: 6910 // reduce(mul(ext(A), ext(B))) or 6911 // reduce(mul(A, B)) or 6912 // reduce(ext(A)) or 6913 // reduce(A). 6914 // The basic idea is that we walk down the tree to do that, finding the root 6915 // reduction instruction in InLoopReductionImmediateChains. From there we find 6916 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6917 // of the components. If the reduction cost is lower then we return it for the 6918 // reduction instruction and 0 for the other instructions in the pattern. If 6919 // it is not we return an invalid cost specifying the orignal cost method 6920 // should be used. 6921 Instruction *RetI = I; 6922 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6923 if (!RetI->hasOneUser()) 6924 return None; 6925 RetI = RetI->user_back(); 6926 } 6927 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6928 RetI->user_back()->getOpcode() == Instruction::Add) { 6929 if (!RetI->hasOneUser()) 6930 return None; 6931 RetI = RetI->user_back(); 6932 } 6933 6934 // Test if the found instruction is a reduction, and if not return an invalid 6935 // cost specifying the parent to use the original cost modelling. 6936 if (!InLoopReductionImmediateChains.count(RetI)) 6937 return None; 6938 6939 // Find the reduction this chain is a part of and calculate the basic cost of 6940 // the reduction on its own. 6941 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6942 Instruction *ReductionPhi = LastChain; 6943 while (!isa<PHINode>(ReductionPhi)) 6944 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6945 6946 const RecurrenceDescriptor &RdxDesc = 6947 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6948 6949 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6950 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6951 6952 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6953 // normal fmul instruction to the cost of the fadd reduction. 6954 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6955 BaseCost += 6956 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6957 6958 // If we're using ordered reductions then we can just return the base cost 6959 // here, since getArithmeticReductionCost calculates the full ordered 6960 // reduction cost when FP reassociation is not allowed. 6961 if (useOrderedReductions(RdxDesc)) 6962 return BaseCost; 6963 6964 // Get the operand that was not the reduction chain and match it to one of the 6965 // patterns, returning the better cost if it is found. 6966 Instruction *RedOp = RetI->getOperand(1) == LastChain 6967 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6968 : dyn_cast<Instruction>(RetI->getOperand(1)); 6969 6970 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6971 6972 Instruction *Op0, *Op1; 6973 if (RedOp && 6974 match(RedOp, 6975 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6976 match(Op0, m_ZExtOrSExt(m_Value())) && 6977 Op0->getOpcode() == Op1->getOpcode() && 6978 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6979 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6980 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6981 6982 // Matched reduce(ext(mul(ext(A), ext(B))) 6983 // Note that the extend opcodes need to all match, or if A==B they will have 6984 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6985 // which is equally fine. 6986 bool IsUnsigned = isa<ZExtInst>(Op0); 6987 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6988 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6989 6990 InstructionCost ExtCost = 6991 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6992 TTI::CastContextHint::None, CostKind, Op0); 6993 InstructionCost MulCost = 6994 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6995 InstructionCost Ext2Cost = 6996 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6997 TTI::CastContextHint::None, CostKind, RedOp); 6998 6999 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7000 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7001 CostKind); 7002 7003 if (RedCost.isValid() && 7004 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7005 return I == RetI ? RedCost : 0; 7006 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7007 !TheLoop->isLoopInvariant(RedOp)) { 7008 // Matched reduce(ext(A)) 7009 bool IsUnsigned = isa<ZExtInst>(RedOp); 7010 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7011 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7012 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7013 CostKind); 7014 7015 InstructionCost ExtCost = 7016 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7017 TTI::CastContextHint::None, CostKind, RedOp); 7018 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7019 return I == RetI ? RedCost : 0; 7020 } else if (RedOp && 7021 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7022 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7023 Op0->getOpcode() == Op1->getOpcode() && 7024 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7025 bool IsUnsigned = isa<ZExtInst>(Op0); 7026 Type *Op0Ty = Op0->getOperand(0)->getType(); 7027 Type *Op1Ty = Op1->getOperand(0)->getType(); 7028 Type *LargestOpTy = 7029 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 7030 : Op0Ty; 7031 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 7032 7033 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 7034 // different sizes. We take the largest type as the ext to reduce, and add 7035 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 7036 InstructionCost ExtCost0 = TTI.getCastInstrCost( 7037 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 7038 TTI::CastContextHint::None, CostKind, Op0); 7039 InstructionCost ExtCost1 = TTI.getCastInstrCost( 7040 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 7041 TTI::CastContextHint::None, CostKind, Op1); 7042 InstructionCost MulCost = 7043 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7044 7045 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7046 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7047 CostKind); 7048 InstructionCost ExtraExtCost = 0; 7049 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7050 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7051 ExtraExtCost = TTI.getCastInstrCost( 7052 ExtraExtOp->getOpcode(), ExtType, 7053 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7054 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7055 } 7056 7057 if (RedCost.isValid() && 7058 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7059 return I == RetI ? RedCost : 0; 7060 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7061 // Matched reduce(mul()) 7062 InstructionCost MulCost = 7063 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7064 7065 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7066 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7067 CostKind); 7068 7069 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7070 return I == RetI ? RedCost : 0; 7071 } 7072 } 7073 7074 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7075 } 7076 7077 InstructionCost 7078 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7079 ElementCount VF) { 7080 // Calculate scalar cost only. Vectorization cost should be ready at this 7081 // moment. 7082 if (VF.isScalar()) { 7083 Type *ValTy = getLoadStoreType(I); 7084 const Align Alignment = getLoadStoreAlignment(I); 7085 unsigned AS = getLoadStoreAddressSpace(I); 7086 7087 return TTI.getAddressComputationCost(ValTy) + 7088 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7089 TTI::TCK_RecipThroughput, I); 7090 } 7091 return getWideningCost(I, VF); 7092 } 7093 7094 LoopVectorizationCostModel::VectorizationCostTy 7095 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7096 ElementCount VF) { 7097 // If we know that this instruction will remain uniform, check the cost of 7098 // the scalar version. 7099 if (isUniformAfterVectorization(I, VF)) 7100 VF = ElementCount::getFixed(1); 7101 7102 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7103 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7104 7105 // Forced scalars do not have any scalarization overhead. 7106 auto ForcedScalar = ForcedScalars.find(VF); 7107 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7108 auto InstSet = ForcedScalar->second; 7109 if (InstSet.count(I)) 7110 return VectorizationCostTy( 7111 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7112 VF.getKnownMinValue()), 7113 false); 7114 } 7115 7116 Type *VectorTy; 7117 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7118 7119 bool TypeNotScalarized = false; 7120 if (VF.isVector() && VectorTy->isVectorTy()) { 7121 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7122 if (NumParts) 7123 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7124 else 7125 C = InstructionCost::getInvalid(); 7126 } 7127 return VectorizationCostTy(C, TypeNotScalarized); 7128 } 7129 7130 InstructionCost 7131 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7132 ElementCount VF) const { 7133 7134 // There is no mechanism yet to create a scalable scalarization loop, 7135 // so this is currently Invalid. 7136 if (VF.isScalable()) 7137 return InstructionCost::getInvalid(); 7138 7139 if (VF.isScalar()) 7140 return 0; 7141 7142 InstructionCost Cost = 0; 7143 Type *RetTy = ToVectorTy(I->getType(), VF); 7144 if (!RetTy->isVoidTy() && 7145 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7146 Cost += TTI.getScalarizationOverhead( 7147 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7148 false); 7149 7150 // Some targets keep addresses scalar. 7151 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7152 return Cost; 7153 7154 // Some targets support efficient element stores. 7155 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7156 return Cost; 7157 7158 // Collect operands to consider. 7159 CallInst *CI = dyn_cast<CallInst>(I); 7160 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7161 7162 // Skip operands that do not require extraction/scalarization and do not incur 7163 // any overhead. 7164 SmallVector<Type *> Tys; 7165 for (auto *V : filterExtractingOperands(Ops, VF)) 7166 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7167 return Cost + TTI.getOperandsScalarizationOverhead( 7168 filterExtractingOperands(Ops, VF), Tys); 7169 } 7170 7171 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7172 if (VF.isScalar()) 7173 return; 7174 NumPredStores = 0; 7175 for (BasicBlock *BB : TheLoop->blocks()) { 7176 // For each instruction in the old loop. 7177 for (Instruction &I : *BB) { 7178 Value *Ptr = getLoadStorePointerOperand(&I); 7179 if (!Ptr) 7180 continue; 7181 7182 // TODO: We should generate better code and update the cost model for 7183 // predicated uniform stores. Today they are treated as any other 7184 // predicated store (see added test cases in 7185 // invariant-store-vectorization.ll). 7186 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7187 NumPredStores++; 7188 7189 if (Legal->isUniformMemOp(I)) { 7190 // TODO: Avoid replicating loads and stores instead of 7191 // relying on instcombine to remove them. 7192 // Load: Scalar load + broadcast 7193 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7194 InstructionCost Cost; 7195 if (isa<StoreInst>(&I) && VF.isScalable() && 7196 isLegalGatherOrScatter(&I)) { 7197 Cost = getGatherScatterCost(&I, VF); 7198 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7199 } else { 7200 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7201 "Cannot yet scalarize uniform stores"); 7202 Cost = getUniformMemOpCost(&I, VF); 7203 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7204 } 7205 continue; 7206 } 7207 7208 // We assume that widening is the best solution when possible. 7209 if (memoryInstructionCanBeWidened(&I, VF)) { 7210 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7211 int ConsecutiveStride = Legal->isConsecutivePtr( 7212 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7213 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7214 "Expected consecutive stride."); 7215 InstWidening Decision = 7216 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7217 setWideningDecision(&I, VF, Decision, Cost); 7218 continue; 7219 } 7220 7221 // Choose between Interleaving, Gather/Scatter or Scalarization. 7222 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7223 unsigned NumAccesses = 1; 7224 if (isAccessInterleaved(&I)) { 7225 auto Group = getInterleavedAccessGroup(&I); 7226 assert(Group && "Fail to get an interleaved access group."); 7227 7228 // Make one decision for the whole group. 7229 if (getWideningDecision(&I, VF) != CM_Unknown) 7230 continue; 7231 7232 NumAccesses = Group->getNumMembers(); 7233 if (interleavedAccessCanBeWidened(&I, VF)) 7234 InterleaveCost = getInterleaveGroupCost(&I, VF); 7235 } 7236 7237 InstructionCost GatherScatterCost = 7238 isLegalGatherOrScatter(&I) 7239 ? getGatherScatterCost(&I, VF) * NumAccesses 7240 : InstructionCost::getInvalid(); 7241 7242 InstructionCost ScalarizationCost = 7243 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7244 7245 // Choose better solution for the current VF, 7246 // write down this decision and use it during vectorization. 7247 InstructionCost Cost; 7248 InstWidening Decision; 7249 if (InterleaveCost <= GatherScatterCost && 7250 InterleaveCost < ScalarizationCost) { 7251 Decision = CM_Interleave; 7252 Cost = InterleaveCost; 7253 } else if (GatherScatterCost < ScalarizationCost) { 7254 Decision = CM_GatherScatter; 7255 Cost = GatherScatterCost; 7256 } else { 7257 Decision = CM_Scalarize; 7258 Cost = ScalarizationCost; 7259 } 7260 // If the instructions belongs to an interleave group, the whole group 7261 // receives the same decision. The whole group receives the cost, but 7262 // the cost will actually be assigned to one instruction. 7263 if (auto Group = getInterleavedAccessGroup(&I)) 7264 setWideningDecision(Group, VF, Decision, Cost); 7265 else 7266 setWideningDecision(&I, VF, Decision, Cost); 7267 } 7268 } 7269 7270 // Make sure that any load of address and any other address computation 7271 // remains scalar unless there is gather/scatter support. This avoids 7272 // inevitable extracts into address registers, and also has the benefit of 7273 // activating LSR more, since that pass can't optimize vectorized 7274 // addresses. 7275 if (TTI.prefersVectorizedAddressing()) 7276 return; 7277 7278 // Start with all scalar pointer uses. 7279 SmallPtrSet<Instruction *, 8> AddrDefs; 7280 for (BasicBlock *BB : TheLoop->blocks()) 7281 for (Instruction &I : *BB) { 7282 Instruction *PtrDef = 7283 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7284 if (PtrDef && TheLoop->contains(PtrDef) && 7285 getWideningDecision(&I, VF) != CM_GatherScatter) 7286 AddrDefs.insert(PtrDef); 7287 } 7288 7289 // Add all instructions used to generate the addresses. 7290 SmallVector<Instruction *, 4> Worklist; 7291 append_range(Worklist, AddrDefs); 7292 while (!Worklist.empty()) { 7293 Instruction *I = Worklist.pop_back_val(); 7294 for (auto &Op : I->operands()) 7295 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7296 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7297 AddrDefs.insert(InstOp).second) 7298 Worklist.push_back(InstOp); 7299 } 7300 7301 for (auto *I : AddrDefs) { 7302 if (isa<LoadInst>(I)) { 7303 // Setting the desired widening decision should ideally be handled in 7304 // by cost functions, but since this involves the task of finding out 7305 // if the loaded register is involved in an address computation, it is 7306 // instead changed here when we know this is the case. 7307 InstWidening Decision = getWideningDecision(I, VF); 7308 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7309 // Scalarize a widened load of address. 7310 setWideningDecision( 7311 I, VF, CM_Scalarize, 7312 (VF.getKnownMinValue() * 7313 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7314 else if (auto Group = getInterleavedAccessGroup(I)) { 7315 // Scalarize an interleave group of address loads. 7316 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7317 if (Instruction *Member = Group->getMember(I)) 7318 setWideningDecision( 7319 Member, VF, CM_Scalarize, 7320 (VF.getKnownMinValue() * 7321 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7322 } 7323 } 7324 } else 7325 // Make sure I gets scalarized and a cost estimate without 7326 // scalarization overhead. 7327 ForcedScalars[VF].insert(I); 7328 } 7329 } 7330 7331 InstructionCost 7332 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7333 Type *&VectorTy) { 7334 Type *RetTy = I->getType(); 7335 if (canTruncateToMinimalBitwidth(I, VF)) 7336 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7337 auto SE = PSE.getSE(); 7338 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7339 7340 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7341 ElementCount VF) -> bool { 7342 if (VF.isScalar()) 7343 return true; 7344 7345 auto Scalarized = InstsToScalarize.find(VF); 7346 assert(Scalarized != InstsToScalarize.end() && 7347 "VF not yet analyzed for scalarization profitability"); 7348 return !Scalarized->second.count(I) && 7349 llvm::all_of(I->users(), [&](User *U) { 7350 auto *UI = cast<Instruction>(U); 7351 return !Scalarized->second.count(UI); 7352 }); 7353 }; 7354 (void) hasSingleCopyAfterVectorization; 7355 7356 if (isScalarAfterVectorization(I, VF)) { 7357 // With the exception of GEPs and PHIs, after scalarization there should 7358 // only be one copy of the instruction generated in the loop. This is 7359 // because the VF is either 1, or any instructions that need scalarizing 7360 // have already been dealt with by the the time we get here. As a result, 7361 // it means we don't have to multiply the instruction cost by VF. 7362 assert(I->getOpcode() == Instruction::GetElementPtr || 7363 I->getOpcode() == Instruction::PHI || 7364 (I->getOpcode() == Instruction::BitCast && 7365 I->getType()->isPointerTy()) || 7366 hasSingleCopyAfterVectorization(I, VF)); 7367 VectorTy = RetTy; 7368 } else 7369 VectorTy = ToVectorTy(RetTy, VF); 7370 7371 // TODO: We need to estimate the cost of intrinsic calls. 7372 switch (I->getOpcode()) { 7373 case Instruction::GetElementPtr: 7374 // We mark this instruction as zero-cost because the cost of GEPs in 7375 // vectorized code depends on whether the corresponding memory instruction 7376 // is scalarized or not. Therefore, we handle GEPs with the memory 7377 // instruction cost. 7378 return 0; 7379 case Instruction::Br: { 7380 // In cases of scalarized and predicated instructions, there will be VF 7381 // predicated blocks in the vectorized loop. Each branch around these 7382 // blocks requires also an extract of its vector compare i1 element. 7383 bool ScalarPredicatedBB = false; 7384 BranchInst *BI = cast<BranchInst>(I); 7385 if (VF.isVector() && BI->isConditional() && 7386 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7387 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7388 ScalarPredicatedBB = true; 7389 7390 if (ScalarPredicatedBB) { 7391 // Not possible to scalarize scalable vector with predicated instructions. 7392 if (VF.isScalable()) 7393 return InstructionCost::getInvalid(); 7394 // Return cost for branches around scalarized and predicated blocks. 7395 auto *Vec_i1Ty = 7396 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7397 return ( 7398 TTI.getScalarizationOverhead( 7399 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7400 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7401 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7402 // The back-edge branch will remain, as will all scalar branches. 7403 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7404 else 7405 // This branch will be eliminated by if-conversion. 7406 return 0; 7407 // Note: We currently assume zero cost for an unconditional branch inside 7408 // a predicated block since it will become a fall-through, although we 7409 // may decide in the future to call TTI for all branches. 7410 } 7411 case Instruction::PHI: { 7412 auto *Phi = cast<PHINode>(I); 7413 7414 // First-order recurrences are replaced by vector shuffles inside the loop. 7415 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7416 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7417 return TTI.getShuffleCost( 7418 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7419 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7420 7421 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7422 // converted into select instructions. We require N - 1 selects per phi 7423 // node, where N is the number of incoming values. 7424 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7425 return (Phi->getNumIncomingValues() - 1) * 7426 TTI.getCmpSelInstrCost( 7427 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7428 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7429 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7430 7431 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7432 } 7433 case Instruction::UDiv: 7434 case Instruction::SDiv: 7435 case Instruction::URem: 7436 case Instruction::SRem: 7437 // If we have a predicated instruction, it may not be executed for each 7438 // vector lane. Get the scalarization cost and scale this amount by the 7439 // probability of executing the predicated block. If the instruction is not 7440 // predicated, we fall through to the next case. 7441 if (VF.isVector() && isScalarWithPredication(I)) { 7442 InstructionCost Cost = 0; 7443 7444 // These instructions have a non-void type, so account for the phi nodes 7445 // that we will create. This cost is likely to be zero. The phi node 7446 // cost, if any, should be scaled by the block probability because it 7447 // models a copy at the end of each predicated block. 7448 Cost += VF.getKnownMinValue() * 7449 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7450 7451 // The cost of the non-predicated instruction. 7452 Cost += VF.getKnownMinValue() * 7453 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7454 7455 // The cost of insertelement and extractelement instructions needed for 7456 // scalarization. 7457 Cost += getScalarizationOverhead(I, VF); 7458 7459 // Scale the cost by the probability of executing the predicated blocks. 7460 // This assumes the predicated block for each vector lane is equally 7461 // likely. 7462 return Cost / getReciprocalPredBlockProb(); 7463 } 7464 LLVM_FALLTHROUGH; 7465 case Instruction::Add: 7466 case Instruction::FAdd: 7467 case Instruction::Sub: 7468 case Instruction::FSub: 7469 case Instruction::Mul: 7470 case Instruction::FMul: 7471 case Instruction::FDiv: 7472 case Instruction::FRem: 7473 case Instruction::Shl: 7474 case Instruction::LShr: 7475 case Instruction::AShr: 7476 case Instruction::And: 7477 case Instruction::Or: 7478 case Instruction::Xor: { 7479 // Since we will replace the stride by 1 the multiplication should go away. 7480 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7481 return 0; 7482 7483 // Detect reduction patterns 7484 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7485 return *RedCost; 7486 7487 // Certain instructions can be cheaper to vectorize if they have a constant 7488 // second vector operand. One example of this are shifts on x86. 7489 Value *Op2 = I->getOperand(1); 7490 TargetTransformInfo::OperandValueProperties Op2VP; 7491 TargetTransformInfo::OperandValueKind Op2VK = 7492 TTI.getOperandInfo(Op2, Op2VP); 7493 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7494 Op2VK = TargetTransformInfo::OK_UniformValue; 7495 7496 SmallVector<const Value *, 4> Operands(I->operand_values()); 7497 return TTI.getArithmeticInstrCost( 7498 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7499 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7500 } 7501 case Instruction::FNeg: { 7502 return TTI.getArithmeticInstrCost( 7503 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7504 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7505 TargetTransformInfo::OP_None, I->getOperand(0), I); 7506 } 7507 case Instruction::Select: { 7508 SelectInst *SI = cast<SelectInst>(I); 7509 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7510 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7511 7512 const Value *Op0, *Op1; 7513 using namespace llvm::PatternMatch; 7514 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7515 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7516 // select x, y, false --> x & y 7517 // select x, true, y --> x | y 7518 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7519 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7520 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7521 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7522 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7523 Op1->getType()->getScalarSizeInBits() == 1); 7524 7525 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7526 return TTI.getArithmeticInstrCost( 7527 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7528 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7529 } 7530 7531 Type *CondTy = SI->getCondition()->getType(); 7532 if (!ScalarCond) 7533 CondTy = VectorType::get(CondTy, VF); 7534 7535 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7536 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7537 Pred = Cmp->getPredicate(); 7538 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7539 CostKind, I); 7540 } 7541 case Instruction::ICmp: 7542 case Instruction::FCmp: { 7543 Type *ValTy = I->getOperand(0)->getType(); 7544 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7545 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7546 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7547 VectorTy = ToVectorTy(ValTy, VF); 7548 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7549 cast<CmpInst>(I)->getPredicate(), CostKind, 7550 I); 7551 } 7552 case Instruction::Store: 7553 case Instruction::Load: { 7554 ElementCount Width = VF; 7555 if (Width.isVector()) { 7556 InstWidening Decision = getWideningDecision(I, Width); 7557 assert(Decision != CM_Unknown && 7558 "CM decision should be taken at this point"); 7559 if (Decision == CM_Scalarize) 7560 Width = ElementCount::getFixed(1); 7561 } 7562 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7563 return getMemoryInstructionCost(I, VF); 7564 } 7565 case Instruction::BitCast: 7566 if (I->getType()->isPointerTy()) 7567 return 0; 7568 LLVM_FALLTHROUGH; 7569 case Instruction::ZExt: 7570 case Instruction::SExt: 7571 case Instruction::FPToUI: 7572 case Instruction::FPToSI: 7573 case Instruction::FPExt: 7574 case Instruction::PtrToInt: 7575 case Instruction::IntToPtr: 7576 case Instruction::SIToFP: 7577 case Instruction::UIToFP: 7578 case Instruction::Trunc: 7579 case Instruction::FPTrunc: { 7580 // Computes the CastContextHint from a Load/Store instruction. 7581 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7582 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7583 "Expected a load or a store!"); 7584 7585 if (VF.isScalar() || !TheLoop->contains(I)) 7586 return TTI::CastContextHint::Normal; 7587 7588 switch (getWideningDecision(I, VF)) { 7589 case LoopVectorizationCostModel::CM_GatherScatter: 7590 return TTI::CastContextHint::GatherScatter; 7591 case LoopVectorizationCostModel::CM_Interleave: 7592 return TTI::CastContextHint::Interleave; 7593 case LoopVectorizationCostModel::CM_Scalarize: 7594 case LoopVectorizationCostModel::CM_Widen: 7595 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7596 : TTI::CastContextHint::Normal; 7597 case LoopVectorizationCostModel::CM_Widen_Reverse: 7598 return TTI::CastContextHint::Reversed; 7599 case LoopVectorizationCostModel::CM_Unknown: 7600 llvm_unreachable("Instr did not go through cost modelling?"); 7601 } 7602 7603 llvm_unreachable("Unhandled case!"); 7604 }; 7605 7606 unsigned Opcode = I->getOpcode(); 7607 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7608 // For Trunc, the context is the only user, which must be a StoreInst. 7609 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7610 if (I->hasOneUse()) 7611 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7612 CCH = ComputeCCH(Store); 7613 } 7614 // For Z/Sext, the context is the operand, which must be a LoadInst. 7615 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7616 Opcode == Instruction::FPExt) { 7617 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7618 CCH = ComputeCCH(Load); 7619 } 7620 7621 // We optimize the truncation of induction variables having constant 7622 // integer steps. The cost of these truncations is the same as the scalar 7623 // operation. 7624 if (isOptimizableIVTruncate(I, VF)) { 7625 auto *Trunc = cast<TruncInst>(I); 7626 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7627 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7628 } 7629 7630 // Detect reduction patterns 7631 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7632 return *RedCost; 7633 7634 Type *SrcScalarTy = I->getOperand(0)->getType(); 7635 Type *SrcVecTy = 7636 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7637 if (canTruncateToMinimalBitwidth(I, VF)) { 7638 // This cast is going to be shrunk. This may remove the cast or it might 7639 // turn it into slightly different cast. For example, if MinBW == 16, 7640 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7641 // 7642 // Calculate the modified src and dest types. 7643 Type *MinVecTy = VectorTy; 7644 if (Opcode == Instruction::Trunc) { 7645 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7646 VectorTy = 7647 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7648 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7649 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7650 VectorTy = 7651 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7652 } 7653 } 7654 7655 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7656 } 7657 case Instruction::Call: { 7658 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7659 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7660 return *RedCost; 7661 bool NeedToScalarize; 7662 CallInst *CI = cast<CallInst>(I); 7663 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7664 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7665 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7666 return std::min(CallCost, IntrinsicCost); 7667 } 7668 return CallCost; 7669 } 7670 case Instruction::ExtractValue: 7671 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7672 case Instruction::Alloca: 7673 // We cannot easily widen alloca to a scalable alloca, as 7674 // the result would need to be a vector of pointers. 7675 if (VF.isScalable()) 7676 return InstructionCost::getInvalid(); 7677 LLVM_FALLTHROUGH; 7678 default: 7679 // This opcode is unknown. Assume that it is the same as 'mul'. 7680 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7681 } // end of switch. 7682 } 7683 7684 char LoopVectorize::ID = 0; 7685 7686 static const char lv_name[] = "Loop Vectorization"; 7687 7688 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7689 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7690 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7691 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7692 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7693 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7694 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7695 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7696 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7697 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7698 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7699 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7700 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7701 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7702 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7703 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7704 7705 namespace llvm { 7706 7707 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7708 7709 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7710 bool VectorizeOnlyWhenForced) { 7711 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7712 } 7713 7714 } // end namespace llvm 7715 7716 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7717 // Check if the pointer operand of a load or store instruction is 7718 // consecutive. 7719 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7720 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7721 return false; 7722 } 7723 7724 void LoopVectorizationCostModel::collectValuesToIgnore() { 7725 // Ignore ephemeral values. 7726 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7727 7728 // Ignore type-promoting instructions we identified during reduction 7729 // detection. 7730 for (auto &Reduction : Legal->getReductionVars()) { 7731 const RecurrenceDescriptor &RedDes = Reduction.second; 7732 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7733 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7734 } 7735 // Ignore type-casting instructions we identified during induction 7736 // detection. 7737 for (auto &Induction : Legal->getInductionVars()) { 7738 const InductionDescriptor &IndDes = Induction.second; 7739 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7740 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7741 } 7742 } 7743 7744 void LoopVectorizationCostModel::collectInLoopReductions() { 7745 for (auto &Reduction : Legal->getReductionVars()) { 7746 PHINode *Phi = Reduction.first; 7747 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7748 7749 // We don't collect reductions that are type promoted (yet). 7750 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7751 continue; 7752 7753 // If the target would prefer this reduction to happen "in-loop", then we 7754 // want to record it as such. 7755 unsigned Opcode = RdxDesc.getOpcode(); 7756 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7757 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7758 TargetTransformInfo::ReductionFlags())) 7759 continue; 7760 7761 // Check that we can correctly put the reductions into the loop, by 7762 // finding the chain of operations that leads from the phi to the loop 7763 // exit value. 7764 SmallVector<Instruction *, 4> ReductionOperations = 7765 RdxDesc.getReductionOpChain(Phi, TheLoop); 7766 bool InLoop = !ReductionOperations.empty(); 7767 if (InLoop) { 7768 InLoopReductionChains[Phi] = ReductionOperations; 7769 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7770 Instruction *LastChain = Phi; 7771 for (auto *I : ReductionOperations) { 7772 InLoopReductionImmediateChains[I] = LastChain; 7773 LastChain = I; 7774 } 7775 } 7776 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7777 << " reduction for phi: " << *Phi << "\n"); 7778 } 7779 } 7780 7781 // TODO: we could return a pair of values that specify the max VF and 7782 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7783 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7784 // doesn't have a cost model that can choose which plan to execute if 7785 // more than one is generated. 7786 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7787 LoopVectorizationCostModel &CM) { 7788 unsigned WidestType; 7789 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7790 return WidestVectorRegBits / WidestType; 7791 } 7792 7793 VectorizationFactor 7794 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7795 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7796 ElementCount VF = UserVF; 7797 // Outer loop handling: They may require CFG and instruction level 7798 // transformations before even evaluating whether vectorization is profitable. 7799 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7800 // the vectorization pipeline. 7801 if (!OrigLoop->isInnermost()) { 7802 // If the user doesn't provide a vectorization factor, determine a 7803 // reasonable one. 7804 if (UserVF.isZero()) { 7805 VF = ElementCount::getFixed(determineVPlanVF( 7806 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7807 .getFixedSize(), 7808 CM)); 7809 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7810 7811 // Make sure we have a VF > 1 for stress testing. 7812 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7813 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7814 << "overriding computed VF.\n"); 7815 VF = ElementCount::getFixed(4); 7816 } 7817 } 7818 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7819 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7820 "VF needs to be a power of two"); 7821 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7822 << "VF " << VF << " to build VPlans.\n"); 7823 buildVPlans(VF, VF); 7824 7825 // For VPlan build stress testing, we bail out after VPlan construction. 7826 if (VPlanBuildStressTest) 7827 return VectorizationFactor::Disabled(); 7828 7829 return {VF, 0 /*Cost*/}; 7830 } 7831 7832 LLVM_DEBUG( 7833 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7834 "VPlan-native path.\n"); 7835 return VectorizationFactor::Disabled(); 7836 } 7837 7838 Optional<VectorizationFactor> 7839 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7840 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7841 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7842 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7843 return None; 7844 7845 // Invalidate interleave groups if all blocks of loop will be predicated. 7846 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7847 !useMaskedInterleavedAccesses(*TTI)) { 7848 LLVM_DEBUG( 7849 dbgs() 7850 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7851 "which requires masked-interleaved support.\n"); 7852 if (CM.InterleaveInfo.invalidateGroups()) 7853 // Invalidating interleave groups also requires invalidating all decisions 7854 // based on them, which includes widening decisions and uniform and scalar 7855 // values. 7856 CM.invalidateCostModelingDecisions(); 7857 } 7858 7859 ElementCount MaxUserVF = 7860 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7861 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7862 if (!UserVF.isZero() && UserVFIsLegal) { 7863 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7864 "VF needs to be a power of two"); 7865 // Collect the instructions (and their associated costs) that will be more 7866 // profitable to scalarize. 7867 if (CM.selectUserVectorizationFactor(UserVF)) { 7868 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7869 CM.collectInLoopReductions(); 7870 buildVPlansWithVPRecipes(UserVF, UserVF); 7871 LLVM_DEBUG(printPlans(dbgs())); 7872 return {{UserVF, 0}}; 7873 } else 7874 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7875 "InvalidCost", ORE, OrigLoop); 7876 } 7877 7878 // Populate the set of Vectorization Factor Candidates. 7879 ElementCountSet VFCandidates; 7880 for (auto VF = ElementCount::getFixed(1); 7881 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7882 VFCandidates.insert(VF); 7883 for (auto VF = ElementCount::getScalable(1); 7884 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7885 VFCandidates.insert(VF); 7886 7887 for (const auto &VF : VFCandidates) { 7888 // Collect Uniform and Scalar instructions after vectorization with VF. 7889 CM.collectUniformsAndScalars(VF); 7890 7891 // Collect the instructions (and their associated costs) that will be more 7892 // profitable to scalarize. 7893 if (VF.isVector()) 7894 CM.collectInstsToScalarize(VF); 7895 } 7896 7897 CM.collectInLoopReductions(); 7898 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7899 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7900 7901 LLVM_DEBUG(printPlans(dbgs())); 7902 if (!MaxFactors.hasVector()) 7903 return VectorizationFactor::Disabled(); 7904 7905 // Select the optimal vectorization factor. 7906 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7907 7908 // Check if it is profitable to vectorize with runtime checks. 7909 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7910 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7911 bool PragmaThresholdReached = 7912 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7913 bool ThresholdReached = 7914 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7915 if ((ThresholdReached && !Hints.allowReordering()) || 7916 PragmaThresholdReached) { 7917 ORE->emit([&]() { 7918 return OptimizationRemarkAnalysisAliasing( 7919 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7920 OrigLoop->getHeader()) 7921 << "loop not vectorized: cannot prove it is safe to reorder " 7922 "memory operations"; 7923 }); 7924 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7925 Hints.emitRemarkWithHints(); 7926 return VectorizationFactor::Disabled(); 7927 } 7928 } 7929 return SelectedVF; 7930 } 7931 7932 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7933 assert(count_if(VPlans, 7934 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7935 1 && 7936 "Best VF has not a single VPlan."); 7937 7938 for (const VPlanPtr &Plan : VPlans) { 7939 if (Plan->hasVF(VF)) 7940 return *Plan.get(); 7941 } 7942 llvm_unreachable("No plan found!"); 7943 } 7944 7945 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7946 VPlan &BestVPlan, 7947 InnerLoopVectorizer &ILV, 7948 DominatorTree *DT) { 7949 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7950 << '\n'); 7951 7952 // Perform the actual loop transformation. 7953 7954 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7955 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7956 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7957 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7958 State.CanonicalIV = ILV.Induction; 7959 ILV.collectPoisonGeneratingRecipes(State); 7960 7961 ILV.printDebugTracesAtStart(); 7962 7963 //===------------------------------------------------===// 7964 // 7965 // Notice: any optimization or new instruction that go 7966 // into the code below should also be implemented in 7967 // the cost-model. 7968 // 7969 //===------------------------------------------------===// 7970 7971 // 2. Copy and widen instructions from the old loop into the new loop. 7972 BestVPlan.execute(&State); 7973 7974 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7975 // predication, updating analyses. 7976 ILV.fixVectorizedLoop(State); 7977 7978 ILV.printDebugTracesAtEnd(); 7979 } 7980 7981 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7982 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7983 for (const auto &Plan : VPlans) 7984 if (PrintVPlansInDotFormat) 7985 Plan->printDOT(O); 7986 else 7987 Plan->print(O); 7988 } 7989 #endif 7990 7991 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7992 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7993 7994 // We create new control-flow for the vectorized loop, so the original exit 7995 // conditions will be dead after vectorization if it's only used by the 7996 // terminator 7997 SmallVector<BasicBlock*> ExitingBlocks; 7998 OrigLoop->getExitingBlocks(ExitingBlocks); 7999 for (auto *BB : ExitingBlocks) { 8000 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8001 if (!Cmp || !Cmp->hasOneUse()) 8002 continue; 8003 8004 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8005 if (!DeadInstructions.insert(Cmp).second) 8006 continue; 8007 8008 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8009 // TODO: can recurse through operands in general 8010 for (Value *Op : Cmp->operands()) { 8011 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8012 DeadInstructions.insert(cast<Instruction>(Op)); 8013 } 8014 } 8015 8016 // We create new "steps" for induction variable updates to which the original 8017 // induction variables map. An original update instruction will be dead if 8018 // all its users except the induction variable are dead. 8019 auto *Latch = OrigLoop->getLoopLatch(); 8020 for (auto &Induction : Legal->getInductionVars()) { 8021 PHINode *Ind = Induction.first; 8022 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8023 8024 // If the tail is to be folded by masking, the primary induction variable, 8025 // if exists, isn't dead: it will be used for masking. Don't kill it. 8026 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8027 continue; 8028 8029 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8030 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8031 })) 8032 DeadInstructions.insert(IndUpdate); 8033 } 8034 } 8035 8036 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8037 8038 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8039 8040 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8041 Value *Step, 8042 Instruction::BinaryOps BinOp) { 8043 // When unrolling and the VF is 1, we only need to add a simple scalar. 8044 Type *Ty = Val->getType(); 8045 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8046 8047 if (Ty->isFloatingPointTy()) { 8048 // Floating-point operations inherit FMF via the builder's flags. 8049 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8050 return Builder.CreateBinOp(BinOp, Val, MulOp); 8051 } 8052 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8053 } 8054 8055 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8056 SmallVector<Metadata *, 4> MDs; 8057 // Reserve first location for self reference to the LoopID metadata node. 8058 MDs.push_back(nullptr); 8059 bool IsUnrollMetadata = false; 8060 MDNode *LoopID = L->getLoopID(); 8061 if (LoopID) { 8062 // First find existing loop unrolling disable metadata. 8063 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8064 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8065 if (MD) { 8066 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8067 IsUnrollMetadata = 8068 S && S->getString().startswith("llvm.loop.unroll.disable"); 8069 } 8070 MDs.push_back(LoopID->getOperand(i)); 8071 } 8072 } 8073 8074 if (!IsUnrollMetadata) { 8075 // Add runtime unroll disable metadata. 8076 LLVMContext &Context = L->getHeader()->getContext(); 8077 SmallVector<Metadata *, 1> DisableOperands; 8078 DisableOperands.push_back( 8079 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8080 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8081 MDs.push_back(DisableNode); 8082 MDNode *NewLoopID = MDNode::get(Context, MDs); 8083 // Set operand 0 to refer to the loop id itself. 8084 NewLoopID->replaceOperandWith(0, NewLoopID); 8085 L->setLoopID(NewLoopID); 8086 } 8087 } 8088 8089 //===--------------------------------------------------------------------===// 8090 // EpilogueVectorizerMainLoop 8091 //===--------------------------------------------------------------------===// 8092 8093 /// This function is partially responsible for generating the control flow 8094 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8095 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8096 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8097 Loop *Lp = createVectorLoopSkeleton(""); 8098 8099 // Generate the code to check the minimum iteration count of the vector 8100 // epilogue (see below). 8101 EPI.EpilogueIterationCountCheck = 8102 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8103 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8104 8105 // Generate the code to check any assumptions that we've made for SCEV 8106 // expressions. 8107 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8108 8109 // Generate the code that checks at runtime if arrays overlap. We put the 8110 // checks into a separate block to make the more common case of few elements 8111 // faster. 8112 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8113 8114 // Generate the iteration count check for the main loop, *after* the check 8115 // for the epilogue loop, so that the path-length is shorter for the case 8116 // that goes directly through the vector epilogue. The longer-path length for 8117 // the main loop is compensated for, by the gain from vectorizing the larger 8118 // trip count. Note: the branch will get updated later on when we vectorize 8119 // the epilogue. 8120 EPI.MainLoopIterationCountCheck = 8121 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8122 8123 // Generate the induction variable. 8124 OldInduction = Legal->getPrimaryInduction(); 8125 Type *IdxTy = Legal->getWidestInductionType(); 8126 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8127 8128 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8129 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8130 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8131 EPI.VectorTripCount = CountRoundDown; 8132 Induction = 8133 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8134 getDebugLocFromInstOrOperands(OldInduction)); 8135 8136 // Skip induction resume value creation here because they will be created in 8137 // the second pass. If we created them here, they wouldn't be used anyway, 8138 // because the vplan in the second pass still contains the inductions from the 8139 // original loop. 8140 8141 return completeLoopSkeleton(Lp, OrigLoopID); 8142 } 8143 8144 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8145 LLVM_DEBUG({ 8146 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8147 << "Main Loop VF:" << EPI.MainLoopVF 8148 << ", Main Loop UF:" << EPI.MainLoopUF 8149 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8150 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8151 }); 8152 } 8153 8154 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8155 DEBUG_WITH_TYPE(VerboseDebug, { 8156 dbgs() << "intermediate fn:\n" 8157 << *OrigLoop->getHeader()->getParent() << "\n"; 8158 }); 8159 } 8160 8161 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8162 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8163 assert(L && "Expected valid Loop."); 8164 assert(Bypass && "Expected valid bypass basic block."); 8165 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8166 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8167 Value *Count = getOrCreateTripCount(L); 8168 // Reuse existing vector loop preheader for TC checks. 8169 // Note that new preheader block is generated for vector loop. 8170 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8171 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8172 8173 // Generate code to check if the loop's trip count is less than VF * UF of the 8174 // main vector loop. 8175 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8176 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8177 8178 Value *CheckMinIters = Builder.CreateICmp( 8179 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8180 "min.iters.check"); 8181 8182 if (!ForEpilogue) 8183 TCCheckBlock->setName("vector.main.loop.iter.check"); 8184 8185 // Create new preheader for vector loop. 8186 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8187 DT, LI, nullptr, "vector.ph"); 8188 8189 if (ForEpilogue) { 8190 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8191 DT->getNode(Bypass)->getIDom()) && 8192 "TC check is expected to dominate Bypass"); 8193 8194 // Update dominator for Bypass & LoopExit. 8195 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8196 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8197 // For loops with multiple exits, there's no edge from the middle block 8198 // to exit blocks (as the epilogue must run) and thus no need to update 8199 // the immediate dominator of the exit blocks. 8200 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8201 8202 LoopBypassBlocks.push_back(TCCheckBlock); 8203 8204 // Save the trip count so we don't have to regenerate it in the 8205 // vec.epilog.iter.check. This is safe to do because the trip count 8206 // generated here dominates the vector epilog iter check. 8207 EPI.TripCount = Count; 8208 } 8209 8210 ReplaceInstWithInst( 8211 TCCheckBlock->getTerminator(), 8212 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8213 8214 return TCCheckBlock; 8215 } 8216 8217 //===--------------------------------------------------------------------===// 8218 // EpilogueVectorizerEpilogueLoop 8219 //===--------------------------------------------------------------------===// 8220 8221 /// This function is partially responsible for generating the control flow 8222 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8223 BasicBlock * 8224 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8225 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8226 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8227 8228 // Now, compare the remaining count and if there aren't enough iterations to 8229 // execute the vectorized epilogue skip to the scalar part. 8230 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8231 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8232 LoopVectorPreHeader = 8233 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8234 LI, nullptr, "vec.epilog.ph"); 8235 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8236 VecEpilogueIterationCountCheck); 8237 8238 // Adjust the control flow taking the state info from the main loop 8239 // vectorization into account. 8240 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8241 "expected this to be saved from the previous pass."); 8242 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8243 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8244 8245 DT->changeImmediateDominator(LoopVectorPreHeader, 8246 EPI.MainLoopIterationCountCheck); 8247 8248 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8249 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8250 8251 if (EPI.SCEVSafetyCheck) 8252 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8253 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8254 if (EPI.MemSafetyCheck) 8255 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8256 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8257 8258 DT->changeImmediateDominator( 8259 VecEpilogueIterationCountCheck, 8260 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8261 8262 DT->changeImmediateDominator(LoopScalarPreHeader, 8263 EPI.EpilogueIterationCountCheck); 8264 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8265 // If there is an epilogue which must run, there's no edge from the 8266 // middle block to exit blocks and thus no need to update the immediate 8267 // dominator of the exit blocks. 8268 DT->changeImmediateDominator(LoopExitBlock, 8269 EPI.EpilogueIterationCountCheck); 8270 8271 // Keep track of bypass blocks, as they feed start values to the induction 8272 // phis in the scalar loop preheader. 8273 if (EPI.SCEVSafetyCheck) 8274 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8275 if (EPI.MemSafetyCheck) 8276 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8277 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8278 8279 // Generate a resume induction for the vector epilogue and put it in the 8280 // vector epilogue preheader 8281 Type *IdxTy = Legal->getWidestInductionType(); 8282 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8283 LoopVectorPreHeader->getFirstNonPHI()); 8284 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8285 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8286 EPI.MainLoopIterationCountCheck); 8287 8288 // Generate the induction variable. 8289 OldInduction = Legal->getPrimaryInduction(); 8290 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8291 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8292 Value *StartIdx = EPResumeVal; 8293 Induction = 8294 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8295 getDebugLocFromInstOrOperands(OldInduction)); 8296 8297 // Generate induction resume values. These variables save the new starting 8298 // indexes for the scalar loop. They are used to test if there are any tail 8299 // iterations left once the vector loop has completed. 8300 // Note that when the vectorized epilogue is skipped due to iteration count 8301 // check, then the resume value for the induction variable comes from 8302 // the trip count of the main vector loop, hence passing the AdditionalBypass 8303 // argument. 8304 createInductionResumeValues(Lp, CountRoundDown, 8305 {VecEpilogueIterationCountCheck, 8306 EPI.VectorTripCount} /* AdditionalBypass */); 8307 8308 AddRuntimeUnrollDisableMetaData(Lp); 8309 return completeLoopSkeleton(Lp, OrigLoopID); 8310 } 8311 8312 BasicBlock * 8313 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8314 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8315 8316 assert(EPI.TripCount && 8317 "Expected trip count to have been safed in the first pass."); 8318 assert( 8319 (!isa<Instruction>(EPI.TripCount) || 8320 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8321 "saved trip count does not dominate insertion point."); 8322 Value *TC = EPI.TripCount; 8323 IRBuilder<> Builder(Insert->getTerminator()); 8324 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8325 8326 // Generate code to check if the loop's trip count is less than VF * UF of the 8327 // vector epilogue loop. 8328 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8329 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8330 8331 Value *CheckMinIters = 8332 Builder.CreateICmp(P, Count, 8333 createStepForVF(Builder, Count->getType(), 8334 EPI.EpilogueVF, EPI.EpilogueUF), 8335 "min.epilog.iters.check"); 8336 8337 ReplaceInstWithInst( 8338 Insert->getTerminator(), 8339 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8340 8341 LoopBypassBlocks.push_back(Insert); 8342 return Insert; 8343 } 8344 8345 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8346 LLVM_DEBUG({ 8347 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8348 << "Epilogue Loop VF:" << EPI.EpilogueVF 8349 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8350 }); 8351 } 8352 8353 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8354 DEBUG_WITH_TYPE(VerboseDebug, { 8355 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8356 }); 8357 } 8358 8359 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8360 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8361 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8362 bool PredicateAtRangeStart = Predicate(Range.Start); 8363 8364 for (ElementCount TmpVF = Range.Start * 2; 8365 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8366 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8367 Range.End = TmpVF; 8368 break; 8369 } 8370 8371 return PredicateAtRangeStart; 8372 } 8373 8374 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8375 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8376 /// of VF's starting at a given VF and extending it as much as possible. Each 8377 /// vectorization decision can potentially shorten this sub-range during 8378 /// buildVPlan(). 8379 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8380 ElementCount MaxVF) { 8381 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8382 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8383 VFRange SubRange = {VF, MaxVFPlusOne}; 8384 VPlans.push_back(buildVPlan(SubRange)); 8385 VF = SubRange.End; 8386 } 8387 } 8388 8389 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8390 VPlanPtr &Plan) { 8391 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8392 8393 // Look for cached value. 8394 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8395 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8396 if (ECEntryIt != EdgeMaskCache.end()) 8397 return ECEntryIt->second; 8398 8399 VPValue *SrcMask = createBlockInMask(Src, Plan); 8400 8401 // The terminator has to be a branch inst! 8402 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8403 assert(BI && "Unexpected terminator found"); 8404 8405 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8406 return EdgeMaskCache[Edge] = SrcMask; 8407 8408 // If source is an exiting block, we know the exit edge is dynamically dead 8409 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8410 // adding uses of an otherwise potentially dead instruction. 8411 if (OrigLoop->isLoopExiting(Src)) 8412 return EdgeMaskCache[Edge] = SrcMask; 8413 8414 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8415 assert(EdgeMask && "No Edge Mask found for condition"); 8416 8417 if (BI->getSuccessor(0) != Dst) 8418 EdgeMask = Builder.createNot(EdgeMask); 8419 8420 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8421 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8422 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8423 // The select version does not introduce new UB if SrcMask is false and 8424 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8425 VPValue *False = Plan->getOrAddVPValue( 8426 ConstantInt::getFalse(BI->getCondition()->getType())); 8427 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8428 } 8429 8430 return EdgeMaskCache[Edge] = EdgeMask; 8431 } 8432 8433 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8434 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8435 8436 // Look for cached value. 8437 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8438 if (BCEntryIt != BlockMaskCache.end()) 8439 return BCEntryIt->second; 8440 8441 // All-one mask is modelled as no-mask following the convention for masked 8442 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8443 VPValue *BlockMask = nullptr; 8444 8445 if (OrigLoop->getHeader() == BB) { 8446 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8447 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8448 8449 // Create the block in mask as the first non-phi instruction in the block. 8450 VPBuilder::InsertPointGuard Guard(Builder); 8451 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8452 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8453 8454 // Introduce the early-exit compare IV <= BTC to form header block mask. 8455 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8456 // Start by constructing the desired canonical IV. 8457 VPValue *IV = nullptr; 8458 if (Legal->getPrimaryInduction()) 8459 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8460 else { 8461 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8462 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8463 IV = IVRecipe; 8464 } 8465 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8466 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8467 8468 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8469 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8470 // as a second argument, we only pass the IV here and extract the 8471 // tripcount from the transform state where codegen of the VP instructions 8472 // happen. 8473 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8474 } else { 8475 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8476 } 8477 return BlockMaskCache[BB] = BlockMask; 8478 } 8479 8480 // This is the block mask. We OR all incoming edges. 8481 for (auto *Predecessor : predecessors(BB)) { 8482 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8483 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8484 return BlockMaskCache[BB] = EdgeMask; 8485 8486 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8487 BlockMask = EdgeMask; 8488 continue; 8489 } 8490 8491 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8492 } 8493 8494 return BlockMaskCache[BB] = BlockMask; 8495 } 8496 8497 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8498 ArrayRef<VPValue *> Operands, 8499 VFRange &Range, 8500 VPlanPtr &Plan) { 8501 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8502 "Must be called with either a load or store"); 8503 8504 auto willWiden = [&](ElementCount VF) -> bool { 8505 if (VF.isScalar()) 8506 return false; 8507 LoopVectorizationCostModel::InstWidening Decision = 8508 CM.getWideningDecision(I, VF); 8509 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8510 "CM decision should be taken at this point."); 8511 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8512 return true; 8513 if (CM.isScalarAfterVectorization(I, VF) || 8514 CM.isProfitableToScalarize(I, VF)) 8515 return false; 8516 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8517 }; 8518 8519 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8520 return nullptr; 8521 8522 VPValue *Mask = nullptr; 8523 if (Legal->isMaskRequired(I)) 8524 Mask = createBlockInMask(I->getParent(), Plan); 8525 8526 // Determine if the pointer operand of the access is either consecutive or 8527 // reverse consecutive. 8528 LoopVectorizationCostModel::InstWidening Decision = 8529 CM.getWideningDecision(I, Range.Start); 8530 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8531 bool Consecutive = 8532 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8533 8534 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8535 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8536 Consecutive, Reverse); 8537 8538 StoreInst *Store = cast<StoreInst>(I); 8539 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8540 Mask, Consecutive, Reverse); 8541 } 8542 8543 VPWidenIntOrFpInductionRecipe * 8544 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8545 ArrayRef<VPValue *> Operands) const { 8546 // Check if this is an integer or fp induction. If so, build the recipe that 8547 // produces its scalar and vector values. 8548 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { 8549 assert(II->getStartValue() == 8550 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8551 return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); 8552 } 8553 8554 return nullptr; 8555 } 8556 8557 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8558 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8559 VPlan &Plan) const { 8560 // Optimize the special case where the source is a constant integer 8561 // induction variable. Notice that we can only optimize the 'trunc' case 8562 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8563 // (c) other casts depend on pointer size. 8564 8565 // Determine whether \p K is a truncation based on an induction variable that 8566 // can be optimized. 8567 auto isOptimizableIVTruncate = 8568 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8569 return [=](ElementCount VF) -> bool { 8570 return CM.isOptimizableIVTruncate(K, VF); 8571 }; 8572 }; 8573 8574 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8575 isOptimizableIVTruncate(I), Range)) { 8576 8577 auto *Phi = cast<PHINode>(I->getOperand(0)); 8578 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8579 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8580 return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); 8581 } 8582 return nullptr; 8583 } 8584 8585 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8586 ArrayRef<VPValue *> Operands, 8587 VPlanPtr &Plan) { 8588 // If all incoming values are equal, the incoming VPValue can be used directly 8589 // instead of creating a new VPBlendRecipe. 8590 VPValue *FirstIncoming = Operands[0]; 8591 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8592 return FirstIncoming == Inc; 8593 })) { 8594 return Operands[0]; 8595 } 8596 8597 // We know that all PHIs in non-header blocks are converted into selects, so 8598 // we don't have to worry about the insertion order and we can just use the 8599 // builder. At this point we generate the predication tree. There may be 8600 // duplications since this is a simple recursive scan, but future 8601 // optimizations will clean it up. 8602 SmallVector<VPValue *, 2> OperandsWithMask; 8603 unsigned NumIncoming = Phi->getNumIncomingValues(); 8604 8605 for (unsigned In = 0; In < NumIncoming; In++) { 8606 VPValue *EdgeMask = 8607 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8608 assert((EdgeMask || NumIncoming == 1) && 8609 "Multiple predecessors with one having a full mask"); 8610 OperandsWithMask.push_back(Operands[In]); 8611 if (EdgeMask) 8612 OperandsWithMask.push_back(EdgeMask); 8613 } 8614 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8615 } 8616 8617 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8618 ArrayRef<VPValue *> Operands, 8619 VFRange &Range) const { 8620 8621 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8622 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8623 Range); 8624 8625 if (IsPredicated) 8626 return nullptr; 8627 8628 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8629 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8630 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8631 ID == Intrinsic::pseudoprobe || 8632 ID == Intrinsic::experimental_noalias_scope_decl)) 8633 return nullptr; 8634 8635 auto willWiden = [&](ElementCount VF) -> bool { 8636 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8637 // The following case may be scalarized depending on the VF. 8638 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8639 // version of the instruction. 8640 // Is it beneficial to perform intrinsic call compared to lib call? 8641 bool NeedToScalarize = false; 8642 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8643 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8644 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8645 return UseVectorIntrinsic || !NeedToScalarize; 8646 }; 8647 8648 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8649 return nullptr; 8650 8651 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8652 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8653 } 8654 8655 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8656 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8657 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8658 // Instruction should be widened, unless it is scalar after vectorization, 8659 // scalarization is profitable or it is predicated. 8660 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8661 return CM.isScalarAfterVectorization(I, VF) || 8662 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8663 }; 8664 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8665 Range); 8666 } 8667 8668 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8669 ArrayRef<VPValue *> Operands) const { 8670 auto IsVectorizableOpcode = [](unsigned Opcode) { 8671 switch (Opcode) { 8672 case Instruction::Add: 8673 case Instruction::And: 8674 case Instruction::AShr: 8675 case Instruction::BitCast: 8676 case Instruction::FAdd: 8677 case Instruction::FCmp: 8678 case Instruction::FDiv: 8679 case Instruction::FMul: 8680 case Instruction::FNeg: 8681 case Instruction::FPExt: 8682 case Instruction::FPToSI: 8683 case Instruction::FPToUI: 8684 case Instruction::FPTrunc: 8685 case Instruction::FRem: 8686 case Instruction::FSub: 8687 case Instruction::ICmp: 8688 case Instruction::IntToPtr: 8689 case Instruction::LShr: 8690 case Instruction::Mul: 8691 case Instruction::Or: 8692 case Instruction::PtrToInt: 8693 case Instruction::SDiv: 8694 case Instruction::Select: 8695 case Instruction::SExt: 8696 case Instruction::Shl: 8697 case Instruction::SIToFP: 8698 case Instruction::SRem: 8699 case Instruction::Sub: 8700 case Instruction::Trunc: 8701 case Instruction::UDiv: 8702 case Instruction::UIToFP: 8703 case Instruction::URem: 8704 case Instruction::Xor: 8705 case Instruction::ZExt: 8706 return true; 8707 } 8708 return false; 8709 }; 8710 8711 if (!IsVectorizableOpcode(I->getOpcode())) 8712 return nullptr; 8713 8714 // Success: widen this instruction. 8715 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8716 } 8717 8718 void VPRecipeBuilder::fixHeaderPhis() { 8719 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8720 for (VPWidenPHIRecipe *R : PhisToFix) { 8721 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8722 VPRecipeBase *IncR = 8723 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8724 R->addOperand(IncR->getVPSingleValue()); 8725 } 8726 } 8727 8728 VPBasicBlock *VPRecipeBuilder::handleReplication( 8729 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8730 VPlanPtr &Plan) { 8731 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8732 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8733 Range); 8734 8735 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8736 [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, 8737 Range); 8738 8739 // Even if the instruction is not marked as uniform, there are certain 8740 // intrinsic calls that can be effectively treated as such, so we check for 8741 // them here. Conservatively, we only do this for scalable vectors, since 8742 // for fixed-width VFs we can always fall back on full scalarization. 8743 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8744 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8745 case Intrinsic::assume: 8746 case Intrinsic::lifetime_start: 8747 case Intrinsic::lifetime_end: 8748 // For scalable vectors if one of the operands is variant then we still 8749 // want to mark as uniform, which will generate one instruction for just 8750 // the first lane of the vector. We can't scalarize the call in the same 8751 // way as for fixed-width vectors because we don't know how many lanes 8752 // there are. 8753 // 8754 // The reasons for doing it this way for scalable vectors are: 8755 // 1. For the assume intrinsic generating the instruction for the first 8756 // lane is still be better than not generating any at all. For 8757 // example, the input may be a splat across all lanes. 8758 // 2. For the lifetime start/end intrinsics the pointer operand only 8759 // does anything useful when the input comes from a stack object, 8760 // which suggests it should always be uniform. For non-stack objects 8761 // the effect is to poison the object, which still allows us to 8762 // remove the call. 8763 IsUniform = true; 8764 break; 8765 default: 8766 break; 8767 } 8768 } 8769 8770 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8771 IsUniform, IsPredicated); 8772 setRecipe(I, Recipe); 8773 Plan->addVPValue(I, Recipe); 8774 8775 // Find if I uses a predicated instruction. If so, it will use its scalar 8776 // value. Avoid hoisting the insert-element which packs the scalar value into 8777 // a vector value, as that happens iff all users use the vector value. 8778 for (VPValue *Op : Recipe->operands()) { 8779 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8780 if (!PredR) 8781 continue; 8782 auto *RepR = 8783 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8784 assert(RepR->isPredicated() && 8785 "expected Replicate recipe to be predicated"); 8786 RepR->setAlsoPack(false); 8787 } 8788 8789 // Finalize the recipe for Instr, first if it is not predicated. 8790 if (!IsPredicated) { 8791 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8792 VPBB->appendRecipe(Recipe); 8793 return VPBB; 8794 } 8795 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8796 assert(VPBB->getSuccessors().empty() && 8797 "VPBB has successors when handling predicated replication."); 8798 // Record predicated instructions for above packing optimizations. 8799 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8800 VPBlockUtils::insertBlockAfter(Region, VPBB); 8801 auto *RegSucc = new VPBasicBlock(); 8802 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8803 return RegSucc; 8804 } 8805 8806 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8807 VPRecipeBase *PredRecipe, 8808 VPlanPtr &Plan) { 8809 // Instructions marked for predication are replicated and placed under an 8810 // if-then construct to prevent side-effects. 8811 8812 // Generate recipes to compute the block mask for this region. 8813 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8814 8815 // Build the triangular if-then region. 8816 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8817 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8818 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8819 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8820 auto *PHIRecipe = Instr->getType()->isVoidTy() 8821 ? nullptr 8822 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8823 if (PHIRecipe) { 8824 Plan->removeVPValueFor(Instr); 8825 Plan->addVPValue(Instr, PHIRecipe); 8826 } 8827 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8828 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8829 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8830 8831 // Note: first set Entry as region entry and then connect successors starting 8832 // from it in order, to propagate the "parent" of each VPBasicBlock. 8833 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8834 VPBlockUtils::connectBlocks(Pred, Exit); 8835 8836 return Region; 8837 } 8838 8839 VPRecipeOrVPValueTy 8840 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8841 ArrayRef<VPValue *> Operands, 8842 VFRange &Range, VPlanPtr &Plan) { 8843 // First, check for specific widening recipes that deal with calls, memory 8844 // operations, inductions and Phi nodes. 8845 if (auto *CI = dyn_cast<CallInst>(Instr)) 8846 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8847 8848 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8849 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8850 8851 VPRecipeBase *Recipe; 8852 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8853 if (Phi->getParent() != OrigLoop->getHeader()) 8854 return tryToBlend(Phi, Operands, Plan); 8855 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8856 return toVPRecipeResult(Recipe); 8857 8858 VPWidenPHIRecipe *PhiRecipe = nullptr; 8859 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8860 VPValue *StartV = Operands[0]; 8861 if (Legal->isReductionVariable(Phi)) { 8862 const RecurrenceDescriptor &RdxDesc = 8863 Legal->getReductionVars().find(Phi)->second; 8864 assert(RdxDesc.getRecurrenceStartValue() == 8865 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8866 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8867 CM.isInLoopReduction(Phi), 8868 CM.useOrderedReductions(RdxDesc)); 8869 } else { 8870 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8871 } 8872 8873 // Record the incoming value from the backedge, so we can add the incoming 8874 // value from the backedge after all recipes have been created. 8875 recordRecipeOf(cast<Instruction>( 8876 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8877 PhisToFix.push_back(PhiRecipe); 8878 } else { 8879 // TODO: record start and backedge value for remaining pointer induction 8880 // phis. 8881 assert(Phi->getType()->isPointerTy() && 8882 "only pointer phis should be handled here"); 8883 PhiRecipe = new VPWidenPHIRecipe(Phi); 8884 } 8885 8886 return toVPRecipeResult(PhiRecipe); 8887 } 8888 8889 if (isa<TruncInst>(Instr) && 8890 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8891 Range, *Plan))) 8892 return toVPRecipeResult(Recipe); 8893 8894 if (!shouldWiden(Instr, Range)) 8895 return nullptr; 8896 8897 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8898 return toVPRecipeResult(new VPWidenGEPRecipe( 8899 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8900 8901 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8902 bool InvariantCond = 8903 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8904 return toVPRecipeResult(new VPWidenSelectRecipe( 8905 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8906 } 8907 8908 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8909 } 8910 8911 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8912 ElementCount MaxVF) { 8913 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8914 8915 // Collect instructions from the original loop that will become trivially dead 8916 // in the vectorized loop. We don't need to vectorize these instructions. For 8917 // example, original induction update instructions can become dead because we 8918 // separately emit induction "steps" when generating code for the new loop. 8919 // Similarly, we create a new latch condition when setting up the structure 8920 // of the new loop, so the old one can become dead. 8921 SmallPtrSet<Instruction *, 4> DeadInstructions; 8922 collectTriviallyDeadInstructions(DeadInstructions); 8923 8924 // Add assume instructions we need to drop to DeadInstructions, to prevent 8925 // them from being added to the VPlan. 8926 // TODO: We only need to drop assumes in blocks that get flattend. If the 8927 // control flow is preserved, we should keep them. 8928 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8929 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8930 8931 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8932 // Dead instructions do not need sinking. Remove them from SinkAfter. 8933 for (Instruction *I : DeadInstructions) 8934 SinkAfter.erase(I); 8935 8936 // Cannot sink instructions after dead instructions (there won't be any 8937 // recipes for them). Instead, find the first non-dead previous instruction. 8938 for (auto &P : Legal->getSinkAfter()) { 8939 Instruction *SinkTarget = P.second; 8940 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8941 (void)FirstInst; 8942 while (DeadInstructions.contains(SinkTarget)) { 8943 assert( 8944 SinkTarget != FirstInst && 8945 "Must find a live instruction (at least the one feeding the " 8946 "first-order recurrence PHI) before reaching beginning of the block"); 8947 SinkTarget = SinkTarget->getPrevNode(); 8948 assert(SinkTarget != P.first && 8949 "sink source equals target, no sinking required"); 8950 } 8951 P.second = SinkTarget; 8952 } 8953 8954 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8955 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8956 VFRange SubRange = {VF, MaxVFPlusOne}; 8957 VPlans.push_back( 8958 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8959 VF = SubRange.End; 8960 } 8961 } 8962 8963 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8964 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8965 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8966 8967 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8968 8969 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8970 8971 // --------------------------------------------------------------------------- 8972 // Pre-construction: record ingredients whose recipes we'll need to further 8973 // process after constructing the initial VPlan. 8974 // --------------------------------------------------------------------------- 8975 8976 // Mark instructions we'll need to sink later and their targets as 8977 // ingredients whose recipe we'll need to record. 8978 for (auto &Entry : SinkAfter) { 8979 RecipeBuilder.recordRecipeOf(Entry.first); 8980 RecipeBuilder.recordRecipeOf(Entry.second); 8981 } 8982 for (auto &Reduction : CM.getInLoopReductionChains()) { 8983 PHINode *Phi = Reduction.first; 8984 RecurKind Kind = 8985 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8986 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8987 8988 RecipeBuilder.recordRecipeOf(Phi); 8989 for (auto &R : ReductionOperations) { 8990 RecipeBuilder.recordRecipeOf(R); 8991 // For min/max reducitons, where we have a pair of icmp/select, we also 8992 // need to record the ICmp recipe, so it can be removed later. 8993 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8994 "Only min/max recurrences allowed for inloop reductions"); 8995 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8996 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8997 } 8998 } 8999 9000 // For each interleave group which is relevant for this (possibly trimmed) 9001 // Range, add it to the set of groups to be later applied to the VPlan and add 9002 // placeholders for its members' Recipes which we'll be replacing with a 9003 // single VPInterleaveRecipe. 9004 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9005 auto applyIG = [IG, this](ElementCount VF) -> bool { 9006 return (VF.isVector() && // Query is illegal for VF == 1 9007 CM.getWideningDecision(IG->getInsertPos(), VF) == 9008 LoopVectorizationCostModel::CM_Interleave); 9009 }; 9010 if (!getDecisionAndClampRange(applyIG, Range)) 9011 continue; 9012 InterleaveGroups.insert(IG); 9013 for (unsigned i = 0; i < IG->getFactor(); i++) 9014 if (Instruction *Member = IG->getMember(i)) 9015 RecipeBuilder.recordRecipeOf(Member); 9016 }; 9017 9018 // --------------------------------------------------------------------------- 9019 // Build initial VPlan: Scan the body of the loop in a topological order to 9020 // visit each basic block after having visited its predecessor basic blocks. 9021 // --------------------------------------------------------------------------- 9022 9023 auto Plan = std::make_unique<VPlan>(); 9024 9025 // Scan the body of the loop in a topological order to visit each basic block 9026 // after having visited its predecessor basic blocks. 9027 LoopBlocksDFS DFS(OrigLoop); 9028 DFS.perform(LI); 9029 9030 VPBasicBlock *VPBB = nullptr; 9031 VPBasicBlock *HeaderVPBB = nullptr; 9032 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9033 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9034 // Relevant instructions from basic block BB will be grouped into VPRecipe 9035 // ingredients and fill a new VPBasicBlock. 9036 unsigned VPBBsForBB = 0; 9037 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9038 if (VPBB) 9039 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9040 else { 9041 auto *TopRegion = new VPRegionBlock("vector loop"); 9042 TopRegion->setEntry(FirstVPBBForBB); 9043 Plan->setEntry(TopRegion); 9044 HeaderVPBB = FirstVPBBForBB; 9045 } 9046 VPBB = FirstVPBBForBB; 9047 Builder.setInsertPoint(VPBB); 9048 9049 // Introduce each ingredient into VPlan. 9050 // TODO: Model and preserve debug instrinsics in VPlan. 9051 for (Instruction &I : BB->instructionsWithoutDebug()) { 9052 Instruction *Instr = &I; 9053 9054 // First filter out irrelevant instructions, to ensure no recipes are 9055 // built for them. 9056 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9057 continue; 9058 9059 SmallVector<VPValue *, 4> Operands; 9060 auto *Phi = dyn_cast<PHINode>(Instr); 9061 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9062 Operands.push_back(Plan->getOrAddVPValue( 9063 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9064 } else { 9065 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9066 Operands = {OpRange.begin(), OpRange.end()}; 9067 } 9068 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9069 Instr, Operands, Range, Plan)) { 9070 // If Instr can be simplified to an existing VPValue, use it. 9071 if (RecipeOrValue.is<VPValue *>()) { 9072 auto *VPV = RecipeOrValue.get<VPValue *>(); 9073 Plan->addVPValue(Instr, VPV); 9074 // If the re-used value is a recipe, register the recipe for the 9075 // instruction, in case the recipe for Instr needs to be recorded. 9076 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9077 RecipeBuilder.setRecipe(Instr, R); 9078 continue; 9079 } 9080 // Otherwise, add the new recipe. 9081 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9082 for (auto *Def : Recipe->definedValues()) { 9083 auto *UV = Def->getUnderlyingValue(); 9084 Plan->addVPValue(UV, Def); 9085 } 9086 9087 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9088 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9089 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9090 // of the header block. That can happen for truncates of induction 9091 // variables. Those recipes are moved to the phi section of the header 9092 // block after applying SinkAfter, which relies on the original 9093 // position of the trunc. 9094 assert(isa<TruncInst>(Instr)); 9095 InductionsToMove.push_back( 9096 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9097 } 9098 RecipeBuilder.setRecipe(Instr, Recipe); 9099 VPBB->appendRecipe(Recipe); 9100 continue; 9101 } 9102 9103 // Otherwise, if all widening options failed, Instruction is to be 9104 // replicated. This may create a successor for VPBB. 9105 VPBasicBlock *NextVPBB = 9106 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9107 if (NextVPBB != VPBB) { 9108 VPBB = NextVPBB; 9109 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9110 : ""); 9111 } 9112 } 9113 } 9114 9115 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9116 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9117 "entry block must be set to a VPRegionBlock having a non-empty entry " 9118 "VPBasicBlock"); 9119 RecipeBuilder.fixHeaderPhis(); 9120 9121 // --------------------------------------------------------------------------- 9122 // Transform initial VPlan: Apply previously taken decisions, in order, to 9123 // bring the VPlan to its final state. 9124 // --------------------------------------------------------------------------- 9125 9126 // Apply Sink-After legal constraints. 9127 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9128 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9129 if (Region && Region->isReplicator()) { 9130 assert(Region->getNumSuccessors() == 1 && 9131 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9132 assert(R->getParent()->size() == 1 && 9133 "A recipe in an original replicator region must be the only " 9134 "recipe in its block"); 9135 return Region; 9136 } 9137 return nullptr; 9138 }; 9139 for (auto &Entry : SinkAfter) { 9140 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9141 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9142 9143 auto *TargetRegion = GetReplicateRegion(Target); 9144 auto *SinkRegion = GetReplicateRegion(Sink); 9145 if (!SinkRegion) { 9146 // If the sink source is not a replicate region, sink the recipe directly. 9147 if (TargetRegion) { 9148 // The target is in a replication region, make sure to move Sink to 9149 // the block after it, not into the replication region itself. 9150 VPBasicBlock *NextBlock = 9151 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9152 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9153 } else 9154 Sink->moveAfter(Target); 9155 continue; 9156 } 9157 9158 // The sink source is in a replicate region. Unhook the region from the CFG. 9159 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9160 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9161 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9162 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9163 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9164 9165 if (TargetRegion) { 9166 // The target recipe is also in a replicate region, move the sink region 9167 // after the target region. 9168 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9169 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9170 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9171 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9172 } else { 9173 // The sink source is in a replicate region, we need to move the whole 9174 // replicate region, which should only contain a single recipe in the 9175 // main block. 9176 auto *SplitBlock = 9177 Target->getParent()->splitAt(std::next(Target->getIterator())); 9178 9179 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9180 9181 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9182 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9183 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9184 if (VPBB == SplitPred) 9185 VPBB = SplitBlock; 9186 } 9187 } 9188 9189 cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); 9190 9191 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9192 9193 // Now that sink-after is done, move induction recipes for optimized truncates 9194 // to the phi section of the header block. 9195 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9196 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9197 9198 // Adjust the recipes for any inloop reductions. 9199 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); 9200 9201 // Introduce a recipe to combine the incoming and previous values of a 9202 // first-order recurrence. 9203 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9204 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9205 if (!RecurPhi) 9206 continue; 9207 9208 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9209 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9210 auto *Region = GetReplicateRegion(PrevRecipe); 9211 if (Region) 9212 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9213 if (Region || PrevRecipe->isPhi()) 9214 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9215 else 9216 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9217 9218 auto *RecurSplice = cast<VPInstruction>( 9219 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9220 {RecurPhi, RecurPhi->getBackedgeValue()})); 9221 9222 RecurPhi->replaceAllUsesWith(RecurSplice); 9223 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9224 // all users. 9225 RecurSplice->setOperand(0, RecurPhi); 9226 } 9227 9228 // Interleave memory: for each Interleave Group we marked earlier as relevant 9229 // for this VPlan, replace the Recipes widening its memory instructions with a 9230 // single VPInterleaveRecipe at its insertion point. 9231 for (auto IG : InterleaveGroups) { 9232 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9233 RecipeBuilder.getRecipe(IG->getInsertPos())); 9234 SmallVector<VPValue *, 4> StoredValues; 9235 for (unsigned i = 0; i < IG->getFactor(); ++i) 9236 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9237 auto *StoreR = 9238 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9239 StoredValues.push_back(StoreR->getStoredValue()); 9240 } 9241 9242 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9243 Recipe->getMask()); 9244 VPIG->insertBefore(Recipe); 9245 unsigned J = 0; 9246 for (unsigned i = 0; i < IG->getFactor(); ++i) 9247 if (Instruction *Member = IG->getMember(i)) { 9248 if (!Member->getType()->isVoidTy()) { 9249 VPValue *OriginalV = Plan->getVPValue(Member); 9250 Plan->removeVPValueFor(Member); 9251 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9252 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9253 J++; 9254 } 9255 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9256 } 9257 } 9258 9259 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9260 // in ways that accessing values using original IR values is incorrect. 9261 Plan->disableValue2VPValue(); 9262 9263 VPlanTransforms::sinkScalarOperands(*Plan); 9264 VPlanTransforms::mergeReplicateRegions(*Plan); 9265 9266 std::string PlanName; 9267 raw_string_ostream RSO(PlanName); 9268 ElementCount VF = Range.Start; 9269 Plan->addVF(VF); 9270 RSO << "Initial VPlan for VF={" << VF; 9271 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9272 Plan->addVF(VF); 9273 RSO << "," << VF; 9274 } 9275 RSO << "},UF>=1"; 9276 RSO.flush(); 9277 Plan->setName(PlanName); 9278 9279 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9280 return Plan; 9281 } 9282 9283 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9284 // Outer loop handling: They may require CFG and instruction level 9285 // transformations before even evaluating whether vectorization is profitable. 9286 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9287 // the vectorization pipeline. 9288 assert(!OrigLoop->isInnermost()); 9289 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9290 9291 // Create new empty VPlan 9292 auto Plan = std::make_unique<VPlan>(); 9293 9294 // Build hierarchical CFG 9295 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9296 HCFGBuilder.buildHierarchicalCFG(); 9297 9298 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9299 VF *= 2) 9300 Plan->addVF(VF); 9301 9302 if (EnableVPlanPredication) { 9303 VPlanPredicator VPP(*Plan); 9304 VPP.predicate(); 9305 9306 // Avoid running transformation to recipes until masked code generation in 9307 // VPlan-native path is in place. 9308 return Plan; 9309 } 9310 9311 SmallPtrSet<Instruction *, 1> DeadInstructions; 9312 VPlanTransforms::VPInstructionsToVPRecipes( 9313 OrigLoop, Plan, 9314 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9315 DeadInstructions, *PSE.getSE()); 9316 return Plan; 9317 } 9318 9319 // Adjust the recipes for reductions. For in-loop reductions the chain of 9320 // instructions leading from the loop exit instr to the phi need to be converted 9321 // to reductions, with one operand being vector and the other being the scalar 9322 // reduction chain. For other reductions, a select is introduced between the phi 9323 // and live-out recipes when folding the tail. 9324 void LoopVectorizationPlanner::adjustRecipesForReductions( 9325 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9326 ElementCount MinVF) { 9327 for (auto &Reduction : CM.getInLoopReductionChains()) { 9328 PHINode *Phi = Reduction.first; 9329 const RecurrenceDescriptor &RdxDesc = 9330 Legal->getReductionVars().find(Phi)->second; 9331 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9332 9333 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9334 continue; 9335 9336 // ReductionOperations are orders top-down from the phi's use to the 9337 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9338 // which of the two operands will remain scalar and which will be reduced. 9339 // For minmax the chain will be the select instructions. 9340 Instruction *Chain = Phi; 9341 for (Instruction *R : ReductionOperations) { 9342 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9343 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9344 9345 VPValue *ChainOp = Plan->getVPValue(Chain); 9346 unsigned FirstOpId; 9347 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9348 "Only min/max recurrences allowed for inloop reductions"); 9349 // Recognize a call to the llvm.fmuladd intrinsic. 9350 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9351 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9352 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9353 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9354 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9355 "Expected to replace a VPWidenSelectSC"); 9356 FirstOpId = 1; 9357 } else { 9358 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9359 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9360 "Expected to replace a VPWidenSC"); 9361 FirstOpId = 0; 9362 } 9363 unsigned VecOpId = 9364 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9365 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9366 9367 auto *CondOp = CM.foldTailByMasking() 9368 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9369 : nullptr; 9370 9371 if (IsFMulAdd) { 9372 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9373 // need to create an fmul recipe to use as the vector operand for the 9374 // fadd reduction. 9375 VPInstruction *FMulRecipe = new VPInstruction( 9376 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9377 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9378 WidenRecipe->getParent()->insert(FMulRecipe, 9379 WidenRecipe->getIterator()); 9380 VecOp = FMulRecipe; 9381 } 9382 VPReductionRecipe *RedRecipe = 9383 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9384 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9385 Plan->removeVPValueFor(R); 9386 Plan->addVPValue(R, RedRecipe); 9387 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9388 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9389 WidenRecipe->eraseFromParent(); 9390 9391 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9392 VPRecipeBase *CompareRecipe = 9393 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9394 assert(isa<VPWidenRecipe>(CompareRecipe) && 9395 "Expected to replace a VPWidenSC"); 9396 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9397 "Expected no remaining users"); 9398 CompareRecipe->eraseFromParent(); 9399 } 9400 Chain = R; 9401 } 9402 } 9403 9404 // If tail is folded by masking, introduce selects between the phi 9405 // and the live-out instruction of each reduction, at the end of the latch. 9406 if (CM.foldTailByMasking()) { 9407 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9408 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9409 if (!PhiR || PhiR->isInLoop()) 9410 continue; 9411 Builder.setInsertPoint(LatchVPBB); 9412 VPValue *Cond = 9413 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9414 VPValue *Red = PhiR->getBackedgeValue(); 9415 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9416 } 9417 } 9418 } 9419 9420 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9421 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9422 VPSlotTracker &SlotTracker) const { 9423 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9424 IG->getInsertPos()->printAsOperand(O, false); 9425 O << ", "; 9426 getAddr()->printAsOperand(O, SlotTracker); 9427 VPValue *Mask = getMask(); 9428 if (Mask) { 9429 O << ", "; 9430 Mask->printAsOperand(O, SlotTracker); 9431 } 9432 9433 unsigned OpIdx = 0; 9434 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9435 if (!IG->getMember(i)) 9436 continue; 9437 if (getNumStoreOperands() > 0) { 9438 O << "\n" << Indent << " store "; 9439 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9440 O << " to index " << i; 9441 } else { 9442 O << "\n" << Indent << " "; 9443 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9444 O << " = load from index " << i; 9445 } 9446 ++OpIdx; 9447 } 9448 } 9449 #endif 9450 9451 void VPWidenCallRecipe::execute(VPTransformState &State) { 9452 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9453 *this, State); 9454 } 9455 9456 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9457 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9458 State.ILV->setDebugLocFromInst(&I); 9459 9460 // The condition can be loop invariant but still defined inside the 9461 // loop. This means that we can't just use the original 'cond' value. 9462 // We have to take the 'vectorized' value and pick the first lane. 9463 // Instcombine will make this a no-op. 9464 auto *InvarCond = 9465 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9466 9467 for (unsigned Part = 0; Part < State.UF; ++Part) { 9468 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9469 Value *Op0 = State.get(getOperand(1), Part); 9470 Value *Op1 = State.get(getOperand(2), Part); 9471 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9472 State.set(this, Sel, Part); 9473 State.ILV->addMetadata(Sel, &I); 9474 } 9475 } 9476 9477 void VPWidenRecipe::execute(VPTransformState &State) { 9478 auto &I = *cast<Instruction>(getUnderlyingValue()); 9479 auto &Builder = State.Builder; 9480 switch (I.getOpcode()) { 9481 case Instruction::Call: 9482 case Instruction::Br: 9483 case Instruction::PHI: 9484 case Instruction::GetElementPtr: 9485 case Instruction::Select: 9486 llvm_unreachable("This instruction is handled by a different recipe."); 9487 case Instruction::UDiv: 9488 case Instruction::SDiv: 9489 case Instruction::SRem: 9490 case Instruction::URem: 9491 case Instruction::Add: 9492 case Instruction::FAdd: 9493 case Instruction::Sub: 9494 case Instruction::FSub: 9495 case Instruction::FNeg: 9496 case Instruction::Mul: 9497 case Instruction::FMul: 9498 case Instruction::FDiv: 9499 case Instruction::FRem: 9500 case Instruction::Shl: 9501 case Instruction::LShr: 9502 case Instruction::AShr: 9503 case Instruction::And: 9504 case Instruction::Or: 9505 case Instruction::Xor: { 9506 // Just widen unops and binops. 9507 State.ILV->setDebugLocFromInst(&I); 9508 9509 for (unsigned Part = 0; Part < State.UF; ++Part) { 9510 SmallVector<Value *, 2> Ops; 9511 for (VPValue *VPOp : operands()) 9512 Ops.push_back(State.get(VPOp, Part)); 9513 9514 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9515 9516 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9517 VecOp->copyIRFlags(&I); 9518 9519 // If the instruction is vectorized and was in a basic block that needed 9520 // predication, we can't propagate poison-generating flags (nuw/nsw, 9521 // exact, etc.). The control flow has been linearized and the 9522 // instruction is no longer guarded by the predicate, which could make 9523 // the flag properties to no longer hold. 9524 if (State.MayGeneratePoisonRecipes.count(this) > 0) 9525 VecOp->dropPoisonGeneratingFlags(); 9526 } 9527 9528 // Use this vector value for all users of the original instruction. 9529 State.set(this, V, Part); 9530 State.ILV->addMetadata(V, &I); 9531 } 9532 9533 break; 9534 } 9535 case Instruction::ICmp: 9536 case Instruction::FCmp: { 9537 // Widen compares. Generate vector compares. 9538 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9539 auto *Cmp = cast<CmpInst>(&I); 9540 State.ILV->setDebugLocFromInst(Cmp); 9541 for (unsigned Part = 0; Part < State.UF; ++Part) { 9542 Value *A = State.get(getOperand(0), Part); 9543 Value *B = State.get(getOperand(1), Part); 9544 Value *C = nullptr; 9545 if (FCmp) { 9546 // Propagate fast math flags. 9547 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9548 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9549 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9550 } else { 9551 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9552 } 9553 State.set(this, C, Part); 9554 State.ILV->addMetadata(C, &I); 9555 } 9556 9557 break; 9558 } 9559 9560 case Instruction::ZExt: 9561 case Instruction::SExt: 9562 case Instruction::FPToUI: 9563 case Instruction::FPToSI: 9564 case Instruction::FPExt: 9565 case Instruction::PtrToInt: 9566 case Instruction::IntToPtr: 9567 case Instruction::SIToFP: 9568 case Instruction::UIToFP: 9569 case Instruction::Trunc: 9570 case Instruction::FPTrunc: 9571 case Instruction::BitCast: { 9572 auto *CI = cast<CastInst>(&I); 9573 State.ILV->setDebugLocFromInst(CI); 9574 9575 /// Vectorize casts. 9576 Type *DestTy = (State.VF.isScalar()) 9577 ? CI->getType() 9578 : VectorType::get(CI->getType(), State.VF); 9579 9580 for (unsigned Part = 0; Part < State.UF; ++Part) { 9581 Value *A = State.get(getOperand(0), Part); 9582 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9583 State.set(this, Cast, Part); 9584 State.ILV->addMetadata(Cast, &I); 9585 } 9586 break; 9587 } 9588 default: 9589 // This instruction is not vectorized by simple widening. 9590 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9591 llvm_unreachable("Unhandled instruction!"); 9592 } // end of switch. 9593 } 9594 9595 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9596 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9597 // Construct a vector GEP by widening the operands of the scalar GEP as 9598 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9599 // results in a vector of pointers when at least one operand of the GEP 9600 // is vector-typed. Thus, to keep the representation compact, we only use 9601 // vector-typed operands for loop-varying values. 9602 9603 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9604 // If we are vectorizing, but the GEP has only loop-invariant operands, 9605 // the GEP we build (by only using vector-typed operands for 9606 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9607 // produce a vector of pointers, we need to either arbitrarily pick an 9608 // operand to broadcast, or broadcast a clone of the original GEP. 9609 // Here, we broadcast a clone of the original. 9610 // 9611 // TODO: If at some point we decide to scalarize instructions having 9612 // loop-invariant operands, this special case will no longer be 9613 // required. We would add the scalarization decision to 9614 // collectLoopScalars() and teach getVectorValue() to broadcast 9615 // the lane-zero scalar value. 9616 auto *Clone = State.Builder.Insert(GEP->clone()); 9617 for (unsigned Part = 0; Part < State.UF; ++Part) { 9618 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9619 State.set(this, EntryPart, Part); 9620 State.ILV->addMetadata(EntryPart, GEP); 9621 } 9622 } else { 9623 // If the GEP has at least one loop-varying operand, we are sure to 9624 // produce a vector of pointers. But if we are only unrolling, we want 9625 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9626 // produce with the code below will be scalar (if VF == 1) or vector 9627 // (otherwise). Note that for the unroll-only case, we still maintain 9628 // values in the vector mapping with initVector, as we do for other 9629 // instructions. 9630 for (unsigned Part = 0; Part < State.UF; ++Part) { 9631 // The pointer operand of the new GEP. If it's loop-invariant, we 9632 // won't broadcast it. 9633 auto *Ptr = IsPtrLoopInvariant 9634 ? State.get(getOperand(0), VPIteration(0, 0)) 9635 : State.get(getOperand(0), Part); 9636 9637 // Collect all the indices for the new GEP. If any index is 9638 // loop-invariant, we won't broadcast it. 9639 SmallVector<Value *, 4> Indices; 9640 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9641 VPValue *Operand = getOperand(I); 9642 if (IsIndexLoopInvariant[I - 1]) 9643 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9644 else 9645 Indices.push_back(State.get(Operand, Part)); 9646 } 9647 9648 // If the GEP instruction is vectorized and was in a basic block that 9649 // needed predication, we can't propagate the poison-generating 'inbounds' 9650 // flag. The control flow has been linearized and the GEP is no longer 9651 // guarded by the predicate, which could make the 'inbounds' properties to 9652 // no longer hold. 9653 bool IsInBounds = 9654 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9655 9656 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9657 // but it should be a vector, otherwise. 9658 auto *NewGEP = IsInBounds 9659 ? State.Builder.CreateInBoundsGEP( 9660 GEP->getSourceElementType(), Ptr, Indices) 9661 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9662 Ptr, Indices); 9663 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9664 "NewGEP is not a pointer vector"); 9665 State.set(this, NewGEP, Part); 9666 State.ILV->addMetadata(NewGEP, GEP); 9667 } 9668 } 9669 } 9670 9671 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9672 assert(!State.Instance && "Int or FP induction being replicated."); 9673 State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), 9674 getStartValue()->getLiveInIRValue(), 9675 getTruncInst(), getVPValue(0), State); 9676 } 9677 9678 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9679 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9680 State); 9681 } 9682 9683 void VPBlendRecipe::execute(VPTransformState &State) { 9684 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9685 // We know that all PHIs in non-header blocks are converted into 9686 // selects, so we don't have to worry about the insertion order and we 9687 // can just use the builder. 9688 // At this point we generate the predication tree. There may be 9689 // duplications since this is a simple recursive scan, but future 9690 // optimizations will clean it up. 9691 9692 unsigned NumIncoming = getNumIncomingValues(); 9693 9694 // Generate a sequence of selects of the form: 9695 // SELECT(Mask3, In3, 9696 // SELECT(Mask2, In2, 9697 // SELECT(Mask1, In1, 9698 // In0))) 9699 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9700 // are essentially undef are taken from In0. 9701 InnerLoopVectorizer::VectorParts Entry(State.UF); 9702 for (unsigned In = 0; In < NumIncoming; ++In) { 9703 for (unsigned Part = 0; Part < State.UF; ++Part) { 9704 // We might have single edge PHIs (blocks) - use an identity 9705 // 'select' for the first PHI operand. 9706 Value *In0 = State.get(getIncomingValue(In), Part); 9707 if (In == 0) 9708 Entry[Part] = In0; // Initialize with the first incoming value. 9709 else { 9710 // Select between the current value and the previous incoming edge 9711 // based on the incoming mask. 9712 Value *Cond = State.get(getMask(In), Part); 9713 Entry[Part] = 9714 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9715 } 9716 } 9717 } 9718 for (unsigned Part = 0; Part < State.UF; ++Part) 9719 State.set(this, Entry[Part], Part); 9720 } 9721 9722 void VPInterleaveRecipe::execute(VPTransformState &State) { 9723 assert(!State.Instance && "Interleave group being replicated."); 9724 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9725 getStoredValues(), getMask()); 9726 } 9727 9728 void VPReductionRecipe::execute(VPTransformState &State) { 9729 assert(!State.Instance && "Reduction being replicated."); 9730 Value *PrevInChain = State.get(getChainOp(), 0); 9731 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9732 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9733 // Propagate the fast-math flags carried by the underlying instruction. 9734 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9735 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9736 for (unsigned Part = 0; Part < State.UF; ++Part) { 9737 Value *NewVecOp = State.get(getVecOp(), Part); 9738 if (VPValue *Cond = getCondOp()) { 9739 Value *NewCond = State.get(Cond, Part); 9740 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9741 Value *Iden = RdxDesc->getRecurrenceIdentity( 9742 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9743 Value *IdenVec = 9744 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9745 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9746 NewVecOp = Select; 9747 } 9748 Value *NewRed; 9749 Value *NextInChain; 9750 if (IsOrdered) { 9751 if (State.VF.isVector()) 9752 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9753 PrevInChain); 9754 else 9755 NewRed = State.Builder.CreateBinOp( 9756 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9757 NewVecOp); 9758 PrevInChain = NewRed; 9759 } else { 9760 PrevInChain = State.get(getChainOp(), Part); 9761 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9762 } 9763 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9764 NextInChain = 9765 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9766 NewRed, PrevInChain); 9767 } else if (IsOrdered) 9768 NextInChain = NewRed; 9769 else 9770 NextInChain = State.Builder.CreateBinOp( 9771 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9772 PrevInChain); 9773 State.set(this, NextInChain, Part); 9774 } 9775 } 9776 9777 void VPReplicateRecipe::execute(VPTransformState &State) { 9778 if (State.Instance) { // Generate a single instance. 9779 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9780 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9781 IsPredicated, State); 9782 // Insert scalar instance packing it into a vector. 9783 if (AlsoPack && State.VF.isVector()) { 9784 // If we're constructing lane 0, initialize to start from poison. 9785 if (State.Instance->Lane.isFirstLane()) { 9786 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9787 Value *Poison = PoisonValue::get( 9788 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9789 State.set(this, Poison, State.Instance->Part); 9790 } 9791 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9792 } 9793 return; 9794 } 9795 9796 // Generate scalar instances for all VF lanes of all UF parts, unless the 9797 // instruction is uniform inwhich case generate only the first lane for each 9798 // of the UF parts. 9799 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9800 assert((!State.VF.isScalable() || IsUniform) && 9801 "Can't scalarize a scalable vector"); 9802 for (unsigned Part = 0; Part < State.UF; ++Part) 9803 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9804 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9805 VPIteration(Part, Lane), IsPredicated, 9806 State); 9807 } 9808 9809 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9810 assert(State.Instance && "Branch on Mask works only on single instance."); 9811 9812 unsigned Part = State.Instance->Part; 9813 unsigned Lane = State.Instance->Lane.getKnownLane(); 9814 9815 Value *ConditionBit = nullptr; 9816 VPValue *BlockInMask = getMask(); 9817 if (BlockInMask) { 9818 ConditionBit = State.get(BlockInMask, Part); 9819 if (ConditionBit->getType()->isVectorTy()) 9820 ConditionBit = State.Builder.CreateExtractElement( 9821 ConditionBit, State.Builder.getInt32(Lane)); 9822 } else // Block in mask is all-one. 9823 ConditionBit = State.Builder.getTrue(); 9824 9825 // Replace the temporary unreachable terminator with a new conditional branch, 9826 // whose two destinations will be set later when they are created. 9827 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9828 assert(isa<UnreachableInst>(CurrentTerminator) && 9829 "Expected to replace unreachable terminator with conditional branch."); 9830 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9831 CondBr->setSuccessor(0, nullptr); 9832 ReplaceInstWithInst(CurrentTerminator, CondBr); 9833 } 9834 9835 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9836 assert(State.Instance && "Predicated instruction PHI works per instance."); 9837 Instruction *ScalarPredInst = 9838 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9839 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9840 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9841 assert(PredicatingBB && "Predicated block has no single predecessor."); 9842 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9843 "operand must be VPReplicateRecipe"); 9844 9845 // By current pack/unpack logic we need to generate only a single phi node: if 9846 // a vector value for the predicated instruction exists at this point it means 9847 // the instruction has vector users only, and a phi for the vector value is 9848 // needed. In this case the recipe of the predicated instruction is marked to 9849 // also do that packing, thereby "hoisting" the insert-element sequence. 9850 // Otherwise, a phi node for the scalar value is needed. 9851 unsigned Part = State.Instance->Part; 9852 if (State.hasVectorValue(getOperand(0), Part)) { 9853 Value *VectorValue = State.get(getOperand(0), Part); 9854 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9855 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9856 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9857 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9858 if (State.hasVectorValue(this, Part)) 9859 State.reset(this, VPhi, Part); 9860 else 9861 State.set(this, VPhi, Part); 9862 // NOTE: Currently we need to update the value of the operand, so the next 9863 // predicated iteration inserts its generated value in the correct vector. 9864 State.reset(getOperand(0), VPhi, Part); 9865 } else { 9866 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9867 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9868 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9869 PredicatingBB); 9870 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9871 if (State.hasScalarValue(this, *State.Instance)) 9872 State.reset(this, Phi, *State.Instance); 9873 else 9874 State.set(this, Phi, *State.Instance); 9875 // NOTE: Currently we need to update the value of the operand, so the next 9876 // predicated iteration inserts its generated value in the correct vector. 9877 State.reset(getOperand(0), Phi, *State.Instance); 9878 } 9879 } 9880 9881 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9882 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9883 9884 // Attempt to issue a wide load. 9885 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9886 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9887 9888 assert((LI || SI) && "Invalid Load/Store instruction"); 9889 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9890 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9891 9892 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9893 9894 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9895 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9896 bool CreateGatherScatter = !Consecutive; 9897 9898 auto &Builder = State.Builder; 9899 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9900 bool isMaskRequired = getMask(); 9901 if (isMaskRequired) 9902 for (unsigned Part = 0; Part < State.UF; ++Part) 9903 BlockInMaskParts[Part] = State.get(getMask(), Part); 9904 9905 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9906 // Calculate the pointer for the specific unroll-part. 9907 GetElementPtrInst *PartPtr = nullptr; 9908 9909 bool InBounds = false; 9910 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9911 InBounds = gep->isInBounds(); 9912 if (Reverse) { 9913 // If the address is consecutive but reversed, then the 9914 // wide store needs to start at the last vector element. 9915 // RunTimeVF = VScale * VF.getKnownMinValue() 9916 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9917 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9918 // NumElt = -Part * RunTimeVF 9919 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9920 // LastLane = 1 - RunTimeVF 9921 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9922 PartPtr = 9923 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9924 PartPtr->setIsInBounds(InBounds); 9925 PartPtr = cast<GetElementPtrInst>( 9926 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9927 PartPtr->setIsInBounds(InBounds); 9928 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9929 BlockInMaskParts[Part] = 9930 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9931 } else { 9932 Value *Increment = 9933 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9934 PartPtr = cast<GetElementPtrInst>( 9935 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9936 PartPtr->setIsInBounds(InBounds); 9937 } 9938 9939 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9940 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9941 }; 9942 9943 // Handle Stores: 9944 if (SI) { 9945 State.ILV->setDebugLocFromInst(SI); 9946 9947 for (unsigned Part = 0; Part < State.UF; ++Part) { 9948 Instruction *NewSI = nullptr; 9949 Value *StoredVal = State.get(StoredValue, Part); 9950 if (CreateGatherScatter) { 9951 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9952 Value *VectorGep = State.get(getAddr(), Part); 9953 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9954 MaskPart); 9955 } else { 9956 if (Reverse) { 9957 // If we store to reverse consecutive memory locations, then we need 9958 // to reverse the order of elements in the stored value. 9959 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9960 // We don't want to update the value in the map as it might be used in 9961 // another expression. So don't call resetVectorValue(StoredVal). 9962 } 9963 auto *VecPtr = 9964 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9965 if (isMaskRequired) 9966 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9967 BlockInMaskParts[Part]); 9968 else 9969 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9970 } 9971 State.ILV->addMetadata(NewSI, SI); 9972 } 9973 return; 9974 } 9975 9976 // Handle loads. 9977 assert(LI && "Must have a load instruction"); 9978 State.ILV->setDebugLocFromInst(LI); 9979 for (unsigned Part = 0; Part < State.UF; ++Part) { 9980 Value *NewLI; 9981 if (CreateGatherScatter) { 9982 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9983 Value *VectorGep = State.get(getAddr(), Part); 9984 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9985 nullptr, "wide.masked.gather"); 9986 State.ILV->addMetadata(NewLI, LI); 9987 } else { 9988 auto *VecPtr = 9989 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9990 if (isMaskRequired) 9991 NewLI = Builder.CreateMaskedLoad( 9992 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9993 PoisonValue::get(DataTy), "wide.masked.load"); 9994 else 9995 NewLI = 9996 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9997 9998 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9999 State.ILV->addMetadata(NewLI, LI); 10000 if (Reverse) 10001 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10002 } 10003 10004 State.set(getVPSingleValue(), NewLI, Part); 10005 } 10006 } 10007 10008 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10009 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10010 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10011 // for predication. 10012 static ScalarEpilogueLowering getScalarEpilogueLowering( 10013 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10014 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10015 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10016 LoopVectorizationLegality &LVL) { 10017 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10018 // don't look at hints or options, and don't request a scalar epilogue. 10019 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10020 // LoopAccessInfo (due to code dependency and not being able to reliably get 10021 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10022 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10023 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10024 // back to the old way and vectorize with versioning when forced. See D81345.) 10025 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10026 PGSOQueryType::IRPass) && 10027 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10028 return CM_ScalarEpilogueNotAllowedOptSize; 10029 10030 // 2) If set, obey the directives 10031 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10032 switch (PreferPredicateOverEpilogue) { 10033 case PreferPredicateTy::ScalarEpilogue: 10034 return CM_ScalarEpilogueAllowed; 10035 case PreferPredicateTy::PredicateElseScalarEpilogue: 10036 return CM_ScalarEpilogueNotNeededUsePredicate; 10037 case PreferPredicateTy::PredicateOrDontVectorize: 10038 return CM_ScalarEpilogueNotAllowedUsePredicate; 10039 }; 10040 } 10041 10042 // 3) If set, obey the hints 10043 switch (Hints.getPredicate()) { 10044 case LoopVectorizeHints::FK_Enabled: 10045 return CM_ScalarEpilogueNotNeededUsePredicate; 10046 case LoopVectorizeHints::FK_Disabled: 10047 return CM_ScalarEpilogueAllowed; 10048 }; 10049 10050 // 4) if the TTI hook indicates this is profitable, request predication. 10051 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10052 LVL.getLAI())) 10053 return CM_ScalarEpilogueNotNeededUsePredicate; 10054 10055 return CM_ScalarEpilogueAllowed; 10056 } 10057 10058 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10059 // If Values have been set for this Def return the one relevant for \p Part. 10060 if (hasVectorValue(Def, Part)) 10061 return Data.PerPartOutput[Def][Part]; 10062 10063 if (!hasScalarValue(Def, {Part, 0})) { 10064 Value *IRV = Def->getLiveInIRValue(); 10065 Value *B = ILV->getBroadcastInstrs(IRV); 10066 set(Def, B, Part); 10067 return B; 10068 } 10069 10070 Value *ScalarValue = get(Def, {Part, 0}); 10071 // If we aren't vectorizing, we can just copy the scalar map values over 10072 // to the vector map. 10073 if (VF.isScalar()) { 10074 set(Def, ScalarValue, Part); 10075 return ScalarValue; 10076 } 10077 10078 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10079 bool IsUniform = RepR && RepR->isUniform(); 10080 10081 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10082 // Check if there is a scalar value for the selected lane. 10083 if (!hasScalarValue(Def, {Part, LastLane})) { 10084 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10085 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10086 "unexpected recipe found to be invariant"); 10087 IsUniform = true; 10088 LastLane = 0; 10089 } 10090 10091 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10092 // Set the insert point after the last scalarized instruction or after the 10093 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10094 // will directly follow the scalar definitions. 10095 auto OldIP = Builder.saveIP(); 10096 auto NewIP = 10097 isa<PHINode>(LastInst) 10098 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10099 : std::next(BasicBlock::iterator(LastInst)); 10100 Builder.SetInsertPoint(&*NewIP); 10101 10102 // However, if we are vectorizing, we need to construct the vector values. 10103 // If the value is known to be uniform after vectorization, we can just 10104 // broadcast the scalar value corresponding to lane zero for each unroll 10105 // iteration. Otherwise, we construct the vector values using 10106 // insertelement instructions. Since the resulting vectors are stored in 10107 // State, we will only generate the insertelements once. 10108 Value *VectorValue = nullptr; 10109 if (IsUniform) { 10110 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10111 set(Def, VectorValue, Part); 10112 } else { 10113 // Initialize packing with insertelements to start from undef. 10114 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10115 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10116 set(Def, Undef, Part); 10117 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10118 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10119 VectorValue = get(Def, Part); 10120 } 10121 Builder.restoreIP(OldIP); 10122 return VectorValue; 10123 } 10124 10125 // Process the loop in the VPlan-native vectorization path. This path builds 10126 // VPlan upfront in the vectorization pipeline, which allows to apply 10127 // VPlan-to-VPlan transformations from the very beginning without modifying the 10128 // input LLVM IR. 10129 static bool processLoopInVPlanNativePath( 10130 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10131 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10132 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10133 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10134 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10135 LoopVectorizationRequirements &Requirements) { 10136 10137 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10138 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10139 return false; 10140 } 10141 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10142 Function *F = L->getHeader()->getParent(); 10143 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10144 10145 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10146 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10147 10148 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10149 &Hints, IAI); 10150 // Use the planner for outer loop vectorization. 10151 // TODO: CM is not used at this point inside the planner. Turn CM into an 10152 // optional argument if we don't need it in the future. 10153 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10154 Requirements, ORE); 10155 10156 // Get user vectorization factor. 10157 ElementCount UserVF = Hints.getWidth(); 10158 10159 CM.collectElementTypesForWidening(); 10160 10161 // Plan how to best vectorize, return the best VF and its cost. 10162 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10163 10164 // If we are stress testing VPlan builds, do not attempt to generate vector 10165 // code. Masked vector code generation support will follow soon. 10166 // Also, do not attempt to vectorize if no vector code will be produced. 10167 if (VPlanBuildStressTest || EnableVPlanPredication || 10168 VectorizationFactor::Disabled() == VF) 10169 return false; 10170 10171 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10172 10173 { 10174 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10175 F->getParent()->getDataLayout()); 10176 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10177 &CM, BFI, PSI, Checks); 10178 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10179 << L->getHeader()->getParent()->getName() << "\"\n"); 10180 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10181 } 10182 10183 // Mark the loop as already vectorized to avoid vectorizing again. 10184 Hints.setAlreadyVectorized(); 10185 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10186 return true; 10187 } 10188 10189 // Emit a remark if there are stores to floats that required a floating point 10190 // extension. If the vectorized loop was generated with floating point there 10191 // will be a performance penalty from the conversion overhead and the change in 10192 // the vector width. 10193 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10194 SmallVector<Instruction *, 4> Worklist; 10195 for (BasicBlock *BB : L->getBlocks()) { 10196 for (Instruction &Inst : *BB) { 10197 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10198 if (S->getValueOperand()->getType()->isFloatTy()) 10199 Worklist.push_back(S); 10200 } 10201 } 10202 } 10203 10204 // Traverse the floating point stores upwards searching, for floating point 10205 // conversions. 10206 SmallPtrSet<const Instruction *, 4> Visited; 10207 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10208 while (!Worklist.empty()) { 10209 auto *I = Worklist.pop_back_val(); 10210 if (!L->contains(I)) 10211 continue; 10212 if (!Visited.insert(I).second) 10213 continue; 10214 10215 // Emit a remark if the floating point store required a floating 10216 // point conversion. 10217 // TODO: More work could be done to identify the root cause such as a 10218 // constant or a function return type and point the user to it. 10219 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10220 ORE->emit([&]() { 10221 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10222 I->getDebugLoc(), L->getHeader()) 10223 << "floating point conversion changes vector width. " 10224 << "Mixed floating point precision requires an up/down " 10225 << "cast that will negatively impact performance."; 10226 }); 10227 10228 for (Use &Op : I->operands()) 10229 if (auto *OpI = dyn_cast<Instruction>(Op)) 10230 Worklist.push_back(OpI); 10231 } 10232 } 10233 10234 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10235 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10236 !EnableLoopInterleaving), 10237 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10238 !EnableLoopVectorization) {} 10239 10240 bool LoopVectorizePass::processLoop(Loop *L) { 10241 assert((EnableVPlanNativePath || L->isInnermost()) && 10242 "VPlan-native path is not enabled. Only process inner loops."); 10243 10244 #ifndef NDEBUG 10245 const std::string DebugLocStr = getDebugLocString(L); 10246 #endif /* NDEBUG */ 10247 10248 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10249 << L->getHeader()->getParent()->getName() << "\" from " 10250 << DebugLocStr << "\n"); 10251 10252 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 10253 10254 LLVM_DEBUG( 10255 dbgs() << "LV: Loop hints:" 10256 << " force=" 10257 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10258 ? "disabled" 10259 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10260 ? "enabled" 10261 : "?")) 10262 << " width=" << Hints.getWidth() 10263 << " interleave=" << Hints.getInterleave() << "\n"); 10264 10265 // Function containing loop 10266 Function *F = L->getHeader()->getParent(); 10267 10268 // Looking at the diagnostic output is the only way to determine if a loop 10269 // was vectorized (other than looking at the IR or machine code), so it 10270 // is important to generate an optimization remark for each loop. Most of 10271 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10272 // generated as OptimizationRemark and OptimizationRemarkMissed are 10273 // less verbose reporting vectorized loops and unvectorized loops that may 10274 // benefit from vectorization, respectively. 10275 10276 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10277 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10278 return false; 10279 } 10280 10281 PredicatedScalarEvolution PSE(*SE, *L); 10282 10283 // Check if it is legal to vectorize the loop. 10284 LoopVectorizationRequirements Requirements; 10285 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10286 &Requirements, &Hints, DB, AC, BFI, PSI); 10287 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10288 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10289 Hints.emitRemarkWithHints(); 10290 return false; 10291 } 10292 10293 // Check the function attributes and profiles to find out if this function 10294 // should be optimized for size. 10295 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10296 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10297 10298 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10299 // here. They may require CFG and instruction level transformations before 10300 // even evaluating whether vectorization is profitable. Since we cannot modify 10301 // the incoming IR, we need to build VPlan upfront in the vectorization 10302 // pipeline. 10303 if (!L->isInnermost()) 10304 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10305 ORE, BFI, PSI, Hints, Requirements); 10306 10307 assert(L->isInnermost() && "Inner loop expected."); 10308 10309 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10310 // count by optimizing for size, to minimize overheads. 10311 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10312 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10313 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10314 << "This loop is worth vectorizing only if no scalar " 10315 << "iteration overheads are incurred."); 10316 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10317 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10318 else { 10319 LLVM_DEBUG(dbgs() << "\n"); 10320 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10321 } 10322 } 10323 10324 // Check the function attributes to see if implicit floats are allowed. 10325 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10326 // an integer loop and the vector instructions selected are purely integer 10327 // vector instructions? 10328 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10329 reportVectorizationFailure( 10330 "Can't vectorize when the NoImplicitFloat attribute is used", 10331 "loop not vectorized due to NoImplicitFloat attribute", 10332 "NoImplicitFloat", ORE, L); 10333 Hints.emitRemarkWithHints(); 10334 return false; 10335 } 10336 10337 // Check if the target supports potentially unsafe FP vectorization. 10338 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10339 // for the target we're vectorizing for, to make sure none of the 10340 // additional fp-math flags can help. 10341 if (Hints.isPotentiallyUnsafe() && 10342 TTI->isFPVectorizationPotentiallyUnsafe()) { 10343 reportVectorizationFailure( 10344 "Potentially unsafe FP op prevents vectorization", 10345 "loop not vectorized due to unsafe FP support.", 10346 "UnsafeFP", ORE, L); 10347 Hints.emitRemarkWithHints(); 10348 return false; 10349 } 10350 10351 bool AllowOrderedReductions; 10352 // If the flag is set, use that instead and override the TTI behaviour. 10353 if (ForceOrderedReductions.getNumOccurrences() > 0) 10354 AllowOrderedReductions = ForceOrderedReductions; 10355 else 10356 AllowOrderedReductions = TTI->enableOrderedReductions(); 10357 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10358 ORE->emit([&]() { 10359 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10360 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10361 ExactFPMathInst->getDebugLoc(), 10362 ExactFPMathInst->getParent()) 10363 << "loop not vectorized: cannot prove it is safe to reorder " 10364 "floating-point operations"; 10365 }); 10366 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10367 "reorder floating-point operations\n"); 10368 Hints.emitRemarkWithHints(); 10369 return false; 10370 } 10371 10372 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10373 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10374 10375 // If an override option has been passed in for interleaved accesses, use it. 10376 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10377 UseInterleaved = EnableInterleavedMemAccesses; 10378 10379 // Analyze interleaved memory accesses. 10380 if (UseInterleaved) { 10381 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10382 } 10383 10384 // Use the cost model. 10385 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10386 F, &Hints, IAI); 10387 CM.collectValuesToIgnore(); 10388 CM.collectElementTypesForWidening(); 10389 10390 // Use the planner for vectorization. 10391 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10392 Requirements, ORE); 10393 10394 // Get user vectorization factor and interleave count. 10395 ElementCount UserVF = Hints.getWidth(); 10396 unsigned UserIC = Hints.getInterleave(); 10397 10398 // Plan how to best vectorize, return the best VF and its cost. 10399 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10400 10401 VectorizationFactor VF = VectorizationFactor::Disabled(); 10402 unsigned IC = 1; 10403 10404 if (MaybeVF) { 10405 VF = *MaybeVF; 10406 // Select the interleave count. 10407 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10408 } 10409 10410 // Identify the diagnostic messages that should be produced. 10411 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10412 bool VectorizeLoop = true, InterleaveLoop = true; 10413 if (VF.Width.isScalar()) { 10414 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10415 VecDiagMsg = std::make_pair( 10416 "VectorizationNotBeneficial", 10417 "the cost-model indicates that vectorization is not beneficial"); 10418 VectorizeLoop = false; 10419 } 10420 10421 if (!MaybeVF && UserIC > 1) { 10422 // Tell the user interleaving was avoided up-front, despite being explicitly 10423 // requested. 10424 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10425 "interleaving should be avoided up front\n"); 10426 IntDiagMsg = std::make_pair( 10427 "InterleavingAvoided", 10428 "Ignoring UserIC, because interleaving was avoided up front"); 10429 InterleaveLoop = false; 10430 } else if (IC == 1 && UserIC <= 1) { 10431 // Tell the user interleaving is not beneficial. 10432 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10433 IntDiagMsg = std::make_pair( 10434 "InterleavingNotBeneficial", 10435 "the cost-model indicates that interleaving is not beneficial"); 10436 InterleaveLoop = false; 10437 if (UserIC == 1) { 10438 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10439 IntDiagMsg.second += 10440 " and is explicitly disabled or interleave count is set to 1"; 10441 } 10442 } else if (IC > 1 && UserIC == 1) { 10443 // Tell the user interleaving is beneficial, but it explicitly disabled. 10444 LLVM_DEBUG( 10445 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10446 IntDiagMsg = std::make_pair( 10447 "InterleavingBeneficialButDisabled", 10448 "the cost-model indicates that interleaving is beneficial " 10449 "but is explicitly disabled or interleave count is set to 1"); 10450 InterleaveLoop = false; 10451 } 10452 10453 // Override IC if user provided an interleave count. 10454 IC = UserIC > 0 ? UserIC : IC; 10455 10456 // Emit diagnostic messages, if any. 10457 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10458 if (!VectorizeLoop && !InterleaveLoop) { 10459 // Do not vectorize or interleaving the loop. 10460 ORE->emit([&]() { 10461 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10462 L->getStartLoc(), L->getHeader()) 10463 << VecDiagMsg.second; 10464 }); 10465 ORE->emit([&]() { 10466 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10467 L->getStartLoc(), L->getHeader()) 10468 << IntDiagMsg.second; 10469 }); 10470 return false; 10471 } else if (!VectorizeLoop && InterleaveLoop) { 10472 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10473 ORE->emit([&]() { 10474 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10475 L->getStartLoc(), L->getHeader()) 10476 << VecDiagMsg.second; 10477 }); 10478 } else if (VectorizeLoop && !InterleaveLoop) { 10479 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10480 << ") in " << DebugLocStr << '\n'); 10481 ORE->emit([&]() { 10482 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10483 L->getStartLoc(), L->getHeader()) 10484 << IntDiagMsg.second; 10485 }); 10486 } else if (VectorizeLoop && InterleaveLoop) { 10487 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10488 << ") in " << DebugLocStr << '\n'); 10489 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10490 } 10491 10492 bool DisableRuntimeUnroll = false; 10493 MDNode *OrigLoopID = L->getLoopID(); 10494 { 10495 // Optimistically generate runtime checks. Drop them if they turn out to not 10496 // be profitable. Limit the scope of Checks, so the cleanup happens 10497 // immediately after vector codegeneration is done. 10498 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10499 F->getParent()->getDataLayout()); 10500 if (!VF.Width.isScalar() || IC > 1) 10501 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10502 10503 using namespace ore; 10504 if (!VectorizeLoop) { 10505 assert(IC > 1 && "interleave count should not be 1 or 0"); 10506 // If we decided that it is not legal to vectorize the loop, then 10507 // interleave it. 10508 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10509 &CM, BFI, PSI, Checks); 10510 10511 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10512 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10513 10514 ORE->emit([&]() { 10515 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10516 L->getHeader()) 10517 << "interleaved loop (interleaved count: " 10518 << NV("InterleaveCount", IC) << ")"; 10519 }); 10520 } else { 10521 // If we decided that it is *legal* to vectorize the loop, then do it. 10522 10523 // Consider vectorizing the epilogue too if it's profitable. 10524 VectorizationFactor EpilogueVF = 10525 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10526 if (EpilogueVF.Width.isVector()) { 10527 10528 // The first pass vectorizes the main loop and creates a scalar epilogue 10529 // to be vectorized by executing the plan (potentially with a different 10530 // factor) again shortly afterwards. 10531 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10532 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10533 EPI, &LVL, &CM, BFI, PSI, Checks); 10534 10535 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10536 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10537 DT); 10538 ++LoopsVectorized; 10539 10540 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10541 formLCSSARecursively(*L, *DT, LI, SE); 10542 10543 // Second pass vectorizes the epilogue and adjusts the control flow 10544 // edges from the first pass. 10545 EPI.MainLoopVF = EPI.EpilogueVF; 10546 EPI.MainLoopUF = EPI.EpilogueUF; 10547 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10548 ORE, EPI, &LVL, &CM, BFI, PSI, 10549 Checks); 10550 10551 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10552 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10553 DT); 10554 ++LoopsEpilogueVectorized; 10555 10556 if (!MainILV.areSafetyChecksAdded()) 10557 DisableRuntimeUnroll = true; 10558 } else { 10559 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10560 &LVL, &CM, BFI, PSI, Checks); 10561 10562 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10563 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10564 ++LoopsVectorized; 10565 10566 // Add metadata to disable runtime unrolling a scalar loop when there 10567 // are no runtime checks about strides and memory. A scalar loop that is 10568 // rarely used is not worth unrolling. 10569 if (!LB.areSafetyChecksAdded()) 10570 DisableRuntimeUnroll = true; 10571 } 10572 // Report the vectorization decision. 10573 ORE->emit([&]() { 10574 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10575 L->getHeader()) 10576 << "vectorized loop (vectorization width: " 10577 << NV("VectorizationFactor", VF.Width) 10578 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10579 }); 10580 } 10581 10582 if (ORE->allowExtraAnalysis(LV_NAME)) 10583 checkMixedPrecision(L, ORE); 10584 } 10585 10586 Optional<MDNode *> RemainderLoopID = 10587 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10588 LLVMLoopVectorizeFollowupEpilogue}); 10589 if (RemainderLoopID.hasValue()) { 10590 L->setLoopID(RemainderLoopID.getValue()); 10591 } else { 10592 if (DisableRuntimeUnroll) 10593 AddRuntimeUnrollDisableMetaData(L); 10594 10595 // Mark the loop as already vectorized to avoid vectorizing again. 10596 Hints.setAlreadyVectorized(); 10597 } 10598 10599 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10600 return true; 10601 } 10602 10603 LoopVectorizeResult LoopVectorizePass::runImpl( 10604 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10605 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10606 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10607 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10608 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10609 SE = &SE_; 10610 LI = &LI_; 10611 TTI = &TTI_; 10612 DT = &DT_; 10613 BFI = &BFI_; 10614 TLI = TLI_; 10615 AA = &AA_; 10616 AC = &AC_; 10617 GetLAA = &GetLAA_; 10618 DB = &DB_; 10619 ORE = &ORE_; 10620 PSI = PSI_; 10621 10622 // Don't attempt if 10623 // 1. the target claims to have no vector registers, and 10624 // 2. interleaving won't help ILP. 10625 // 10626 // The second condition is necessary because, even if the target has no 10627 // vector registers, loop vectorization may still enable scalar 10628 // interleaving. 10629 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10630 TTI->getMaxInterleaveFactor(1) < 2) 10631 return LoopVectorizeResult(false, false); 10632 10633 bool Changed = false, CFGChanged = false; 10634 10635 // The vectorizer requires loops to be in simplified form. 10636 // Since simplification may add new inner loops, it has to run before the 10637 // legality and profitability checks. This means running the loop vectorizer 10638 // will simplify all loops, regardless of whether anything end up being 10639 // vectorized. 10640 for (auto &L : *LI) 10641 Changed |= CFGChanged |= 10642 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10643 10644 // Build up a worklist of inner-loops to vectorize. This is necessary as 10645 // the act of vectorizing or partially unrolling a loop creates new loops 10646 // and can invalidate iterators across the loops. 10647 SmallVector<Loop *, 8> Worklist; 10648 10649 for (Loop *L : *LI) 10650 collectSupportedLoops(*L, LI, ORE, Worklist); 10651 10652 LoopsAnalyzed += Worklist.size(); 10653 10654 // Now walk the identified inner loops. 10655 while (!Worklist.empty()) { 10656 Loop *L = Worklist.pop_back_val(); 10657 10658 // For the inner loops we actually process, form LCSSA to simplify the 10659 // transform. 10660 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10661 10662 Changed |= CFGChanged |= processLoop(L); 10663 } 10664 10665 // Process each loop nest in the function. 10666 return LoopVectorizeResult(Changed, CFGChanged); 10667 } 10668 10669 PreservedAnalyses LoopVectorizePass::run(Function &F, 10670 FunctionAnalysisManager &AM) { 10671 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10672 auto &LI = AM.getResult<LoopAnalysis>(F); 10673 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10674 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10675 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10676 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10677 auto &AA = AM.getResult<AAManager>(F); 10678 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10679 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10680 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10681 10682 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10683 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10684 [&](Loop &L) -> const LoopAccessInfo & { 10685 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10686 TLI, TTI, nullptr, nullptr, nullptr}; 10687 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10688 }; 10689 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10690 ProfileSummaryInfo *PSI = 10691 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10692 LoopVectorizeResult Result = 10693 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10694 if (!Result.MadeAnyChange) 10695 return PreservedAnalyses::all(); 10696 PreservedAnalyses PA; 10697 10698 // We currently do not preserve loopinfo/dominator analyses with outer loop 10699 // vectorization. Until this is addressed, mark these analyses as preserved 10700 // only for non-VPlan-native path. 10701 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10702 if (!EnableVPlanNativePath) { 10703 PA.preserve<LoopAnalysis>(); 10704 PA.preserve<DominatorTreeAnalysis>(); 10705 } 10706 10707 if (Result.MadeCFGChange) { 10708 // Making CFG changes likely means a loop got vectorized. Indicate that 10709 // extra simplification passes should be run. 10710 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10711 // be run if runtime checks have been added. 10712 AM.getResult<ShouldRunExtraVectorPasses>(F); 10713 PA.preserve<ShouldRunExtraVectorPasses>(); 10714 } else { 10715 PA.preserveSet<CFGAnalyses>(); 10716 } 10717 return PA; 10718 } 10719 10720 void LoopVectorizePass::printPipeline( 10721 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10722 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10723 OS, MapClassName2PassName); 10724 10725 OS << "<"; 10726 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10727 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10728 OS << ">"; 10729 } 10730